In [0]:
%run ./setAccess

In [0]:
%run ./cleaningBase

In [0]:
%run ./cleaningFunctions

In [0]:
# passed by pipeline (user defined) parameters (set for specific database)
rawFileNamePattern = dbutils.widgets.get("rawFileNamePattern")
configFileName = dbutils.widgets.get("configFileName")

# passed by pipeline (trigger) parameters
fileName = dbutils.widgets.get("fileName")
filePath = dbutils.widgets.get("filePath")  

# pre-validate the file
checkIfNotRejectedDir(filePath)
checkFileNamePattern(fileName,rawFileNamePattern)
checkHeader(fileName)

# load config for database
config = (spark.read
          .format("json")
          .option("multiline","true")
          .load("abfss://"+container+"@"+storageAccountName+".dfs.core.windows.net/"+
                dirs["config"]+configFileName)
          .collect()
         )

# load file for cleaning
data = (spark.read.format("csv")
        .option("header", "true")
        .option("inferSchema", "true")
        .option("mode","dropmalformed")
        .load("abfss://"+container+"@"+storageAccountName+".dfs.core.windows.net/"+
              dirs["lvl1"]+fileName)
       )

# identify labels of columns in loaded data matching the config assumptions
data = getColumnsForCleaning(config,data)

# impose a schema based on config assumptions
data = spark.createDataFrame(
    data.select([conf.asDict()["targetName"] for conf in config]).rdd,
    schema = buildSchema(config)
)

# remove duplicates - based on "primaryKeys" identified in config
data = data.dropDuplicates([conf.asDict()["targetName"] for conf in config if conf.asDict()["primaryKey"]])

# apply config-specified cleaning functions (see: Notebook: cleaningFunctions) for chosen columns
from pyspark.sql.functions import trim,col
for conf in config:
    colName = conf.asDict()["targetName"]
    if conf.asDict()["type"]=="String":
        data = (data.withColumn(colName, trim(col(colName)))
               .fillna("NA",subset=colName))
    else:
        data = data.fillna(0,subset=colName)
    for function in conf.asDict()["applyFunctions"]:
        data = func[function[0]](
            colName,
            data,
            eval(function[1])
        )
    
(data.write
    .format("parquet")
    .mode("overwrite")
    .option("header", "true")
    .save("abfss://"+container+"@"+storageAccountName+".dfs.core.windows.net/"+
          dirs["lvl2"]+fileName.split(".")[0]+".parquet"))