In [0]:
from fnmatch import fnmatch
from pyspark.sql.functions import lit
from pyspark.sql.types import *

def rejectFile(fileName):
    return dbutils.fs.mv(
        "abfss://"+container+"@"+storageAccountName+".dfs.core.windows.net/"+dirs["lvl1"]+fileName,
        "abfss://"+container+"@"+storageAccountName+".dfs.core.windows.net/"+dirs["lvl1-rejected"]+fileName,
        recurse=True)
    return True

def checkIfNotRejectedDir(filePath):
    if (filePath+"/")!=(container+"/"+dirs["lvl1"]):
        raise Exception(f"False-trigger, generated by moving files to \'rejected\'.")  # it might be better without generating an exception

def checkFileNamePattern(fileName, pattern):
    if not fnmatch(fileName,pattern) or len(fileName.split("."))>2:
        raise Exception(f"The filename ({fileName}) does not match the pattern ({rawFileNamePattern}) or contains more than a single \'.\'. File moved to \'rejected\' (success?: {rejectFile(fileName)}).")
        
def checkHeader(fileName):
    header = (spark.read.format("csv")
              .option("header", "false")
              .load("abfss://"+container+"@"+storageAccountName+".dfs.core.windows.net/"+
                    dirs["lvl1"]+fileName)
              .limit(1)
             )
    row = header.collect()[0]
    if any(cell.isdigit() for cell in row):
        raise Exception(f"File ({fileName}) has no headers. File moved to \'rejected\' (success?: {rejectFile(fileName)}).")

def getColumnsForCleaning(config, data):
    dataCols = data.columns
    for colSettings in config:
        matchingCols = []
        patterns = [colSettings.asDict()["targetName"]]+colSettings.asDict()["namePatterns"]
        for pattern in patterns:
            matchingCols.append([c for c in dataCols if fnmatch(c.upper(),pattern.upper())])  # assuming: case-insensitivity
        flattenedMC = [item for sublist in matchingCols for item in sublist]
        distinctMC = list(set(flattenedMC))
        if len(distinctMC)==0:
            if colSettings.asDict()["nullable"]:  # no matching column is found but column is nullable -> add null-filled column to loaded data named as 'targetName'
                data = data.withColumn(patterns[0], lit(None).cast(colSettings.asDict()["type"]))
            else: raise Exception(f"File ({fileName}) has no column matching any pattern ({patterns}) and is set as no-nullable. File moved to \'rejected\' (success?: {rejectFile(fileName)}).")
        elif len(distinctMC)>1: 
            raise Exception(f"File ({fileName}) has multiple columns ({distinctMC}) matching the patterns ({patterns}). File moved to \'rejected\' (success?: {rejectFile(fileName)}).")
        else:
            data = data.withColumnRenamed(distinctMC[0],patterns[0])
            data = data.withColumn(patterns[0],col(patterns[0]).cast(colSettings.asDict()["type"]))
    return data

def buildSchema(config):
    mapTypes = {
        "Integer": IntegerType(),
        "Long": LongType(),
        "String": StringType(),
        "Float": FloatType(),
        "Double": DoubleType(),
        "Boolean": BooleanType()
    }
    structFieldList = [StructField(conf.asDict()["targetName"],
                                   mapTypes[conf.asDict()["type"]],
                                   nullable=conf.asDict()["nullable"])
                       for conf in config]
    return StructType(structFieldList)