## Business Central merge data notebook
In this part the files in the delta folder will be merge with the Lakehouse table.
- It iterates first on the folders to append to the existing table.
- After that is will remove all duplicates by sorting the table. 
- At last it will remove all deleted records inside the table that are deleted in Business Central

Please change the parameters in the first part.

In [None]:
%%pyspark
# settings
spark.conf.set("sprk.sql.parquet.vorder.enabled","true")
spark.conf.set("spark.microsoft.delta.optimizewrite.enabled","true")
spark.conf.set("spark.sql.parquet.filterPushdown", "true")
spark.conf.set("spark.sql.parquet.mergeSchema", "false")
spark.conf.set("spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version", "2")
spark.conf.set("spark.sql.delta.commitProtocol.enabled", "true")

# file paths
folder_path_spark = 'Files/deltas/' # this is mostly the default
folder_path_json = '/lakehouse/default/Files/' # this is mostly the default
folder_path_reset = '/lakehouse/default/Files/reset/' # this is mostly the default
folder_path = '/lakehouse/default/Files/deltas/' # this is mostly the default

# parameters
workspace = 'businessCentral' #can also be a GUID
Lakehouse = 'businessCentral'; #can also be a GUID
Remove_delta = True; #will remove the delta files if everything is processed
Drop_table_if_mismatch = False; #option to drop the table if json file has different columns then in the table
no_Partition = 258 #how many partition is used in the dataframe, a good starting point might be 2-4 partitions per CPU core in your Spark cluster
DecimalFormat = 'float' #how to format the decimal numbers, can be 'float' or 'decimal(10,3)'. If you change this it will be a breaking change for the table
DateTimeFormat = 'date' #how to format the datetime, can be 'timestamp' or 'date'. If you change this it will be a breaking change for the table

In [None]:
%%pyspark
import os
import json
from pyspark.sql.types import *

if Drop_table_if_mismatch:

    def count_keys(obj):  
        if isinstance(obj, dict):  
            return len(obj) + sum(count_keys(v) for v in obj.values())  
        if isinstance(obj, list):  
            return sum(count_keys(v) for v in obj)  
        return 0  

    for filename in os.listdir(folder_path_json):
        if "manifest" not in filename: # exclude the manifest files
            if filename.endswith(".cdm.json"):
                table_name = filename.replace("-","")
                table_name = table_name.replace(".cdm.json","")

                if table_name in [t.name for t in spark.catalog.listTables()]:
                    #count number of columns in excisting table
                    SQL_Query = "SELECT * FROM " + Lakehouse +"."+table_name;  
                    df = spark.sql(SQL_Query)
                    num_cols_table = len(df.columns)                

                    #count number of columns in json file                
                    f = open(folder_path_json + filename)
                    schema = json.load(f)
                    has_attributes = schema["definitions"][0]["hasAttributes"]  
                    num_names = len(has_attributes)

                    if num_cols_table != num_names:
                        df = spark.sql("DROP TABLE IF EXISTS "+ Lakehouse + "." + table_name)

In [None]:
%%pyspark
import os
import glob
from pyspark.sql.types import *

if os.path.exists(folder_path_reset):
    for filename in os.listdir(folder_path_reset):
        # Remove the table
        table_name = filename.replace("-","")
        table_name = table_name.replace(".txt","")

        df = spark.sql("DROP TABLE IF EXISTS "+ Lakehouse + "." + table_name)

        try:  
            os.remove(folder_path_reset + '/' + filename)  
        except OSError as e:  # this would catch any error when trying to delete the file  
            print(f"Error: {filename} : {e.strerror}")

In [None]:
%%pyspark
import json
import os
import glob
from pyspark.sql.types import *
from pyspark.sql.utils import AnalysisException
from pyspark.sql.functions import col
from pyspark.sql.functions import desc
file_list = []

for entry in os.scandir(folder_path):
 if entry.is_dir():

    for filename in glob.glob(folder_path + entry.name + '/*'):     
        table_name = entry.name.replace("-","")
        ContainsCompany = False
        df_new = spark.read.option("minPartitions", no_Partition).format("csv").option("header","true").load(folder_path_spark + entry.name +"/*")   
        file_list.append(filename) #collect the imported filed in a list for deletion later on

        f = open(folder_path_json + entry.name +".cdm.json")
        schema = json.load(f)
        # Parse the schema to get column names and data types
        column_names = [attr["name"] for attr in schema["definitions"][0]["hasAttributes"]] 
        if '$Company' in column_names:
            ContainsCompany = True
        column_types = [attr['dataFormat'] for attr in schema["definitions"][0]["hasAttributes"]]   
        for col_name, col_type in zip(column_names, column_types):
            if col_type == "String":
                col_type = "string"
            if col_type == "Guid":
                col_type = "string"
            if col_type == "Code":
                col_type = "object"
            if col_type == "Option":
                col_type = "string"
            if col_type == "Date":
                col_type = "date"
            if col_type == "Time":
                col_type = "string"
            if col_type == "DateTime":
                col_type = DateTimeFormat
            if col_type == "Duration":
                col_type = "timedelta"
            if col_type == "Decimal":
                col_type = DecimalFormat
            if col_type == "Boolean":
                col_type = "boolean"
            if col_type == "Integer":
                col_type = "int"
            if col_type == "Int64":
                col_type = "int"
            if col_type == "Int32":
                col_type = "int"
            if col_name == 'SystemModifiedAt-2000000003': #Audit fields must be in timestamp
                col_type = "timestamp"
            if col_name == 'SystemModifiedBy-2000000004': 
                col_type = "timestamp"

            df_new = df_new.withColumn(col_name, df_new[col_name].cast(col_type))

        #check if the table exists
        if table_name in [t.name for t in spark.catalog.listTables()]:  
            #read the old data into a new dataframe and union with the new dataframe
            SQL_Query = "SELECT * FROM " + Lakehouse +"."+table_name;  
            #print(SQL_Query)
            df_old = spark.sql(SQL_Query)
            df_new = df_new.union(df_old).repartition(no_Partition)

            #delete all old records
            df_deletes = df_new.filter(df_new['SystemCreatedAt-2000000001'].isNull())
            if ContainsCompany:
                df_new = df_new.join(df_deletes, ['$Company','systemId-2000000000'], 'leftanti')
            else:
                df_new = df_new.join(df_deletes, ['systemId-2000000000'], 'leftanti')
            
            # remove duplicates by filtering on systemID and systemModifiedAt fields
            if ContainsCompany:
                df_new = df_new.orderBy('$Company','systemId-2000000000',desc('SystemModifiedAt-2000000003'))
                df_new = df_new.dropDuplicates(['$Company','systemId-2000000000'])
            else:
                df_new = df_new.orderBy('systemId-2000000000',desc('SystemModifiedAt-2000000003'))
                df_new = df_new.dropDuplicates(['systemId-2000000000'])
            
            #overwrite the dataframe in the new table
            df_new.write.mode("overwrite").format("delta").save("Tables/" + table_name) 
        else:  
            #table isn't there so just insert it
            df_new.write.mode("overwrite").format("delta").save("Tables/" + table_name)

        #delete the files
        if Remove_delta:
            for filename in file_list:  
                try:  
                    os.remove(filename)  
                except OSError as e:  # this would catch any error when trying to delete the file  
                    print(f"Error: {filename} : {e.strerror}")
            file_list = [] # clear the list