## Business Central merge data notebook
In this part the files in the delta folder will be merge with the Lakehouse table.
- It iterates first on the folders to append to the existing table.
- After that is will remove all duplicates by sorting the table. 
- At last it will remove all deleted records inside the table that are deleted in Business Central

Please change the parameters in the first part.

In [11]:
%%pyspark
# settings
spark.conf.set("sprk.sql.parquet.vorder.enabled","true")
spark.conf.set("spark.microsoft.delta.optimizewrite.enabled","true")
spark.conf.set("spark.sql.parquet.filterPushdown", "true")
spark.conf.set("spark.sql.parquet.mergeSchema", "false")
spark.conf.set("spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version", "2")
spark.conf.set("spark.sql.delta.commitProtocol.enabled", "true")

# parameters
folder_path_spark = 'Files/deltas/' # this is mostly the default
folder_path = '/lakehouse/default/Files/deltas/' # this is mostly the default
workspace = 'fabricTest' #can also be a GUID
Lakehouse = 'businessCentral'; #can also be a GUID
Remove_delta = True;

StatementMeta(, daf5b681-1577-45ce-aec5-3696d3564aa0, 13, Finished, Available)

In [12]:
%%pyspark
import json
import os
from pyspark.sql.types import *
from pyspark.sql.utils import AnalysisException
from pyspark.sql.functions import col

for entry in os.scandir(folder_path):
 if entry.is_dir():
    table_name = entry.name.replace("-","")
    
    for filename in os.listdir(folder_path + entry.name):
        # if there is a reset in the file then drop the whole table
        if "-reset" in filename:
            df = spark.sql("DROP TABLE IF EXISTS " + Lakehouse +"." + table_name)

        df_spark = spark.read.format("csv").option("header","true").load(folder_path_spark + entry.name +"/"+ filename)
        
        f = open("/lakehouse/default/Files/"+ entry.name +".cdm.json")
        schema = json.load(f)
        # Parse the schema to get column names and data types
        column_names = [attr["name"] for attr in schema["definitions"][0]["hasAttributes"]] 
        column_types = [attr['dataFormat'] for attr in schema["definitions"][0]["hasAttributes"]]   
        for col_name, col_type in zip(column_names, column_types):
            if col_type == "String":
                col_type = "string"
            if col_type == "Guid":
                col_type = "string"
            if col_type == "Code":
                col_type = "object"
            if col_type == "Option":
                col_type = "string"
            if col_type == "Date":
                col_type = "date"
            if col_type == "DateTime":
                col_type = "date"
            if col_type == "Duration":
                col_type = "timedelta"
            if col_type == "Decimal":
                col_type = "float"
            if col_type == "Boolean":
                col_type = "boolean"
            if col_type == "Integer":
                col_type = "int"
            if col_type == "Int64":
                col_type = "int"
            if col_type == "Int32":
                col_type = "int"

            df_spark = df_spark.withColumn(col_name, df_spark[col_name].cast(col_type))

        df_spark.write.mode("append").format("delta").save("Tables/" + table_name)

        #delete the file
        if Remove_delta:
          os.remove(folder_path + entry.name +"/" + filename)

StatementMeta(, daf5b681-1577-45ce-aec5-3696d3564aa0, 14, Finished, Available)

  [(c, t) for (_, c), t in zip(pdf_slice.iteritems(), arrow_types)]
  [(c, t) for (_, c), t in zip(pdf_slice.iteritems(), arrow_types)]


In [13]:
%%pyspark
from pyspark.sql.types import *
import json

# Open the manifest.cdm.json to read all the tables
f = open("/lakehouse/default/Files/deltas.manifest.cdm.json")
schema = json.load(f)

table_name = [attr["entityName"] for attr in schema["entities"]]
for table_name, in zip(table_name):
    table_name = table_name.replace("-","")

    # remove deletes  
    SQL_Query = "SELECT * FROM " + Lakehouse +"."+table_name;  
    df_spark = spark.sql(SQL_Query)  
    df_deletes = df_spark.filter(df_spark['SystemCreatedAt-2000000001'].isNull())  
      
    for row in df_deletes.collect():  
        df_spark = df_spark.filter(df_spark['systemId-2000000000'] != row['systemId-2000000000'])
  
    df_spark.write.mode("overwrite").format("delta").save("Tables/" + table_name) 

StatementMeta(, daf5b681-1577-45ce-aec5-3696d3564aa0, 15, Finished, Available)

  [(c, t) for (_, c), t in zip(pdf_slice.iteritems(), arrow_types)]
  [(c, t) for (_, c), t in zip(pdf_slice.iteritems(), arrow_types)]


In [14]:
%%pyspark
from pyspark.sql.types import *
from pyspark.sql.functions import desc
import json

# Open the manifest.cdm.json to read all the tables
f = open("/lakehouse/default/Files/deltas.manifest.cdm.json")
schema = json.load(f)

table_names = [attr["entityName"] for attr in schema["entities"]]  
for table_name in table_names:  
    table_name = table_name.replace("-","")

    # remove duplicates by filtering on systemID and systemModifiedAt fields
    SQL_Query = "SELECT * FROM " + Lakehouse +"."+table_name;
    df_spark = spark.sql(SQL_Query)
    df_spark = df_spark.orderBy('systemId-2000000000',desc('SystemModifiedAt-2000000003'))
    df_spark = df_spark.dropDuplicates(['systemId-2000000000'])
    
    df_spark.write.mode("overwrite").format("delta").save("Tables/" + table_name)

StatementMeta(, daf5b681-1577-45ce-aec5-3696d3564aa0, 16, Finished, Available)

  [(c, t) for (_, c), t in zip(pdf_slice.iteritems(), arrow_types)]
  [(c, t) for (_, c), t in zip(pdf_slice.iteritems(), arrow_types)]
