
### Connecting ADLS Gen2 with Azure Databricks Notebook

In [0]:
'''
1. Created the App Registration in Azure Portal. 
2. Created the Secret under Certificate&Secrets for the App Registration that is created
3. In ADLS2, under IAM -> Added new Role Assignment and add the User that is created in the App Registration to establish the connectivity between ADLS and Azure Databricks. 
'''

storage_account = "dmgproductionadls2"                     # Storage account name
application_id = "7729fa92-7775-4604-9238-20b3d90fdecf"    # Application ID from App Registration
directory_id = "72a18fcd-e918-4ce9-aea4-0d2739f1bc60"      # Directory ID from App Registration

spark.conf.set(f"fs.azure.account.auth.type.{storage_account}.dfs.core.windows.net", "OAuth")
spark.conf.set(f"fs.azure.account.oauth.provider.type.{storage_account}.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set(f"fs.azure.account.oauth2.client.id.{storage_account}.dfs.core.windows.net", application_id)
spark.conf.set(f"fs.azure.account.oauth2.client.secret.{storage_account}.dfs.core.windows.net", "BtG8Q~xXGHe18fHgNtggNyDHcJuxSgOun1kagdBE")                # Value from App Registration -> Certification & Secrets
spark.conf.set(f"fs.azure.account.oauth2.client.endpoint.{storage_account}.dfs.core.windows.net", f"https://login.microsoftonline.com/{directory_id}/oauth2/token")


### Accessing the bronze layer and getting all the files names for further processing

In [0]:
# Extracting the bronze layered table list
bronze_base_path = 'abfss://bronze@dmgproductionadls2.dfs.core.windows.net/Sales/'
dbutils.fs.ls(bronze_base_path)
tables_list=[]
for table in dbutils.fs.ls(bronze_base_path):
    tables_list.append(table.name)
print(tables_list)

['CountryRegionCurrency/', 'CreditCard/', 'Currency/', 'CurrencyRate/', 'Customer/', 'PersonCreditCard/', 'SalesOrderDetail/', 'SalesOrderHeader/', 'SalesOrderHeaderSalesReason/', 'SalesPerson/', 'SalesPersonQuotaHistory/', 'SalesReason/', 'SalesTaxRate/', 'SalesTerritory/', 'SalesTerritoryHistory/', 'ShoppingCartItem/', 'SpecialOffer/', 'SpecialOfferProduct/', 'Store/']



### Reading the Bronze layer files into Spark dataframes dynamically

In [0]:
# Extracting the data from Bronze layer into dataframes inside a dictionary. 
df={}
for table in tables_list:
    file_name=table.split("/")[0]
    file_df="df_"+file_name
    file_path = bronze_base_path+table+file_name+".parquet"
    df[file_df]=spark.read.parquet(file_path,header=True)
    print(f'The name of the dataframe is {file_df} and count is {df[file_df].count()}')
    print("The data is:\n",df[file_df].show(2))

The name of the dataframe is df_CountryRegionCurrency and count is 109
+-----------------+------------+--------------------+
|CountryRegionCode|CurrencyCode|        ModifiedDate|
+-----------------+------------+--------------------+
|               AE|         AED|2014-02-08 10:17:...|
|               AR|         ARS|2014-02-08 10:17:...|
+-----------------+------------+--------------------+
only showing top 2 rows

The data is:
 None
The name of the dataframe is df_CreditCard and count is 19118
+------------+------------+--------------+--------+-------+-------------------+
|CreditCardID|    CardType|    CardNumber|ExpMonth|ExpYear|       ModifiedDate|
+------------+------------+--------------+--------+-------+-------------------+
|           1|SuperiorCard|33332664695310|      11|   2006|2013-07-29 00:00:00|
|           2| Distinguish|55552127249722|       8|   2005|2013-12-05 00:00:00|
+------------+------------+--------------+--------+-------+-------------------+
only showing top 2 


## Data Cleaning


#### Defining the function to check null values

In [0]:
from pyspark.sql.functions import col, count, when

def check_nulls(df, df_name):
    # Initialize an empty list to store the count expressions
    counts = []
    
    # Iterate over each column and apply the count condition
    for c in df.columns:
        counts.append(count(when(col(c).isNull(), 1)).alias(c))
    
    # Apply select with the count expressions
    null_counts_df = df.select(*counts)
    
    # Collect results as a dictionary
    null_counts = null_counts_df.collect()[0].asDict()
   
    # Check if any column has nulls
    if any(value > 0 for value in null_counts.values()):
        print(f"!!!!! Null values detected in {df_name}!!!!!")
        print(null_counts)
    else:
        print(f"<<OK>> No nulls found in {df_name} <<OK>>")

[0;31m---------------------------------------------------------------------------[0m
[0;31mNameError[0m                                 Traceback (most recent call last)
File [0;32m<command-8903882494457152>, line 5[0m
[1;32m      3[0m file_name[38;5;241m=[39mtable[38;5;241m.[39msplit([38;5;124m"[39m[38;5;124m/[39m[38;5;124m"[39m)[[38;5;241m0[39m]
[1;32m      4[0m file_df[38;5;241m=[39m[38;5;124m"[39m[38;5;124mdf_[39m[38;5;124m"[39m[38;5;241m+[39mfile_name
[0;32m----> 5[0m file_path [38;5;241m=[39m base_path[38;5;241m+[39mtable[38;5;241m+[39mfile_name[38;5;241m+[39m[38;5;124m"[39m[38;5;124m.parquet[39m[38;5;124m"[39m
[1;32m      6[0m df[file_df][38;5;241m=[39mspark[38;5;241m.[39mread[38;5;241m.[39mparquet(file_path,header[38;5;241m=[39m[38;5;28;01mTrue[39;00m)
[1;32m      7[0m [38;5;28mprint[39m([38;5;124mf[39m[38;5;124m'[39m[38;5;124mThe name of the dataframe is [39m[38;5;132;01m{[39;00mfile_df[38;5;132;01m}[39;

In [0]:
# Displaying the count and validating the null values in each of the dataframes. 
for table in df.keys():
    print("\nThe  table is  :", table)
    print("The count of df:",df[table].count())
    check_nulls(df[table],table)


The  table is  : df_CountryRegionCurrency
The count of df: 109
<<OK>> No nulls found in df_CountryRegionCurrency <<OK>>

The  table is  : df_CreditCard
The count of df: 19118
<<OK>> No nulls found in df_CreditCard <<OK>>

The  table is  : df_Currency
The count of df: 105
<<OK>> No nulls found in df_Currency <<OK>>

The  table is  : df_CurrencyRate
The count of df: 13532
<<OK>> No nulls found in df_CurrencyRate <<OK>>

The  table is  : df_Customer
The count of df: 19820
!!!!! Null values detected in df_Customer!!!!!
{'CustomerID': 0, 'PersonID': 701, 'StoreID': 18484, 'TerritoryID': 0, 'AccountNumber': 0, 'rowguid': 0, 'ModifiedDate': 0}

The  table is  : df_PersonCreditCard
The count of df: 19118
<<OK>> No nulls found in df_PersonCreditCard <<OK>>

The  table is  : df_SalesOrderDetail
The count of df: 121317
!!!!! Null values detected in df_SalesOrderDetail!!!!!
{'SalesOrderID': 0, 'SalesOrderDetailID': 0, 'CarrierTrackingNumber': 60398, 'OrderQty': 0, 'ProductID': 0, 'SpecialOfferID'

[0;31m---------------------------------------------------------------------------[0m
[0;31mNameError[0m                                 Traceback (most recent call last)
File [0;32m<command-8903882494457152>, line 5[0m
[1;32m      3[0m file_name[38;5;241m=[39mtable[38;5;241m.[39msplit([38;5;124m"[39m[38;5;124m/[39m[38;5;124m"[39m)[[38;5;241m0[39m]
[1;32m      4[0m file_df[38;5;241m=[39m[38;5;124m"[39m[38;5;124mdf_[39m[38;5;124m"[39m[38;5;241m+[39mfile_name
[0;32m----> 5[0m file_path [38;5;241m=[39m base_path[38;5;241m+[39mtable[38;5;241m+[39mfile_name[38;5;241m+[39m[38;5;124m"[39m[38;5;124m.parquet[39m[38;5;124m"[39m
[1;32m      6[0m df[file_df][38;5;241m=[39mspark[38;5;241m.[39mread[38;5;241m.[39mparquet(file_path,header[38;5;241m=[39m[38;5;28;01mTrue[39;00m)
[1;32m      7[0m [38;5;28mprint[39m([38;5;124mf[39m[38;5;124m'[39m[38;5;124mThe name of the dataframe is [39m[38;5;132;01m{[39;00mfile_df[38;5;132;01m}[39;


So from the above cell, it is clear that most of the dataframes are without null values, however below have null values. Lets review them if data needs imputed. 

- df_Customer
- df_SalesOrderDetail
- df_SalesOrderHeader 
- df_SalesPerson 
- df_SalesTerritoryHistory 
- df_SpecialOffer 

In [0]:
# Fixing null in the table df_Customer

# display Original dataframe
print("The Original table is  :")
df['df_Customer'].show(5)

# Update StoreID where its null as 0. 
df['df_Customer'] = df['df_Customer'].withColumn("StoreID", when(col("StoreID").isNull(), 0).otherwise(col("StoreID")))
# Update PersonID where its null as 0 . 
df['df_Customer'] = df['df_Customer'].withColumn("PersonID", when(col("PersonID").isNull(), 0).otherwise(col("PersonID")))

# Show updated DataFrame
print("The Updated table is  :")
df['df_Customer'].show(5)

# Validating the null values
check_nulls(df['df_Customer'], 'df_Customer')

The Original table is  :
+----------+--------+-------+-----------+-------------+--------------------+--------------------+
|CustomerID|PersonID|StoreID|TerritoryID|AccountNumber|             rowguid|        ModifiedDate|
+----------+--------+-------+-----------+-------------+--------------------+--------------------+
|         1|    NULL|    934|          1|   AW00000001|3f5ae95e-b87d-4ae...|2014-09-12 11:15:...|
|         2|    NULL|   1028|          1|   AW00000002|e552f657-a9af-4a7...|2014-09-12 11:15:...|
|         3|    NULL|    642|          4|   AW00000003|130774b1-db21-4ef...|2014-09-12 11:15:...|
|         4|    NULL|    932|          4|   AW00000004|ff862851-1daa-404...|2014-09-12 11:15:...|
|         5|    NULL|   1026|          4|   AW00000005|83905bdc-6f5e-4f7...|2014-09-12 11:15:...|
+----------+--------+-------+-----------+-------------+--------------------+--------------------+
only showing top 5 rows

The Updated table is  :
+----------+--------+-------+-----------+---

[0;31m---------------------------------------------------------------------------[0m
[0;31mAnalysisException[0m                         Traceback (most recent call last)
File [0;32m<command-3271532783269673>, line 9[0m
[1;32m      6[0m df[[38;5;124m'[39m[38;5;124mdf_Customer[39m[38;5;124m'[39m][38;5;241m.[39mshow([38;5;241m2500[39m)
[1;32m      8[0m [38;5;66;03m# Update EndDate where it's NULL as 2 monthsr from the StartDate. [39;00m
[0;32m----> 9[0m df[[38;5;124m'[39m[38;5;124mdf_Customer[39m[38;5;124m'[39m] [38;5;241m=[39m df[[38;5;124m'[39m[38;5;124mdf_Customer[39m[38;5;124m'[39m][38;5;241m.[39mwithColumn([38;5;124m"[39m[38;5;124mEndDate[39m[38;5;124m"[39m, when(col([38;5;124m"[39m[38;5;124mEndDate[39m[38;5;124m"[39m)[38;5;241m.[39misNull(), add_months(col([38;5;124m"[39m[38;5;124mStartDate[39m[38;5;124m"[39m), [38;5;241m2[39m))[38;5;241m.[39motherwise(col([38;5;124m"[39m[38;5;124mEndDate[39m[38;5;124m"[39m)))
[1

In [0]:
# Fixing null in the table df_SalesOrderDetail

from pyspark.sql.functions import col, when, add_months

# display Original dataframe
print("The Original table is  :")
df['df_SalesOrderDetail'].show(5)

# Update StoreID where its null as "not Available". 
df['df_SalesOrderDetail'] = df['df_SalesOrderDetail'].withColumn("CarrierTrackingNumber", when(col("CarrierTrackingNumber").isNull(), "Not Available").otherwise(col("CarrierTrackingNumber")))

# Show updated DataFrame
print("The Updated table is  :")
df['df_SalesOrderDetail'].show(5)

# Validating the null values
check_nulls(df['df_SalesOrderDetail'], 'df_SalesOrderDetail')

The Original table is  :
+------------+------------------+---------------------+--------+---------+--------------+---------+-----------------+-----------+--------------------+-------------------+
|SalesOrderID|SalesOrderDetailID|CarrierTrackingNumber|OrderQty|ProductID|SpecialOfferID|UnitPrice|UnitPriceDiscount|  LineTotal|             rowguid|       ModifiedDate|
+------------+------------------+---------------------+--------+---------+--------------+---------+-----------------+-----------+--------------------+-------------------+
|       43659|                 1|         4911-403C-98|       1|      776|             1|2024.9940|           0.0000|2024.994000|b207c96d-d9e6-402...|2011-05-31 00:00:00|
|       43659|                 2|         4911-403C-98|       3|      777|             1|2024.9940|           0.0000|6074.982000|7abb600d-1e77-41b...|2011-05-31 00:00:00|
|       43659|                 3|         4911-403C-98|       1|      778|             1|2024.9940|           0.0000|202

In [0]:
# Fixing null in the table df_SalesOrderHeader

# Show original DataFrame
print("The Original table is  :")
#df['df_SalesOrderHeader'].show(5)

# Update "SalesPersonID" as 0 for null values
df['df_SalesOrderHeader'] = df['df_SalesOrderHeader'].withColumn("SalesPersonID", when((col("SalesPersonID").isNull()), 0).otherwise(col("SalesPersonID")))

# Update "PurchaseOrderNumber" as 0 for null values
df['df_SalesOrderHeader'] = df['df_SalesOrderHeader'].withColumn("PurchaseOrderNumber", when((col("PurchaseOrderNumber").isNull()), 0).otherwise(col("PurchaseOrderNumber")))

# dropping comment column as all values are null
df['df_SalesOrderHeader'] = df['df_SalesOrderHeader'].drop('Comment')

# Update "CurrencyRateID" as 0 where it's NULL
df['df_SalesOrderHeader'] = df['df_SalesOrderHeader'].withColumn("CurrencyRateID", when(col("CurrencyRateID").isNull(), 0).otherwise(col("CurrencyRateID")))

# Update "CreditCardID" as 0 where it's NULL
df['df_SalesOrderHeader'] = df['df_SalesOrderHeader'].withColumn("CreditCardID", when(col("CreditCardID").isNull(), 0).otherwise(col("CreditCardID")))

# Update "CreditCardApprovalCode" as 0 where it's NULL
df['df_SalesOrderHeader'] = df['df_SalesOrderHeader'].withColumn("CreditCardApprovalCode", when(col("CreditCardApprovalCode").isNull(), "Not Available").otherwise(col("CreditCardApprovalCode")))

# Update "CurrencyRateID" as 0 where it's NULL
df['df_SalesOrderHeader'] = df['df_SalesOrderHeader'].withColumn("CurrencyRateID", when(col("CurrencyRateID").isNull(), 0).otherwise(col("CurrencyRateID")))

# Show Updated DataFrame
print("The Updated table is  :")
#df['df_SalesOrderHeader'].show(5)

check_nulls(df['df_SalesOrderHeader'], 'df_SalesOrderHeader')

The Original table is  :
The Updated table is  :
<<OK>> No nulls found in df_SalesOrderHeader <<OK>>


[0;36m  File [0;32m<command-3271532783269675>, line 32[0;36m[0m
[0;31m    check_nulls(df['df_SalesOrderHeader'], 'df_SalesOrderHeader')'''[0m
[0m                                                                 ^[0m
[0;31mSyntaxError[0m[0;31m:[0m incomplete input


In [0]:
# Fixing null in the table df_SalesPerson
 
# Show Original DataFrame
print("The Original table is  :")
df['df_SalesPerson'].show(5)

avg_SalesQuota=df['df_SalesPerson'].select('SalesQuota').agg({'SalesQuota':'avg'}).collect()[0][0]

df['df_SalesPerson']=df['df_SalesPerson'].withColumn("SalesQuota",when(col("SalesQuota").isNull(),avg_SalesQuota).otherwise(col("SalesQuota")))
df['df_SalesPerson']=df['df_SalesPerson'].withColumn("TerritoryID",when(col("TerritoryID").isNull(),0).otherwise(col("TerritoryID")))

# Show Updated DataFrame
print("The Updated table is  :")
df['df_SalesPerson'].show(5)
check_nulls(df['df_SalesPerson'],'df_SalesPerson')

The Original table is  :
+----------------+-----------+-----------+---------+-------------+------------+-------------+--------------------+-------------------+
|BusinessEntityID|TerritoryID| SalesQuota|    Bonus|CommissionPct|    SalesYTD|SalesLastYear|             rowguid|       ModifiedDate|
+----------------+-----------+-----------+---------+-------------+------------+-------------+--------------------+-------------------+
|             274|       NULL|       NULL|   0.0000|       0.0000| 559697.5639|       0.0000|48754992-9ee0-4c0...|2010-12-28 00:00:00|
|             275|          2|300000.0000|4100.0000|       0.0120|3763178.1787| 1750406.4785|1e0a7274-3064-4f5...|2011-05-24 00:00:00|
|             276|          4|250000.0000|2000.0000|       0.0150|4251368.5497| 1439156.0291|4dd9eee4-8e81-4f8...|2011-05-24 00:00:00|
|             277|          3|250000.0000|2500.0000|       0.0150|3189418.3662| 1997186.2037|39012928-bfec-424...|2011-05-24 00:00:00|
|             278|          6|

In [0]:
# Fixing null in the table df_SalesTerritoryHistory
from pyspark.sql.functions import date_add, col

# Show Original DataFrame
print("The Original table is  :")
df['df_SalesTerritoryHistory'].show(5)

df['df_SalesTerritoryHistory']=df['df_SalesTerritoryHistory'].withColumn("EndDate",when(col("EndDate").isNull(),date_add(col("StartDate"),365)).otherwise(col("EndDate")))

# Show Updated DataFrame
print("The Updated table is  :")
df['df_SalesTerritoryHistory'].show(5)

check_nulls(df['df_SalesTerritoryHistory'],'df_SalesTerritoryHistory')                                                                     

The Original table is  :
+----------------+-----------+-------------------+-------------------+--------------------+-------------------+
|BusinessEntityID|TerritoryID|          StartDate|            EndDate|             rowguid|       ModifiedDate|
+----------------+-----------+-------------------+-------------------+--------------------+-------------------+
|             275|          2|2011-05-31 00:00:00|2012-11-29 00:00:00|8563ce6a-00ff-47d...|2012-11-22 00:00:00|
|             275|          3|2012-11-30 00:00:00|               NULL|2f44304c-ee87-4c7...|2012-11-23 00:00:00|
|             276|          4|2011-05-31 00:00:00|               NULL|64bcb1b3-a793-40b...|2011-05-24 00:00:00|
|             277|          3|2011-05-31 00:00:00|2012-11-29 00:00:00|3e9f893d-5142-46c...|2012-11-22 00:00:00|
|             277|          2|2012-11-30 00:00:00|               NULL|132e4721-32dd-4a7...|2012-11-23 00:00:00|
+----------------+-----------+-------------------+-------------------+---------

In [0]:
# Fixing null in the table df_SpecialOffer
from pyspark.sql.functions import col, when, lit, coalesce

# Show Original DataFrame
print("The Original table is  :")
df['df_SpecialOffer'].show(6)

# Updating the MaxQty as 0 where it's null and MinQty is 0
df['df_SpecialOffer']=df['df_SpecialOffer'].withColumn("MaxQty",when(col("MaxQty").isNull() & (col("MinQty") == 0), 0).otherwise(col("MaxQty")))

# Collect MaxQty and MinQty values as lists
MaxQty_list = [row['MaxQty'] for row in df['df_SpecialOffer'].select('MaxQty').collect()]
MinQty_list = [row['MinQty'] for row in df['df_SpecialOffer'].select('MinQty').collect()]

#Calculate the average of difference of MaxQty and MinQty
avg_qty = [a - b for a, b in zip(MaxQty_list, MinQty_list) if a is not None and b is not None]
avg_qty_to_update=round((sum(avg_qty) / len(avg_qty) if avg_qty else 0),)

#Impute the MaxQty value as MinQty + Avg_qty that is calcualted. 
df['df_SpecialOffer']=df['df_SpecialOffer'].withColumn("MaxQty",when(col("MaxQty").isNull(), ((col("MinQty")) + lit(avg_qty_to_update)))
    .otherwise(col("MaxQty")))

# Show Updated DataFrame
print("The Updated table is  :")
df['df_SpecialOffer'].show(6)

check_nulls(df['df_SpecialOffer'],'df_SpecialOffer')

The Original table is  :
+--------------+--------------------+-----------+---------------+-----------+-------------------+-------------------+------+------+--------------------+-------------------+
|SpecialOfferID|         Description|DiscountPct|           Type|   Category|          StartDate|            EndDate|MinQty|MaxQty|             rowguid|       ModifiedDate|
+--------------+--------------------+-----------+---------------+-----------+-------------------+-------------------+------+------+--------------------+-------------------+
|             1|         No Discount|     0.0000|    No Discount|No Discount|2011-05-01 00:00:00|2014-11-30 00:00:00|     0|  NULL|0290c4f5-191f-433...|2011-04-01 00:00:00|
|             2|Volume Discount 1...|     0.0200|Volume Discount|   Reseller|2011-05-31 00:00:00|2014-05-30 00:00:00|    11|    14|d7542ee7-15db-454...|2011-05-01 00:00:00|
|             3|Volume Discount 1...|     0.0500|Volume Discount|   Reseller|2011-05-31 00:00:00|2014-05-30 00

[0;31m---------------------------------------------------------------------------[0m
[0;31mNameError[0m                                 Traceback (most recent call last)
File [0;32m<command-8875516628818281>, line 32[0m
[1;32m     29[0m df[[38;5;124m'[39m[38;5;124mdf_SpecialOffer[39m[38;5;124m'[39m][38;5;241m.[39mshow([38;5;241m10[39m)
[1;32m     31[0m check_nulls(df[[38;5;124m'[39m[38;5;124mdf_SpecialOffer[39m[38;5;124m'[39m],[38;5;124m'[39m[38;5;124mdf_SpecialOffer[39m[38;5;124m'[39m)
[0;32m---> 32[0m dilip

[0;31mNameError[0m: name 'dilip' is not defined

In [0]:
# Re-validate if nulls are handled in all the tables. 
for table in df.keys():
    print("\nThe  table is  :", table)
    print("The count of df:",df[table].count())
    check_nulls(df[table],table)


The  table is  : df_CountryRegionCurrency
The count of df: 109
<<OK>> No nulls found in df_CountryRegionCurrency <<OK>>

The  table is  : df_CreditCard
The count of df: 19118
<<OK>> No nulls found in df_CreditCard <<OK>>

The  table is  : df_Currency
The count of df: 105
<<OK>> No nulls found in df_Currency <<OK>>

The  table is  : df_CurrencyRate
The count of df: 13532
<<OK>> No nulls found in df_CurrencyRate <<OK>>

The  table is  : df_Customer
The count of df: 19820
<<OK>> No nulls found in df_Customer <<OK>>

The  table is  : df_PersonCreditCard
The count of df: 19118
<<OK>> No nulls found in df_PersonCreditCard <<OK>>

The  table is  : df_SalesOrderDetail
The count of df: 121317
<<OK>> No nulls found in df_SalesOrderDetail <<OK>>

The  table is  : df_SalesOrderHeader
The count of df: 31465
<<OK>> No nulls found in df_SalesOrderHeader <<OK>>

The  table is  : df_SalesOrderHeaderSalesReason
The count of df: 27647
<<OK>> No nulls found in df_SalesOrderHeaderSalesReason <<OK>>

The  

[0;31m---------------------------------------------------------------------------[0m
[0;31mNameError[0m                                 Traceback (most recent call last)
File [0;32m<command-8875516628818281>, line 32[0m
[1;32m     29[0m df[[38;5;124m'[39m[38;5;124mdf_SpecialOffer[39m[38;5;124m'[39m][38;5;241m.[39mshow([38;5;241m10[39m)
[1;32m     31[0m check_nulls(df[[38;5;124m'[39m[38;5;124mdf_SpecialOffer[39m[38;5;124m'[39m],[38;5;124m'[39m[38;5;124mdf_SpecialOffer[39m[38;5;124m'[39m)
[0;32m---> 32[0m dilip

[0;31mNameError[0m: name 'dilip' is not defined


View table schema and then transform Timestamp datatype to Date in all tables since the time values in timestamp are zeros. 

In [0]:
# Create the function to print the Schema. 
def print_schema(df, df_name):
    print("\nThe Schema of the table {} is :".format(df_name))
    df.printSchema()

[0;31m---------------------------------------------------------------------------[0m
[0;31mNameError[0m                                 Traceback (most recent call last)
File [0;32m<command-8875516628818281>, line 32[0m
[1;32m     29[0m df[[38;5;124m'[39m[38;5;124mdf_SpecialOffer[39m[38;5;124m'[39m][38;5;241m.[39mshow([38;5;241m10[39m)
[1;32m     31[0m check_nulls(df[[38;5;124m'[39m[38;5;124mdf_SpecialOffer[39m[38;5;124m'[39m],[38;5;124m'[39m[38;5;124mdf_SpecialOffer[39m[38;5;124m'[39m)
[0;32m---> 32[0m dilip

[0;31mNameError[0m: name 'dilip' is not defined

In [0]:
# Displaying the schema of each tables to identify the timestamp column
for file in df.keys():
    print_schema(df[file],file)


The Schema of the table df_CountryRegionCurrency is :
root
 |-- CountryRegionCode: string (nullable = true)
 |-- CurrencyCode: string (nullable = true)
 |-- ModifiedDate: timestamp (nullable = true)


The Schema of the table df_CreditCard is :
root
 |-- CreditCardID: integer (nullable = true)
 |-- CardType: string (nullable = true)
 |-- CardNumber: string (nullable = true)
 |-- ExpMonth: integer (nullable = true)
 |-- ExpYear: integer (nullable = true)
 |-- ModifiedDate: timestamp (nullable = true)


The Schema of the table df_Currency is :
root
 |-- CurrencyCode: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- ModifiedDate: timestamp (nullable = true)


The Schema of the table df_CurrencyRate is :
root
 |-- CurrencyRateID: integer (nullable = true)
 |-- CurrencyRateDate: timestamp (nullable = true)
 |-- FromCurrencyCode: string (nullable = true)
 |-- ToCurrencyCode: string (nullable = true)
 |-- AverageRate: decimal(19,4) (nullable = true)
 |-- EndOfDayRate: decimal

[0;31m---------------------------------------------------------------------------[0m
[0;31mNameError[0m                                 Traceback (most recent call last)
File [0;32m<command-8875516628818281>, line 32[0m
[1;32m     29[0m df[[38;5;124m'[39m[38;5;124mdf_SpecialOffer[39m[38;5;124m'[39m][38;5;241m.[39mshow([38;5;241m10[39m)
[1;32m     31[0m check_nulls(df[[38;5;124m'[39m[38;5;124mdf_SpecialOffer[39m[38;5;124m'[39m],[38;5;124m'[39m[38;5;124mdf_SpecialOffer[39m[38;5;124m'[39m)
[0;32m---> 32[0m dilip

[0;31mNameError[0m: name 'dilip' is not defined

In [0]:
# Transforming the date columns to date format
from pyspark.sql.functions import to_date

for file in df.keys():
    print("\n",file," in Original Schema")
    print_schema(df[file],file)
    for col in df[file].dtypes:
        col_name = col[0]
        col_type = col[1]
        if 'date' in col_name or 'Date' in col_name:
            df[file] = df[file].withColumn(col_name, to_date(col_name, 'MM/dd/yyyy'))

    print("\n",file," in transformed Schema")
    print_schema(df[file],file)


 df_CountryRegionCurrency  in Original Schema

The Schema of the table df_CountryRegionCurrency is :
root
 |-- CountryRegionCode: string (nullable = true)
 |-- CurrencyCode: string (nullable = true)
 |-- ModifiedDate: timestamp (nullable = true)


 df_CountryRegionCurrency  in transformed Schema

The Schema of the table df_CountryRegionCurrency is :
root
 |-- CountryRegionCode: string (nullable = true)
 |-- CurrencyCode: string (nullable = true)
 |-- ModifiedDate: date (nullable = true)


 df_CreditCard  in Original Schema

The Schema of the table df_CreditCard is :
root
 |-- CreditCardID: integer (nullable = true)
 |-- CardType: string (nullable = true)
 |-- CardNumber: string (nullable = true)
 |-- ExpMonth: integer (nullable = true)
 |-- ExpYear: integer (nullable = true)
 |-- ModifiedDate: timestamp (nullable = true)


 df_CreditCard  in transformed Schema

The Schema of the table df_CreditCard is :
root
 |-- CreditCardID: integer (nullable = true)
 |-- CardType: string (nullable 

[0;31m---------------------------------------------------------------------------[0m
[0;31mNameError[0m                                 Traceback (most recent call last)
File [0;32m<command-8875516628818281>, line 32[0m
[1;32m     29[0m df[[38;5;124m'[39m[38;5;124mdf_SpecialOffer[39m[38;5;124m'[39m][38;5;241m.[39mshow([38;5;241m10[39m)
[1;32m     31[0m check_nulls(df[[38;5;124m'[39m[38;5;124mdf_SpecialOffer[39m[38;5;124m'[39m],[38;5;124m'[39m[38;5;124mdf_SpecialOffer[39m[38;5;124m'[39m)
[0;32m---> 32[0m dilip

[0;31mNameError[0m: name 'dilip' is not defined

In [0]:
# Displaying the data of each tables to identify the timestamp column changed to dates
for file in df.keys():
    df[file].show(5)

+-----------------+------------+------------+
|CountryRegionCode|CurrencyCode|ModifiedDate|
+-----------------+------------+------------+
|               AE|         AED|  2014-02-08|
|               AR|         ARS|  2014-02-08|
|               AT|         ATS|  2014-02-08|
|               AT|         EUR|  2008-04-30|
|               AU|         AUD|  2014-02-08|
+-----------------+------------+------------+
only showing top 5 rows

+------------+-------------+--------------+--------+-------+------------+
|CreditCardID|     CardType|    CardNumber|ExpMonth|ExpYear|ModifiedDate|
+------------+-------------+--------------+--------+-------+------------+
|           1| SuperiorCard|33332664695310|      11|   2006|  2013-07-29|
|           2|  Distinguish|55552127249722|       8|   2005|  2013-12-05|
|           3|ColonialVoice|77778344838353|       7|   2005|  2014-01-14|
|           4|ColonialVoice|77774915718248|       7|   2006|  2013-05-20|
|           5|        Vista|11114404600042|

[0;31m---------------------------------------------------------------------------[0m
[0;31mNameError[0m                                 Traceback (most recent call last)
File [0;32m<command-8875516628818281>, line 32[0m
[1;32m     29[0m df[[38;5;124m'[39m[38;5;124mdf_SpecialOffer[39m[38;5;124m'[39m][38;5;241m.[39mshow([38;5;241m10[39m)
[1;32m     31[0m check_nulls(df[[38;5;124m'[39m[38;5;124mdf_SpecialOffer[39m[38;5;124m'[39m],[38;5;124m'[39m[38;5;124mdf_SpecialOffer[39m[38;5;124m'[39m)
[0;32m---> 32[0m dilip

[0;31mNameError[0m: name 'dilip' is not defined

## Writing the cleanedup data to silver layer

In [0]:
# Writing the cleanedup data to silver layer. 

silver_base_path = 'abfss://silver@dmgproductionadls2.dfs.core.windows.net/Sales/'
bronze_base_path = 'abfss://bronze@dmgproductionadls2.dfs.core.windows.net/Sales/'

for file in df.keys():
    folder_path = f"{silver_base_path}{file}/"
    df[file].write.mode("overwrite").parquet(folder_path)
    print(f"✅ File saved successfully: {folder_path}")

abfss://silver@dmgproductionadls2.dfs.core.windows.net/Sales/df_CountryRegionCurrency/
✅ File saved successfully: abfss://silver@dmgproductionadls2.dfs.core.windows.net/Sales/df_CountryRegionCurrency/
abfss://silver@dmgproductionadls2.dfs.core.windows.net/Sales/df_CreditCard/
✅ File saved successfully: abfss://silver@dmgproductionadls2.dfs.core.windows.net/Sales/df_CreditCard/
abfss://silver@dmgproductionadls2.dfs.core.windows.net/Sales/df_Currency/
✅ File saved successfully: abfss://silver@dmgproductionadls2.dfs.core.windows.net/Sales/df_Currency/
abfss://silver@dmgproductionadls2.dfs.core.windows.net/Sales/df_CurrencyRate/
✅ File saved successfully: abfss://silver@dmgproductionadls2.dfs.core.windows.net/Sales/df_CurrencyRate/
abfss://silver@dmgproductionadls2.dfs.core.windows.net/Sales/df_Customer/
✅ File saved successfully: abfss://silver@dmgproductionadls2.dfs.core.windows.net/Sales/df_Customer/
abfss://silver@dmgproductionadls2.dfs.core.windows.net/Sales/df_PersonCreditCard/
✅ Fil

[0;31m---------------------------------------------------------------------------[0m
[0;31mNameError[0m                                 Traceback (most recent call last)
File [0;32m<command-8875516628818281>, line 32[0m
[1;32m     29[0m df[[38;5;124m'[39m[38;5;124mdf_SpecialOffer[39m[38;5;124m'[39m][38;5;241m.[39mshow([38;5;241m10[39m)
[1;32m     31[0m check_nulls(df[[38;5;124m'[39m[38;5;124mdf_SpecialOffer[39m[38;5;124m'[39m],[38;5;124m'[39m[38;5;124mdf_SpecialOffer[39m[38;5;124m'[39m)
[0;32m---> 32[0m dilip

[0;31mNameError[0m: name 'dilip' is not defined