# Running Connection notebook

In [0]:
%run /Workspace/Users/bhimsendabby2023@gmail.com/Drafts/connection_to_adlgen2_using_ServicePrinciple

[SecretScope(name='adlgen2')]

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.window import Window

#Reading Customer File Data From ADLGen2 Bronze

In [0]:
schema = "customer_id string, customer_name string, email string, join_date date, country string"

In [0]:
customer_df = spark.read.format('csv').option("header", "true").schema(schema).load(base_path+'customer/customer.csv')


In [0]:
customer_df.show()

+-----------+--------------+--------------------+----------+-------+
|customer_id| customer_name|               email| join_date|country|
+-----------+--------------+--------------------+----------+-------+
|       C001| Alice Johnson|   alice@example.com|2023-01-15|    USA|
|       C002|     Bob Smith|        bob@test.com|2023-02-20|     UK|
|       C003| Charlie Brown|                NULL|2023-03-05| Canada|
|       C004|  David Miller|   david.m@gmail.com|      NULL|Germany|
|       C005|     Eve White|     eve@example.com|2023-05-12| France|
|       NULL|        Mosasa|bellaxsxa@example...|2025-08-05|     UK|
|       C006| Alice Johnson|   alice@example.com|2023-01-15|    USA|
|       C006| Alice Johnson|   alice@example.com|2023-01-15|    USA|
|       C007|    Frank Ross|      frank@test.com|2023-06-01|    USA|
|       C008|     Grace Lee|                NULL|2023-06-15|  Japan|
|       C009|    Henry Ford|      henry@ford.com|2023-07-20|     UK|
|       C009|    Henry Ford|      

# Replacing Null values like date with old date, email with unknown

In [0]:
customer_df = customer_df.fillna({'email':'unknown','join_date':'2026-02-13'})
customer_df.show()

+-----------+--------------+--------------------+----------+-------+
|customer_id| customer_name|               email| join_date|country|
+-----------+--------------+--------------------+----------+-------+
|       C001| Alice Johnson|   alice@example.com|2023-01-15|    USA|
|       C002|     Bob Smith|        bob@test.com|2023-02-20|     UK|
|       C003| Charlie Brown|             unknown|2023-03-05| Canada|
|       C004|  David Miller|   david.m@gmail.com|2026-02-13|Germany|
|       C005|     Eve White|     eve@example.com|2023-05-12| France|
|       NULL|        Mosasa|bellaxsxa@example...|2025-08-05|     UK|
|       C006| Alice Johnson|   alice@example.com|2023-01-15|    USA|
|       C006| Alice Johnson|   alice@example.com|2023-01-15|    USA|
|       C007|    Frank Ross|      frank@test.com|2023-06-01|    USA|
|       C008|     Grace Lee|             unknown|2023-06-15|  Japan|
|       C009|    Henry Ford|      henry@ford.com|2023-07-20|     UK|
|       C009|    Henry Ford|      

# Removing the null value primary keys records

In [0]:
customer_df = customer_df.dropna()
customer_df.show()

+-----------+--------------+-----------------+----------+-------+
|customer_id| customer_name|            email| join_date|country|
+-----------+--------------+-----------------+----------+-------+
|       C001| Alice Johnson|alice@example.com|2023-01-15|    USA|
|       C002|     Bob Smith|     bob@test.com|2023-02-20|     UK|
|       C003| Charlie Brown|          unknown|2023-03-05| Canada|
|       C004|  David Miller|david.m@gmail.com|2026-02-13|Germany|
|       C005|     Eve White|  eve@example.com|2023-05-12| France|
|       C006| Alice Johnson|alice@example.com|2023-01-15|    USA|
|       C006| Alice Johnson|alice@example.com|2023-01-15|    USA|
|       C007|    Frank Ross|   frank@test.com|2023-06-01|    USA|
|       C008|     Grace Lee|          unknown|2023-06-15|  Japan|
|       C009|    Henry Ford|   henry@ford.com|2023-07-20|     UK|
|       C009|    Henry Ford|   henry@ford.com|2023-07-20|     UK|
|       C010|Isabella Moore|bella@example.com|2023-08-05|    USA|
+---------

In [0]:
customer_df = customer_df.withColumn("Validated_Date", coalesce(to_date(col("join_date"),'MM/dd/yyyy'),to_date(col("join_date"),'yyyy-MM-dd'),to_date(col("join_date"),'MM-dd-yyyy')))
customer_df.show()



+-----------+--------------+-----------------+----------+-------+--------------+
|customer_id| customer_name|            email| join_date|country|Validated_Date|
+-----------+--------------+-----------------+----------+-------+--------------+
|       C001| Alice Johnson|alice@example.com|2023-01-15|    USA|    2023-01-15|
|       C002|     Bob Smith|     bob@test.com|2023-02-20|     UK|    2023-02-20|
|       C003| Charlie Brown|          unknown|2023-03-05| Canada|    2023-03-05|
|       C004|  David Miller|david.m@gmail.com|2026-02-13|Germany|    2026-02-13|
|       C005|     Eve White|  eve@example.com|2023-05-12| France|    2023-05-12|
|       C006| Alice Johnson|alice@example.com|2023-01-15|    USA|    2023-01-15|
|       C006| Alice Johnson|alice@example.com|2023-01-15|    USA|    2023-01-15|
|       C007|    Frank Ross|   frank@test.com|2023-06-01|    USA|    2023-06-01|
|       C008|     Grace Lee|          unknown|2023-06-15|  Japan|    2023-06-15|
|       C009|    Henry Ford|

In [0]:
customer_df = customer_df.withColumn("TimeStamp_Col", current_timestamp())
customer_df.show()

+-----------+--------------+-----------------+----------+-------+--------------+--------------------+
|customer_id| customer_name|            email| join_date|country|Validated_Date|       TimeStamp_Col|
+-----------+--------------+-----------------+----------+-------+--------------+--------------------+
|       C001| Alice Johnson|alice@example.com|2023-01-15|    USA|    2023-01-15|2026-02-13 05:58:...|
|       C002|     Bob Smith|     bob@test.com|2023-02-20|     UK|    2023-02-20|2026-02-13 05:58:...|
|       C003| Charlie Brown|          unknown|2023-03-05| Canada|    2023-03-05|2026-02-13 05:58:...|
|       C004|  David Miller|david.m@gmail.com|2026-02-13|Germany|    2026-02-13|2026-02-13 05:58:...|
|       C005|     Eve White|  eve@example.com|2023-05-12| France|    2023-05-12|2026-02-13 05:58:...|
|       C006| Alice Johnson|alice@example.com|2023-01-15|    USA|    2023-01-15|2026-02-13 05:58:...|
|       C006| Alice Johnson|alice@example.com|2023-01-15|    USA|    2023-01-15|20

In [0]:
customer_df_1 = customer_df

#Removing the duplicates using drop_duplicate based on customer_id

In [0]:
customer_df_1 = customer_df_1.drop_duplicates(['customer_id'])
customer_df_1.show()

+-----------+--------------+-----------------+----------+-------+--------------+--------------------+
|customer_id| customer_name|            email| join_date|country|Validated_Date|       TimeStamp_Col|
+-----------+--------------+-----------------+----------+-------+--------------+--------------------+
|       C006| Alice Johnson|alice@example.com|2023-01-15|    USA|    2023-01-15|2026-02-13 05:58:...|
|       C010|Isabella Moore|bella@example.com|2023-08-05|    USA|    2023-08-05|2026-02-13 05:58:...|
|       C007|    Frank Ross|   frank@test.com|2023-06-01|    USA|    2023-06-01|2026-02-13 05:58:...|
|       C003| Charlie Brown|          unknown|2023-03-05| Canada|    2023-03-05|2026-02-13 05:58:...|
|       C004|  David Miller|david.m@gmail.com|2026-02-13|Germany|    2026-02-13|2026-02-13 05:58:...|
|       C009|    Henry Ford|   henry@ford.com|2023-07-20|     UK|    2023-07-20|2026-02-13 05:58:...|
|       C008|     Grace Lee|          unknown|2023-06-15|  Japan|    2023-06-15|20

#Removing duplicates using Window function

In [0]:

windowPar = Window.partitionBy(col('customer_id')).orderBy(col('TimeStamp_Col'))

In [0]:
customer_df = customer_df.withColumn('count',row_number().over(windowPar))
customer_df.show()

+-----------+--------------+-----------------+----------+-------+--------------+--------------------+-----+
|customer_id| customer_name|            email| join_date|country|Validated_Date|       TimeStamp_Col|count|
+-----------+--------------+-----------------+----------+-------+--------------+--------------------+-----+
|       C001| Alice Johnson|alice@example.com|2023-01-15|    USA|    2023-01-15|2026-02-13 05:58:...|    1|
|       C002|     Bob Smith|     bob@test.com|2023-02-20|     UK|    2023-02-20|2026-02-13 05:58:...|    1|
|       C003| Charlie Brown|          unknown|2023-03-05| Canada|    2023-03-05|2026-02-13 05:58:...|    1|
|       C004|  David Miller|david.m@gmail.com|2026-02-13|Germany|    2026-02-13|2026-02-13 05:58:...|    1|
|       C005|     Eve White|  eve@example.com|2023-05-12| France|    2023-05-12|2026-02-13 05:58:...|    1|
|       C006| Alice Johnson|alice@example.com|2023-01-15|    USA|    2023-01-15|2026-02-13 05:58:...|    1|
|       C006| Alice Johnson|

In [0]:
customer_df = customer_df.filter(col('count')==1)
customer_df.show()

+-----------+--------------+-----------------+----------+-------+--------------+--------------------+-----+
|customer_id| customer_name|            email| join_date|country|Validated_Date|       TimeStamp_Col|count|
+-----------+--------------+-----------------+----------+-------+--------------+--------------------+-----+
|       C001| Alice Johnson|alice@example.com|2023-01-15|    USA|    2023-01-15|2026-02-13 05:58:...|    1|
|       C002|     Bob Smith|     bob@test.com|2023-02-20|     UK|    2023-02-20|2026-02-13 05:58:...|    1|
|       C003| Charlie Brown|          unknown|2023-03-05| Canada|    2023-03-05|2026-02-13 05:58:...|    1|
|       C004|  David Miller|david.m@gmail.com|2026-02-13|Germany|    2026-02-13|2026-02-13 05:58:...|    1|
|       C005|     Eve White|  eve@example.com|2023-05-12| France|    2023-05-12|2026-02-13 05:58:...|    1|
|       C006| Alice Johnson|alice@example.com|2023-01-15|    USA|    2023-01-15|2026-02-13 05:58:...|    1|
|       C007|    Frank Ross|

In [0]:
customer_df = customer_df.withColumnRenamed('TimeStamp_Col','date_timestamp')
customer_df = customer_df.withColumnRenamed('Validated_Date','validated_date')

In [0]:
customer_df = customer_df.select("customer_id","customer_name","email","country","validated_Date","date_timestamp")
customer_df.show()

+-----------+--------------+-----------------+-------+--------------+--------------------+
|customer_id| customer_name|            email|country|validated_Date|      date_timestamp|
+-----------+--------------+-----------------+-------+--------------+--------------------+
|       C001| Alice Johnson|alice@example.com|    USA|    2023-01-15|2026-02-13 05:58:...|
|       C002|     Bob Smith|     bob@test.com|     UK|    2023-02-20|2026-02-13 05:58:...|
|       C003| Charlie Brown|          unknown| Canada|    2023-03-05|2026-02-13 05:58:...|
|       C004|  David Miller|david.m@gmail.com|Germany|    2026-02-13|2026-02-13 05:58:...|
|       C005|     Eve White|  eve@example.com| France|    2023-05-12|2026-02-13 05:58:...|
|       C006| Alice Johnson|alice@example.com|    USA|    2023-01-15|2026-02-13 05:58:...|
|       C007|    Frank Ross|   frank@test.com|    USA|    2023-06-01|2026-02-13 05:58:...|
|       C008|     Grace Lee|          unknown|  Japan|    2023-06-15|2026-02-13 05:58:...|

In [0]:
customer_df.write.format('delta').saveAsTable('bhim_bricks.dbo.customer')

In [0]:
%sql

desc formatted bhim_bricks.dbo.customer

col_name,data_type,comment
customer_id,string,
customer_name,string,
email,string,
country,string,
validated_Date,date,
date_timestamp,timestamp,
,,
# Delta Statistics Columns,,
Column Names,"date_timestamp, customer_id, customer_name, email, validated_Date, country",
Column Selection Method,first-32,
