In [0]:
dbutils.secrets.list('dbclientretail')

[SecretMetadata(key='clientsecretretail')]

In [0]:
secret=dbutils.secrets.get('dbclientretail','clientsecretretail')

In [0]:
# Service Principal credentials and configurations
client_id= "6e518dc2-6192-4de7-bfd8-a2081c0ed276"
client_secret = secret
directory_id= "80afa3f4-1563-46f1-a268-04f5fb610dd9"

# Access storage account using Service Principal
configs = {"fs.azure.account.auth.type": "OAuth",
           "fs.azure.account.oauth.provider.type": "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider",
           "fs.azure.account.oauth2.client.id": f"{client_id}",
           "fs.azure.account.oauth2.client.secret": f"{client_secret}",
           "fs.azure.account.oauth2.client.endpoint": f"https://login.microsoftonline.com/{directory_id}/oauth2/token"}


In [0]:
#Mounting curated container

storage_account_name = "dlretail"
container_name = "raw"
processed_container_name = "curated"

mount_point = f"/mnt/{storage_account_name}/{processed_container_name}"

if any(mount.mountPoint == mount_point for mount in dbutils.fs.mounts()):
    # Unmount the existing mount point
    dbutils.fs.unmount(mount_point)
    print(f"Unmounted existing mount at {mount_point}")

try:
    dbutils.fs.mount(
        source=f"abfss://{processed_container_name}@{storage_account_name}.dfs.core.windows.net/",
        mount_point=mount_point,
        extra_configs=configs
    )
    print(f"Mounted {processed_container_name} successfully at {mount_point}")
except Exception as e:
    print(f"Error mounting: {e}")

/mnt/dlretail/curated has been unmounted.
Unmounted existing mount at /mnt/dlretail/curated
Mounted raw successfully at /mnt/dlretail/curated


In [0]:
#Checking the container contents
dbutils.fs.ls("/mnt/dlretail/curated")

[FileInfo(path='dbfs:/mnt/dlretail/curated/df1/', name='df1/', size=0, modificationTime=1732398377000),
 FileInfo(path='dbfs:/mnt/dlretail/curated/df2/', name='df2/', size=0, modificationTime=1732398380000),
 FileInfo(path='dbfs:/mnt/dlretail/curated/df3/', name='df3/', size=0, modificationTime=1732398382000),
 FileInfo(path='dbfs:/mnt/dlretail/curated/df4/', name='df4/', size=0, modificationTime=1732398384000),
 FileInfo(path='dbfs:/mnt/dlretail/curated/df5/', name='df5/', size=0, modificationTime=1732398386000)]

In [0]:

from pyspark.sql.functions import col


# Define the curated container path
curated_path = "/mnt/dlretail/curated"

# Load DataFrames
address_df = spark.read.format("delta").load(f"{curated_path}/df1")
city_df = spark.read.format("delta").load(f"{curated_path}/df2")
country_df = spark.read.format("delta").load(f"{curated_path}/df3")
customer_df = spark.read.format("delta").load(f"{curated_path}/df4")

address_df.show(3)
city_df.show(3)
country_df.show(3)
customer_df.show(3)

+----------+--------------------+--------+-------+-----------+-----------+
|address_id|             address|district|city_id|postal_code|      phone|
+----------+--------------------+--------+-------+-----------+-----------+
|         5|      1913 Hanoi Way|Nagasaki|    463|      35200|28303384290|
|        17|  270 Amroha Parkway|Osmaniye|    384|      29610| 6.9548E+11|
|        29|934 San Felipe de...|    Sind|    472|      99780|1.96496E+11|
+----------+--------------------+--------+-------+-----------+-----------+
only showing top 3 rows

+-------+--------------------+----------+
|city_id|                city|country_id|
+-------+--------------------+----------+
|      6|         Addis Abeba|        31|
|     18|Allappuzha (Allep...|        44|
|     30|            Araatuba|        15|
+-------+--------------------+----------+
only showing top 3 rows

+----------+--------+
|country_id| country|
+----------+--------+
|         4|  Angola|
|        14| Bolivia|
|        24|Colombia|

In [0]:
# Filter active customers
active_customers_df = customer_df.filter(customer_df.activebool == True)
active_customers_df.show(5)

+-----------+--------+----------+---------+--------------------+----------+----------+-------------------+------+
|customer_id|store_id|first_name|last_name|               email|address_id|activebool|        create_date|active|
+-----------+--------+----------+---------+--------------------+----------+----------+-------------------+------+
|        524|       1|     Jared|      Ely|jared.ely@sakilac...|       530|      TRUE|2006-02-14 00:00:00|     1|
|          1|       1|      Mary|    Smith|mary.smith@sakila...|         5|      TRUE|2006-02-14 00:00:00|     1|
|          2|       1|  Patricia|  Johnson|patricia.johnson@...|         6|      TRUE|2006-02-14 00:00:00|     1|
|          3|       1|     Linda| Williams|linda.williams@sa...|         7|      TRUE|2006-02-14 00:00:00|     1|
|          4|       2|   Barbara|    Jones|barbara.jones@sak...|         8|      TRUE|2006-02-14 00:00:00|     1|
+-----------+--------+----------+---------+--------------------+----------+----------+--

In [0]:
# Perform joins
customer_address_df = active_customers_df.join(
    address_df, active_customers_df.address_id == address_df.address_id, "inner")

customer_address_city_df = customer_address_df.join(
    city_df, customer_address_df.city_id == city_df.city_id, "inner"
)

final_df = customer_address_city_df.join(
    country_df, customer_address_city_df.country_id == country_df.country_id, "inner"
)
final_df.show(5)

+-----------+--------+----------+---------+--------------------+----------+----------+-------------------+------+----------+-------------------+-----------+-------+-----------+-----------+-------+-----------------+----------+----------+-------+
|customer_id|store_id|first_name|last_name|               email|address_id|activebool|        create_date|active|address_id|            address|   district|city_id|postal_code|      phone|city_id|             city|country_id|country_id|country|
+-----------+--------+----------+---------+--------------------+----------+----------+-------------------+------+----------+-------------------+-----------+-------+-----------+-----------+-------+-----------------+----------+----------+-------+
|         11|       2|      Lisa| Anderson|lisa.anderson@sak...|        15|      TRUE|2006-02-14 00:00:00|     1|        15|1542 Tarlac Parkway|   Kanagawa|    440|       1027|6.35297E+11|    440|       Sagamihara|        50|        50|  Japan|
|         23|       

In [0]:
# Select columns
result_df = final_df.select(
    active_customers_df.customer_id,
    active_customers_df.first_name,
    active_customers_df.last_name,
    active_customers_df.email,
    active_customers_df.activebool.alias("active"),
    address_df.address,
    address_df.district,
    address_df.postal_code,
    address_df.phone,
    city_df.city,
    country_df.country
)

result_df.show(5)

+-----------+----------+---------+--------------------+------+-------------------+-----------+-----------+-----------+-----------------+-------+
|customer_id|first_name|last_name|               email|active|            address|   district|postal_code|      phone|             city|country|
+-----------+----------+---------+--------------------+------+-------------------+-----------+-----------+-----------+-----------------+-------+
|         11|      Lisa| Anderson|lisa.anderson@sak...|  TRUE|1542 Tarlac Parkway|   Kanagawa|       1027|6.35297E+11|       Sagamihara|  Japan|
|         23|     Sarah|    Lewis|sarah.lewis@sakil...|  TRUE|1780 Hino Boulevard|    Liepaja|       7716|9.02731E+11|          Liepaja| Latvia|
|         35|  Virginia|    Green|virginia.green@sa...|  TRUE|   391 Callao Drive|Midi-Pyrnes|      34021|4.40512E+11|         Toulouse| France|
|         47|   Frances|   Parker|frances.parker@sa...|  TRUE|  686 Garland Manor|       Cear|      52535|69493378813|Juazeiro do 

In [0]:
from pyspark.sql.types import *

# Define the schema for result_df
result_schema = StructType([
    StructField("customer_id", IntegerType(), True),
    StructField("first_name", StringType(), True),
    StructField("last_name", StringType(), True),
    StructField("email", StringType(), True),
    StructField("active", BooleanType(), True),
    StructField("address", StringType(), True),
    StructField("district", StringType(), True),
    StructField("postal_code", StringType(), True),
    StructField("phone", StringType(), True),
    StructField("city", StringType(), True),
    StructField("country", StringType(), True)
])

In [0]:
#Mounting staging container

storage_account_name = "dlretail"
container_name = "staging"

mount_point = f"/mnt/{storage_account_name}/{container_name}"

if any(mount.mountPoint == mount_point for mount in dbutils.fs.mounts()):
    # Unmount the existing mount point
    dbutils.fs.unmount(mount_point)
    print(f"Unmounted existing mount at {mount_point}")

try:
    dbutils.fs.mount(
        source=f"abfss://{container_name}@{storage_account_name}.dfs.core.windows.net/",
        mount_point=mount_point,
        extra_configs=configs
    )
    print(f"Mounted {container_name} successfully at {mount_point}")
except Exception as e:
    print(f"Error mounting: {e}")

/mnt/dlretail/staging has been unmounted.
Unmounted existing mount at /mnt/dlretail/staging
Mounted staging successfully at /mnt/dlretail/staging


In [0]:

# Define the staging container path
staging_path = "/mnt/dlretail/staging"

# Write the DataFrame in Delta format
result_df.write.format("delta").mode("append").save(staging_path+"/active_customers")

print("Data saved successfully in Delta format in the staging container.")

# Read the data back to verify
staging_df = spark.read.format("delta").load(staging_path + "/active_customers")

# Show the data
staging_df.show()

Data saved successfully in Delta format in the staging container.
+-----------+----------+----------+--------------------+------+--------------------+--------------------+-----------+-----------+--------------------+------------------+
|customer_id|first_name| last_name|               email|active|             address|            district|postal_code|      phone|                city|           country|
+-----------+----------+----------+--------------------+------+--------------------+--------------------+-----------+-----------+--------------------+------------------+
|         12|     Nancy|    Thomas|nancy.thomas@saki...|  TRUE|    808 Bhopal Manor|             Haryana|      10672|4.65888E+11|        Yamuna Nagar|             India|
|         24|  Kimberly|       Lee|kimberly.lee@saki...|  TRUE|       96 Tafuna Way|              Crdoba|      99865| 9.3473E+11|              Crdoba|         Argentina|
|         36|  Kathleen|     Adams|kathleen.adams@sa...|  TRUE|334 Munger (Mongh...|