## Bronze layer

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, DecimalType, LongType
from pyspark.sql.functions import col, lit, current_timestamp, to_date, monotonically_increasing_id
from datetime import datetime, timezone

## Initialize SparkSession with Blob credentials

In [0]:
tenant_id = dbutils.secrets.get(scope="databricks scope", key="tenant-id")
client_id = dbutils.secrets.get(scope="databricks scope", key="client-id")
client_secret = dbutils.secrets.get(scope="databricks scope", key="client-secret")
spark = SparkSession.builder.appName("Bronze Batch Ingestion") \
    .config("fs.azure.account.auth.type.demotri.dfs.core.windows.net", "OAuth") \
    .config("fs.azure.account.oauth.provider.type.demotri.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider") \
    .config("fs.azure.account.oauth2.client.id.demotri.dfs.core.windows.net",client_id) \
    .config("fs.azure.account.oauth2.client.secret.demotri.dfs.core.windows.net",client_secret) \
    .config("fs.azure.account.oauth2.client.endpoint.demotri.dfs.core.windows.net", f"https://login.microsoftonline.com/{tenant_id}/oauth2/token") \
    .getOrCreate()


## JDBC Azure SQL config

In [0]:
db_user = dbutils.secrets.get(scope="databricks scope", key="db-user")
db_password = dbutils.secrets.get(scope="databricks scope", key="db-password")

jdbc_url = (
    "jdbc:sqlserver://testsecondserver.database.windows.net:1433;"
    "databaseName=testNewdb;"
    "encrypt=true;"
    "trustServerCertificate=false;"
    "hostNameInCertificate=*.database.windows.net;"
    "loginTimeout=30;"
)
connection_properties = {
    "user": db_user,
    "password": db_password,
    "driver": "com.microsoft.sqlserver.jdbc.SQLServerDriver"
}

## Read SQL table

In [0]:
jdbc_df = spark.read.jdbc(
    url=jdbc_url,
    table="dbo.jdbc_data",
    properties=connection_properties
)


## Read Blob CSV (uploaded as blob_storage_data.csv)

In [0]:
storage_key = dbutils.secrets.get(scope="databricks scope", key="demotri-key")

spark.conf.set(
    "fs.azure.account.key.demotri.dfs.core.windows.net",
    storage_key
)


In [0]:
blob_path = "abfss://data@demotri.dfs.core.windows.net/blob_storage_data.csv"

df = spark.read.option("header", "true").csv(blob_path)


## Join Datasets

In [0]:
# Perform INNER JOIN on 'customer_id'
joined_df = jdbc_df.alias("j").join(df.alias("b"), on="customer_id", how="inner")

# Show number of joined records
print("Joined Count:", joined_df.count())

# Show schema of the resulting DataFrame
joined_df.printSchema()

# Display top 10 records (Databricks-specific)
joined_df.orderBy("customer_id")


Joined Count: 1000
root
 |-- Customer_ID: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Annual_Income: string (nullable = true)
 |-- Satisfaction_Score: string (nullable = true)
 |-- Email_Opt_In: string (nullable = true)
 |-- Target_Churn: string (nullable = true)
 |-- Total_Spend: string (nullable = true)
 |-- Years_as_Customer: string (nullable = true)
 |-- Num_of_Purchases: string (nullable = true)
 |-- Average_Transaction_Amount: string (nullable = true)
 |-- Num_of_Returns: string (nullable = true)
 |-- Num_of_Support_Contacts: string (nullable = true)
 |-- Last_Purchase_Days_Ago: string (nullable = true)
 |-- Promotion_Response: string (nullable = true)



DataFrame[Customer_ID: string, Age: string, Gender: string, Annual_Income: string, Satisfaction_Score: string, Email_Opt_In: string, Target_Churn: string, Total_Spend: string, Years_as_Customer: string, Num_of_Purchases: string, Average_Transaction_Amount: string, Num_of_Returns: string, Num_of_Support_Contacts: string, Last_Purchase_Days_Ago: string, Promotion_Response: string]

# Create a schema 

In [0]:
# Create a schema if not exists
spark.sql("CREATE SCHEMA IF NOT EXISTS rawdata.Bronze_layer")

DataFrame[]

In [0]:
from pyspark.sql.functions import current_date
from datetime import datetime, timezone

# Add the ingest_date column to the DataFrame
new_df = joined_df.withColumn("ingest_date", current_date())

# Optional: Enable dynamic partition overwrite to avoid replacing the whole table
spark.conf.set("spark.sql.sources.partitionOverwriteMode", "dynamic")

# Write to Delta table with partitioning
new_df.write.format("delta") \
    .mode("overwrite") \
    .partitionBy("ingest_date") \
     \
    .saveAsTable("rawdata.bronze_layer.rawfile") 

# Confirmation message
print(f"Bronze ingestion completed successfully for {datetime.now(timezone.utc).strftime('%Y-%m-%d')}")

Bronze ingestion completed successfully for 2025-12-26
