# Step 1: Import necessary libraries

In [0]:
# Databricks notebook source
# Bronze Layer Ingestion Notebook
# This notebook reads raw CSV files and creates bronze Delta tables.
from pyspark.sql import SparkSession

# Get Spark session (already available in Databricks)
spark = SparkSession.builder.getOrCreate()

# Step 2: Define file paths

In [0]:

customers_path = "/Volumes/workspace/retail/retail/customers.csv"
products_path = "/Volumes/workspace/retail/retail/products.csv"  
promotions_path="/Volumes/workspace/retail/retail/promotions.csv"
pos_sales_path="/Volumes/workspace/retail/retail/pos_sales.csv"
stores_path="/Volumes/workspace/retail/retail/stores.csv"


# Step 3: Read customers CSV and create bronze table

In [0]:

# Read with schema inference; add schema manually if needed for consistency
df_customers = spark.read.format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load(customers_path)
# Write as Delta table in bronze database
# First, create the database if it doesn't exist
spark.sql("CREATE DATABASE IF NOT EXISTS bronze")

# Save as table (overwrites if exists; use mode('append') for incremental loads)
df_customers.write.format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable("bronze.customers")

print("Bronze customers table created successfully.")
display(spark.table("bronze.customers").limit(5))


#Step 4: Read products CSV and create bronze table

In [0]:


df_products = spark.read.format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load(products_path)

# Save as Delta table
df_products.write.format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable("bronze.products")

print("Bronze products table created successfully.")

display(spark.table("bronze.products").limit(5))

#Step 4: Read promotions CSV and create bronze table

In [0]:
df_promotions = spark.read.format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load(promotions_path)

# Save as Delta table
df_promotions.write.format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable("bronze.promotions")

print("Bronze promotions table created successfully.")

display(spark.table("bronze.promotions").limit(5))


#Step 5: Read pos_sales CSV and create bronze table

In [0]:
df_pos_sales  = spark.read.format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load(pos_sales_path)

# Save as Delta table
df_pos_sales.write.format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable("bronze.pos_sales")

print("Bronze sales table created successfully.")

display(spark.table("bronze.pos_sales").limit(5))


#Step 6: Read stores CSV and create bronze table

In [0]:
df_stores  = spark.read.format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load(stores_path)

# Save as Delta table
df_stores.write.format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable("bronze.stores")

print("Bronze stores table created successfully.")

display(spark.table("bronze.stores").limit(5))