In [0]:
#/Volumes/workspace/default/1912/products.csv

In [0]:
%sql
CREATE DATABASE IF NOT EXISTS bronze;

In [0]:
%sql
SHOW DATABASES;

databaseName
bronze
default
information_schema
insurance_bronze
insurance_gold
insurance_silver


In [0]:
#/Volumes/workspace/default/1912/products.csv
sales_b1_df = spark.read.csv(
    "/Volumes/workspace/default/1912/sales_batch_1.csv",
    header=True,
    inferSchema=True
)

display(sales_b1_df)


transaction_id,transaction_timestamp,store_id,product_id,quantity,unit_price,discount,total_amount,currency
T00112,2024-06-01T07:18:51.000Z,S012,P0096,2,97.66,14.16,181.16,INR
T00698,2024-06-02T02:34:26.000Z,S015,P0056,1,263.09,11.48,251.61,USD
T00311,2024-05-31T22:24:13.000Z,S015,P0059,1,126.21,1.48,124.73,USD
T00463,2024-06-03T01:01:54.000Z,S019,P0194,1,171.84,8.32,163.52,EUR
T00426,2024-06-01T11:13:33.000Z,S021,P0151,5,98.56,12.42,480.38,USD
T00963,2024-06-02T19:14:24.000Z,INVALID_STORE,P0096,-1,31.77,0.05,-31.82,INR
T00406,2024-06-02T21:30:55.000Z,S010,P0156,1,209.34,18.48,190.86,INR
T00592,2024-06-02T17:26:54.000Z,S035,P0069,2,30.71,3.32,58.1,INR
T00661,2024-06-02T11:27:33.000Z,S031,P0040,-1,166.59,8.76,-175.35,EUR
T01181,2024-06-01T14:07:30.000Z,S031,P0171,-1,285.73,19.99,-305.72,INR


In [0]:
products_df = spark.read.csv(
    "/Volumes/workspace/default/1912/products.csv",
    header=True,
    inferSchema=True
)
display(products_df)


product_id,product_name,category,brand,standard_price
P0001,Product_1,Home,BrandB,127.08
P0002,Product_2,Sports,BrandC,374.77
P0003,Product_3,Beauty,BrandD,184.19
P0004,Product_4,Clothing,BrandD,50.0
P0005,Product_5,Electronics,BrandA,80.66
P0006,Product_6,Clothing,BrandD,300.22
P0007,Product_7,Beauty,BrandD,299.96
P0008,Product_8,Beauty,BrandC,485.68
P0009,Product_9,Electronics,BrandA,342.45
P0010,Product_10,Sports,BrandC,385.46


In [0]:
stores_df = spark.read.csv(
    "/Volumes/workspace/default/1912/stores.csv",
    header=True,
    inferSchema=True
)

display(stores_df)


store_id,store_name,region,country
S001,Store_1,North,India
S002,Store_2,East,India
S003,Store_3,South,India
S004,Store_4,North,Germany
S005,Store_5,North,Germany
S006,Store_6,West,India
S007,Store_7,North,India
S008,Store_8,South,India
S009,Store_9,North,Germany
S010,Store_10,South,Germany


In [0]:
from pyspark.sql.functions import current_timestamp, lit

sales_bronze = sales_b1_df \
    .withColumn("ingestion_timestamp", current_timestamp()) \
    .withColumn("source_system", lit("sales_csv"))

products_bronze = products_df \
    .withColumn("ingestion_timestamp", current_timestamp()) \
    .withColumn("source_system", lit("product_csv"))

stores_bronze = stores_df \
    .withColumn("ingestion_timestamp", current_timestamp()) \
    .withColumn("source_system", lit("store_csv"))


In [0]:
sales_bronze.write.format("delta") \
    .mode("append") \
    .saveAsTable("bronze.sales_raw")


In [0]:
products_bronze.write.format("delta") \
    .mode("overwrite") \
    .saveAsTable("bronze.products_raw")


In [0]:
stores_bronze.write.format("delta") \
    .mode("overwrite") \
    .saveAsTable("bronze.stores_raw")


In [0]:
%sql
SELECT COUNT(*) FROM bronze.sales_raw;



COUNT(*)
700


In [0]:
%sql
SELECT COUNT(*) FROM bronze.products_raw;


COUNT(*)
200


In [0]:
%sql
SELECT COUNT(*) AS stores_count FROM bronze.stores_raw;


stores_count
50


In [0]:
# Creating bronze logs table
spark.sql("""
CREATE TABLE IF NOT EXISTS bronze_logs (
    run_id STRING,
    status STRING,
    timestamp TIMESTAMP,
    record_count BIGINT,
    message STRING
)
""")

DataFrame[]

In [0]:
from datetime import datetime
import uuid
run_id = str(uuid.uuid4())
df = spark.table("bronze.sales_raw")
row_count = df.count()
spark.sql(
    f"""
    INSERT INTO bronze_logs
    VALUES (
        '{run_id}',
        'SUCCESS',
        current_timestamp(),
        {row_count},
        NULL
    )
    """
)


DataFrame[num_affected_rows: bigint, num_inserted_rows: bigint]

In [0]:
#Monitoring
spark.sql("""
          SELECT * FROM bronze_logs ORDER BY timestamp DESC;
""")


DataFrame[run_id: string, status: string, timestamp: timestamp, record_count: bigint, message: string]

In [0]:
#to show incremental logic
batch2 = spark.read.csv(
    "/Volumes/workspace/default/1912/sales_batch_2.csv",
    header=True,
    inferSchema=True
)

batch2 \
  .withColumn("ingestion_timestamp", current_timestamp()) \
  .withColumn("source_system", lit("sales_batch_2")) \
  .write.format("delta") \
  .mode("append") \
  .saveAsTable("bronze.sales_raw")
