## Step 1: Dataset & Storage Setup

**Dataset Source:** [Online Retail II (UCI) - Kaggle](https://www.kaggle.com/datasets/mashlyn/online-retail-ii-uci/data)

**Setup Instructions:**
1. Create a **Volume** within the `lab_2026` catalog.
2. Download the dataset and upload it to the `/.compressed` directory of the created volume.


In [0]:
import os

CATALOG_NAME = 'workspace'
SCHEMA_NAME = 'lab_2026'
VOLUME_DIR = os.path.abspath('/Volumes/{}/{}/'.format(CATALOG_NAME, SCHEMA_NAME))
RAW_DIR = os.path.join(VOLUME_DIR, 'raw')
COMPRESSED_RAW_DIR = os.path.join(RAW_DIR, '.compressed')
if not os.path.exists(COMPRESSED_RAW_DIR):
    raise Exception(f"{COMPRESSED_RAW_DIR} Not Found!")

INPUT_FILE = 'online_retail_II.csv'
compressed_file_path = os.path.join(COMPRESSED_RAW_DIR, INPUT_FILE + '.zip')
dbutils.fs.ls(compressed_file_path)

[FileInfo(path='dbfs:/Volumes/workspace/lab_2026/raw/.compressed/online_retail_II.csv.zip', name='online_retail_II.csv.zip', size=15217139, modificationTime=1766781209000)]

In [0]:
import zipfile

csv_file_path = os.path.join(RAW_DIR)
with zipfile.ZipFile(compressed_file_path, "r") as zip_ref:
    zip_ref.extractall(csv_file_path)
dbutils.fs.ls(csv_file_path)

[FileInfo(path='dbfs:/Volumes/workspace/lab_2026/raw/.compressed/', name='.compressed/', size=0, modificationTime=1766782485612),
 FileInfo(path='dbfs:/Volumes/workspace/lab_2026/raw/online_retail_II.csv', name='online_retail_II.csv', size=94850204, modificationTime=1766782485000)]

In [0]:
spark.read.csv(csv_file_path, header=True, inferSchema=False).limit(5).display()

Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01 07:45:00,6.95,13085.0,United Kingdom
489434,79323P,PINK CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom
489434,79323W,WHITE CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom
489434,22041,"""RECORD FRAME 7"""" SINGLE SIZE """,48,2009-12-01 07:45:00,2.1,13085.0,United Kingdom
489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,2009-12-01 07:45:00,1.25,13085.0,United Kingdom


## Step 2: The Ingestion Logic

In [0]:
def ingest_raw_to_bronze(source_path, table_name):
    from pyspark.sql import functions as F

    source_df = spark.read.csv(source_path, header=True, inferSchema=False)

    renamed_columns = dict()
    for col in source_df.columns:
        renamed_columns[col] = col.replace(' ', '_').lower()

    raw_df = (
        source_df
        .withColumn('_hash_md5', F.md5(F.concat_ws(',', *source_df.columns)))
        .withColumn('_ingest_timestamp', F.current_timestamp())
        .withColumn('_ingest_author', F.current_user())
        .withColumn('_source_file', F.col("_metadata.file_path"))
        .withColumnsRenamed(renamed_columns)
    )

    (
        raw_df
        .write
        .format('delta')
        .mode('overwrite')
        .option('mergeSchema', True)
        .saveAsTable(table_name)
    )

TABLE_NAME = 'online_retail'
table_name = f'{SCHEMA_NAME}.bronze_{TABLE_NAME}'
ingest_raw_to_bronze(csv_file_path, table_name)
spark.read.table(table_name).display()

invoice,stockcode,description,quantity,invoicedate,price,customer_id,country,_hash_md5,_ingest_timestamp,_ingest_author,_source_file
489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01 07:45:00,6.95,13085.0,United Kingdom,8f540e2dceb601d1c14994732a5dec97,2025-12-26T20:54:49.150Z,gopinadh5g7@sasi.ac.in,dbfs:/Volumes/workspace/lab_2026/raw/online_retail_II.csv
489434,79323P,PINK CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom,3a80e4c67de6e819b5daeb79395ab317,2025-12-26T20:54:49.150Z,gopinadh5g7@sasi.ac.in,dbfs:/Volumes/workspace/lab_2026/raw/online_retail_II.csv
489434,79323W,WHITE CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom,5ff842f39a6d8bdb90aa2522efa4072f,2025-12-26T20:54:49.150Z,gopinadh5g7@sasi.ac.in,dbfs:/Volumes/workspace/lab_2026/raw/online_retail_II.csv
489434,22041,"""RECORD FRAME 7"""" SINGLE SIZE """,48,2009-12-01 07:45:00,2.1,13085.0,United Kingdom,af0d02f57d2e9daee8e5e3d1561359fd,2025-12-26T20:54:49.150Z,gopinadh5g7@sasi.ac.in,dbfs:/Volumes/workspace/lab_2026/raw/online_retail_II.csv
489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,2009-12-01 07:45:00,1.25,13085.0,United Kingdom,69145db326f632aa37e590fbded6f0ab,2025-12-26T20:54:49.150Z,gopinadh5g7@sasi.ac.in,dbfs:/Volumes/workspace/lab_2026/raw/online_retail_II.csv
489434,22064,PINK DOUGHNUT TRINKET POT,24,2009-12-01 07:45:00,1.65,13085.0,United Kingdom,d909f1dce88457d0e4723cc8cf7a8d69,2025-12-26T20:54:49.150Z,gopinadh5g7@sasi.ac.in,dbfs:/Volumes/workspace/lab_2026/raw/online_retail_II.csv
489434,21871,SAVE THE PLANET MUG,24,2009-12-01 07:45:00,1.25,13085.0,United Kingdom,f7a1f3e1eb542ef39cf2b255a4c4c0a3,2025-12-26T20:54:49.150Z,gopinadh5g7@sasi.ac.in,dbfs:/Volumes/workspace/lab_2026/raw/online_retail_II.csv
489434,21523,FANCY FONT HOME SWEET HOME DOORMAT,10,2009-12-01 07:45:00,5.95,13085.0,United Kingdom,61121fde713280adf87973245df2e166,2025-12-26T20:54:49.150Z,gopinadh5g7@sasi.ac.in,dbfs:/Volumes/workspace/lab_2026/raw/online_retail_II.csv
489435,22350,CAT BOWL,12,2009-12-01 07:46:00,2.55,13085.0,United Kingdom,eaf2e356b76cee2872ed7a5d09389dd4,2025-12-26T20:54:49.150Z,gopinadh5g7@sasi.ac.in,dbfs:/Volumes/workspace/lab_2026/raw/online_retail_II.csv
489435,22349,"DOG BOWL , CHASING BALL DESIGN",12,2009-12-01 07:46:00,3.75,13085.0,United Kingdom,e3aad9c7dac0fee528d0978f4d1d4ad9,2025-12-26T20:54:49.150Z,gopinadh5g7@sasi.ac.in,dbfs:/Volumes/workspace/lab_2026/raw/online_retail_II.csv


In [0]:
%sql

DESCRIBE FORMATTED lab_2026.bronze_online_retail

col_name,data_type,comment
invoice,string,
stockcode,string,
description,string,
quantity,string,
invoicedate,string,
price,string,
customer_id,string,
country,string,
_hash_md5,string,
_ingest_timestamp,timestamp,


In [0]:
%sql

DESCRIBE HISTORY lab_2026.bronze_online_retail

version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
3,2025-12-26T20:54:53.000Z,76410191890103,gopinadh5g7@sasi.ac.in,CREATE OR REPLACE TABLE AS SELECT,"Map(partitionBy -> [], clusterBy -> [], description -> null, isManaged -> true, properties -> {""delta.enableDeletionVectors"":""true""}, statsOnLoad -> true)",,List(2884203686590536),1226-191418-v62hfpy0-v2n,2.0,WriteSerializable,False,"Map(numFiles -> 2, numRemovedFiles -> 2, numRemovedBytes -> 41105496, numDeletionVectorsRemoved -> 0, numOutputRows -> 1067371, numOutputBytes -> 41105496)",,Databricks-Runtime/17.3.x-aarch64-photon-scala2.13
2,2025-12-26T20:41:37.000Z,76410191890103,gopinadh5g7@sasi.ac.in,CREATE OR REPLACE TABLE AS SELECT,"Map(partitionBy -> [], clusterBy -> [], description -> null, isManaged -> true, properties -> {""delta.enableDeletionVectors"":""true""}, statsOnLoad -> true)",,List(2884203686590536),1226-191418-v62hfpy0-v2n,1.0,WriteSerializable,False,"Map(numFiles -> 2, numRemovedFiles -> 2, numRemovedBytes -> 41105496, numDeletionVectorsRemoved -> 0, numOutputRows -> 1067371, numOutputBytes -> 41105496)",,Databricks-Runtime/17.3.x-aarch64-photon-scala2.13
1,2025-12-26T20:41:28.000Z,76410191890103,gopinadh5g7@sasi.ac.in,CREATE OR REPLACE TABLE AS SELECT,"Map(partitionBy -> [], clusterBy -> [], description -> null, isManaged -> true, properties -> {""delta.enableDeletionVectors"":""true""}, statsOnLoad -> true)",,List(2884203686590536),1226-191418-v62hfpy0-v2n,0.0,WriteSerializable,False,"Map(numFiles -> 2, numRemovedFiles -> 2, numRemovedBytes -> 41105496, numDeletionVectorsRemoved -> 0, numOutputRows -> 1067371, numOutputBytes -> 41105496)",,Databricks-Runtime/17.3.x-aarch64-photon-scala2.13
0,2025-12-26T20:41:15.000Z,76410191890103,gopinadh5g7@sasi.ac.in,CREATE OR REPLACE TABLE AS SELECT,"Map(partitionBy -> [], clusterBy -> [], description -> null, isManaged -> true, properties -> {""delta.enableDeletionVectors"":""true""}, statsOnLoad -> true)",,List(2884203686590536),1226-191418-v62hfpy0-v2n,,WriteSerializable,False,"Map(numFiles -> 2, numRemovedFiles -> 0, numRemovedBytes -> 0, numDeletionVectorsRemoved -> 0, numOutputRows -> 1067371, numOutputBytes -> 41105496)",,Databricks-Runtime/17.3.x-aarch64-photon-scala2.13
