kaggle dataset [online_retail_II.csv](https://www.kaggle.com/datasets/mashlyn/online-retail-ii-uci/data)

- create volume under the lab_2026 catalog
- download and place in the compressed directory of the volume


In [0]:
import os

CATALOG_NAME = 'workspace'
SCHEMA_NAME = 'lab_2026'
VOLUME_DIR = os.path.abspath('/Volumes/{}/{}/'.format(CATALOG_NAME, SCHEMA_NAME))
RAW_DIR = os.path.join(VOLUME_DIR, 'raw')
COMPRESSED_RAW_DIR = os.path.join(RAW_DIR, '.compressed')
if not os.path.exists(COMPRESSED_RAW_DIR):
    raise Exception(f"{COMPRESSED_RAW_DIR} Not Found!")

INPUT_FILE = 'online_retail_II.csv'
compressed_file_path = os.path.join(COMPRESSED_RAW_DIR, INPUT_FILE + '.zip')
dbutils.fs.ls(compressed_file_path)

In [0]:
import zipfile

csv_file_path = os.path.join(RAW_DIR)
with zipfile.ZipFile(compressed_file_path, "r") as zip_ref:
    zip_ref.extractall(csv_file_path)
dbutils.fs.ls(csv_file_path)

In [0]:
spark.read.csv(csv_file_path, header=True, inferSchema=False).limit(5).display()

In [0]:
TABLE_NAME = 'online_retail'

def ingest_raw_to_bronze(source_path, table_name):
    from pyspark.sql import functions as F

    source_df = spark.read.csv(source_path, header=True, inferSchema=False)

    renamed_columns = dict()
    for col in source_df.columns:
        renamed_columns[col] = col.replace(' ', '_').lower()

    raw_df = (
        source_df
        .withColumn('_hash_md5', F.md5(F.concat_ws(',', *source_df.columns)))
        .withColumn('_ingest_timestamp', F.current_timestamp())
        .withColumn('_ingest_author', F.current_user())
        .withColumn('_source_file', F.col("_metadata.file_path"))
        .withColumnsRenamed(renamed_columns)
    )

    (
        raw_df
        .write
        .format('delta')
        .mode('overwrite')
        .option('mergeSchema', True)
        .saveAsTable(f'{SCHEMA_NAME}.bronze_{table_name}')
    )

ingest_raw_to_bronze(csv_file_path, TABLE_NAME)
spark.read.table(f'{SCHEMA_NAME}.bronze_{TABLE_NAME}').display()

In [0]:
%sql

DESCRIBE FORMATTED lab_2026.bronze_online_retail

In [0]:
%sql

DESCRIBE HISTORY lab_2026.bronze_online_retail