In [0]:
%sql
CREATE CATALOG IF NOT EXISTS main;
USE CATALOG main;

CREATE SCHEMA IF NOT EXISTS retail;
USE SCHEMA retail;

CREATE VOLUME IF NOT EXISTS raw;


In [0]:
%sql
LIST '/Volumes/main/retail/raw/';


In [0]:
import pandas as pd

pdf = pd.read_excel("/Volumes/main/retail/raw/Superstore.xlsx")

def clean_col(name: str) -> str:
    return (
        name.strip()
            .lower()
            .replace(" ", "_")
            .replace("-", "_")
            .replace("/", "_")
            .replace("(", "")
            .replace(")", "")
    )

pdf.columns = [clean_col(c) for c in pdf.columns]


print(pdf.columns.tolist())


In [0]:
# 4. Convert → Spark DataFrame
df = spark.createDataFrame(pdf)

# 5. Write Bronze Delta
df.write.mode("overwrite").format("delta").saveAsTable("main.retail.superstore_bronze")

In [0]:
%sql
USE CATALOG main;
USE SCHEMA retail;

CREATE OR REPLACE TABLE main.retail.superstore_silver AS
SELECT
  /* Keys */
  TRY_CAST(row_id AS INT)                   AS row_id,
  CAST(order_id AS STRING)                  AS order_id,
  CAST(customer_id AS STRING)               AS customer_id,
  CAST(product_id AS STRING)                AS product_id,

  /* Dates – handle multiple common formats */
  COALESCE(
    to_date(order_date, 'yyyy-MM-dd'),
    to_date(order_date, 'M/d/yyyy')
  )                                         AS order_date,

  COALESCE(
    to_date(ship_date, 'yyyy-MM-dd'),
    to_date(ship_date, 'M/d/yyyy')
  )                                         AS ship_date,

  /* Text dimensions (trim) */
  TRIM(ship_mode)                           AS ship_mode,
  TRIM(customer_name)                       AS customer_name,
  TRIM(segment)                             AS segment,
  TRIM(country)                             AS country,
  TRIM(city)                                AS city,
  TRIM(state)                               AS state,
  TRIM(postal_code)                         AS postal_code,
  TRIM(region)                              AS region,
  TRIM(category)                            AS category,
  TRIM(sub_category)                        AS sub_category,
  TRIM(product_name)                        AS product_name,

  /* Measures */
  TRY_CAST(sales AS DOUBLE)                 AS sales,
  TRY_CAST(quantity AS INT)                 AS quantity,
  TRY_CAST(discount AS DOUBLE)              AS discount,
  TRY_CAST(profit AS DOUBLE)                AS profit,

  /* Derived */
  date_trunc('month',
    COALESCE(to_date(order_date, 'yyyy-MM-dd'),
             to_date(order_date, 'M/d/yyyy'))
  )                                         AS order_month,

  CASE WHEN TRY_CAST(sales AS DOUBLE) IS NOT NULL
            AND TRY_CAST(sales AS DOUBLE) <> 0
       THEN TRY_CAST(profit AS DOUBLE) / TRY_CAST(sales AS DOUBLE)
  END                                       AS profit_margin
FROM main.retail.superstore_bronze;

SELECT * FROM main.retail.superstore_silver;

In [0]:
%sql
SELECT COUNT(*) AS rows, MIN(order_date) AS min_order_date, MAX(order_date) AS max_order_date
FROM main.retail.superstore_silver;



In [0]:
%sql
SELECT SUM(sales) AS total_sales, SUM(profit) AS total_profit
FROM main.retail.superstore_silver;