# Step 1: Build DateDim table

In [0]:
%sql
-- Extract min and max date
SELECT
  CAST(min(order_date) AS DATE)  AS d_min,
  CAST(max(COALESCE(ship_date, order_date)) AS DATE) AS d_max
FROM main.retail.superstore_silver;


## Creating DateDim

In [0]:
%sql

create or replace table main.retail_gold.DateDim (

  date DATE NOT NULL
    COMMENT 'Calendar date (day grain)',
  date_sk INT NOT NULL GENERATED ALWAYS AS (CAST(date_format(date,'yyyyMMdd') AS INT))
    COMMENT 'Surrogate key YYYYMMDD, generated from date',
  
  day_of_week TINYINT NOT NULL GENERATED ALWAYS AS (
    CAST(pmod(dayofweek(date) + 5, 7) + 1 AS TINYINT)
    )
    COMMENT '1=Mon,..., 7=Sun (ISO)',
  
  dow_name STRING NOT NULL GENERATED ALWAYS AS (date_format(date,'EEE'))
    COMMENT 'Mon, Tue,..., Sun (ISO)',
  
  week SMALLINT NOT NULL  GENERATED ALWAYS AS (
    CAST(weekofyear(date) AS SMALLINT)
    )
    COMMENT 'Week number 1 - 53',
  
  month tinyint NOT NULL GENERATED ALWAYS AS (
    CAST(month(date) AS TINYINT)
    )
    COMMENT 'month number 1-12',
  
  month_name STRING NOT NULL GENERATED ALWAYS AS (date_format(date,'MMMM'))
    COMMENT 'Jan, Feb,..., Dec',
  
  quarter tinyint NOT NULL GENERATED ALWAYS AS (CAST(quarter(date) AS TINYINT))
    COMMENT 'quarter number 1-4',
  
  year smallint NOT NULL GENERATED ALWAYS AS (
    CAST(year(date) AS SMALLINT)
    )
    COMMENT 'Gregorian year, not ISO',

  is_month_start BOOLEAN NOT NULL GENERATED ALWAYS AS (
    date = date_trunc('month', date)
    )
    COMMENT 'True if first day of month',

  is_month_end BOOLEAN NOT NULL GENERATED ALWAYS AS (
    date = last_day(date)
    )
    COMMENT 'True if last day of month',
  
  is_quarter_start BOOLEAN NOT NULL GENERATED ALWAYS AS (
    date = date_trunc('quarter', date)
    )
    COMMENT 'True if first day of quarter',
  
  is_quarter_end BOOLEAN NOT NULL GENERATED ALWAYS AS (
    date = last_day(date) AND quarter(date) <> quarter(date_add(date, 1))
    )
    COMMENT 'True if last day of quarter' 
)

USING DELTA
COMMENT 'Date dimension (authoritative calendar ; one row per day)'
TBLPROPERTIES (
  delta.appendOnly = true
);

select * from main.retail_gold.datedim limit 10;

## Filling DateDim

In [0]:
%sql

-- Extended date range to encompase entire years of date bounds
WITH bounds AS (
  SELECT
    CAST('2011-01-01' AS DATE) AS d_min,
    CAST('2015-12-31' AS DATE) AS d_max
),
calendar AS (
  SELECT
    explode(sequence(BOUNDS.d_min, BOUNDS.d_max)) AS date
  FROM bounds
)

INSERT INTO main.retail_gold.DateDim(date)
SELECT
  c.date
FROM calendar c
LEFT ANTI JOIN main.retail_gold.datedim d ON c.date = d.date;

SELECT * from main.retail_gold.DatEDiM limit 10;

## Sanity checks on DateDim

In [0]:
%sql
-- Range: Expect min_date 2011-01-01, max_date 2015-12-31
SELECT min(date) AS min_date, max(date) AS max_date
FROM main.retail_gold.datedim;

In [0]:
%sql
-- Continuity: Expect no rows, datediff > 1 should evaluate to false in all cases.
WITH ordered AS (
  SELECT date, lag(date) OVER (ORDER BY date) AS prev_d
  FROM main.retail_gold.datedim
)
SELECT date AS gap_starts_after
FROM ordered
WHERE prev_d IS NOT NULL AND datediff(date, prev_d) > 1;

In [0]:
%sql
-- Uniqueness of surrogate key: expect n = nd
SELECT COUNT(*) AS n, COUNT(DISTINCT date_sk) AS nd
FROM main.retail_gold.datedim;

# Step 2: Build ProductDim table

In [0]:
%sql
create or replace table main.retail_gold.ProductDim (
  product_id string not null
    comment 'Primary key. Unique identifier for product',
  
  product_name string not null
    comment 'The product\'s name',

  category string not null
    comment 'Top-level product grouping',
  
  subcategory string not null
    comment 'The product\'s sub-category. Rolls up to category',

  row_hash string generated always as (
    sha2(
      concat_ws('||',
        lower(TRIM(product_id)),
        lower(TRIM(product_name)),
        lower(TRIM(category)),
        lower(TRIM(subcategory))
      ),
      256
    )
  )
    comment "SHA2-256 hash of business attributes for change detection",


  created_at timestamp not null default current_timestamp()
    comment 'Timestamp of creation',

  updated_at timestamp not null default current_timestamp()
    comment 'Timestamp of last update'
)

USING DELTA
COMMENT 'Product dimension (Type-1; overwrite on change; one row per product)'
TBLPROPERTIES(
  'delta.feature.allowColumnDefaults' = 'supported'
  );

select * from main.retail_gold.ProductDim;

## Fill ProductDim table

In [0]:
%sql
MERGE INTO main.retail_gold.ProductDim AS t

USING (
  SELECT
    UPPER(TRIM(product_id)) AS product_id,
    TRIM(product_name)      AS product_name,
    TRIM(category)          AS category,
    TRIM(sub_category)      AS subcategory,
    ROW_NUMBER() OVER (
      PARTITION BY UPPER(TRIM(product_id))
      ORDER BY TRIM(product_name)
    ) AS rn,

    sha2(
      concat_ws('||',
        lower(trim(product_name)),
        lower(trim(sub_category)),
        lower(trim(category))
    ), 
    256
    ) AS incoming_hash
  FROM main.retail.superstore_silver
  WHERE product_id IS NOT NULL AND TRIM(product_id) <> ''
) s
ON t.product_id = s.product_id


when MATCHED AND s.rn = 1 AND t.row_hash <> s.incoming_hash THEN
  UPDATE SET
    product_name = s.product_name,
    category     = s.category,
    subcategory  = s.subcategory,
    updated_at   = current_timestamp()


when NOT MATCHED AND s.rn = 1 THEN
  INSERT (product_id, product_name, category, subcategory)
  VALUES (s.product_id, s.product_name, s.category, s.subcategory);


In [0]:
%sql
select * from main.retail_gold.productdim limit 10

# Step 3 Build GeoDim TAble

In [0]:
%sql
select * from main.retail.superstore_silver limit 10

In [0]:
%sql
select distinct region from main.retail.superstore_silver;

In [0]:
%sql

create or replace table main.retail_gold.GeoDim (
  geo_id string not null generated always as (
    sha2(
      concat_ws('||',
        lower(TRIM(region)),
        lower(TRIM(state)),
        lower(TRIM(city)),
        lower(TRIM(postal_code))
      ),
      256
    )
  )
    comment 'Deterministic key made of hashed region, state, city, postal code',
  
  region string not null
    comment 'The region of the store. Allowed set: {Central, East, South, West}',
  
  state string not null
    comment 'The full name of the state (e.g., "Washington")',
  
  city string not null
    comment 'The city name (e.g. "Seattle", "St. Louis")',

  postal_code string not null
    comment 'The postal code (e.g., "98101")',
  
  created_at timestamp not null default current_timestamp()
    comment 'Timestamp of creation',

  updated_at timestamp not null default current_timestamp()
    comment 'Timestamp of last update'
)

using delta
comment 'Geo dimension at ZIP grain (Type-1; one row per region/state/city/zip)'
TBLPROPERTIES(
  'delta.feature.allowColumnDefaults' = 'supported'
  );


select * from main.retail_gold.geodim limit 1

# HERE

In [0]:
%sql
with temp as (
  select
    initcap(trim(region)) as region,
    initcap(trim(state))  as state,
    initcap(trim(city))   as city,
    lpad(substring(regexp_replace(trim(postal_code), '\\D', ''), 1, 5), 5, '0') as postal_code,
    sha2(concat_ws('||',
      lower(trim(initcap(trim(region)))),
      lower(trim(initcap(trim(state)))),
      lower(trim(initcap(trim(city)))),
      lpad(substring(regexp_replace(trim(postal_code), '\\D', ''), 1, 5), 5, '0')
    ), 256) as geo_key,
    row_number() over (
      partition by sha2(concat_ws('||',
        lower(trim(initcap(trim(region)))),
        lower(trim(initcap(trim(state)))),
        lower(trim(initcap(trim(city)))),
        lpad(substring(regexp_replace(trim(postal_code), '\\D', ''), 1, 5), 5, '0')
      ), 256)
      order by city, state
    ) as rn
  from main.retail.superstore_silver
  where region is not null
    and state  is not null
    and city   is not null
    and postal_code is not null
),
dedup as (
  select region, state, city, postal_code, geo_key
  from temp
  where rn = 1
)

merge into main.retail_gold.geodim as g
using dedup as d
on g.geo_id = d.geo_key
when matched and (
     g.region      <> d.region
  or g.state       <> d.state
  or g.city        <> d.city
  or g.postal_code <> d.postal_code
) then update set
  region      = d.region,
  state       = d.state,
  city        = d.city,
  postal_code = d.postal_code,
  updated_at  = current_timestamp()
when not matched then
  insert (region, state, city, postal_code)
  values (d.region, d.state, d.city, d.postal_code);


In [0]:
# HERE FACTS TABLE NEXT