# YANKI E-COMMERCE ETL PROJECT 

## Load library and dependencies

In [2]:
# Import dependencies
import pandas as pd
from IPython.display import display
import sys
from pathlib import Path
sys.path.append(str(Path.cwd().parent)) # Add parent directory
from dynamic_etl_pipeline import *

print("✅ Environment ready!")

✅ Environment ready!


## DB CONNECTION

In [3]:
# Connect to database
conn = get_db_connection(env_prefix="YANKI_DB_")
#conn = get_db_connection(env_prefix="ZULO_DB_")

[get_db_connection] ✅ Connected to 'user=postgres password=xxx dbname=yanki_ecomm host=localhost port=5432' using prefix 'YANKI_DB_'


## Extract (Load Raw Data)

In [9]:
# Trigger interactive source selection
result = select_and_load_source(ETL_CONFIG)

Dropdown(description='Data Source:', options=('all', 'zulo_bank', 'yanki_ecom', 'csv_ex', 'url_ex'), value='al…

Output()

In [10]:
# setting up parameters to be used in the etl pipeline
raw_df = result["raw_df"]
dataset_key = result["selected_source"]
cfg = ETL_CONFIG["data_sources"][dataset_key]

print(dataset_key)

yanki_ecom


## Transform (Data Cleaning & Normalisation)

### Cleaning Steps for Data Modeling & Normalization

| Step                         | Action                                                                                       | Purpose                                                       |
|------------------------------|----------------------------------------------------------------------------------------------|----------------------------------------------------------------|
| 1. Handle Missing Values     | - Drop rows with missing `Order_ID`, `Customer_ID`                      | These are core entity identifiers; nulls would break relationships |
|                              | - Optionally handle or impute 1 missing value in `State`                                    | Tolerable since it's likely a lookup/dimension table           |
| 2. Ensure Correct Data Types | - Convert `Order_Date` to datetime format                                                   | Enables partitioning and relationship with a Date dimension   |
|                              | - Ensure `Postal_Code` is treated as string if needed (for leading zeros)                   | Maintains data integrity, especially in US/UK postal codes     |
| 3. Remove Duplicates         | - Check for and drop duplicates in `Order_ID`, `Product_ID`, `Customer_ID` combinations     | Avoids redundant relationships or many-to-many mapping noise   |
| 4. Standardize Categorical Fields | - Lowercase and strip spaces in fields like `Category`, `Brand`, `Payment_Method`       | Ensures consistency across joins or dimensions                 |
| 5. Verify Key Relationships  | - Ensure 1:many mappings exist: `Customer` → `Order`, `Order` → `Product`                  | For proper normalization and referential integrity             |
| 6. Normalize Derived Fields  | - Confirm `Total_Price = Price * Quantity` or drop `Total_Price` if calculated later        | Avoid storing redundant calculations in normalized schema      |
| 7. Split Compound Columns    | - Split `Customer_Name` into `First_Name` and `Last_Name`                       | Improves normalization (optional but good practice)            |
| 8. Validate Contact Info     | - Basic regex or null check on `Email`, `Phone_Number`                                     | Prevent invalid records from entering DB                       |


### **Data Modelling (OLTP)**

#### **Data Model**
*Splitting the data into normalized tables like:*  `Customers`, `Products`, `Orders`, `Location`, `Payments`

![alt text](db_model.png)

### Clean & Normalise df to 3NF

In [5]:
raw_df.nunique()

Order_ID               990
Customer_ID            990
Customer_Name          986
Product_ID            1000
Product_Name           628
Brand                  968
Category                 2
Price                  995
Quantity                10
Total_Price            999
Order_Date            1000
Shipping_Address      1000
City                   969
State                   50
Country                240
Postal_Code            994
Email                  999
Phone_Number          1000
Payment_Method           3
Transaction_Status       2
dtype: int64

In [11]:
# Clean, split to normalised tables, check for missing pks and insert missing fk from config
oltp_tables, pk_dict, fk_dict, sk_dict = transform_oltp(dataset_key, cfg, raw_df)


🔁 Processing yanki_ecom...

🧪 Processing source: yanki_ecom
[ensure_correct_dtypes] ✅ Converted 'Order_Date' to datetime (mixed format, normalized).
[remove_duplicates] Removed 20 duplicate row(s) based on all columns.
[handle_missing_critical] Dropped 10 rows with nulls in ['Order_ID', 'Customer_ID']
[split_compound_column] ✅ Split 'Customer_Name' into ['First_Name', 'Last_Name']
[validate_contact_info] Cleaned email in column 'Email'
[validate_contact_info] Cleaned phone in column 'Phone_Number'
[normalize_derived_fields] Created 'Total_Price' from Price * Quantity, rounded to 2dp.
[standardize_categoricals] Standardized fields: ['Order_ID', 'Customer_ID', 'Customer_Name', 'Product_ID', 'Product_Name', 'Brand', 'Category', 'Shipping_Address', 'City', 'State', 'Country', 'Email', 'Phone_Number', 'Payment_Method', 'Transaction_Status', 'First_Name', 'Last_Name']
✅ Finished processing: yanki_ecom — 990 rows, 22 columns
ℹ️ Skipping date_dim — OLAP pipeline or date mapping not configured

In [7]:
# Visualize and inspect all OLTP tables
print("\n OLTP Tables:")
display(list(oltp_tables.keys()))

# Show PKs and table list
print("\n Primary Keys:")
display(pk_dict)

# Show FKs and table list
print("\n Foreign Keys:")
display(fk_dict)

# Show SKs and table list
print("\n Surrogate  Keys:")
display(sk_dict)

for name, df in oltp_tables.items():
    print(f"\n📄 Table: {name} — {df.shape[0]} rows, {df.shape[1]} columns")
    display(df.head(3))
    print("\n📋 Schema Info:")
    display(df.info())  # .info() returns None but prints to console


 OLTP Tables:


['customers', 'accounts', 'loans', 'transactions', 'date_dim']


 Primary Keys:


{'customers': ['customer_id'],
 'accounts': ['account_id'],
 'loans': ['loan_id'],
 'transactions': ['transaction_id'],
 'date_dim': ['date_id']}


 Foreign Keys:


{'accounts': [('customer_id', 'customers', 'customer_id')],
 'loans': [('customer_id', 'customers', 'customer_id')],
 'transactions': [('account_id', 'accounts', 'account_id'),
  ('customer_id', 'customers', 'customer_id')]}


 Surrogate  Keys:


{'customers': ['customer_id'],
 'accounts': ['account_id', 'customer_id', 'opening_date_id'],
 'loans': ['customer_id', 'start_date_id', 'end_date_id'],
 'transactions': ['transaction_id',
  'account_id',
  'customer_id',
  'transaction_date_id'],
 'date_dim': ['date_id']}


📄 Table: customers — 87 rows, 5 columns


Unnamed: 0,customer_id,first_name,last_name,email,phone
0,85,carol,miller,yfisher@example.org,6088279027
1,91,geoffrey,banks,gonzalesgeorge@example.net,154685765185359
2,89,eric,phillips,mark13@example.com,699516763882918



📋 Schema Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87 entries, 0 to 86
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   customer_id  87 non-null     Int64 
 1   first_name   87 non-null     object
 2   last_name    87 non-null     object
 3   email        87 non-null     object
 4   phone        87 non-null     object
dtypes: Int64(1), object(4)
memory usage: 3.6+ KB


None


📄 Table: accounts — 198 rows, 6 columns


Unnamed: 0,account_id,customer_id,account_type,balance,opening_date,opening_date_id
0,88,85,savings,5652.16,2019-08-12,20190812
1,26,91,credit,2881.24,2019-05-06,20190506
2,152,89,savings,2391.9,2020-09-17,20200917



📋 Schema Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 198 entries, 0 to 197
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   account_id       198 non-null    Int64         
 1   customer_id      198 non-null    int64         
 2   account_type     198 non-null    object        
 3   balance          198 non-null    float64       
 4   opening_date     198 non-null    datetime64[ns]
 5   opening_date_id  198 non-null    Int64         
dtypes: Int64(2), datetime64[ns](1), float64(1), int64(1), object(1)
memory usage: 9.8+ KB


None


📄 Table: loans — 129 rows, 9 columns


Unnamed: 0,loan_id,customer_id,loan_amount,loan_type,start_date,end_date,interest,start_date_id,end_date_id
0,44,91,32428.9,mortgage,2021-06-24,2050-01-08,68749.27,20210624,20500108
1,48,91,31406.77,personal,2021-02-27,2038-10-12,145413.35,20210227,20381012
2,76,91,27834.0,personal,2019-12-05,2037-08-15,59843.1,20191205,20370815



📋 Schema Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129 entries, 0 to 128
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   loan_id        129 non-null    Int64         
 1   customer_id    129 non-null    int64         
 2   loan_amount    129 non-null    float64       
 3   loan_type      129 non-null    object        
 4   start_date     129 non-null    datetime64[ns]
 5   end_date       129 non-null    datetime64[ns]
 6   interest       129 non-null    float64       
 7   start_date_id  129 non-null    Int64         
 8   end_date_id    129 non-null    Int64         
dtypes: Int64(3), datetime64[ns](2), float64(2), int64(1), object(1)
memory usage: 9.6+ KB


None


📄 Table: transactions — 1000 rows, 7 columns


Unnamed: 0,transaction_id,transaction_type,amount,transaction_date,account_id,customer_id,transaction_date_id
0,1,withdrawal,102.15,2023-04-26,88,85,20230426
1,2,withdrawal,358.8,2020-06-13,26,91,20200613
2,3,deposit,112.41,2019-05-17,152,89,20190517



📋 Schema Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   transaction_id       1000 non-null   Int64         
 1   transaction_type     1000 non-null   object        
 2   amount               1000 non-null   float64       
 3   transaction_date     1000 non-null   datetime64[ns]
 4   account_id           1000 non-null   int64         
 5   customer_id          1000 non-null   int64         
 6   transaction_date_id  1000 non-null   Int64         
dtypes: Int64(2), datetime64[ns](1), float64(1), int64(2), object(1)
memory usage: 56.8+ KB


None


📄 Table: date_dim — 1094 rows, 11 columns


Unnamed: 0,full_date,date_id,day,month,year,quarter,week,day_name,month_name,is_weekend,is_holiday
379,2019-02-09,20190209,9,2,2019,1,6,Saturday,February,True,False
137,2019-02-11,20190211,11,2,2019,1,7,Monday,February,False,False
1157,2019-02-12,20190212,12,2,2019,1,7,Tuesday,February,False,False



📋 Schema Info:
<class 'pandas.core.frame.DataFrame'>
Index: 1094 entries, 379 to 1017
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   full_date   1094 non-null   datetime64[ns]
 1   date_id     1094 non-null   Int64         
 2   day         1094 non-null   int32         
 3   month       1094 non-null   int32         
 4   year        1094 non-null   int32         
 5   quarter     1094 non-null   int32         
 6   week        1094 non-null   UInt32        
 7   day_name    1094 non-null   object        
 8   month_name  1094 non-null   object        
 9   is_weekend  1094 non-null   bool          
 10  is_holiday  1094 non-null   bool          
dtypes: Int64(1), UInt32(1), bool(2), datetime64[ns](1), int32(4), object(2)
memory usage: 68.4+ KB


None

### Save normalised tables to csv (optional)

In [None]:
#save_tables_to_csv(tables=oltp_tables, export_dir="dataset/")

## Load to postgresSQL (*connection already made*)

In [12]:
result = run_dynamic_etl_pipeline(conn, dataset_key, raw_df, cfg, oltp_tables,pk_dict,fk_dict)
print(result)

ERROR:root:[UPSERT] ❌ Error: relation "temp_customers" already exists

ERROR:root:[UPSERT] ❌ Error: missing data for column "city"
CONTEXT:  COPY temp_locations, line 1: "1	e0d6cb3c-c4b0-4cfe-8225-b65d094d2424	"psc 2224, box 2284"



🔀 Table creation order: ['customers', 'products', 'locations', 'orders', 'payments']
⏩ Table yanki_oltp.customers exists - skipping creation
⏩ Table yanki_oltp.products exists - skipping creation
⏩ Table yanki_oltp.locations exists - skipping creation
⏩ Table yanki_oltp.orders exists - skipping creation
⏩ Table yanki_oltp.payments exists - skipping creation
📌 Loading: customers — PK: ['customer_id'], FK: []
[UPSERT] ❌ Failed to upsert 'yanki_oltp.customers': relation "temp_customers" already exists

📌 Loading: products — PK: ['product_id'], FK: []
[UPSERT] ✅ Successfully upserted 990 rows to yanki_oltp.products
📌 Loading: locations — PK: ['location_id'], FK: [('customer_id', 'customers', 'customer_id')]
[UPSERT] ❌ Failed to upsert 'yanki_oltp.locations': missing data for column "city"
CONTEXT:  COPY temp_locations, line 1: "1	e0d6cb3c-c4b0-4cfe-8225-b65d094d2424	"psc 2224, box 2284"

📌 Loading: orders — PK: ['order_id'], FK: [('customer_id', 'customers', 'customer_id'), ('product_id', 

In [None]:
# create and execute schema + tables
create_and_execute_schema_and_tables(
    conn=conn,
    schema="yanki",
    tables=normalized_tables,
    primary_keys=primary_keys,
    foreign_keys=foreign_keys
)

In [None]:
# Load data to the database table
load_db(
    conn=conn,
    tables=normalized_tables,
    foreign_keys=foreign_keys,
    schema="yanki"
)

In [None]:
# Generate SQL script to insert to the database table (Optional)
export_sql_script(
    schema= "yanki",
    tables= normalized_tables,
    foreign_keys= foreign_keys,
    output_sql_path= "dataset/yanki_insert.sql"
)

![ERD](schemaERD.pgerd.png)

In [None]:
# 2. Create the OLT schema and tables

create_and_execute_schema_and_tables(
    conn=conn,
    schema="zulo_oltp",
    tables=oltp_tables,
    primary_keys=pk_dict,
    foreign_keys=fk_dict,
    surrogate_keys={}  # Not needed here for OLTP
)


# 2. map dateids to oltp tables
from config import CONFIG
cfg = CONFIG["zulo_bank"]
date_map = cfg["oltp"]["date_mapping"]

oltp_with_date_ids = apply_configured_date_mapping(
    tables=oltp_tables,
    date_dim=dim_date,         # Must include 'full_date' and 'date_id' or 'date_sk'
    date_mapping=date_map,
    date_key="date_sk"         # Match your dim_date schema
)


# 2. Create the OLAP schema and tables
reate_and_execute_schema_and_tables(
    conn=conn,
    schema="zulo_olap",
    tables=olap_structure,     # built from config
    primary_keys={},           # OLAP doesn't use natural PKs
    foreign_keys={},           # You can add FK logic later
    surrogate_keys=surrogate_keys


 # Load all OLTP tables (with FK-safe order)
sorted_tables = topological_sort_tables(oltp_tables, foreign_keys)

for table_name in sorted_tables:
    df = oltp_tables[table_name]
    copy_dataframe_to_table(
        conn=conn,
        df=df,
        table_name=table_name,
        schema="zulo_oltp"
    )

# load dim tables
from config import CONFIG

cfg = CONFIG["zulo_bank"]
olap_dims = cfg["olap"]["dimensions"]

dim_lookups = build_and_load_all_dimensions(
    conn=conn,
    oltp_tables=oltp_tables,       # Your normalized OLTP data
    dimension_defs=olap_dims,
    schema="zulo_olap"
)


# load fact tables
from config import CONFIG

cfg = CONFIG["zulo_bank"]
fact_defs = cfg["olap"]["facts"]

build_and_load_all_facts(
    conn=conn,
    oltp_tables=oltp_tables,
    fact_defs=fact_defs,
    dim_lookups=dim_lookups,
    schema="zulo_olap"
)


## validation checks
passed = qa_runner_with_pass(
    oltp_tables=oltp_tables,
    dim_lookups=dim_lookups,
    fact_tables={
        "fact_transactions": fact_transactions_df,
        "fact_loans": fact_loans_df
    },
    checks={
        "fact_transactions": {
            "check_not_null": ["account_sk", "transaction_date_id"],
            "fk_checks": {"account_sk": "dim_accounts"}
        },
        "fact_loans": {
            "check_not_null": ["customer_sk", "start_date_id"],
            "fk_checks": {"customer_sk": "dim_customers"}
        }
    }
)

if passed:
    print("🎉 All QA checks passed. Pipeline can continue.")
else:
    print("🚫 QA checks failed. Review before proceeding.")


# index generator
generate_indexes_on_sk_and_date_ids(conn, fact_tables, schema="zulo_olap")


# materialised view
create_materialized_fact_summary(
    conn=conn,
    fact_table="fact_transactions",
    dim_date="dim_date",
    date_fk="transaction_date_id",
    group_fields=["d.year", "d.month", "f.account_sk"],
    measures={"amount": "SUM", "transaction_sk": "COUNT"},
    schema="zulo_olap"
)


