# YANKI E-COMMERCE ETL PROJECT 

In [18]:
import sys
from pathlib import Path
sys.path.append(str(Path.cwd().parent)) # Add parent directory
import importlib

import pandas as pd
import numpy as np

from etl_end_to_end import *


## EXTRACT

In [19]:
# df = read_data("data.csv")  # Local file
# df = read_data("1DdmNsrdBRLfzBdgtvzvFHZ7ejFpLlpwW", "gdrive")  # Google Drive
# df = read_data("https://example.com/data.csv")  # URL (auto-detected)
# df.head()

In [20]:
source ="1lPmrM-4EJLfM14E_3pWF2aLl_Y7HsoHa"
yanki_df =  read_data(source, "gdrive")  # Google Drive
yanki_df.head()

Unnamed: 0,Order_ID,Customer_ID,Customer_Name,Product_ID,Product_Name,Brand,Category,Price,Quantity,Total_Price,Order_Date,Shipping_Address,City,State,Country,Postal_Code,Email,Phone_Number,Payment_Method,Transaction_Status
0,,,,64b689a8-bf03-47d2-a5cc-0723baeb1606,major,Haynes PLC,perfume,52.34,2,104.68,27/05/2020 14:20,"1648 Brown Bridge Apt. 846\nCaitlinland, MI 57992",Davidstad,Utah,China,50247,karen64@example.com,001-217-511-0290x8262,Credit Card,completed
1,,,,64b689a8-bf03-47d2-a5cc-0723baeb1606,major,Haynes PLC,perfume,52.34,2,104.68,27/05/2020 14:20,"1648 Brown Bridge Apt. 846\nCaitlinland, MI 57992",Davidstad,Utah,China,50247,karen64@example.com,001-217-511-0290x8262,Credit Card,completed
2,e32af09a-b6ab-497e-af3a-331e4d4ed6e7,e0d6cb3c-c4b0-4cfe-8225-b65d094d2424,Dominic Buchanan,2ef6e8fa-6a36-4515-b1c2-a0a700abf386,despite,"Lawson, Stone and Campos",perfume oil,250.57,2,501.14,06/03/2020 06:49,"PSC 2224, Box 2284\nAPO AP 65880",Jordanborough,Arkansas,Kuwait,27879,margaret97@example.com,259.603.6134,Debit Card,completed
3,86eb8859-14ab-4d4e-9267-c5826f4e0c8e,fa3ca35a-5540-404b-a7eb-9001cdcbd840,Daniel Allen,3ba38e01-f8e7-4af2-9246-87ef0961d4f5,sea,Washington Group,perfume oil,179.81,6,1078.86,31/07/2020 07:09,"83909 Johnson Mall\nTranberg, MS 99017",Lake Ginatown,Georgia,Saint Vincent and the Grenadines,39336,angela55@example.com,+1-869-659-4272x982,Credit Card,completed
4,7379b560-8897-4623-92a2-523ddcdc43a1,7ad4de53-e6d7-4cd3-99b8-13fb70fe7a34,Daniel Schmidt,a58c53bd-a34b-4541-b926-bec9eb84cac2,suddenly,Rodgers Ltd,perfume oil,600.55,7,4203.85,11/05/2021 04:33,"82101 Johnson Flat\nWest Ryan, MS 29075",Sarahville,Ohio,France,30845,wayne59@example.org,292.840.0975x724,PayPal,pending


In [21]:
yanki_df.info()
print(yanki_df.columns)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1020 entries, 0 to 1019
Data columns (total 20 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Order_ID            1000 non-null   object 
 1   Customer_ID         1000 non-null   object 
 2   Customer_Name       1000 non-null   object 
 3   Product_ID          1020 non-null   object 
 4   Product_Name        1020 non-null   object 
 5   Brand               1020 non-null   object 
 6   Category            1020 non-null   object 
 7   Price               1020 non-null   float64
 8   Quantity            1020 non-null   int64  
 9   Total_Price         1020 non-null   float64
 10  Order_Date          1020 non-null   object 
 11  Shipping_Address    1020 non-null   object 
 12  City                1020 non-null   object 
 13  State               1019 non-null   object 
 14  Country             1020 non-null   object 
 15  Postal_Code         1020 non-null   int64  
 16  Email 

## TRANSFORM 

### **Cleaning Steps for Data Modeling & Normalization**

| Step                         | Action                                                                                       | Purpose                                                       |
|------------------------------|----------------------------------------------------------------------------------------------|----------------------------------------------------------------|
| 1. Handle Missing Values     | - Drop rows with missing `Order_ID`, `Customer_ID`                      | These are core entity identifiers; nulls would break relationships |
|                              | - Optionally handle or impute 1 missing value in `State`                                    | Tolerable since it's likely a lookup/dimension table           |
| 2. Ensure Correct Data Types | - Convert `Order_Date` to datetime format                                                   | Enables partitioning and relationship with a Date dimension   |
|                              | - Ensure `Postal_Code` is treated as string if needed (for leading zeros)                   | Maintains data integrity, especially in US/UK postal codes     |
| 3. Remove Duplicates         | - Check for and drop duplicates in `Order_ID`, `Product_ID`, `Customer_ID` combinations     | Avoids redundant relationships or many-to-many mapping noise   |
| 4. Standardize Categorical Fields | - Lowercase and strip spaces in fields like `Category`, `Brand`, `Payment_Method`       | Ensures consistency across joins or dimensions                 |
| 5. Verify Key Relationships  | - Ensure 1:many mappings exist: `Customer` → `Order`, `Order` → `Product`                  | For proper normalization and referential integrity             |
| 6. Normalize Derived Fields  | - Confirm `Total_Price = Price * Quantity` or drop `Total_Price` if calculated later        | Avoid storing redundant calculations in normalized schema      |
| 7. Split Compound Columns    | - Split `Customer_Name` into `First_Name` and `Last_Name`                       | Improves normalization (optional but good practice)            |
| 8. Validate Contact Info     | - Basic regex or null check on `Email`, `Phone_Number`                                     | Prevent invalid records from entering DB                       |


### **Data Cleaning**

In [22]:
yanki_cleaned = yanki_df.copy()

In [23]:
yanki_cleaned= handle_missing_critical(yanki_cleaned, ['Order_ID', 'Customer_ID'])
yanki_cleaned= ensure_correct_dtypes(yanki_cleaned, ['Order_Date'])
yanki_cleaned= standardize_categoricals(yanki_cleaned)
yanki_cleaned= split_compound_column(yanki_cleaned, 'Customer_Name', ['First_Name', 'Last_Name'])
yanki_cleaned= validate_contact_info(yanki_cleaned, email_col='Email', phone_col='Phone_Number')
#yanki_cleaned= remove_duplicates_by_keys(yanki_cleaned, ['Order_ID', 'Product_ID', 'Customer_ID'])
yanki_cleaned= normalize_derived_fields(yanki_cleaned, price_col='Price', qty_col='Quantity')
yanki_cleaned= verify_key_relationships(yanki_cleaned, ['Customer_ID','Order_ID','Product_ID'])
yanki_cleaned = lowercase_columns(yanki_cleaned)
yanki_cleaned.info()

[handle_missing_critical] Dropped 20 rows with nulls in ['Order_ID', 'Customer_ID']
[ensure_correct_dtypes] Converted 'Order_Date' to datetime using day-first format.
[standardize_categoricals] Standardized fields: ['Order_ID', 'Customer_ID', 'Customer_Name', 'Product_ID', 'Product_Name', 'Brand', 'Category', 'Shipping_Address', 'City', 'State', 'Country', 'Email', 'Phone_Number', 'Payment_Method', 'Transaction_Status']
[split_compound_column] Split 'Customer_Name' into ['First_Name', 'Last_Name']
[validate_contact_info] Cleaned email in column 'Email'
[validate_contact_info] Cleaned phone in column 'Phone_Number'
[normalize_derived_fields] Created 'computed_total' from Price * Quantity, rounded to 2dp.
[verify_key_relationships] Column 'Customer_ID' has 0 missing foreign key values.
[verify_key_relationships] Column 'Order_ID' has 0 missing foreign key values.
[verify_key_relationships] Column 'Product_ID' has 0 missing foreign key values.
[lowercase_columns] Column names converted to

### **Data Modelling**

#### **Data Model**
*Splitting the data into normalized tables like:*  `Customers`, `Products`, `Orders`, `Location`, `Payments`

![alt text](db_model.png)

In [24]:
# Define the table specifications
table_definitions = {
    "customers": ['customer_id', 'first_name', 'last_name', 'email', 'phone_number'],
    "products": ['product_id', 'product_name', 'brand', 'category', 'price'],
    "orders": ['order_id', 'customer_id', 'product_id', 'quantity', 'computed_total', 'order_date'],
    "locations": ['customer_id', 'shipping_address', 'city', 'state', 'country', 'postal_code'],
    "payments": ['order_id', 'payment_method', 'transaction_status']
}
normalized_tables = split_normalized_tables(yanki_cleaned, table_definitions)
globals().update(normalized_tables) #access the tables as standalone variables


[split_normalized_tables] Column names normalized to lowercase.
[split_normalized_tables] Created table 'customers' with 990 rows.
[split_normalized_tables] Created table 'products' with 990 rows.
[split_normalized_tables] Created table 'orders' with 990 rows.
[split_normalized_tables] Created table 'locations' with 990 rows.
[split_normalized_tables] Created table 'payments' with 990 rows.


In [25]:
# Add a surrogate key to the desired table (e.g., 'location_df')
locations.insert(0, 'location_id', range(1, len(locations) + 1))
normalized_tables['locations'] = locations

In [26]:
# Add location_ to orders table using customer_id match
orders = orders.merge(locations[['customer_id', 'location_id']], on='customer_id', how='left')
normalized_tables['orders'] = orders # Update the dictionary again

### Save to csv (optional)

In [27]:
#save_tables_to_csv(tables=normalized_tables, export_dir="dataset/")

## LOAD

In [28]:
# initiate connection to pgadmin db
conn = get_db_connection()

[get_db_connection] Connection to PostgreSQL successful.


In [29]:
# Infer PK & FK
primary_keys, foreign_keys = infer_keys_from_normalized_tables(normalized_tables)



[infer_keys_from_normalized_tables] 🔍 Starting key inference...

✅ PRIMARY KEY for 'customers': customer_id
✅ PRIMARY KEY for 'products': product_id
✅ PRIMARY KEY for 'orders': order_id
✅ PRIMARY KEY for 'locations': location_id
❌ No PK found in 'payments'
ℹ️ 'customers' has a PK but no FKs.
ℹ️ 'products' has a PK but no FKs.
🔗 FOREIGN KEY in 'orders': customer_id → customers.customer_id
🔗 FOREIGN KEY in 'orders': product_id → products.product_id
🔗 FOREIGN KEY in 'orders': location_id → locations.location_id
🔗 FOREIGN KEY in 'locations': customer_id → customers.customer_id
🔗 FOREIGN KEY in 'payments': order_id → orders.order_id

✅ Inference Summary: 4 PKs and 5 FKs across 3 tables.



In [30]:
# Perform a dependency-aware sort on your tables based on foreign key references:
creation_order = topological_sort_tables(
    tables=normalized_tables,
    foreign_keys=foreign_keys
)
print(creation_order)


['customers', 'products', 'locations', 'orders', 'payments']


In [35]:
# create and execute schema + tables
create_and_execute_schema_and_tables(
    conn=conn,
    schema="yanki",
    tables=normalized_tables,
    primary_keys=primary_keys,
    foreign_keys=foreign_keys
)

[create_and_execute_schema_and_tables] ❌ Dropping schema 'yanki' if it exists...
[create_and_execute_schema_and_tables] ✅ Creating schema 'yanki'...
[create_and_execute_schema_and_tables] ❌ Dropping table 'customers' if it exists...
[create_and_execute_schema_and_tables] ✅ Creating table 'customers'...
[create_and_execute_schema_and_tables] ❌ Dropping table 'products' if it exists...
[create_and_execute_schema_and_tables] ✅ Creating table 'products'...
[create_and_execute_schema_and_tables] ❌ Dropping table 'locations' if it exists...
[create_and_execute_schema_and_tables] ✅ Creating table 'locations'...
[create_and_execute_schema_and_tables] ❌ Dropping table 'orders' if it exists...
[create_and_execute_schema_and_tables] ✅ Creating table 'orders'...
[create_and_execute_schema_and_tables] ❌ Dropping table 'payments' if it exists...
[create_and_execute_schema_and_tables] ✅ Creating table 'payments'...

[create_and_execute_schema_and_tables] ✅ Execution finished for schema 'yanki'.
📊 Sum

In [36]:
# Load data to the database table
load_db(
    conn=conn,
    tables=normalized_tables,
    foreign_keys=foreign_keys,
    schema="yanki"
)

🚚 Loading 'yanki.customers'...
✅ 990 rows loaded into 'yanki.customers'.
🚚 Loading 'yanki.products'...
✅ 990 rows loaded into 'yanki.products'.
🚚 Loading 'yanki.locations'...
✅ 990 rows loaded into 'yanki.locations'.
🚚 Loading 'yanki.orders'...
✅ 990 rows loaded into 'yanki.orders'.
🚚 Loading 'yanki.payments'...
✅ 990 rows loaded into 'yanki.payments'.

📊 Load complete for all tables.


In [38]:
# Generate SQL script to insert to the database table (Optional)
export_sql_script(
    schema= "yanki",
    tables= normalized_tables,
    foreign_keys= foreign_keys,
    output_sql_path= "dataset/yanki_insert.sql"
)

📝 SQL script saved to dataset/yanki_insert.sql
