# ZULO BANK (OLTP & OLAP) ETL PROJECT 

In [1]:
# Import dependencies
import sys
from pathlib import Path
sys.path.append(str(Path.cwd().parent)) # Add parent directory
import pandas as pd
from db_utils import *
from config import ETL_CONFIG
from IPython.display import display, HTML
import ipywidgets as widgets

print("✅ Environment ready!")


✅ Environment ready!


In [2]:
# Connect to database
conn = get_db_connection(env_prefix="ZULO_DB_")
if not conn:
    raise Exception("❌ Database connection failed")
print(f"✅ Connected to {conn.dsn}")


[get_db_connection] ✅ Connected to database 'zulobank_db' using prefix 'ZULO_DB_'
✅ Connected to user=postgres password=xxx dbname=zulobank_db host=localhost port=5432


In [3]:
# Interactive source selector
# Create a global variable to store selection
global selected_source
selected_source = None

def on_source_change(change):
    global selected_source
    selected_source = change.new
    print(f"Selected source: {selected_source}")

# Create dropdown
source_dropdown = widgets.Dropdown(
    options=['all'] + list(ETL_CONFIG['data_sources'].keys()),
    description='Data Source:',
    disabled=False
)
source_dropdown.observe(on_source_change, names='value')

display(source_dropdown)

Dropdown(description='Data Source:', options=('all', 'zulo_gdrive', 'yanki', 'external_pricing'), value='all')

## Extract (Data Loading)

In [2]:
# df = read_data("data.csv")  # Local file
# df = read_data("1DdmNsrdBRLfzBdgtvzvFHZ7ejFpLlpwW", "gdrive")  # Google Drive
# df = read_data("https://example.com/data.csv")  # URL (auto-detected)
# df.head()

In [6]:
# Get configuration
print(f"Current selection: {selected_source}")
config = ETL_CONFIG['data_sources'][selected_source]
raw_df = read_data(config['path'], config.get('type', 'auto'))

print("Raw data shape:", raw_df.shape)
display(raw_df.head(3))

Current selection: zulo_gdrive
Raw data shape: (1554, 18)


Unnamed: 0,TransactionID,TransactionType,Amount,TransactionDate,CustomerID,FullName,Email,Phone,AccountID,AccountType,Balance,OpeningDate,LoanID,LoanAmount,LoanType,StartDate,EndDate,InterestRate
0,1,withdrawal,102.15,2023-04-26,85,Carol Miller,yfisher@example.org,6088279027,88,Savings,5652.16,2019-08-12,,,,,,
1,2,withdrawal,358.8,2020-06-13,91,Geoffrey Banks,gonzalesgeorge@example.net,001-546-857-6518x5359,26,Credit,2881.24,2019-05-06,44.0,32428.9,Mortgage,2021-06-24,2050-01-08 04:59:17.907588,2.12
2,2,withdrawal,358.8,2020-06-13,91,Geoffrey Banks,gonzalesgeorge@example.net,001-546-857-6518x5359,26,Credit,2881.24,2019-05-06,48.0,31406.77,Personal,2021-02-27,2038-10-12 04:59:17.907821,4.63


In [5]:
raw_df.info()
print(raw_df.columns)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1554 entries, 0 to 1553
Data columns (total 18 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   TransactionID    1554 non-null   int64  
 1   TransactionType  1554 non-null   object 
 2   Amount           1554 non-null   float64
 3   TransactionDate  1554 non-null   object 
 4   CustomerID       1554 non-null   int64  
 5   FullName         1554 non-null   object 
 6   Email            1554 non-null   object 
 7   Phone            1554 non-null   object 
 8   AccountID        1554 non-null   int64  
 9   AccountType      1554 non-null   object 
 10  Balance          1554 non-null   float64
 11  OpeningDate      1554 non-null   object 
 12  LoanID           1278 non-null   float64
 13  LoanAmount       1278 non-null   float64
 14  LoanType         1278 non-null   object 
 15  StartDate        1278 non-null   object 
 16  EndDate          1278 non-null   object 
 17  InterestRate  

## Transform (Data Cleaning)

### **Cleaning Steps for Data Modeling & Normalization**

*Clean df, add calculated fields: `interest` generate a `date_dim` for OLAP and map dateids for all date columns in the df*

| Step                         | Action                                                                                   | Columns Affected                                                                 |
|------------------------------|------------------------------------------------------------------------------------------|----------------------------------------------------------------------------------|
| 1. Handle Missing Values     | Drop or isolate rows with missing loan data                                              | `LoanID`, `LoanAmount`, `LoanType`, `StartDate`, `EndDate`, `InterestRate`      |
|                              | Drop rows with missing critical IDs                                                      | `TransactionID`, `CustomerID`, `AccountID`                                      |
| 2. Convert Data Types        | Convert to datetime format                                                               | `TransactionDate`, `OpeningDate`, `StartDate`, `EndDate`                        |
|                              | Ensure numeric/ID fields are proper type                                                 | `TransactionID`, `CustomerID`, `AccountID`, `LoanID`                            |
|                              | Round monetary values to 2 decimal places                                                | `Amount`, `Balance`, `LoanAmount`, `InterestRate`                               |
| 3. Standardize Text Fields   | Strip, lowercase, validate format                                                        | `Email`, `Phone`, `TransactionType`, `AccountType`, `LoanType`                  |
| 4. Deduplicate Records       | Drop exact or near-duplicate entries                                                     | All entity-specific tables based on unique ID or composite keys                 |
| 5. Normalize Derived Fields  | Recompute totals (if needed)                                                             | Not directly applicable here, but common in sales-related datasets              |
| 6. Enforce Data Integrity    | Check FK relationships and referential integrity                                         | `CustomerID`, `AccountID`, `LoanID` across all related tables                   |
| 7. Prepare for Normalization | Split main table into dimension/fact tables                                              | `customer_df`, `account_df`, `transaction_df`, `loan_df`                        |



In [None]:
print(f"🔄 Transforming {selected_source}...")
zolu_clean = raw_df.copy()

zulo_cleaned = handle_missing_critical(zulo_cleaned, config['critical_columns'])
print("✅ Critical columns validated")

zulo_cleaned[['Amount', 'Balance', 'LoanAmount', 'InterestRate']] = zulo_cleaned[['Amount', 'Balance', 'LoanAmount', 'InterestRate']].round(2)
print("✅ Transaction Variables rounded to 2 dp")

zulo_cleaned = validate_contact_info(zulo_cleaned, email_col='Email', phone_col='Phone')
print("✅ Contact Info validated")

zulo_cleaned = compute_interest(zulo_cleaned, principal_col='LoanAmount', rate_col='InterestRate', new_col='Interest')
print("✅ Interest calculated based on interest rate and loan amount")

zulo_cleaned = standardize_categoricals(zulo_cleaned)
print("✅ Categorical values validated to ensure consistency in values")

if config.get('date_columns'):
    zulo_cleaned = ensure_correct_dtypes(zulo_cleaned, config['date_columns'])
    print("✅ Date columns converted")

# 4. Apply custom transformations
if config.get('split_columns'):
    for col, new_cols in config['split_columns'].items():
        zulo_cleaned = split_compound_column(zulo_cleaned, col, new_cols)
        print(f"✅ Split {col} into {new_cols}")
        
clean_df = standardize_columns(clean_df)
print("✅ Column names standardized to snake_case format")

display(clean_df.head(2))
print("Transformed shape:", clean_df.shape)
zulo_cleaned.info()

In [6]:
#zulo_cleaned = handle_missing_critical(zulo_cleaned, ['TransactionID', 'CustomerID', 'AccountID'])
#zulo_cleaned = drop_fully_null_columns(zulo_cleaned, ['LoanID', 'LoanAmount', 'LoanType', 'StartDate', 'EndDate', 'InterestRate'])
#zulo_cleaned = ensure_correct_dtypes(zulo_cleaned, datetime_cols=['TransactionDate', 'OpeningDate', 'StartDate', 'EndDate'])






[handle_missing_critical] Dropped 0 rows with nulls in ['TransactionID', 'CustomerID', 'AccountID']
[ensure_correct_dtypes] Converted 'TransactionDate' using day-first format.
[ensure_correct_dtypes] Converted 'OpeningDate' using day-first format.
[ensure_correct_dtypes] Converted 'StartDate' using day-first format.
[ensure_correct_dtypes] Converted 'EndDate' using day-first format.
[validate_contact_info] Cleaned email in column 'Email'
[validate_contact_info] Cleaned phone in column 'Phone'
[standardize_categoricals] Standardized fields: ['TransactionType', 'FullName', 'Email', 'Phone', 'AccountType', 'LoanType', 'FirstName', 'LastName']
[split_compound_column] Split 'FullName' into ['FirstName', 'LastName']
[standardize_columns_in_df] Renamed columns to snake_case.
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1554 entries, 0 to 1553
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   t

In [8]:
# Creating a date dimension table
detected_date_cols = zulo_cleaned.select_dtypes(include=['datetime64[ns]']).columns.tolist()
date_dim = generate_date_dim(zulo_cleaned, date_columns=detected_date_cols)

[generate_date_dim] ✅ Created date dimension with 1094 unique dates (including holidays).


In [9]:
# Apply date_id mapping
zulo_cleaned = auto_map_date_ids(zulo_cleaned, date_dim)

[auto_map_date_ids] ✅ Mapped 'transaction_date' to 'transaction_date_id' and cast to Int64.
[auto_map_date_ids] ✅ Mapped 'opening_date' to 'opening_date_id' and cast to Int64.
[auto_map_date_ids] ✅ Mapped 'start_date' to 'start_date_id' and cast to Int64.
[auto_map_date_ids] ✅ Mapped 'end_date' to 'end_date_id' and cast to Int64.


### **Data Modelling**

#### **Data Model**
*Splitting the data into normalized tables like:*  `customers`, `accounts`, `loans`, `transactions`

![oltpmodel](oltp_db_model.png)

![olapmodel](olap_db_model.png)

### Normalise df to 3NF & Create Dimension tables

- dimension tables store descriptive attributes while Fact tables hold metrics and foreign keys to dimension tables.

In [11]:
# check unique counts to ensure accurate splits
zulo_cleaned.nunique()

transaction_id         1000
transaction_type          2
amount                  994
transaction_date        779
customer_id              87
full_name                87
email                    87
phone                    87
account_id              198
account_type              3
balance                 198
opening_date            189
loan_id                 129
loan_amount             129
loan_type                 4
start_date              123
end_date                129
interest_rate           117
first_name               69
last_name                77
transaction_date_id     779
opening_date_id         189
start_date_id           123
end_date_id             129
dtype: int64

In [13]:
# Define the table specifications
table_definitions = {
    # OLTP Normalized Tables
    "customers": ['customer_id', 'first_name', 'last_name', 'email', 'phone'],
    "accounts": ['account_id', 'customer_id', 'account_type', 'balance', 'opening_date', 'opening_date_id'],
    "loans": [
        'loan_id', 'customer_id', 'loan_amount', 'loan_type',
        'start_date', 'start_date_id',
        'end_date', 'end_date_id',
        'interest_rate'
    ],
    "transactions": [
        'transaction_id', 'transaction_type', 'amount',
        'transaction_date', 'transaction_date_id', 'account_id'
    ],

    # OLAP Dimension Tables
    "dim_customer": ['customer_id', 'first_name', 'last_name', 'email', 'phone'],
    "dim_account": ['account_id', 'account_type', 'balance'],
    "dim_transaction": ['transaction_id', 'transaction_type'],
    "dim_loan": ['loan_id', 'loan_type', 'interest_rate'],

    # OLAP Fact Tables
    "transaction_fact": ['transaction_id', 'account_id', 'opening_date_id', 'transaction_date_id', 'amount'],
    "loan_fact": ['loan_id', 'customer_id', 'start_date_id', 'end_date_id', 'loan_amount', 'interest']
}


normalized_tables = split_normalized_tables(zulo_cleaned, table_definitions)
globals().update(normalized_tables) #access the tables as standalone variables


[split_normalized_tables] Column names normalized to lowercase.
[split_normalized_tables]  Created table 'customers' with 87 non-null rows.
[split_normalized_tables]  Created table 'accounts' with 198 non-null rows.
[split_normalized_tables]  Created table 'loans' with 129 non-null rows.
[split_normalized_tables]  Created table 'transactions' with 1000 non-null rows.
[split_normalized_tables]  Created table 'dim_customer' with 87 non-null rows.
[split_normalized_tables]  Created table 'dim_account' with 198 non-null rows.
[split_normalized_tables]  Created table 'dim_transaction' with 1000 non-null rows.
[split_normalized_tables]  Created table 'dim_loan' with 129 non-null rows.
[split_normalized_tables]  Created table 'transaction_fact' with 1000 non-null rows.
[split_normalized_tables]  Created table 'loan_fact' with 129 non-null rows.


### Save to csv (optional)

In [62]:
# save_tables_to_csv(tables=normalized_tables, export_dir="dataset/")

## LOAD

In [15]:
# initiate connection to pgadmin db
#conn = get_db_connection("DB_")   
conn = get_db_connection("ZULO_DB_") 


[get_db_connection] ✅ Connected to database 'zulobank_db' using prefix 'ZULO_DB_'


In [18]:
# Infer PK & FK
primary_keys, foreign_keys, surrogate_keys, summary = infer_keys_extended(normalized_tables)
print(summary)


🔍 **Extended Key Inference Report**

📌 **Primary Keys**
   ✔️ customers            → customer_id
   ✔️ accounts             → account_id
   ✔️ loans                → loan_id
   ✔️ transactions         → transaction_id
   ✔️ dim_customer         → customer_id
   ✔️ dim_account          → account_id
   ✔️ dim_transaction      → transaction_id
   ✔️ dim_loan             → loan_id
   ❌ transaction_fact     → No primary key detected
   ❌ loan_fact            → No primary key detected

📎 **Foreign Keys**
   🔗 customers            → customer_id → dim_customer.customer_id
   🔗 accounts             → account_id → dim_account.account_id
   🔗 accounts             → customer_id → dim_customer.customer_id
   🔗 loans                → loan_id → dim_loan.loan_id
   🔗 loans                → customer_id → dim_customer.customer_id
   🔗 transactions         → transaction_id → dim_transaction.transaction_id
   🔗 transactions         → account_id → dim_account.account_id
   ℹ️ dim_customer         → Has PK

In [19]:
# Perform a dependency-aware sort on your tables based on foreign key references:
creation_order = topological_sort_tables(
    tables=normalized_tables,
    foreign_keys=foreign_keys
)
print(creation_order)


['dim_customer', 'dim_account', 'dim_transaction', 'dim_loan', 'customers', 'accounts', 'transactions', 'transaction_fact', 'loan_fact', 'loans']


In [20]:
# create and execute schema + tables
create_and_execute_schema_and_tables(
    conn=conn,
    schema="zulo",
    tables=normalized_tables,
    primary_keys=primary_keys,
    foreign_keys=foreign_keys,
    surrogate_keys=surrogate_keys
)


[create_and_execute_schema_and_tables] 🚫 Dropping schema 'zulo' if it exists...
[create_and_execute_schema_and_tables] ✅ Creating schema 'zulo'...
[create_and_execute_schema_and_tables] 🚫 Dropping table 'dim_customer' if it exists...
[create_and_execute_schema_and_tables] 🛠️ Creating table 'dim_customer'...
[create_and_execute_schema_and_tables] ✅ Successfully created 'dim_customer'.
[create_and_execute_schema_and_tables] 🚫 Dropping table 'dim_account' if it exists...
[create_and_execute_schema_and_tables] 🛠️ Creating table 'dim_account'...
[create_and_execute_schema_and_tables] ✅ Successfully created 'dim_account'.
[create_and_execute_schema_and_tables] 🚫 Dropping table 'dim_transaction' if it exists...
[create_and_execute_schema_and_tables] 🛠️ Creating table 'dim_transaction'...
[create_and_execute_schema_and_tables] ✅ Successfully created 'dim_transaction'.
[create_and_execute_schema_and_tables] 🚫 Dropping table 'dim_loan' if it exists...
[create_and_execute_schema_and_tables] 🛠️ Cr

In [21]:
# Load data to the database table
load_db(
    conn=conn,
    tables=normalized_tables,
    foreign_keys=foreign_keys,
    schema="zulo"
)

🚚 Loading 'zulo.dim_customer'...
✅ 87 rows loaded into 'zulo.dim_customer'.
🚚 Loading 'zulo.dim_account'...
✅ 198 rows loaded into 'zulo.dim_account'.
🚚 Loading 'zulo.dim_transaction'...
✅ 1000 rows loaded into 'zulo.dim_transaction'.
🚚 Loading 'zulo.dim_loan'...
✅ 129 rows loaded into 'zulo.dim_loan'.
🚚 Loading 'zulo.customers'...
✅ 87 rows loaded into 'zulo.customers'.
🚚 Loading 'zulo.accounts'...
✅ 198 rows loaded into 'zulo.accounts'.
🚚 Loading 'zulo.transactions'...
✅ 1000 rows loaded into 'zulo.transactions'.
🚚 Loading 'zulo.transaction_fact'...
✅ 1000 rows loaded into 'zulo.transaction_fact'.
🚚 Loading 'zulo.loan_fact'...
✅ 129 rows loaded into 'zulo.loan_fact'.
🚚 Loading 'zulo.loans'...
✅ 129 rows loaded into 'zulo.loans'.

📊 Load complete for all tables.


In [22]:
# Generate SQL script to insert to the database table (Optional)
export_sql_script(
    schema= "zulo",
    tables= normalized_tables,
    foreign_keys= foreign_keys,
    output_sql_path= "dataset/zulo_insert.sql"
)

📝 SQL script saved to dataset/zulo_insert.sql


## SCHEMA ERD 

![ERD](schemaERD.pgerd.png)