# BoostCredit ETL Pipeline Demo

This notebook demonstrates the ETL pipeline for processing CSV and JSON data.

## Pipeline Flow:
1. **Extract** → Read data from CSV/JSON files
2. **Transform** → Clean, convert types, mask PII
3. **Store** → Save to object store (Parquet)
4. **Load** → Load from object store to PostgreSQL database

In [None]:
import os
import sys
import importlib
import pandas as pd
from pathlib import Path

# Reload modules to ensure we have the latest code (important for notebooks)
import src.loaders
import src.pipeline
import src.extractors
import src.transformers


from src.pipeline import Pipeline
from src.extractors import CSVExtractor, JSONExtractor
from src.transformers import CSVTransformer, JSONTransformer

# Set environment variables
os.environ['STORE_KEY'] = 'demo_data'
os.environ['DB_TYPE'] = 'postgresql'
os.environ['DB_HOST'] = 'localhost'
os.environ['DB_PORT'] = '5432'
os.environ['DB_USER'] = 'etl_user'
os.environ['DB_PASSWORD'] = 'etl_password'
os.environ['DB_NAME'] = 'etl_database'
os.environ['DATA_PATH'] = './data'
os.environ['OBJECT_STORE_PATH'] = './output'

print("✓ Environment variables set")
print("✓ Modules reloaded and imports successful")

## Step 1: Initialize Pipeline

The pipeline handles the complete ETL process automatically.

In [None]:
pipeline = Pipeline()
print("✓ Pipeline initialized")

## Step 2: Test Individual Components

Let's test each component separately to understand what they do.

In [None]:
# Test CSV Extractor
csv_extractor = CSVExtractor()
csv_file = Path('data/test.csv')
if csv_file.exists():
    sample_data = csv_extractor.extract(str(csv_file))
    print(f"✓ CSV Extracted: {len(sample_data)} rows")
    print(f"  Columns: {list(sample_data.columns)}")
    print(f"\n  First row sample:")
    print(sample_data.head(1))
else:
    print("⚠ CSV file not found")

In [None]:
sample_data.head()

In [None]:
# Test CSV Transformer
csv_transformer = CSVTransformer()
if csv_file.exists():
    transformed = csv_transformer.transform(sample_data.head(5))
    print("✓ CSV Transformed")
    print(f"  Data types converted")
    print(f"  PII masked (name, address)")
    print(f"\n  Transformed sample:")
    print(transformed[['id', 'name', 'created_at', 'is_claimed', 'paid_amount']].head(2))

In [None]:
transformed.head()

In [None]:
# Test JSON Extractor
json_extractor = JSONExtractor()
json_file = Path('data/test.json')
if json_file.exists():
    json_data = json_extractor.extract(str(json_file))
    print(f"✓ JSON Extracted: {len(json_data)} records")
    print(f"\n  First record keys: {list(json_data[0].keys())}")
    print(f"  Sample user_id: {json_data[0].get('user_id', 'N/A')}")
else:
    print("⚠ JSON file not found")

In [None]:
# Test JSON Transformer
json_transformer = JSONTransformer()
if json_file.exists():
    json_transformed = json_transformer.transform(json_data[:2])  # Transform 2 records
    print("✓ JSON Transformed into 3 tables:")
    print(f"  - users: {len(json_transformed['users'])} rows")
    print(f"  - telephone_numbers: {len(json_transformed['telephone_numbers'])} rows")
    print(f"  - jobs_history: {len(json_transformed['jobs_history'])} rows")
    print(f"\n  Users sample:")
    print(json_transformed['users'][['user_id', 'name', 'username']].head(2))

In [None]:
json_transformed

## Step 3: Run Complete Pipeline

Now let's run the full pipeline for CSV processing.

In [None]:
# Process CSV file
if csv_file.exists():
    os.environ['STORE_KEY'] = 'csv_demo'
    pipeline.process_csv('test.csv')
    print("✓ CSV processing completed!")
    print("  → Data extracted, transformed, saved to object store, and loaded to database")
else:
    print("⚠ CSV file not found - skipping CSV processing")

## Step 4: Process JSON File

Process JSON data which creates multiple linked tables.

In [None]:
# Process JSON file
if json_file.exists():
    os.environ['STORE_KEY'] = 'json_demo'
    pipeline.process_json('test.json')
    print("✓ JSON processing completed!")
    print("  → Created 3 tables: users, telephone_numbers, jobs_history")
    print("  → All PII masked (emails, phones, national IDs, passwords)")
else:
    print("⚠ JSON file not found - skipping JSON processing")

## Step 5: Query Database Tables

Verify that data was loaded into the database by querying all tables.

In [1]:
import os
import pandas as pd
from sqlalchemy import create_engine, text

# DB config
db_host = os.getenv('DB_HOST', 'localhost')
db_port = os.getenv('DB_PORT', '5432')
db_user = os.getenv('DB_USER', 'etl_user')
db_password = os.getenv('DB_PASSWORD', 'etl_password')
db_name = os.getenv('DB_NAME', 'etl_database')


engine = create_engine(
    f"postgresql://{db_user}:{db_password}@{db_host}:{db_port}/{db_name}"
)


query = text("""
SELECT *
FROM users
""")

with engine.connect() as conn:
    df = pd.read_sql(query, conn)

print(df.head())




                                user_id created_at updated_at  \
0  97d5b68f-91e0-4274-a438-ec511dd6e84e 1991-03-18 1975-04-18   
1  b4d41832-1533-4bbf-ba7f-1b3a6cca9e86 1986-10-05 2013-12-06   
2  a9ad233a-f852-4d32-b400-6d5145b78c48 2018-01-21 2008-05-18   
3  a581e8eb-9510-490f-8944-aa09196cf525 2011-12-11 1992-12-13   
4  bb938800-98e6-47bc-84a2-42d5bc227beb 1998-04-17 1998-06-27   

            logged_at       name         dob  \
0 1980-06-11 03:47:41  E*** G***  2003-08-04   
1 1992-08-29 17:40:56  J*** R***  1996-05-06   
2 1974-12-01 13:51:37  E*** W***        None   
3 2000-11-10 12:39:28  H*** T***        None   
4 2020-05-12 14:31:29  M*** C***        None   

                                             address  \
0           *****************\nSandramouth, IN 16935   
1  ******************************\nEast Carlaview...   
2                                               None   
3                                               None   
4                                       

In [2]:
df.head()

Unnamed: 0,user_id,created_at,updated_at,logged_at,name,dob,address,username,password,national_id
0,97d5b68f-91e0-4274-a438-ec511dd6e84e,1991-03-18,1975-04-18,1980-06-11 03:47:41,E*** G***,2003-08-04,"*****************\nSandramouth, IN 16935",j******6@gmail.com,**********,*****9634
1,b4d41832-1533-4bbf-ba7f-1b3a6cca9e86,1986-10-05,2013-12-06,1992-08-29 17:40:56,J*** R***,1996-05-06,******************************\nEast Carlaview...,r********h@gmail.com,**********,*****4909
2,a9ad233a-f852-4d32-b400-6d5145b78c48,2018-01-21,2008-05-18,1974-12-01 13:51:37,E*** W***,,,m*********a@murphy.com,**********,*****2133
3,a581e8eb-9510-490f-8944-aa09196cf525,2011-12-11,1992-12-13,2000-11-10 12:39:28,H*** T***,,,e*****x@gmail.com,**********,*****3748
4,bb938800-98e6-47bc-84a2-42d5bc227beb,1998-04-17,1998-06-27,2020-05-12 14:31:29,M*** C***,,,p************n@gmail.com,**********,*****7237


In [3]:
query = text("""
SELECT *
FROM telephone_numbers
""")

with engine.connect() as conn:
    df = pd.read_sql(query, conn)

print(df.head())

       id                               user_id    telephone_number
0  233234  e9703a66-6556-4b48-8a0b-0ace129d7a11    ************7268
1  233235  e9703a66-6556-4b48-8a0b-0ace129d7a11  **************5397
2  233236  aa246388-104c-44f7-93f4-4b688dc0baff          ******9845
3  233237  aa246388-104c-44f7-93f4-4b688dc0baff    ************0701
4  233238  86af3e4d-6c57-4245-be99-19b2bfb138c0   *************6286


In [4]:
df.head()

Unnamed: 0,id,user_id,telephone_number
0,233234,e9703a66-6556-4b48-8a0b-0ace129d7a11,************7268
1,233235,e9703a66-6556-4b48-8a0b-0ace129d7a11,**************5397
2,233236,aa246388-104c-44f7-93f4-4b688dc0baff,******9845
3,233237,aa246388-104c-44f7-93f4-4b688dc0baff,************0701
4,233238,86af3e4d-6c57-4245-be99-19b2bfb138c0,*************6286


In [5]:
query = text("""
SELECT *
FROM jobs_history
""")

with engine.connect() as conn:
    df = pd.read_sql(query, conn)

print(df.head())

                                 job_id                               user_id  \
0  8c48a084-27d7-4f13-98fe-10b802275103  e9703a66-6556-4b48-8a0b-0ace129d7a11   
1  b9d4fc47-0e53-4494-84ae-a39f446be0c9  aa246388-104c-44f7-93f4-4b688dc0baff   
2  818289aa-0a0d-45a5-89ad-0b0e1dea5bb3  86af3e4d-6c57-4245-be99-19b2bfb138c0   
3  17905cdf-2254-44d2-a592-b6c4babcf7f1  a610bdb4-8d67-47b0-b6f6-791a8ea22d86   
4  51c8cffd-98be-4f0d-a85b-3793494fed9d  2bde2981-4bd8-4da0-ab3b-9ff26a4cebd8   

                       occupation  is_fulltime       start         end  \
0                    Set designer        False  1996-12-26  1997-10-02   
1                  Chief of Staff         True  1991-09-12  2015-08-04   
2         Clinical cytogeneticist         True  1970-10-28  2018-03-30   
3  Manufacturing systems engineer         True  2007-05-05  2014-07-01   
4        Counselling psychologist        False  1989-12-24        None   

  employer  
0     None  
1     None  
2     None  
3     None  
4  

In [7]:
df.head()

Unnamed: 0,job_id,user_id,occupation,is_fulltime,start,end,employer
0,8c48a084-27d7-4f13-98fe-10b802275103,e9703a66-6556-4b48-8a0b-0ace129d7a11,Set designer,False,1996-12-26,1997-10-02,
1,b9d4fc47-0e53-4494-84ae-a39f446be0c9,aa246388-104c-44f7-93f4-4b688dc0baff,Chief of Staff,True,1991-09-12,2015-08-04,
2,818289aa-0a0d-45a5-89ad-0b0e1dea5bb3,86af3e4d-6c57-4245-be99-19b2bfb138c0,Clinical cytogeneticist,True,1970-10-28,2018-03-30,
3,17905cdf-2254-44d2-a592-b6c4babcf7f1,a610bdb4-8d67-47b0-b6f6-791a8ea22d86,Manufacturing systems engineer,True,2007-05-05,2014-07-01,
4,51c8cffd-98be-4f0d-a85b-3793494fed9d,2bde2981-4bd8-4da0-ab3b-9ff26a4cebd8,Counselling psychologist,False,1989-12-24,,
