# BoostCredit ETL Pipeline Demo

This notebook demonstrates the ETL pipeline for processing CSV and JSON data.

## Pipeline Flow:
1. **Extract** → Read data from CSV/JSON files
2. **Transform** → Clean, convert types, mask PII
3. **Store** → Save to object store (Parquet)
4. **Load** → Load from object store to PostgreSQL database

In [None]:
import os
import pandas as pd
from pathlib import Path
from src.pipeline import Pipeline
from src.extractors import CSVExtractor, JSONExtractor
from src.transformers import CSVTransformer, JSONTransformer

# Set environment variables
os.environ['STORE_KEY'] = 'demo_data'
os.environ['DB_TYPE'] = 'postgresql'
os.environ['DB_HOST'] = 'localhost'
os.environ['DB_PORT'] = '5432'
os.environ['DB_USER'] = 'etl_user'
os.environ['DB_PASSWORD'] = 'etl_password'
os.environ['DB_NAME'] = 'etl_database'
os.environ['DATA_PATH'] = './data'
os.environ['OBJECT_STORE_PATH'] = './output'

print("✓ Environment variables set")
print("✓ Imports successful")

## Step 1: Initialize Pipeline

The pipeline handles the complete ETL process automatically.

In [None]:
pipeline = Pipeline()
print("✓ Pipeline initialized")

## Step 2: Test Individual Components

Let's test each component separately to understand what they do.

In [None]:
# Test CSV Extractor
csv_extractor = CSVExtractor()
csv_file = Path('data/test.csv')
if csv_file.exists():
    sample_data = csv_extractor.extract(str(csv_file))
    print(f"✓ CSV Extracted: {len(sample_data)} rows")
    print(f"  Columns: {list(sample_data.columns)}")
    print(f"\n  First row sample:")
    print(sample_data.head(1))
else:
    print("⚠ CSV file not found")

In [None]:
# Test CSV Transformer
csv_transformer = CSVTransformer()
if csv_file.exists():
    transformed = csv_transformer.transform(sample_data.head(5))
    print("✓ CSV Transformed")
    print(f"  Data types converted")
    print(f"  PII masked (name, address)")
    print(f"\n  Transformed sample:")
    print(transformed[['id', 'name', 'created_at', 'is_claimed', 'paid_amount']].head(2))

In [None]:
# Test JSON Extractor
json_extractor = JSONExtractor()
json_file = Path('data/test.json')
if json_file.exists():
    json_data = json_extractor.extract(str(json_file))
    print(f"✓ JSON Extracted: {len(json_data)} records")
    print(f"\n  First record keys: {list(json_data[0].keys())}")
    print(f"  Sample user_id: {json_data[0].get('user_id', 'N/A')}")
else:
    print("⚠ JSON file not found")

In [None]:
# Test JSON Transformer
json_transformer = JSONTransformer()
if json_file.exists():
    json_transformed = json_transformer.transform(json_data[:2])  # Transform 2 records
    print("✓ JSON Transformed into 3 tables:")
    print(f"  - users: {len(json_transformed['users'])} rows")
    print(f"  - telephone_numbers: {len(json_transformed['telephone_numbers'])} rows")
    print(f"  - jobs_history: {len(json_transformed['jobs_history'])} rows")
    print(f"\n  Users sample:")
    print(json_transformed['users'][['user_id', 'name', 'username']].head(2))

## Step 3: Run Complete Pipeline

Now let's run the full pipeline for CSV processing.

In [None]:
# Process CSV file
if csv_file.exists():
    os.environ['STORE_KEY'] = 'csv_demo'
    pipeline.process_csv('test.csv')
    print("✓ CSV processing completed!")
    print("  → Data extracted, transformed, saved to object store, and loaded to database")
else:
    print("⚠ CSV file not found - skipping CSV processing")

## Step 4: Process JSON File

Process JSON data which creates multiple linked tables.

In [None]:
# Process JSON file
if json_file.exists():
    os.environ['STORE_KEY'] = 'json_demo'
    pipeline.process_json('test.json')
    print("✓ JSON processing completed!")
    print("  → Created 3 tables: users, telephone_numbers, jobs_history")
    print("  → All PII masked (emails, phones, national IDs, passwords)")
else:
    print("⚠ JSON file not found - skipping JSON processing")

## Step 5: Verify Data in Object Store

Check what was saved to the object store (intermediate step).

In [None]:
from src.storage import ObjectStore

store = ObjectStore('./output')

# Check CSV data in store
csv_data = store.load('csv_demo', 'parquet')
if csv_data is not None:
    print(f"✓ CSV data in object store: {len(csv_data)} rows")
    print(f"  Columns: {list(csv_data.columns)}")

# Check JSON data in store
json_data_store = store.load('json_demo', 'parquet')
if json_data_store is not None:
    print(f"\n✓ JSON data in object store:")
    for table_name, df in json_data_store.items():
        print(f"  - {table_name}: {len(df)} rows")

## Step 6: Cleanup

Close the pipeline to release database connections.

In [None]:
pipeline.close()
print("✓ Pipeline closed - database connections released")

# BoostCredit ETL Pipeline 



In [2]:
from src.pipeline import ETLPipeline

# Initialize pipeline with SQLite database
db_connection_string = 'sqlite:///etl_database.db'
pipeline = ETLPipeline(db_connection_string)

print(f"✓ ETL Pipeline initialized")
print(f"✓ Database: {db_connection_string}")


ImportError: cannot import name 'ETLPipeline' from 'src.pipeline' (/home/anser/Downloads/anserGithub/BoostCredit/src/pipeline.py)

In [3]:
# Check CSV file
csv_file = Path('data/test.csv')
print(f"CSV file exists: {csv_file.exists()}")
print(f"CSV file size: {csv_file.stat().st_size / (1024*1024):.2f} MB")

# Preview first few rows
if csv_file.exists():
    df_preview = pd.read_csv(csv_file, nrows=5)
    print("\nFirst 5 rows:")
    print(df_preview)
    print(f"\nColumns: {list(df_preview.columns)}")


CSV file exists: True
CSV file size: 465.68 MB

First 5 rows:
     id              name                                            address  \
0  6311    Jennifer Green  7593 Juan Throughway Apt. 948\nWest Corey, TX ...   
1  3350      Karen Grimes    60975 Jessica Squares\nEast Sallybury, FL 71671   
2  9031       Calvin Cook                   PSC 3989, Box 4719\nAPO AA 42056   
3  1131    Peter Mcdowell                   PSC 1868, Box 4833\nAPO AP 77807   
4  1889  Mr. Ryan Sanchez      352 Simmons Circle\nPort Dustinbury, OK 83627   

    color               created_at  last_login is_claimed  paid_amount  
0    lime  Monday, June 30th, 2013  1202190735       True  5004.671532  
1    lime  Monday, June 30th, 2013   195884769       True   893.404595  
2  silver           1986-06-23TEST   623477862       True   266.600000  
3    aqua           1998-07-17TEST  1244885561       True   674.544127  
4   white      2006-05-09 13:29:58  1293151276      truee          NaN  

Columns: ['id', 'n

In [4]:
df_preview.head()

Unnamed: 0,id,name,address,color,created_at,last_login,is_claimed,paid_amount
0,6311,Jennifer Green,"7593 Juan Throughway Apt. 948\nWest Corey, TX ...",lime,"Monday, June 30th, 2013",1202190735,True,5004.671532
1,3350,Karen Grimes,"60975 Jessica Squares\nEast Sallybury, FL 71671",lime,"Monday, June 30th, 2013",195884769,True,893.404595
2,9031,Calvin Cook,"PSC 3989, Box 4719\nAPO AA 42056",silver,1986-06-23TEST,623477862,True,266.6
3,1131,Peter Mcdowell,"PSC 1868, Box 4833\nAPO AP 77807",aqua,1998-07-17TEST,1244885561,True,674.544127
4,1889,Mr. Ryan Sanchez,"352 Simmons Circle\nPort Dustinbury, OK 83627",white,2006-05-09 13:29:58,1293151276,truee,


In [None]:
# Process CSV file
print("Processing CSV file...")
pipeline.process_csv(str(csv_file), 'test')
print("✓ CSV processing completed!")


In [None]:
# Connect to database and query test table
engine = create_engine(db_connection_string)

with engine.connect() as conn:
    # Get row count
    result = conn.execute(text("SELECT COUNT(*) as count FROM test"))
    row_count = result.fetchone()[0]
    print(f"Total rows in 'test' table: {row_count}")
    
    # Get sample data
    result = conn.execute(text("SELECT * FROM test LIMIT 5"))
    columns = result.keys()
    rows = result.fetchall()
    
    print("\nSample data from 'test' table:")
    df_test = pd.DataFrame(rows, columns=columns)
    print(df_test)
    
    # Check data types
    print("\nData types:")
    result = conn.execute(text("PRAGMA table_info(test)"))
    schema = result.fetchall()
    for col in schema:
        print(f"  {col[1]}: {col[2]}")


In [1]:
# Check JSON file
json_file = Path('data/test.json')
print(f"JSON file exists: {json_file.exists()}")
print(f"JSON file size: {json_file.stat().st_size / (1024*1024):.2f} MB")

# Preview first record
if json_file.exists():
    import json
    with open(json_file, 'r') as f:
        first_line = f.readline()
        first_record = json.loads(first_line)
        print("\nFirst record structure:")
        print(json.dumps(first_record, indent=2))


NameError: name 'Path' is not defined

In [None]:
# Process JSON file
print("Processing JSON file...")
pipeline.process_json(str(json_file))
print("✓ JSON processing completed!")


## Step 5: Verify JSON Data in Database

Let's verify that the three tables (`users`, `telephone_numbers`, `jobs_history`) were created and populated correctly.


In [None]:
# Verify users table
with engine.connect() as conn:
    result = conn.execute(text("SELECT COUNT(*) as count FROM users"))
    user_count = result.fetchone()[0]
    print(f"Total users: {user_count}")
    
    result = conn.execute(text("SELECT * FROM users LIMIT 3"))
    columns = result.keys()
    rows = result.fetchall()
    df_users = pd.DataFrame(rows, columns=columns)
    print("\nSample users (with PII masked):")
    print(df_users[['user_id', 'name', 'username', 'national_id']].to_string())


In [None]:
# Verify telephone_numbers table
with engine.connect() as conn:
    result = conn.execute(text("SELECT COUNT(*) as count FROM telephone_numbers"))
    tel_count = result.fetchone()[0]
    print(f"Total telephone numbers: {tel_count}")
    
    result = conn.execute(text("""
        SELECT tn.*, u.name 
        FROM telephone_numbers tn
        JOIN users u ON tn.user_id = u.user_id
        LIMIT 5
    """))
    columns = result.keys()
    rows = result.fetchall()
    df_tel = pd.DataFrame(rows, columns=columns)
    print("\nSample telephone numbers (with PII masked):")
    print(df_tel[['user_id', 'name', 'telephone_number']].to_string())


In [None]:
# Verify jobs_history table
with engine.connect() as conn:
    result = conn.execute(text("SELECT COUNT(*) as count FROM jobs_history"))
    job_count = result.fetchone()[0]
    print(f"Total job history records: {job_count}")
    
    result = conn.execute(text("""
        SELECT jh.*, u.name 
        FROM jobs_history jh
        JOIN users u ON jh.user_id = u.user_id
        LIMIT 5
    """))
    columns = result.keys()
    rows = result.fetchall()
    df_jobs = pd.DataFrame(rows, columns=columns)
    print("\nSample job history:")
    print(df_jobs[['user_id', 'name', 'occupation', 'start', 'end', 'is_fulltime']].to_string())


## Step 6: Verify Foreign Key Relationships

Let's verify that the foreign key relationships are working correctly.


In [None]:
# Verify relationships
with engine.connect() as conn:
    # Check users with telephone numbers
    result = conn.execute(text("""
        SELECT 
            u.user_id,
            u.name,
            COUNT(tn.telephone_number) as phone_count,
            COUNT(jh.job_id) as job_count
        FROM users u
        LEFT JOIN telephone_numbers tn ON u.user_id = tn.user_id
        LEFT JOIN jobs_history jh ON u.user_id = jh.user_id
        GROUP BY u.user_id, u.name
        LIMIT 10
    """))
    columns = result.keys()
    rows = result.fetchall()
    df_relationships = pd.DataFrame(rows, columns=columns)
    print("Users with their telephone numbers and job counts:")
    print(df_relationships.to_string())
    
    print("\n✓ Foreign key relationships verified!")


## Step 7: Summary Statistics

Let's get a summary of all the data loaded.


In [None]:
# Summary statistics
with engine.connect() as conn:
    print("=" * 60)
    print("ETL Pipeline Summary")
    print("=" * 60)
    
    # Test table
    result = conn.execute(text("SELECT COUNT(*) FROM test"))
    test_count = result.fetchone()[0]
    print(f"\n✓ Test table: {test_count:,} rows")
    
    # Users table
    result = conn.execute(text("SELECT COUNT(*) FROM users"))
    users_count = result.fetchone()[0]
    print(f"✓ Users table: {users_count:,} rows")
    
    # Telephone numbers
    result = conn.execute(text("SELECT COUNT(*) FROM telephone_numbers"))
    tel_count = result.fetchone()[0]
    print(f"✓ Telephone numbers table: {tel_count:,} rows")
    
    # Jobs history
    result = conn.execute(text("SELECT COUNT(*) FROM jobs_history"))
    jobs_count = result.fetchone()[0]
    print(f"✓ Jobs history table: {jobs_count:,} rows")
    
    print("\n" + "=" * 60)
    print("✓ All data successfully loaded and verified!")
    print("=" * 60)


In [None]:
# Close pipeline
pipeline.close()
print("✓ Pipeline closed successfully")
