# ETL/ELT Pipeline - DB-1

This notebook provides a comprehensive ETL/ELT pipeline for database db-1.

## Pipeline Overview
1. **Extract**: Load data from source systems
2. **Transform**: Clean, validate, and transform data
3. **Load**: Load transformed data into target database
4. **Validate**: Verify data quality and completeness
5. **Monitor**: Track pipeline performance and errors

## Section 1: Setup and Configuration

In [None]:
import sys
from pathlib import Path
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import json
import logging
from typing import Dict, List, Optional
import warnings
warnings.filterwarnings('ignore')

# Database connections
try:
    from sqlalchemy import create_engine, text
    SQLALCHEMY_AVAILABLE = True
except ImportError:
    SQLALCHEMY_AVAILABLE = False
    print("Warning: sqlalchemy not available")

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 50)
pd.set_option('display.width', None)

print("✓ Imports successful")

In [None]:
# Configuration
DB_NAME = "db-5"
DB_PATH = Path.cwd().parent

# Database connection strings (configure as needed)
# PostgreSQL
POSTGRES_CONNECTION_STRING = None  # "postgresql://user:password@localhost:5432/dbname"

# Databricks
DATABRICKS_CONNECTION_STRING = None  # Configure Databricks connection

# Databricks
SNOWFLAKE_CONNECTION_STRING = None  # Configure Databricks connection

# Source data paths
DATA_DIR = DB_PATH / "data"
SCHEMA_FILE = DATA_DIR / "schema.sql"
DATA_FILE = DATA_DIR / "data.sql"

print(f"Database: {DB_NAME}")
print(f"Data directory: {DATA_DIR}")
print(f"Schema file exists: {SCHEMA_FILE.exists()}")
print(f"Data file exists: {DATA_FILE.exists()}")

## Section 2: Extract - Data Loading

In [None]:
def load_schema_file(schema_path: Path) -> Optional[str]:
    """Load database schema from SQL file."""
    try:
        if schema_path.exists():
            with open(schema_path, 'r') as f:
                return f.read()
        else:
            logger.warning(f"Schema file not found: {schema_path}")
            return None
    except Exception as e:
        logger.error(f"Error loading schema: {e}")
        return None

def load_data_file(data_path: Path) -> Optional[str]:
    """Load data from SQL file."""
    try:
        if data_path.exists():
            with open(data_path, 'r') as f:
                return f.read()
        else:
            logger.warning(f"Data file not found: {data_path}")
            return None
    except Exception as e:
        logger.error(f"Error loading data: {e}")
        return None

# Load schema and data
schema_sql = load_schema_file(SCHEMA_FILE)
data_sql = load_data_file(DATA_FILE)

if schema_sql:
    print(f"✓ Schema loaded ({len(schema_sql)} characters)")
if data_sql:
    print(f"✓ Data loaded ({len(data_sql)} characters)")

In [None]:
def extract_from_csv(csv_path: Path) -> Optional[pd.DataFrame]:
    """Extract data from CSV file."""
    try:
        if csv_path.exists():
            df = pd.read_csv(csv_path)
            logger.info(f"Loaded {len(df)} rows from {csv_path.name}")
            return df
        return None
    except Exception as e:
        logger.error(f"Error loading CSV {csv_path}: {e}")
        return None

def extract_from_json(json_path: Path) -> Optional[Dict]:
    """Extract data from JSON file."""
    try:
        if json_path.exists():
            with open(json_path, 'r') as f:
                data = json.load(f)
            logger.info(f"Loaded JSON from {json_path.name}")
            return data
        return None
    except Exception as e:
        logger.error(f"Error loading JSON {json_path}: {e}")
        return None

# Find and load data files
csv_files = list(DATA_DIR.glob("*.csv"))
json_files = list(DATA_DIR.glob("*.json"))

extracted_data = {}

for csv_file in csv_files:
    df = extract_from_csv(csv_file)
    if df is not None:
        extracted_data[csv_file.stem] = df

for json_file in json_files:
    data = extract_from_json(json_file)
    if data is not None:
        extracted_data[json_file.stem] = data

print(f"✓ Extracted {len(extracted_data)} data sources")

## Section 3: Transform - Data Cleaning and Transformation

In [None]:
def clean_dataframe(df: pd.DataFrame) -> pd.DataFrame:
    """Clean DataFrame: handle missing values, remove duplicates, etc."""
    if df is None or df.empty:
        return df
    
    # Remove duplicates
    initial_rows = len(df)
    df = df.drop_duplicates()
    duplicates_removed = initial_rows - len(df)
    
    # Handle missing values
    missing_before = df.isnull().sum().sum()
    # Fill numeric columns with median
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    for col in numeric_cols:
        df[col] = df[col].fillna(df[col].median())
    # Fill text columns with mode
    text_cols = df.select_dtypes(include=['object']).columns
    for col in text_cols:
        df[col] = df[col].fillna(df[col].mode()[0] if len(df[col].mode()) > 0 else '')
    missing_after = df.isnull().sum().sum()
    
    logger.info(f"Cleaned data: removed {duplicates_removed} duplicates, filled {missing_before - missing_after} missing values")
    return df

# Clean extracted data
cleaned_data = {}
for name, data in extracted_data.items():
    if isinstance(data, pd.DataFrame):
        cleaned_data[name] = clean_dataframe(data)
    else:
        cleaned_data[name] = data

print(f"✓ Cleaned {len(cleaned_data)} data sources")

In [None]:
def validate_dataframe(df: pd.DataFrame, required_columns: List[str] = None) -> Dict:
    """Validate DataFrame structure and data quality."""
    if df is None or df.empty:
        return {'valid': False, 'errors': ['DataFrame is empty or None']}
    
    validation_results = {
        'valid': True,
        'row_count': len(df),
        'column_count': len(df.columns),
        'missing_values': df.isnull().sum().to_dict(),
        'duplicate_rows': df.duplicated().sum(),
        'errors': []
    }
    
    # Check required columns
    if required_columns:
        missing_cols = [col for col in required_columns if col not in df.columns]
        if missing_cols:
            validation_results['valid'] = False
            validation_results['errors'].append(f"Missing required columns: {missing_cols}")
    
    return validation_results

# Validate cleaned data
validation_results = {}
for name, data in cleaned_data.items():
    if isinstance(data, pd.DataFrame):
        validation_results[name] = validate_dataframe(data)

# Display validation results
for name, results in validation_results.items():
    status = "✓" if results['valid'] else "✗"
    print(f"{status} {name}: {results['row_count']} rows, {results['column_count']} columns")
    if results['errors']:
        for error in results['errors']:
            print(f"  Error: {error}")

## Section 4: Load - Data Loading to Target Database

In [None]:
def load_to_postgresql(df: pd.DataFrame, table_name: str, connection_string: str, if_exists: str = 'replace') -> bool:
    """Load DataFrame to PostgreSQL table."""
    if not SQLALCHEMY_AVAILABLE or connection_string is None:
        logger.warning("PostgreSQL connection not available")
        return False
    
    try:
        engine = create_engine(connection_string)
        df.to_sql(table_name, engine, if_exists=if_exists, index=False)
        logger.info(f"Loaded {len(df)} rows to PostgreSQL table {table_name}")
        return True
    except Exception as e:
        logger.error(f"Error loading to PostgreSQL: {e}")
        return False

def load_to_databricks(df: pd.DataFrame, table_name: str, connection_string: str) -> bool:
    """Load DataFrame to Databricks table."""
    if not SQLALCHEMY_AVAILABLE or connection_string is None:
        logger.warning("Databricks connection not available")
        return False
    
    try:
        engine = create_engine(connection_string)
        df.to_sql(table_name, engine, if_exists='replace', index=False)
        logger.info(f"Loaded {len(df)} rows to Databricks table {table_name}")
        return True
    except Exception as e:
        logger.error(f"Error loading to Databricks: {e}")
        return False

# Load data to target databases
load_results = {}

for name, data in cleaned_data.items():
    if isinstance(data, pd.DataFrame) and not data.empty:
        table_name = name.lower().replace(' ', '_')
        
        # PostgreSQL
        if POSTGRES_CONNECTION_STRING:
            load_results[f"{name}_postgres"] = load_to_postgresql(
                data, table_name, POSTGRES_CONNECTION_STRING
            )
        
        # Databricks
        if SNOWFLAKE_CONNECTION_STRING:
            load_results[f"{name}_databricks"] = load_to_databricks(
                data, table_name, SNOWFLAKE_CONNECTION_STRING
            )

print(f"✓ Loaded {sum(load_results.values())} datasets to target databases")

## Section 5: Validate - Data Quality Checks

In [None]:
def generate_data_quality_report(df: pd.DataFrame, table_name: str) -> Dict:
    """Generate comprehensive data quality report."""
    if df is None or df.empty:
        return {'table': table_name, 'status': 'empty'}
    
    report = {
        'table': table_name,
        'row_count': len(df),
        'column_count': len(df.columns),
        'missing_values': int(df.isnull().sum().sum()),
        'missing_percentage': float((df.isnull().sum().sum() / (len(df) * len(df.columns))) * 100),
        'duplicate_rows': int(df.duplicated().sum()),
        'data_types': df.dtypes.astype(str).to_dict(),
        'numeric_stats': {},
        'timestamp': datetime.now().isoformat()
    }
    
    # Add statistics for numeric columns
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    if len(numeric_cols) > 0:
        report['numeric_stats'] = df[numeric_cols].describe().to_dict()
    
    return report

# Generate quality reports
quality_reports = {}
for name, data in cleaned_data.items():
    if isinstance(data, pd.DataFrame):
        quality_reports[name] = generate_data_quality_report(data, name)

# Display quality reports
for name, report in quality_reports.items():
    print(f"\n=== {name} ===")
    print(f"Rows: {report['row_count']}")
    print(f"Columns: {report['column_count']}")
    print(f"Missing values: {report['missing_values']} ({report['missing_percentage']:.2f}%)")
    print(f"Duplicate rows: {report['duplicate_rows']}")

## Section 6: Monitor - Pipeline Monitoring and Logging

In [None]:
# Save pipeline execution metadata
pipeline_metadata = {
    'database': DB_NAME,
    'execution_timestamp': datetime.now().isoformat(),
    'data_sources': list(extracted_data.keys()),
    'extracted_count': len(extracted_data),
    'cleaned_count': len(cleaned_data),
    'validation_results': validation_results,
    'load_results': load_results,
    'quality_reports': quality_reports,
    'status': 'completed'
}

# Save metadata to JSON
metadata_file = DB_PATH / "metadata" / "pipeline_metadata.json"
metadata_file.parent.mkdir(parents=True, exist_ok=True)

with open(metadata_file, 'w') as f:
    json.dump(pipeline_metadata, f, indent=2, default=str)

print(f"✓ Pipeline metadata saved to {metadata_file}")

# Display summary
print("\n" + "="*80)
print("PIPELINE EXECUTION SUMMARY")
print("="*80)
print(f"Database: {pipeline_metadata['database']}")
print(f"Execution time: {pipeline_metadata['execution_timestamp']}")
print(f"Data sources extracted: {pipeline_metadata['extracted_count']}")
print(f"Datasets cleaned: {pipeline_metadata['cleaned_count']}")
print(f"Successful loads: {sum(pipeline_metadata['load_results'].values())}")
print(f"Status: {pipeline_metadata['status']}")