## ETL Products Data Pipeline
In this notebook, we will build a robust ETL pipeline to process product data from JSON files. The pipeline will:
* Extract product data from multiple JSON files.
* Transform the data to normalize fields, handle missing values, and calculate additional metrics.
* Load the cleaned data into a PostgreSQL database with a staging-merge pattern.
* Log all steps and errors for traceability.

### 1. Importing Required Libraries


In [164]:
import pandas as pd
from sqlalchemy import create_engine
import logging
import json

### 2. Logging

In [165]:
# Logging
logging.basicConfig(
    filename="etl_pipeline.log",
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s"
)

logging.info("ETL Pipeline started")

## 3. Defining Functions


### 3.1 Extract Function
* Reads multiple JSON files one by one.
* Converts the data from each file into a Python DataFrame.
* If a specific key exists in the JSON, it extracts only that part; otherwise, it uses the entire data.
* Logs any errors from missing files or invalid JSON, while allowing the process to continue.
* Combines all successfully processed DataFrames into a single raw data DataFrame.

In [166]:
def extract(batch_files, key):
    json_files = []
    success_count = 0
    
    for file in batch_files:
        try:
            with open(file, "r") as f:
                data = json.load(f) 
            
            if key in data:
                df_json = pd.DataFrame(data[key])
            else:
                df_json = pd.DataFrame(data)
            
            json_files.append(df_json)
            success_count += 1
            logging.info(f"Successfully processed {file}")  
            
        except FileNotFoundError:
            logging.error(f"{file} not found")
        except json.JSONDecodeError as e:
            logging.error(f"{file} has invalid JSON: {e}")
        except Exception as e:
            logging.error(f"Unexpected error with {file}: {e}")
    
    logging.info(f"Processed {success_count}/{len(batch_files)} files successfully")
    
    df = pd.concat(json_files, ignore_index=True)
    return df

### 3.2 Transform Function
Transform data to clean, normalized format:
* Explode reviews into separate rows
* Extract review ratings and drop reviews column
* Drop missing data (null IDs, titles, prices, review ratings)
* Validate data (positive prices, discounts 0-100%)
* Calculate price_with_discount
* Convert types and reorder columns

In [167]:
def transform(df):
   
    df = df.copy()
    
    df = df.explode("reviews", ignore_index=True)
    df["review_rating"] = df["reviews"].apply(lambda x: x["rating"] if isinstance(x, dict) else None)
    df = df.drop(columns=["reviews"])
    
    df = df.dropna(subset=["id", "title"], how="all")
    df = df.dropna(subset=["price","review_rating"]) 

    df = df[df['price'] > 0]
    df = df[(df['discountPercentage'] >= 0) & (df['discountPercentage'] <= 100)]        

    df["price_with_discount"] = (df["price"] * (1 - df["discountPercentage"] / 100)).round(2)

    df["price"] = df["price"].astype(float)
    df["discountPercentage"] = df["discountPercentage"].astype(float)
    df["rating"] = df["rating"].astype(float)
    df["review_rating"] = df["review_rating"].astype(int)
    df["price_with_discount"] = df["price_with_discount"].astype(float)

    df = df[["id", "title", "category", "price", "discountPercentage",
          "rating", "brand", "review_rating", "price_with_discount"]]

    df.columns = df.columns.str.lower()

    df = df.reset_index(drop=True)
    
    return df




### 3.3 Load Function

* Loads data into staging table
* PostgreSQL UPSERT: updates existing records (by ID) or inserts new ones
* Transaction-managed with automatic rollback on errors
* Drops staging table after successful merge

In [168]:
def load(df, sql_connection, table_name, drop_staging=True):
    from sqlalchemy import text
    schema = 'etl_schema'
    staging_table = f"{table_name}_staging"
    
    with sql_connection.begin() as conn:
        # Load to staging
        df.to_sql(staging_table, conn, if_exists='replace', index=False, schema=schema)
        logging.info(f"Loaded {len(df)} records to {schema}.{staging_table}")
        
        # Insert all records from staging
        merge_query = text(f"""
            INSERT INTO {schema}.{table_name} (
                id, title, category, price, discountPercentage,
                rating, brand, review_rating, price_with_discount,
                created_at, updated_at
            )
            SELECT id, title, category, price, discountPercentage, rating, brand,
                review_rating, price_with_discount,
                CURRENT_TIMESTAMP, CURRENT_TIMESTAMP
            FROM {schema}.{staging_table};
        """)
        conn.execute(merge_query)
        logging.info(f"Inserted data to {schema}.{table_name}")
        
        # Drop staging
        if drop_staging:
            conn.execute(text(f"DROP TABLE IF EXISTS {schema}.{staging_table}"))
            logging.info(f"Dropped staging table")

### 4. Usage 

In [169]:
# Batch files
batch_files = [
    f"/Users/buse/Desktop/NumPy/ETL-pipeline/json-pagination/{skip}.json"
    for skip in range(20)
]

# PostgreSQL Connection
engine = create_engine(
    'postgresql://etl_user:sifre@localhost:5432/etl_pipeline'
)

try:
    # Extract 
    df = extract(batch_files, "products")
    logging.info(f"Extract completed: {len(df)} raw records")
    
    # Transform
    df_t = transform(df)
    logging.info(f"Transform completed: {len(df_t)} clean records")
    
    # Load 
    load(df_t, engine, table_name="products", drop_staging=True)
    
    logging.info("=== ETL Pipeline completed successfully! ===")
    
except Exception as e:
    logging.critical(f"Pipeline failed: {e}")
    raise
    
finally:
    engine.dispose()
    logging.info("Database connection closed")