# 🧪 ETL Pipeline using Pandas & Scikit-learn
This notebook performs an end-to-end ETL process: Extract, Transform, Load.

In [None]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
import os

In [None]:
# Replace with your actual path
DATA_PATH = "raw_data.csv"

# Load data
df = pd.read_csv(DATA_PATH)
df.head()

In [None]:
# Identify numerical and categorical columns
num_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()

print("Numerical Columns:", num_cols)
print("Categorical Columns:", cat_cols)

In [None]:
# Numerical pipeline
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Categorical pipeline
cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

# Combine them
full_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_cols),
    ('cat', cat_pipeline, cat_cols)
])

In [None]:
# Fit and transform
transformed_data = full_pipeline.fit_transform(df)

# Convert to DataFrame
transformed_df = pd.DataFrame(
    transformed_data.toarray() if hasattr(transformed_data, "toarray") else transformed_data
)

transformed_df.head()

In [None]:
OUTPUT_PATH = "processed_data.csv"
transformed_df.to_csv(OUTPUT_PATH, index=False)

print(f"✅ Processed data saved to: {OUTPUT_PATH}")

In [None]:
import joblib

# Save the pipeline object
joblib.dump(full_pipeline, "etl_pipeline_model.pkl")
print("💾 Pipeline model saved as 'etl_pipeline_model.pkl'")