# 📘 Notebook 02d – Feature Cleaning & Encoding

🎯 **Objective:**
Clean and preprocess session features:
- Handle missing values
- Encode categorical columns
- Scale numeric features if needed
- Save cleaned dataset & log summary

---

In [None]:
import pandas as pd
import numpy as np
import os
import json
from sklearn.preprocessing import StandardScaler
from google.cloud import storage

# ✅ Step 1: Download session_features.csv from GCS
gcs_path = "clickstream/session_features.csv"
local_path = "../data/session_features.csv"
os.makedirs("../data", exist_ok=True)

client = storage.Client()
bucket = client.bucket("boothill2001-dataset")
blob = bucket.blob(gcs_path)
blob.download_to_filename(local_path)
print(f"✅ Downloaded session_features.csv to {local_path}")

# ✅ Step 2: Load DataFrame
df = pd.read_csv(local_path)

# ✅ Step 3: Handle Missing Values
missing_report = df.isnull().sum().to_dict()

# ✅ Step 4: Scale numeric features (if applicable)
if 'avg_price' in df.columns:
    scaler = StandardScaler()
    df['avg_price_scaled'] = scaler.fit_transform(df[['avg_price']])
    scaled_mean = float(df['avg_price_scaled'].mean())
    scaled_std = float(df['avg_price_scaled'].std())
else:
    scaled_mean = None
    scaled_std = None

# ✅ Step 5: Save cleaned dataset
clean_path = "../data/session_features_clean.csv"
df.to_csv(clean_path, index=False)
print(f"✅ Saved cleaned dataset to {clean_path}")

# ✅ Step 6: Log feature transformation summary
feature_log = {
    "missing_before": missing_report,
    "columns": list(df.columns),
    "avg_price_scaled_mean": scaled_mean,
    "avg_price_scaled_std": scaled_std
}

os.makedirs("../outputs", exist_ok=True)
with open("../outputs/feature_log.json", "w") as f:
    json.dump(feature_log, f, indent=2)

print("\n✅ Feature log saved to ../outputs/feature_log.json")
print(feature_log)

# ✅ Optional: Upload cleaned file to GCS
upload_path = "clickstream/session_features_clean.csv"
blob_clean = bucket.blob(upload_path)
blob_clean.upload_from_filename(clean_path)
print(f"✅ Uploaded cleaned dataset to GCS → gs://boothill2001-dataset/{upload_path}")


✅ Downloaded session_features.csv to ../data/session_features.csv
✅ Saved cleaned dataset to ../data/session_features_clean.csv

✅ Feature log saved to ../outputs/feature_log.json
{'missing_before': {'user_session': 0, 'num_events': 0, 'num_views': 0, 'num_carts': 0, 'num_purchases': 0, 'avg_price': 0, 'max_price': 0, 'num_categories': 0, 'num_brands': 0, 'session_duration': 0, 'conversion': 0}, 'columns': ['user_session', 'num_events', 'num_views', 'num_carts', 'num_purchases', 'avg_price', 'max_price', 'num_categories', 'num_brands', 'session_duration', 'conversion', 'avg_price_scaled'], 'avg_price_scaled_mean': 8.810729923425242e-17, 'avg_price_scaled_std': 1.0000500037503157}
✅ Uploaded cleaned dataset to GCS → gs://boothill2001-dataset/clickstream/session_features_clean.csv
