# 📘 Notebook 02c – Dataset Summary Logging

🎯 **Objective:**
Log summary statistics of the session_features.csv file after feature engineering, including conversion rate, number of sessions, and number of converted sessions. Save the output to `data_summary.json` for future use in modeling, reporting, and tracking.

---

In [1]:
from google.colab import auth

# 🔐 Authenticate GCP
auth.authenticate_user()

In [2]:
import pandas as pd
import json
from datetime import datetime
import os
from google.cloud import storage

# ✅ Load session_features.csv from GCS
gcs_path = "clickstream/session_features.csv"
local_path = "../data/session_features.csv"
os.makedirs("../data", exist_ok=True)

client = storage.Client()
bucket = client.bucket("boothill2001-dataset")
blob = bucket.blob(gcs_path)
blob.download_to_filename(local_path)
print(f"✅ Downloaded session_features from GCS to {local_path}")

# ✅ Load processed session features
df = pd.read_csv(local_path)

# ✅ Calculate summary statistics
n_sessions = df.shape[0]
n_features = df.shape[1] - 1  # exclude label
n_converted = int(df["conversion"].sum())
n_not_converted = int(n_sessions - n_converted)
conversion_rate = round(n_converted / n_sessions, 4)
timestamp = datetime.now().isoformat()

summary = {
    "n_sessions": n_sessions,
    "n_features": n_features,
    "conversion_rate": conversion_rate,
    "n_converted": n_converted,
    "n_not_converted": n_not_converted,
    "timestamp": timestamp
}

# ✅ Save summary to JSON
os.makedirs("../outputs", exist_ok=True)
with open("../outputs/data_summary.json", "w") as f:
    json.dump(summary, f, indent=2)

print("\n✅ Summary logged to ../outputs/data_summary.json")
print(summary)


✅ Downloaded session_features from GCS to ../data/session_features.csv

✅ Summary logged to ../outputs/data_summary.json
{'n_sessions': 10000, 'n_features': 10, 'conversion_rate': 0.0545, 'n_converted': 545, 'n_not_converted': 9455, 'timestamp': '2025-03-26T08:47:54.030357'}
