# 📘 Notebook 03 – Modeling Baseline: Logistic Regression & XGBoost

🎯 **Objective:**
Train and evaluate baseline models (Logistic Regression & XGBoost) on session_features_clean.csv to predict conversion.

---

In [2]:
from google.colab import auth

# 🔐 Authenticate GCP
auth.authenticate_user()

In [3]:
import pandas as pd
import numpy as np
import os
import json
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, roc_auc_score
from google.cloud import storage


# ✅ Step 1: Download cleaned dataset from GCS
gcs_path = "clickstream/session_features_clean.csv"
local_path = "../data/session_features_clean.csv"
os.makedirs("../data", exist_ok=True)

client = storage.Client()
bucket = client.bucket("boothill2001-dataset")
blob = bucket.blob(gcs_path)
blob.download_to_filename(local_path)
print(f"✅ Downloaded cleaned dataset to {local_path}")

# ✅ Step 2: Load data
df = pd.read_csv(local_path)

# ✅ Step 3: Define features and target
X = df.drop(columns=["user_session", "conversion"])
y = df["conversion"]

# ✅ Step 4: Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

# ✅ Step 5: Logistic Regression
logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train, y_train)
y_pred_log = logreg.predict(X_test)
y_prob_log = logreg.predict_proba(X_test)[:, 1]

# ✅ Step 6: XGBoost
xgb = XGBClassifier(use_label_encoder=False, eval_metric="logloss", random_state=42)
xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(X_test)
y_prob_xgb = xgb.predict_proba(X_test)[:, 1]

# ✅ Step 7: Evaluation
results = {
    "logistic_regression": {
        "report": classification_report(y_test, y_pred_log, output_dict=True),
        "roc_auc": roc_auc_score(y_test, y_prob_log)
    },
    "xgboost": {
        "report": classification_report(y_test, y_pred_xgb, output_dict=True),
        "roc_auc": roc_auc_score(y_test, y_prob_xgb)
    }
}

# ✅ Save evaluation summary
os.makedirs("../outputs", exist_ok=True)
with open("../outputs/model_baseline_summary.json", "w") as f:
    json.dump(results, f, indent=2)

print("\n✅ Baseline model evaluation saved to ../outputs/model_baseline_summary.json")


✅ Downloaded cleaned dataset to ../data/session_features_clean.csv

✅ Baseline model evaluation saved to ../outputs/model_baseline_summary.json


Parameters: { "use_label_encoder" } are not used.



In [4]:
import joblib

# ✅ Create models folder
os.makedirs("../models", exist_ok=True)

# Save Logistic Regression
joblib.dump(logreg, "../models/logreg_model.pkl")

# Save XGBoost (as JSON for compatibility)
xgb.save_model("../models/xgb_model.json")

print("✅ Models saved locally to ../models/")


✅ Models saved locally to ../models/


In [5]:
# ✅ Upload models to GCS
gcs_model_paths = {
    "logreg": "clickstream/models/logreg_model.pkl",
    "xgboost": "clickstream/models/xgb_model.json"
}

for name, path in gcs_model_paths.items():
    local_model_path = f"../models/{os.path.basename(path)}"
    blob = bucket.blob(path)
    blob.upload_from_filename(local_model_path)
    print(f"✅ Uploaded {name} model to GCS → gs://boothill2001-dataset/{path}")


✅ Uploaded logreg model to GCS → gs://boothill2001-dataset/clickstream/models/logreg_model.pkl
✅ Uploaded xgboost model to GCS → gs://boothill2001-dataset/clickstream/models/xgb_model.json
