# 📘 Notebook 03b – Advanced Modeling with CatBoost & Optuna

🎯 **Objective:**
Train and tune a CatBoost model using Optuna on session-level data to predict conversion. Log performance metrics and best parameters.

---

In [2]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.2.1-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.15.1-py3-none-any.whl.metadata (7.2 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.2.1-py3-none-any.whl (383 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m383.6/383.6 kB[0m [31m20.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.15.1-py3-none-any.whl (231 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m231.8/231.8 kB[0m [31m17.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, alembic, optuna
Successfully installed alembic-1.15.1 colorlog-6.9.0 optuna-4.2.1


In [4]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.7-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Collecting numpy<2.0,>=1.16.0 (from catboost)
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
Downloading catboost-1.2.7-cp311-cp311-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.3/18.3 MB[0m [31m88.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: numpy, catboost
  Attempting uninstall: numpy
    Found existing installation: numpy 2.0.2
    Uninstalling numpy-2.0.2:
      Successfully uninstalled numpy-2.0.2
Successfully in

In [6]:
!pip install numpy==1.23.5  # Replace with compatible version
!pip install catboost==1.2.0  # Replace with compatible version

Collecting numpy==1.23.5
  Downloading numpy-1.23.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.3 kB)
Downloading numpy-1.23.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.1/17.1 MB[0m [31m81.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 1.26.4
    Uninstalling numpy-1.26.4:
      Successfully uninstalled numpy-1.26.4
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
jaxlib 0.5.1 requires numpy>=1.25, but you have numpy 1.23.5 which is incompatible.
treescope 0.1.9 requires numpy>=1.25.2, but you have numpy 1.23.5 which is incompatible.
albucore 0.0.23 requires numpy>=1.24.4, but you have numpy 1.23.5 which is incompatible.
blosc2 3.2.0 requires 

Collecting catboost==1.2.0
  Downloading catboost-1.2-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2-cp311-cp311-manylinux2014_x86_64.whl (98.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.5/98.5 MB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
  Attempting uninstall: catboost
    Found existing installation: catboost 1.2.7
    Uninstalling catboost-1.2.7:
      Successfully uninstalled catboost-1.2.7
Successfully installed catboost-1.2


In [1]:
import pandas as pd
import numpy as np
import os
import json
import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, f1_score
from catboost import CatBoostClassifier, Pool
from google.cloud import storage

from google.colab import auth

# 🔐 Authenticate GCP
auth.authenticate_user()

# ✅ Step 1: Download cleaned dataset from GCS
gcs_path = "clickstream/session_features_clean.csv"
local_path = "../data/session_features_clean.csv"
os.makedirs("../data", exist_ok=True)

client = storage.Client()
bucket = client.bucket("boothill2001-dataset")
blob = bucket.blob(gcs_path)
blob.download_to_filename(local_path)
print(f"✅ Downloaded cleaned dataset to {local_path}")

# ✅ Step 2: Load data
df = pd.read_csv(local_path)
X = df.drop(columns=["user_session", "conversion"])
y = df["conversion"]

# ✅ Step 3: Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)
train_pool = Pool(X_train, y_train)
test_pool = Pool(X_test, y_test)

# ✅ Step 4: Define Optuna objective

def objective(trial):
    params = {
        "iterations": 300,
        "learning_rate": trial.suggest_loguniform("learning_rate", 0.01, 0.3),
        "depth": trial.suggest_int("depth", 4, 10),
        "l2_leaf_reg": trial.suggest_loguniform("l2_leaf_reg", 1, 10),
        "random_seed": 42,
        "verbose": 0,
        "loss_function": "Logloss",
        "eval_metric": "AUC"
    }
    model = CatBoostClassifier(**params)
    model.fit(train_pool, eval_set=test_pool, early_stopping_rounds=30, verbose=0)
    preds = model.predict_proba(X_test)[:, 1]
    return roc_auc_score(y_test, preds)

# ✅ Step 5: Run Optuna study
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=30)

print("\n✅ Best trial:")
print(study.best_trial)
best_params = study.best_trial.params

# ✅ Step 6: Train final model with best params
final_model = CatBoostClassifier(
    iterations=500,
    random_seed=42,
    verbose=100,
    **best_params
)
final_model.fit(train_pool, eval_set=test_pool)

# ✅ Step 7: Evaluate final model
preds_final = final_model.predict_proba(X_test)[:, 1]
preds_label = final_model.predict(X_test)
auc = roc_auc_score(y_test, preds_final)
f1 = f1_score(y_test, preds_label)

metrics = {
    "roc_auc": auc,
    "f1_score": f1,
    "best_params": best_params
}

os.makedirs("../outputs", exist_ok=True)
with open("../outputs/catboost_optuna_summary.json", "w") as f:
    json.dump(metrics, f, indent=2)

# ✅ Save model
os.makedirs("../models", exist_ok=True)
model_path = "../models/catboost_model.cbm"
final_model.save_model(model_path)

# ✅ Upload model to GCS
gcs_model_path = "clickstream/models/catboost_model.cbm"
blob_model = bucket.blob(gcs_model_path)
blob_model.upload_from_filename(model_path)
print(f"✅ Uploaded CatBoost model to GCS → gs://boothill2001-dataset/{gcs_model_path}")

print("\n✅ Advanced modeling complete. Metrics:")
print(metrics)


[I 2025-03-26 09:42:49,766] A new study created in memory with name: no-name-17a90602-ac82-4a47-adff-efbad2c9152d


✅ Downloaded cleaned dataset to ../data/session_features_clean.csv


  "learning_rate": trial.suggest_loguniform("learning_rate", 0.01, 0.3),
  "l2_leaf_reg": trial.suggest_loguniform("l2_leaf_reg", 1, 10),
[I 2025-03-26 09:42:50,350] Trial 0 finished with value: 1.0 and parameters: {'learning_rate': 0.05185916743101991, 'depth': 7, 'l2_leaf_reg': 4.950278393536486}. Best is trial 0 with value: 1.0.
  "learning_rate": trial.suggest_loguniform("learning_rate", 0.01, 0.3),
  "l2_leaf_reg": trial.suggest_loguniform("l2_leaf_reg", 1, 10),
[I 2025-03-26 09:42:50,840] Trial 1 finished with value: 1.0 and parameters: {'learning_rate': 0.06470012257124624, 'depth': 8, 'l2_leaf_reg': 5.4040657348753465}. Best is trial 0 with value: 1.0.
  "learning_rate": trial.suggest_loguniform("learning_rate", 0.01, 0.3),
  "l2_leaf_reg": trial.suggest_loguniform("l2_leaf_reg", 1, 10),
[I 2025-03-26 09:42:51,556] Trial 2 finished with value: 1.0 and parameters: {'learning_rate': 0.28962478897054966, 'depth': 9, 'l2_leaf_reg': 1.7770110564427586}. Best is trial 0 with value: 1


✅ Best trial:
FrozenTrial(number=0, state=1, values=[1.0], datetime_start=datetime.datetime(2025, 3, 26, 9, 42, 49, 771423), datetime_complete=datetime.datetime(2025, 3, 26, 9, 42, 50, 349828), params={'learning_rate': 0.05185916743101991, 'depth': 7, 'l2_leaf_reg': 4.950278393536486}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'learning_rate': FloatDistribution(high=0.3, log=True, low=0.01, step=None), 'depth': IntDistribution(high=10, log=False, low=4, step=1), 'l2_leaf_reg': FloatDistribution(high=10.0, log=True, low=1.0, step=None)}, trial_id=0, value=None)
0:	learn: 0.4994757	test: 0.4997434	best: 0.4997434 (0)	total: 2.79ms	remaining: 1.39s
100:	learn: 0.0002440	test: 0.0002881	best: 0.0002881 (100)	total: 455ms	remaining: 1.8s
200:	learn: 0.0001124	test: 0.0001476	best: 0.0001476 (200)	total: 1.38s	remaining: 2.05s
300:	learn: 0.0001040	test: 0.0001380	best: 0.0001380 (300)	total: 1.83s	remaining: 1.21s
400:	learn: 0.0001026	test: 0.0001364	best: 0.0