# Imports and Setup

In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import os
import glob
import re

# Models
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

# Tools
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.feature_selection import SelectKBest, f_regression

# Setup
sns.set_style("whitegrid")
if not os.path.exists("../../models"):
    os.makedirs("../../models")

## 1. Data Loading & Cleaning
We load the cleaned **Advertising (User)** dataset.
* **Target:** `conversion_rate`
* **Action:** We remove any non-numeric columns and ensure missing values are handled.

In [13]:
# FIND FILE AUTOMATICALLY
files = glob.glob("../Cleaned_Datasets/Advertising/advertising_v6_full_time_features.csv")
print(f"Loading file: {files[0]}")
df = pd.read_csv(files[0])

# DEFINE TARGET
TARGET = 'conversion_rate'

# PRE-PROCESSING
# Drop non-predictive columns (IDs, timestamps if not converted)
drop_cols = ['ROI'] # ROI is calculated AFTER conversion, so it's a cheat/leak.
df = df.drop([c for c in drop_cols if c in df.columns], axis=1)

# Basic Encoding (if any object columns remain)
df = pd.get_dummies(df, drop_first=True)
df = df.fillna(0)

print(f"Final Data Shape: {df.shape}")

Loading file: ../Cleaned_Datasets/Advertising/advertising_v6_full_time_features.csv
Final Data Shape: (1000, 33)


## 2. Feature Selection
To improve model performance and reduce noise, we "play" with features using:
1.  **Correlation Matrix:** Visualizing linear relationships.
2.  **SelectKBest:** Statistical test to pick the top 10 most influential features.

In [14]:
# --- CELL 5: FEATURE SELECTION (BASELINE = ALL) ---
import re

# 1. DEFINE X and Y
X = df.drop(TARGET, axis=1)
y = df[TARGET]

# 2. SANITIZE COLUMN NAMES (Critical for LightGBM)
# Replaces [ ] : and spaces with underscores
X.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in X.columns]
print("‚úÖ Column names sanitized.")

# 3. SELECT FEATURES
# For Baseline Experiment, we use k='all' to see how the model handles full noise
print("Selecting ALL features for Baseline...")
selector = SelectKBest(score_func=f_regression, k='all') 
selector.fit(X, y)

selected_features = X.columns[selector.get_support()]
print(f"‚úÖ Features ready: {len(selected_features)}")

‚úÖ Column names sanitized.
Selecting ALL features for Baseline...
‚úÖ Features ready: 32


## 3. Baseline Model Comparison
We train three "classical" models with default parameters to establish a baseline.
* Random Forest
* XGBoost
* LightGBM

In [15]:
# --- CELL 7: TRAIN BASELINE MODELS ---
# Goal: Compare Train vs Test R2 to detect Overfitting

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

models = {
    'RandomForest': RandomForestRegressor(random_state=42), 
    'XGBoost': XGBRegressor(random_state=42, objective='reg:squarederror'),
    'LightGBM': LGBMRegressor(random_state=42, verbose=-1)
}

print(f"{'Model':<15} | {'Test R2':<10} | {'Train R2':<10} | {'RMSE':<10} | {'Gap (Train-Test)'}")
print("-" * 80)

for name, model in models.items():
    # Train
    model.fit(X_train, y_train)
    
    # Predict
    y_pred_test = model.predict(X_test)
    y_pred_train = model.predict(X_train) 
    
    # Calculate Metrics
    r2_test = r2_score(y_test, y_pred_test)
    r2_train = r2_score(y_train, y_pred_train)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
    
    # Gap > 0.10 usually means Overfitting
    gap = r2_train - r2_test
    
    print(f"{name:<15} | {r2_test:.4f}     | {r2_train:.4f}     | {rmse:.4f}     | {gap:.4f}")

print("-" * 80)

Model           | Test R2    | Train R2   | RMSE       | Gap (Train-Test)
--------------------------------------------------------------------------------
RandomForest    | -0.0032     | 0.8565     | 0.0396     | 0.8597
XGBoost         | -0.2495     | 0.9990     | 0.0442     | 1.2486
LightGBM        | -0.0978     | 0.9237     | 0.0414     | 1.0215
--------------------------------------------------------------------------------


## 4. Hyperparameter Optimization
We take the winning model and optimize it using `RandomizedSearchCV`.

In [16]:
# --- CELL 7: TRAIN BASELINE MODELS & FIND WINNER ---
# Goal: Compare Train vs Test R2 AND save the best model for the next step.

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

models = {
    'RandomForest': RandomForestRegressor(random_state=42), 
    'XGBoost': XGBRegressor(random_state=42, objective='reg:squarederror'),
    'LightGBM': LGBMRegressor(random_state=42, verbose=-1)
}

# --- TRACKING VARIABLES (Crucial for next steps) ---
best_r2 = -float('inf')
best_model_name = None
best_base_model = None

print(f"{'Model':<15} | {'Test R2':<10} | {'Train R2':<10} | {'RMSE':<10} | {'Gap (Train-Test)'}")
print("-" * 80)

for name, model in models.items():
    # Train
    model.fit(X_train, y_train)
    
    # Predict
    y_pred_test = model.predict(X_test)
    y_pred_train = model.predict(X_train) 
    
    # Calculate Metrics
    r2_test = r2_score(y_test, y_pred_test)
    r2_train = r2_score(y_train, y_pred_train)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
    
    gap = r2_train - r2_test
    
    print(f"{name:<15} | {r2_test:.4f}     | {r2_train:.4f}     | {rmse:.4f}     | {gap:.4f}")
    
    # --- LOGIC TO SAVE THE WINNER ---
    if r2_test > best_r2:
        best_r2 = r2_test
        best_model_name = name
        best_base_model = model

print("-" * 80)
print(f"üèÜ Winner: {best_model_name} (R2: {best_r2:.4f})")

Model           | Test R2    | Train R2   | RMSE       | Gap (Train-Test)
--------------------------------------------------------------------------------
RandomForest    | -0.0032     | 0.8565     | 0.0396     | 0.8597
XGBoost         | -0.2495     | 0.9990     | 0.0442     | 1.2486
LightGBM        | -0.0978     | 0.9237     | 0.0414     | 1.0215
--------------------------------------------------------------------------------
üèÜ Winner: RandomForest (R2: -0.0032)


# Saving the model

In [17]:
from datetime import datetime
import os

# 1. Setup Timestamp
timestamp = datetime.now().strftime("%Y%m%d_%H%M")

# 2. Check variables
# We use 'best_base_model' because we haven't optimized yet (Exp 1)
if best_base_model is None:
    print("‚ùå Error: No model to save. Did Cell 7 run correctly?")
else:
    # 3. Create Filename
    # I added 'Exp1' to the name so you know this is the Baseline experiment
    filename = f"Exp1_User_{best_model_name}_R2-{best_r2:.3f}_{timestamp}.joblib"
    
    # 4. Define Path (Assumes notebook is in 'src/')
    save_dir = "../models"
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
        
    save_path = os.path.join(save_dir, filename)

    # 5. Save
    joblib.dump(best_base_model, save_path)
    print(f"‚úÖ Baseline Model saved: {save_path}")

‚úÖ Baseline Model saved: ../models\Exp1_User_RandomForest_R2--0.003_20260111_1832.joblib
