# Imports and Setup

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import os
import glob
import re

# Models
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

# Tools
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.feature_selection import SelectKBest, f_regression

# Setup
sns.set_style("whitegrid")
if not os.path.exists("../../models"):
    os.makedirs("../../models")

## 1. Data Loading & Cleaning
We load the cleaned **Advertising (User)** dataset.
* **Target:** `conversion_rate`
* **Action:** We remove any non-numeric columns and ensure missing values are handled.

In [2]:
# FIND FILE AUTOMATICALLY
files = glob.glob("../Cleaned_Datasets/Advertising/advertising_v6_full_time_features.csv")
print(f"Loading file: {files[0]}")
df = pd.read_csv(files[0])

# DEFINE TARGET
TARGET = 'conversion_rate'

# PRE-PROCESSING
drop_cols = ['ROI']
df = df.drop([c for c in drop_cols if c in df.columns], axis=1)

# Basic Encoding (if any object columns remain)
df = pd.get_dummies(df, drop_first=True)
df = df.fillna(0)

print(f"Final Data Shape: {df.shape}")

Loading file: ../Cleaned_Datasets/Advertising/advertising_v6_full_time_features.csv
Final Data Shape: (1000, 38)


## 2. Feature Selection
To improve model performance and reduce noise, we "play" with features using:
1.  **Correlation Matrix:** Visualizing linear relationships.
2.  **SelectKBest:** Statistical test to pick the top 10 most influential features.

In [3]:
import re
from sklearn.feature_selection import SelectKBest, f_regression

# 1. DEFINE X and Y
X = df.drop(TARGET, axis=1)
y = df[TARGET]

# 2. SANITIZE COLUMN NAMES
X.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in X.columns]
print("‚úÖ Column names sanitized.")

# 3. SELECT TOP 10 FEATURES
print("Selecting TOP 10 features for Experiment 4...")
selector = SelectKBest(score_func=f_regression, k=10) 
selector.fit(X, y)

# Get the names of the selected columns
cols = selector.get_support(indices=True)
X_selected = X.iloc[:, cols]

# OVERWRITE X with the reduced version
X = X_selected

print(f"‚úÖ Features ready: {X.shape[1]}")
print(f"   Selected: {X.columns.tolist()}")

‚úÖ Column names sanitized.
Selecting TOP 10 features for Experiment 4...
‚úÖ Features ready: 10
   Selected: ['Unnamed__0', 'click_through_rate', 'view_time', 'cost_per_click', 'age_group_encoded', 'engagement_level_encoded', 'content_type_Text', 'content_type_Video', 'ad_topic_Finance', 'ad_target_audience_Students']


## 3. Baseline Model Comparison
We train three "classical" models with default parameters to establish a baseline.
* Random Forest
* XGBoost
* LightGBM

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

models = {
    'RandomForest': RandomForestRegressor(random_state=42), 
    'XGBoost': XGBRegressor(random_state=42, objective='reg:squarederror'),
    'LightGBM': LGBMRegressor(random_state=42, verbose=-1)
}

print(f"{'Model':<15} | {'Test R2':<10} | {'Train R2':<10} | {'RMSE':<10} | {'Gap (Train-Test)'}")
print("-" * 80)

for name, model in models.items():
    # Train
    model.fit(X_train, y_train)
    
    # Predict
    y_pred_test = model.predict(X_test)
    y_pred_train = model.predict(X_train) 
    
    # Calculate Metrics
    r2_test = r2_score(y_test, y_pred_test)
    r2_train = r2_score(y_train, y_pred_train)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
    
    # Gap > 0.10 usually means Overfitting
    gap = r2_train - r2_test
    
    print(f"{name:<15} | {r2_test:.4f}     | {r2_train:.4f}     | {rmse:.4f}     | {gap:.4f}")

print("-" * 80)

Model           | Test R2    | Train R2   | RMSE       | Gap (Train-Test)
--------------------------------------------------------------------------------
RandomForest    | 0.7424     | 0.9551     | 0.0104     | 0.2127
XGBoost         | 0.6857     | 0.9913     | 0.0115     | 0.3056
LightGBM        | 0.7047     | 0.9390     | 0.0111     | 0.2343
--------------------------------------------------------------------------------


## 4. Hyperparameter Optimization
We take the winning model and optimize it using `RandomizedSearchCV`.

In [5]:
from sklearn.model_selection import RandomizedSearchCV

# 1. Define Parameter Grids
param_grids = {
    'RandomForest': {
        'n_estimators': [100, 200, 300],
        'max_depth': [10, 20, 30, None],
        'min_samples_split': [2, 5, 10]
    },
    'XGBoost': {
        'n_estimators': [100, 300],
        'learning_rate': [0.01, 0.05, 0.1],
        'max_depth': [3, 5, 7],
        'subsample': [0.8, 1.0]
    },
    'LightGBM': {
        'n_estimators': [100, 300],
        'learning_rate': [0.01, 0.05, 0.1],
        'num_leaves': [31, 50]
    }
}

print(f"{'Model':<15} | {'Test R2':<10} | {'Train R2':<10} | {'Gap':<10} | {'CV Score':<10}")
print("-" * 80)

# Variables to track the "Best of the Best" for saving
best_opt_r2 = -float('inf')
best_opt_model = None
best_model_name = None 

# 2. LOOP THROUGH ALL MODELS
for name, model in models.items():
    if name in param_grids:
        search = RandomizedSearchCV(
            estimator=model,
            param_distributions=param_grids[name],
            n_iter=5,
            cv=3, 
            scoring='r2', 
            n_jobs=-1, 
            random_state=42
        )
        
        search.fit(X_train, y_train)
        
        # Get Best Optimized Model
        opt_model = search.best_estimator_
        
        # Predict
        y_pred_test = opt_model.predict(X_test)
        y_pred_train = opt_model.predict(X_train)
        
        # Metrics
        r2_test = r2_score(y_test, y_pred_test)
        r2_train = r2_score(y_train, y_pred_train)
        gap = r2_train - r2_test
        cv_score = search.best_score_
        
        print(f"{name:<15} | {r2_test:.4f}     | {r2_train:.4f}     | {gap:.4f}     | {cv_score:.4f}")
        
        # Track the ultimate winner
        if r2_test > best_opt_r2:
            best_opt_r2 = r2_test
            best_opt_model = opt_model
            best_model_name = name

print("-" * 80)
print(f"üèÜ Ultimate Winner: {best_model_name} (R2: {best_opt_r2:.4f})")
opt_r2 = best_opt_r2

Model           | Test R2    | Train R2   | Gap        | CV Score  
--------------------------------------------------------------------------------
RandomForest    | 0.7460     | 0.8947     | 0.1487     | 0.6822
XGBoost         | 0.7542     | 0.7791     | 0.0249     | 0.6905
LightGBM        | 0.7290     | 0.8807     | 0.1517     | 0.6631
--------------------------------------------------------------------------------
üèÜ Ultimate Winner: XGBoost (R2: 0.7542)


# Saving the model

In [6]:
# --- SAVE CELL ---
from datetime import datetime
import os
import joblib

timestamp = datetime.now().strftime("%Y%m%d_%H%M")

if 'best_opt_model' in locals():
    filename = f"Exp4_User_{best_model_name}_Opt_R2-{opt_r2:.3f}_{timestamp}.joblib"
    save_dir = "../src/experiments"
    
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
        
    save_path = os.path.join(save_dir, filename)
    joblib.dump(best_opt_model, save_path)
    print(f"‚úÖ Saved Exp4 Model: {save_path}")
else:
    print("‚ùå optimization didn't run. Model not saved.")

‚úÖ Saved Exp4 Model: ../src/experiments\Exp4_User_XGBoost_Opt_R2-0.754_20260112_2116.joblib
