# Imports and Setup

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import os
import glob
import re
from datetime import datetime


# Models
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

# Sklearn Tools
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.feature_selection import SelectKBest, f_regression

# Setup plotting
sns.set_style("whitegrid")

# Create models folder if it doesn't exist
if not os.path.exists("../../models"):
    os.makedirs("../../models")

## 1. Target Definition & Cleaning
We are predicting **Overcost** (Market Efficiency).

* **Goal:** Predict if a campaign will be under or over budget based on its settings.
* **Formula:** `Overcost = approved_budget - media_cost_usd`
    * **Positive (+):** Under Budget (Good efficiency).
    * **Negative (-):** Over Budget (Bad efficiency).
* **Critical Step:** We MUST drop `approved_budget` and `media_cost_usd` from the input features (X). If we include them, the model will "cheat" by simply doing the subtraction itself (Data Leakage).

In [2]:
# FIND FILE AUTOMATICALLY
files = glob.glob("../Cleaned_Datasets/Marketing/*.csv")

if len(files) == 0:
    print("‚ùå Error: No files found. Check your directory path!")
else:
    print(f"Loading file: {files[0]}")
    df = pd.read_csv(files[0])

    # 1. CREATE TARGET (if not exists)
    if 'Overcost' not in df.columns:
        print("Creating 'Overcost' column...")
        budget_col = 'approved_budget' 
        cost_col = 'media_cost_usd'
        
        # Force numeric conversion to avoid string errors
        df[budget_col] = pd.to_numeric(df[budget_col], errors='coerce')
        df[cost_col] = pd.to_numeric(df[cost_col], errors='coerce')
        
        # Calculate Target
        df['Overcost'] = df[budget_col] - df[cost_col]

    TARGET = 'Overcost'

    # 2. DROP LEAKAGE & IDs
    leakage = ['approved_budget', 'campaign_budget_usd', 'media_cost_usd', 'campaign_item_id']
    df = df.drop([c for c in leakage if c in df.columns], axis=1)

    # Encode Categoricals
    df = pd.get_dummies(df, drop_first=True)
    df = df.fillna(0)

    print(f"Final Data Shape: {df.shape}")

Loading file: ../Cleaned_Datasets/Marketing\marketing_cleaned_prepared.csv
Creating 'Overcost' column...
Final Data Shape: (72206, 240)


## 2. Feature Selection
We analyze which campaign settings (Day, Category, Platform, etc.) have the biggest impact on the cost.
* **Visual Check:** Correlation Heatmap.
* **Statistical Check:** `SelectKBest` scores.

In [3]:
# --- CELL 5: FEATURE SELECTION (BASELINE = ALL) ---
import re

# 1. DEFINE X and Y
X = df.drop(TARGET, axis=1)
y = df[TARGET]

# 2. SANITIZE COLUMN NAMES
X.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in X.columns]
print("‚úÖ Column names sanitized.")

# 3. SELECT FEATURES
print("Selecting ALL features for Baseline...")
selector = SelectKBest(score_func=f_regression, k='all') 
selector.fit(X, y)

selected_features = X.columns[selector.get_support()]
print(f"‚úÖ Features ready: {len(selected_features)}")

‚úÖ Column names sanitized.
Selecting ALL features for Baseline...
‚úÖ Features ready: 239


## 3. Baseline Model Comparison
We train our three regressors to predict the `Overcost`.
* **Metric:** We use **RMSE (Root Mean Squared Error)**.
* **Interpretation:** Lower RMSE is better. It represents how many dollars ($) our prediction is off by on average.

In [4]:
# --- CELL 7: TRAIN BASELINE MODELS ---
# Goal: Compare Train vs Test R2 to detect Overfitting

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

models = {
    'RandomForest': RandomForestRegressor(random_state=42), 
    'XGBoost': XGBRegressor(random_state=42, objective='reg:squarederror'),
    'LightGBM': LGBMRegressor(random_state=42, verbose=-1)
}

print(f"{'Model':<15} | {'Test R2':<10} | {'Train R2':<10} | {'RMSE':<10} | {'Gap (Train-Test)'}")
print("-" * 80)

for name, model in models.items():
    model.fit(X_train, y_train)
    
    y_pred_test = model.predict(X_test)
    y_pred_train = model.predict(X_train) 
    
    r2_test = r2_score(y_test, y_pred_test)
    r2_train = r2_score(y_train, y_pred_train)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
    
    gap = r2_train - r2_test
    
    print(f"{name:<15} | {r2_test:.4f}     | {r2_train:.4f}     | {rmse:.4f}     | {gap:.4f}")

print("-" * 80)

Model           | Test R2    | Train R2   | RMSE       | Gap (Train-Test)
--------------------------------------------------------------------------------
RandomForest    | 0.9534     | 0.9949     | 118623.2481     | 0.0416
XGBoost         | 0.7916     | 0.9481     | 250796.5554     | 0.1565
LightGBM        | 0.8270     | 0.9196     | 228474.4521     | 0.0926
--------------------------------------------------------------------------------


## 4. Optimization & Saving
We take the winning model and tune its hyperparameters to see if we can reduce the error (RMSE) further.
Finally, we save the model using `joblib` with a versioned filename.

In [5]:
# --- CELL 8: OPTIMIZE ALL MODELS (MARKET) ---
from sklearn.model_selection import RandomizedSearchCV

param_grids = {
    'RandomForest': {
        'n_estimators': [100, 200, 300],
        'max_depth': [10, 20, 30, None],
        'min_samples_split': [2, 5]
    },
    'XGBoost': {
        'n_estimators': [100, 300],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7]
    },
    'LightGBM': {
        'n_estimators': [100, 300],
        'learning_rate': [0.01, 0.1],
        'num_leaves': [31, 50]
    }
}

print(f"{'Model':<15} | {'Test R2':<10} | {'RMSE':<10} | {'CV RMSE':<10}")
print("-" * 65)

best_opt_rmse = float('inf')
best_opt_model = None
best_model_name = None 

for name, model in models.items():
    if name in param_grids:
        search = RandomizedSearchCV(
            estimator=model,
            param_distributions=param_grids[name],
            n_iter=5, 
            cv=3, 
            scoring='neg_root_mean_squared_error', 
            n_jobs=-1, 
            random_state=42
        )
        
        search.fit(X_train, y_train)
        
        opt_model = search.best_estimator_
        
        # Metrics
        y_pred_test = opt_model.predict(X_test)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
        r2_test = r2_score(y_test, y_pred_test)
        cv_rmse = abs(search.best_score_)
        
        print(f"{name:<15} | {r2_test:.4f}     | {rmse:.0f}     | {cv_rmse:.0f}")
        
        # Track the ultimate winner (Lowest RMSE)
        if rmse < best_opt_rmse:
            best_opt_rmse = rmse
            best_opt_model = opt_model
            best_model_name = name

print("-" * 65)
print(f"üèÜ Ultimate Winner: {best_model_name} (RMSE: {best_opt_rmse:.0f})")
# Important: We set 'opt_rmse' so the Save Cell works correctly
opt_rmse = best_opt_rmse

Model           | Test R2    | RMSE       | CV RMSE   
-----------------------------------------------------------------
RandomForest    | 0.9546     | 116999     | 116823
XGBoost         | 0.8043     | 243021     | 233975
LightGBM        | 0.8455     | 215920     | 214927
-----------------------------------------------------------------
üèÜ Ultimate Winner: RandomForest (RMSE: 116999)


In [6]:
# --- SAVE CELL (Market Engine) ---
from datetime import datetime
import os
import joblib

timestamp = datetime.now().strftime("%Y%m%d_%H%M")

if 'best_opt_model' in locals():
    # Save as Exp3 (Optimized)
    filename = f"Exp3_Market_{best_model_name}_Opt_RMSE-{opt_rmse:.0f}_{timestamp}.joblib"
    save_dir = "../src/experiments"
    
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
        
    save_path = os.path.join(save_dir, filename)
    joblib.dump(best_opt_model, save_path)
    print(f"‚úÖ Saved Exp3 Market Model: {save_path}")
else:
    print("‚ùå optimization didn't run.")

‚úÖ Saved Exp3 Market Model: ../src/experiments\Exp3_Market_RandomForest_Opt_RMSE-116999_20260111_2116.joblib
