# Imports and Setup

In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import os
import glob
import re
from datetime import datetime


# Models
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

# Sklearn Tools
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.feature_selection import SelectKBest, f_regression

# Setup plotting
sns.set_style("whitegrid")

# Create models folder if it doesn't exist
if not os.path.exists("../../models"):
    os.makedirs("../../models")

## 1. Target Definition & Cleaning
We are predicting **Overcost** (Market Efficiency).

* **Goal:** Predict if a campaign will be under or over budget based on its settings.
* **Formula:** `Overcost = approved_budget - media_cost_usd`
    * **Positive (+):** Under Budget (Good efficiency).
    * **Negative (-):** Over Budget (Bad efficiency).
* **Critical Step:** We MUST drop `approved_budget` and `media_cost_usd` from the input features (X). If we include them, the model will "cheat" by simply doing the subtraction itself (Data Leakage).

In [8]:
# FIND FILE AUTOMATICALLY
files = glob.glob("../Cleaned_Datasets/Marketing/*.csv")

if len(files) == 0:
    print("‚ùå Error: No files found. Check your directory path!")
else:
    print(f"Loading file: {files[0]}")
    df = pd.read_csv(files[0])

    # 1. CREATE TARGET (if not exists)
    if 'Overcost' not in df.columns:
        print("Creating 'Overcost' column...")
        budget_col = 'approved_budget' 
        cost_col = 'media_cost_usd'
        
        # Force numeric conversion to avoid string errors
        df[budget_col] = pd.to_numeric(df[budget_col], errors='coerce')
        df[cost_col] = pd.to_numeric(df[cost_col], errors='coerce')
        
        # Calculate Target
        df['Overcost'] = df[budget_col] - df[cost_col]

    TARGET = 'Overcost'

    # 2. DROP LEAKAGE & IDs
    leakage = ['approved_budget', 'campaign_budget_usd', 'media_cost_usd', 'campaign_item_id']
    df = df.drop([c for c in leakage if c in df.columns], axis=1)

    # Encode Categoricals
    df = pd.get_dummies(df, drop_first=True)
    df = df.fillna(0)

    print(f"Final Data Shape: {df.shape}")

Loading file: ../Cleaned_Datasets/Marketing\marketing_cleaned_prepared.csv
Creating 'Overcost' column...
Final Data Shape: (72206, 240)


## 2. Feature Selection
We analyze which campaign settings (Day, Category, Platform, etc.) have the biggest impact on the cost.
* **Visual Check:** Correlation Heatmap.
* **Statistical Check:** `SelectKBest` scores.

In [None]:
import re
from sklearn.feature_selection import SelectKBest, f_regression

# 1. DEFINE X and Y
X = df.drop(TARGET, axis=1)
y = df[TARGET]

# 2. SANITIZE COLUMN NAMES
X.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in X.columns]
print("‚úÖ Column names sanitized.")

# 3. SELECT TOP 50 FEATURES
print("Selecting TOP 50 features for Experiment 4...")

selector = SelectKBest(score_func=f_regression, k=50) 
selector.fit(X, y)

# Get the names of the selected columns
cols = selector.get_support(indices=True)
X_selected = X.iloc[:, cols]

# OVERWRITE X with the reduced version
X = X_selected

print(f"‚úÖ Features ready: {X.shape[1]}")
print(f"   Selected: {X.columns.tolist()}")

‚úÖ Column names sanitized.
Selecting TOP 20 features for Experiment 4...
‚úÖ Features ready: 50
   Selected: ['Unnamed__0', 'no_of_days', 'impressions', 'clicks', 'search_tag_cat_Retargeting', 'search_tag_cat_Youtube', 'time_2022_08_05', 'time_2022_08_08', 'time_2022_08_09', 'time_2022_08_10', 'time_2022_08_11', 'time_2022_08_12', 'time_2022_08_13', 'time_2022_08_14', 'time_2022_08_15', 'time_2022_08_16', 'time_2022_08_17', 'time_2022_08_18', 'time_2022_08_19', 'time_2022_08_20', 'time_2022_08_21', 'time_2022_08_22', 'time_2022_08_23', 'time_2022_08_24', 'time_2022_08_25', 'time_2022_08_26', 'time_2022_08_27', 'time_2022_08_28', 'time_2022_08_29', 'time_2022_08_30', 'time_2022_08_31', 'time_2022_09_01', 'time_2022_09_02', 'time_2022_09_03', 'time_2022_09_04', 'time_2022_09_05', 'time_2022_09_06', 'time_2022_09_07', 'time_2022_09_08', 'time_2022_09_09', 'time_2022_09_10', 'time_2022_09_11', 'time_2022_09_12', 'time_2022_09_13', 'time_2022_09_14', 'time_2022_09_15', 'time_2022_09_16', '

## 3. Baseline Model Comparison
We train our three regressors to predict the `Overcost`.
* **Metric:** We use **RMSE (Root Mean Squared Error)**.
* **Interpretation:** Lower RMSE is better. It represents how many dollars ($) our prediction is off by on average.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

models = {
    'RandomForest': RandomForestRegressor(random_state=42), 
    'XGBoost': XGBRegressor(random_state=42, objective='reg:squarederror'),
    'LightGBM': LGBMRegressor(random_state=42, verbose=-1)
}

print(f"{'Model':<15} | {'Test R2':<10} | {'Train R2':<10} | {'RMSE':<10} | {'Gap (Train-Test)'}")
print("-" * 80)

for name, model in models.items():
    model.fit(X_train, y_train)
    
    y_pred_test = model.predict(X_test)
    y_pred_train = model.predict(X_train) 
    
    r2_test = r2_score(y_test, y_pred_test)
    r2_train = r2_score(y_train, y_pred_train)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
    
    gap = r2_train - r2_test
    
    print(f"{name:<15} | {r2_test:.4f}     | {r2_train:.4f}     | {rmse:.4f}     | {gap:.4f}")

print("-" * 80)

Model           | Test R2    | Train R2   | RMSE       | Gap (Train-Test)
--------------------------------------------------------------------------------
RandomForest    | 0.9722     | 0.9960     | 91665.8600     | 0.0238
XGBoost         | 0.8177     | 0.9427     | 234578.5560     | 0.1251
LightGBM        | 0.8278     | 0.9166     | 228000.4128     | 0.0888
--------------------------------------------------------------------------------


## 4. Optimization & Saving
We take the winning model and tune its hyperparameters to see if we can reduce the error (RMSE) further.
Finally, we save the model using `joblib` with a versioned filename.

In [None]:
from sklearn.model_selection import RandomizedSearchCV

param_grids = {
    'RandomForest': {
        'n_estimators': [100, 200, 300],
        'max_depth': [10, 20, 30, None],
        'min_samples_split': [2, 5]
    },
    'XGBoost': {
        'n_estimators': [100, 300],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7]
    },
    'LightGBM': {
        'n_estimators': [100, 300],
        'learning_rate': [0.01, 0.1],
        'num_leaves': [31, 50]
    }
}

print(f"{'Model':<15} | {'Test R2':<10} | {'RMSE':<10} | {'CV RMSE':<10}")
print("-" * 65)

best_opt_rmse = float('inf')
best_opt_model = None
best_model_name = None 

for name, model in models.items():
    if name in param_grids:
        search = RandomizedSearchCV(
            estimator=model,
            param_distributions=param_grids[name],
            n_iter=5, 
            cv=3, 
            scoring='neg_root_mean_squared_error', 
            n_jobs=-1, 
            random_state=42
        )
        
        search.fit(X_train, y_train)
        
        opt_model = search.best_estimator_
        
        # Metrics
        y_pred_test = opt_model.predict(X_test)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
        r2_test = r2_score(y_test, y_pred_test)
        cv_rmse = abs(search.best_score_)
        
        print(f"{name:<15} | {r2_test:.4f}     | {rmse:.0f}     | {cv_rmse:.0f}")
        
        if rmse < best_opt_rmse:
            best_opt_rmse = rmse
            best_opt_model = opt_model
            best_model_name = name

print("-" * 65)
print(f"üèÜ Ultimate Winner: {best_model_name} (RMSE: {best_opt_rmse:.0f})")
opt_rmse = best_opt_rmse

Model           | Test R2    | RMSE       | CV RMSE   
-----------------------------------------------------------------
RandomForest    | 0.9723     | 91390     | 106678
XGBoost         | 0.8181     | 234319     | 227209
LightGBM        | 0.8515     | 211713     | 208463
-----------------------------------------------------------------
üèÜ Ultimate Winner: RandomForest (RMSE: 91390)


In [None]:
# --- SAVE CELL (Market Engine) ---
from datetime import datetime
import os
import joblib

timestamp = datetime.now().strftime("%Y%m%d_%H%M")

if 'best_opt_model' in locals():
    # Save as Exp3 (Optimized)
    filename = f"Exp4_Market_{best_model_name}_Opt_RMSE-{opt_rmse:.0f}_{timestamp}.joblib"
    save_dir = "../src/models"
    
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
        
    save_path = os.path.join(save_dir, filename)
    joblib.dump(best_opt_model, save_path)
    print(f"‚úÖ Saved Exp4 Market Model: {save_path}")
else:
    print("‚ùå optimization didn't run.")

‚úÖ Saved Exp4 Market Model: ../src/experiments\Exp4_Market_RandomForest_Opt_RMSE-91390_20260112_1528.joblib
