## Model Training

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [2]:
df = pd.read_csv("preprocessed_forecast_data.csv")

In [3]:
df.head()

Unnamed: 0,Marketing_Spend,Product_Price,Competitor_Price,Region,Product_Category,Channel_Type,Festival_Week,Store_Type,Weekly_Sales
0,4370.86,35.43,33.77,North,Electronics,Online,No,Flagship,682
1,9556.43,51.28,54.78,South,Furniture,In-store,No,Flagship,864
2,7587.95,18.93,13.97,East,Grocery,In-store,No,Flagship,1005
3,6387.93,50.22,49.4,North,Grocery,Online,No,Online-only,770
4,2404.17,28.28,34.36,North,Electronics,In-store,No,Franchise,596


Let us prepare the data for model training

### One Hot Encoding categorical features

In [4]:
df_encoded = pd.get_dummies(df, columns=['Region', 'Product_Category', 'Store_Type', 'Channel_Type', 'Festival_Week'], drop_first=True, dtype=int)

In [5]:
df_encoded.head()

Unnamed: 0,Marketing_Spend,Product_Price,Competitor_Price,Weekly_Sales,Region_North,Region_South,Region_West,Product_Category_Electronics,Product_Category_Furniture,Product_Category_Grocery,Product_Category_Toys,Store_Type_Franchise,Store_Type_Online-only,Channel_Type_Online,Festival_Week_Yes
0,4370.86,35.43,33.77,682,1,0,0,1,0,0,0,0,0,1,0
1,9556.43,51.28,54.78,864,0,1,0,0,1,0,0,0,0,0,0
2,7587.95,18.93,13.97,1005,0,0,0,0,0,1,0,0,0,0,0
3,6387.93,50.22,49.4,770,1,0,0,0,0,1,0,0,1,1,0
4,2404.17,28.28,34.36,596,1,0,0,1,0,0,0,1,0,0,0


Before we move-on to model training, let us scale the numerical data using standard scaling

In [6]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

numeric_cols = ['Marketing_Spend','Product_Price','Competitor_Price']
numeric_cols_scaled = scaler.fit_transform(df_encoded[numeric_cols])
df_encoded[numeric_cols] = numeric_cols_scaled

In [7]:
df_encoded.head()

Unnamed: 0,Marketing_Spend,Product_Price,Competitor_Price,Weekly_Sales,Region_North,Region_South,Region_West,Product_Category_Electronics,Product_Category_Furniture,Product_Category_Grocery,Product_Category_Toys,Store_Type_Franchise,Store_Type_Online-only,Channel_Type_Online,Festival_Week_Yes
0,-0.433337,-0.755065,-0.804858,682,1,0,0,1,0,0,0,0,0,1,0
1,1.56491,-0.145279,-0.010589,864,0,1,0,0,1,0,0,0,0,0,0
2,0.806361,-1.389858,-1.553384,1005,0,0,0,0,0,1,0,0,0,0,0
3,0.343936,-0.186059,-0.213976,770,1,0,0,0,0,1,0,0,1,1,0
4,-1.191196,-1.030142,-0.782554,596,1,0,0,1,0,0,0,1,0,0,0


Now, we can separte the features and target

In [8]:
X = df_encoded.drop(columns = ["Weekly_Sales"], axis=1)
y = df_encoded["Weekly_Sales"]

### Train Test Split

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Model Building

#### Training Baseline Models

In [10]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.tree import DecisionTreeRegressor


#### Evaliation Metrics

In [11]:
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [12]:
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(),
    "Gradient Boosting": GradientBoostingRegressor(),
    "AdaBoost": AdaBoostRegressor(),
    "SVR": SVR(),
    "XGBoost": XGBRegressor(tree_method='gpu_hist', gpu_id=0, single_precision_histogram=True),
    "Decision Tree": DecisionTreeRegressor()
}

In [13]:
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    print(f"📊 {name}")
    print(f"MSE: {mean_squared_error(y_test, y_pred):.2f}")
    print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred)):.2f}")
    print(f"MAE: {mean_absolute_error(y_test, y_pred):.2f}")
    print(f"R2 Score: {r2_score(y_test, y_pred):.2f}")
    print("=" * 50)

📊 Linear Regression
MSE: 2534.71
RMSE: 50.35
MAE: 40.35
R2 Score: 0.89
📊 Random Forest
MSE: 2741.82
RMSE: 52.36
MAE: 41.95
R2 Score: 0.88
📊 Gradient Boosting
MSE: 2563.85
RMSE: 50.63
MAE: 40.62
R2 Score: 0.88
📊 AdaBoost
MSE: 2799.21
RMSE: 52.91
MAE: 42.36
R2 Score: 0.87
📊 SVR
MSE: 2640.37
RMSE: 51.38
MAE: 41.14
R2 Score: 0.88
📊 XGBoost
MSE: 2619.32
RMSE: 51.18
MAE: 41.07
R2 Score: 0.88
📊 Decision Tree
MSE: 5422.28
RMSE: 73.64
MAE: 58.83
R2 Score: 0.76


Even though the above results appear promissing, it is the best practice to perform hyperparameter tuning to fine-tune the model and achieve the best performance.

Let us hyperparameter tune:

- Gradient Boosting Regressor

- XGBRegressor

- SVR

- Random Forest

In [14]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform

In [15]:
param_grid = [
    {
        'model': [GradientBoostingRegressor()],
        'model__n_estimators': [100, 200, 300],
        'model__learning_rate': [0.01, 0.05, 0.1],
        'model__max_depth': [3, 5, 7],
        'model__subsample': [0.8, 1.0],
    },
    {
        'model': [XGBRegressor(tree_method='gpu_hist', gpu_id=0, single_precision_histogram=True)],
        'model__n_estimators': [100, 200],
        'model__learning_rate': [0.01, 0.1],
        'model__max_depth': [3, 5, 7],
        'model__subsample': [0.8, 1.0],
        'model__colsample_bytree': [0.8, 1.0]
    },
    {
        'model': [SVR()],
        'model__C': [0.1, 1, 10],
        'model__gamma': ['scale', 'auto'],
        'model__kernel': ['rbf', 'linear']
    },
    {
        'model': [RandomForestRegressor()],
        'model__n_estimators': [100, 200],
        'model__max_depth': [None, 10, 20],
        'model__min_samples_split': [2, 5],
        'model__min_samples_leaf': [1, 2]
    }
]


In [16]:
from sklearn.dummy import DummyRegressor

# Base pipeline with a dummy placeholder
pipeline = Pipeline([
    ('model', DummyRegressor())  # Placeholder
])

In [None]:
random_search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_grid,
    n_iter=30,  # Adjust depending on time/resources
    scoring='neg_mean_squared_error',
    cv=5,
    verbose=2,
    n_jobs=-1,
    random_state=42
)


: 

In [None]:
random_search.fit(X_train, y_train)

Fitting 5 folds for each of 30 candidates, totalling 150 fits


[CV] END model=XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             feature_weights=None, gamma=None, gpu_id=0, grow_policy=None,
             importance_type=None, interaction_constraints=None,
             learning_rate=None, max_bin=None, max_cat_threshold=None,
             max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
             max_leaves=None, min_child_weight=None, missing=nan,
             monotone_constraints=None, multi_strategy=None, n_estimators=None,
             n_jobs=None, ...), model__colsample_bytree=1.0, model__learning_rate=0.01, model__max_depth=7, model__n_estimators=200, model__subsample=1.0; total time=   8.6s
[CV] END model=XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=

In [None]:
print("Best Model: ", random_search.best_estimator_)
print("Best Score (MSE): ", -random_search.best_score_)
print("Best R2-Score: ", random_search.best_estimator_.score(X_test, y_test))