In [1]:
# Imports and setup
import numpy as np
import pandas as pd
import xgboost as xgb

from pathlib import Path
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV, train_test_split

root = Path.cwd().parent

In [2]:
# Load the processed data CSV (without 'Date' column)
df = pd.read_csv(root/'data'/'processed_data.csv')

# Quick check of the data
print(f"Data shape: {df.shape}")
df.info()
df.head()


Data shape: (1017209, 21)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1017209 entries, 0 to 1017208
Data columns (total 21 columns):
 #   Column                      Non-Null Count    Dtype  
---  ------                      --------------    -----  
 0   Store                       1017209 non-null  int64  
 1   DayOfWeek                   1017209 non-null  int64  
 2   Sales                       1017209 non-null  int64  
 3   Open                        1017209 non-null  int64  
 4   Promo                       1017209 non-null  int64  
 5   StateHoliday                1017209 non-null  int64  
 6   SchoolHoliday               1017209 non-null  int64  
 7   StoreType                   1017209 non-null  int64  
 8   Assortment                  1017209 non-null  int64  
 9   CompetitionDistance         1017209 non-null  float64
 10  Promo2                      1017209 non-null  int64  
 11  Promo2SinceWeek             1017209 non-null  int64  
 12  Promo2SinceYear             10

Unnamed: 0,Store,DayOfWeek,Sales,Open,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,...,Promo2SinceWeek,Promo2SinceYear,CompetitionDistanceMissing,CompetitionOpenMissing,Year,Month,MonthsSinceCompOpened,IsPromoIntervalActive,WeekOfYear,DaysSinceStart
0,1,4,5263,1,1,0,1,2,0,1270.0,...,0,0,0,0,2015,7,82,0,31,941
1,2,4,6064,1,1,0,1,0,0,570.0,...,13,2010,0,0,2015,7,92,1,31,941
2,3,4,8314,1,1,0,1,0,0,14130.0,...,14,2011,0,0,2015,7,103,1,31,941
3,4,4,13995,1,1,0,1,2,2,620.0,...,0,0,0,0,2015,7,70,0,31,941
4,5,4,4822,1,1,0,1,0,0,29910.0,...,0,0,0,0,2015,7,3,0,31,941


In [3]:
# Drop columns that are not features
X = df.drop(columns=['Sales'])

y = df['Sales']


In [4]:
# Split X and y into training and test sets
# shuffle=False keeps data in time order to avoid leakage in time series split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, shuffle=False
) 


# ðŸ¤– Machine Learning Pipeline

We perform grid search using `GridSearchCV` to find the optimal combination of hyperparameters for our XGBoost model. Below is an explanation of the key parameters involved:

## GridSearchCV Parameters

- **`estimator`**  
  The machine learning model to be trained (in our case, an XGBoost regressor).

- **`param_grid`**  
  A dictionary containing the hyperparameters we want to tune and the list of values to try for each.

- **`scoring`**  
  The metric used to evaluate model performance. For regression, common options include:  
  - `'neg_root_mean_squared_error'` (RMSE)  
  - `'neg_mean_absolute_error'` (MAE)

- **`cv`**  
  The number of cross-validation folds. The data is split into `cv` subsets. The model is trained on `cv-1` and validated on the remaining one, iterating through all combinations.

- **`verbose`**  
  Controls the level of logging output during the search. Higher values give more details (`2` means detailed output).

- **`n_jobs`**  
  Number of CPU cores to use in parallel. `-1` means use all available cores.

## Hyperparameters in `param_grid`

- **`n_estimators`**  
  Number of trees (boosting rounds). More trees can improve performance but increase training time.

- **`max_depth`**  
  Maximum depth of each tree. Controls model complexity and overfitting (deeper trees can capture more complex patterns but might overfit).

- **`learning_rate`**  
  Step size shrinkage used in updates to prevent overfitting. Lower values make learning slower but often improve generalization.

- **`subsample`**  
  Fraction of training data used to grow each tree. Values less than 1.0 can help prevent overfitting, but too low may underfit.

- **`colsample_bytree`**  
  Fraction of features used when building each tree. Like `subsample`, helps reduce overfitting.


In [5]:
# Hyperparameters for XGBoost
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 6, 9],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

xgb_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)

# Grid search for hyperparameter tuning
grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid,
    scoring='neg_root_mean_squared_error',  # or 'neg_mean_absolute_error'
    cv=3,  # 3-fold cross-validation
    verbose=2,
    n_jobs=-1
)

grid_search.fit(X_train, y_train)

print("Best parameters:", grid_search.best_params_)
print("Best RMSE:", -grid_search.best_score_)

# Use best model to predict
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
y_pred_truncated = np.maximum(y_pred, 0)


Fitting 3 folds for each of 243 candidates, totalling 729 fits
Best parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.2, 'max_depth': 9, 'n_estimators': 200, 'subsample': 1.0}
Best RMSE: 1204.8248494466145


In [6]:
def rmspe(y_true, y_pred):
    # Mask: ignore entries where y_true is zero
    mask = y_true > 0
    y_true_filtered = y_true[mask]
    y_pred_filtered = y_pred[mask]
    
    return np.sqrt(np.mean(np.square((y_true_filtered - y_pred_filtered) / y_true_filtered)))

# Evaluate
rmspe_score = rmspe(y_test.values, y_pred_truncated)
r2 = r2_score(y_test, y_pred_truncated)

print(f'Test RMSPE: {rmspe_score:.4f}')
print(f'Test RÂ²: {r2:.3f}')


Test RMSPE: 0.3152
Test RÂ²: 0.907


### Export results:

In [7]:
import json
import joblib

# Save model
joblib.dump(best_model, root/'results'/'xgb_model.pkl')

# Save metrics
metrics = {
    "R2": r2,
    "RMSPE": rmspe_score
}
with open(root/'results'/'metrics.json', 'w') as f:
    json.dump(metrics, f, indent=4)
