Import Libraries

In [114]:
import pandas as pd
import numpy as np
from datetime import datetime
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go


In [115]:
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import mean_squared_error
import numpy as np
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor


In [116]:
df = pd.read_csv('data/enhanced_train.csv')
df_test = pd.read_csv('data/enhanced_test.csv')

In [117]:
# Drop columns not needed for training
drop_cols = ['Unnamed: 0', 'Item_Outlet_Sales','Log_Sales']
X = df.drop(columns=drop_cols)
y = df['Log_Sales']  # Use log-transformed target


Train-Test Split

In [118]:

# 1. Create a DataFrame that includes X and stratification column
df['Log_Sales'] = y  # just in case
df['Freq_Bin_Item_Cluster'] = df['Freq_Bin_Item_Cluster'].astype(str)  # ensure string for stratification

# 2. Prepare stratified splitter
splitter = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

for train_idx, val_idx in splitter.split(df, df['Freq_Bin_Item_Cluster']):
    df_train_split = df.iloc[train_idx].copy()
    df_val_split = df.iloc[val_idx].copy()

# 3. Remove rows from val that contain Item_Identifiers not present in train
val_unique_items = set(df_val_split['Item_Identifier'])
train_unique_items = set(df_train_split['Item_Identifier'])

missing_items = val_unique_items - train_unique_items
print(f"Missing Item_Identifiers in train: {missing_items}")

# 4. Move those rows to train
rows_to_move = df_val_split[df_val_split['Item_Identifier'].isin(missing_items)]
df_train_split = pd.concat([df_train_split, rows_to_move], axis=0)
df_val_split = df_val_split[~df_val_split['Item_Identifier'].isin(missing_items)]


Missing Item_Identifiers in train: {'FDN52', 'DRE01', 'NCS41', 'FDE52', 'FDK57'}


In [119]:
X_train = df_train_split.drop(columns=['Item_Outlet_Sales', 'Log_Sales','Unnamed: 0'])
y_train = df_train_split['Log_Sales']

X_val = df_val_split.drop(columns=['Item_Outlet_Sales', 'Log_Sales','Unnamed: 0'])
y_val = df_val_split['Log_Sales']


In [120]:

# Sanity check: No missing profiles
train_profiles = set(df.loc[X_train.index, 'Item_Profile'])
val_profiles = set(df.loc[X_val.index, 'Item_Profile'])
missing_profiles = val_profiles - train_profiles
if missing_profiles:
    print(f"Warning: Some item profiles are only in validation: {missing_profiles}")

In [121]:

# One-hot encode categoricals
X_train_encoded = pd.get_dummies(X_train)
X_val_encoded = pd.get_dummies(X_val)
# 🔁 Align columns (same fix applies to all models)
X_train_encoded, X_val_encoded = X_train_encoded.align(X_val_encoded, join='left', axis=1, fill_value=0)


In [122]:

ridge = ridge = Ridge(
    alpha=10.0,             # increase regularization to reduce overfitting
    solver='auto',          # works well in most cases
    random_state=42
)

rf = RandomForestRegressor(
    n_estimators=500,        # more trees = more stability
    max_depth=15,            # deeper trees (but not too deep)
    min_samples_split=10,    # prevent overfitting on small splits
    min_samples_leaf=4,      # avoid learning tiny leaves
    max_features='sqrt',     # use sqrt(n_features) at each split (faster, robust)
    bootstrap=True,
    random_state=42,
    n_jobs=-1
)

xgb = XGBRegressor(
    objective='reg:squarederror',
    n_estimators=1000,          # enough trees to capture complexity
    learning_rate=0.01,        # small step to generalize better
    max_depth=7,               # reasonable depth, avoids overfitting
    subsample=0.8,             # row sampling to reduce variance
    colsample_bytree=0.8,      # column sampling to reduce feature noise
    min_child_weight=3,        # control splits with small data
    gamma=0.1,                 # penalize unnecessary splits
    reg_alpha=1.0,             # L1 regularization (sparsity)
    reg_lambda=2.0,            # L2 regularization (stability)
    n_jobs=-1,
    random_state=42
)


In [123]:
# Fit base models
ridge.fit(X_train_encoded, y_train)

Ridge(alpha=10.0, random_state=42)

In [124]:
rf.fit(X_train_encoded, y_train)

RandomForestRegressor(max_depth=15, max_features='sqrt', min_samples_leaf=4,
                      min_samples_split=10, n_estimators=500, n_jobs=-1,
                      random_state=42)

In [125]:
xgb.fit(X_train_encoded, y_train)

XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=0.8, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=0.1, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=0.01, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=7, max_leaves=None,
             min_child_weight=3, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=1000, n_jobs=-1,
             num_parallel_tree=None, random_state=42, ...)

Prediction on Validation Set 

In [126]:
ridge_preds = ridge.predict(X_val_encoded)
rf_preds = rf.predict(X_val_encoded)
xgb_preds = xgb.predict(X_val_encoded)


In [127]:
# Ridge
# Inverse log1p
y_val_true = np.expm1(y_val)
ridge_preds = ridge.predict(X_val_encoded)
rmse_ridge = np.sqrt(mean_squared_error(y_val_true, np.expm1(ridge_preds)))
print(f"Ridge RMSE: {rmse_ridge:.4f}")

Ridge RMSE: 1100.5463


In [128]:
# Random Forest
rf_preds = rf.predict(X_val_encoded)
rmse_rf = np.sqrt(mean_squared_error(y_val_true, np.expm1(rf_preds)))
print(f" Random Forest RMSE: {rmse_rf:.4f}")

 Random Forest RMSE: 1377.7045


In [129]:
# XGBoost
xgb_preds = xgb.predict(X_val_encoded)
rmse_xgb = np.sqrt(mean_squared_error(y_val_true, np.expm1(xgb_preds)))
print(f" XGBoost RMSE: {rmse_xgb:.4f}")

 XGBoost RMSE: 1116.4576


Average Ensemble

In [130]:
# Average predictions (in log space)
avg_preds_log = (ridge_preds + rf_preds + xgb_preds) / 3
avg_preds = np.expm1(avg_preds_log)
y_val_actual = np.expm1(y_val)

from sklearn.metrics import mean_squared_error
rmse = np.sqrt(mean_squared_error(y_val_actual, avg_preds))
print(f"Ensemble Validation RMSE: {rmse:.4f}")


Ensemble Validation RMSE: 1157.2310


In [131]:
# Try weights (e.g., XGB strong, RF medium, Ridge weak)
weighted_preds_log = (0.775 * xgb_preds) + (0.0 * rf_preds) + (0.225 * ridge_preds)
weighted_preds = np.expm1(weighted_preds_log)

rmse_weighted = np.sqrt(mean_squared_error(y_val_actual, weighted_preds))
print(f"Weighted Ensemble RMSE: {rmse_weighted:.4f}")


Weighted Ensemble RMSE: 1107.6767


In [132]:
# Align test features to match train
X_test = pd.get_dummies(df_test.drop(columns=['Item_Identifier', 'Outlet_Identifier']))
X_test = X_test.reindex(columns=X_train_encoded.columns, fill_value=0)

# Predict using trained models
ridge_test_preds_log = ridge.predict(X_test)
rf_test_preds_log = rf.predict(X_test)
xgb_test_preds_log = xgb.predict(X_test)

# Average predictions (in log space)
ensemble_test_preds_log = (ridge_test_preds_log + rf_test_preds_log + xgb_test_preds_log) / 3

# Inverse log1p to get final predictions
ensemble_test_preds = np.expm1(ensemble_test_preds_log)

# Create submission file
submission = df_test[['Item_Identifier', 'Outlet_Identifier']].copy()
submission['Item_Outlet_Sales'] = ensemble_test_preds

#Save to CSV
submission.to_csv('ensemble_submission_1.csv', index=False)
print("Saved ensemble_submission_1.csv")


Saved ensemble_submission_1.csv


In [133]:
import numpy as np
from sklearn.metrics import mean_squared_error

best_rmse = float('inf')
best_weights = (0.0, 0.0, 0.0)

weight_range = np.arange(0.0, 1.025, 0.025)

for w1 in weight_range:  # XGBoost
    for w2 in weight_range:
        w3 = 1.0 - w1 - w2
        if w3 < 0 or w3 > 1:
            continue
        ensemble_log_preds = w1 * xgb_preds + w2 * rf_preds + w3 * ridge_preds
        rmse = np.sqrt(mean_squared_error(np.expm1(y_val), np.expm1(ensemble_log_preds)))
        if rmse < best_rmse:
            best_rmse = rmse
            best_weights = (w1, w2, w3)

print(f"✅ Best RMSE: {best_rmse:.4f}")
print(f"🏋️‍♂️ Optimal Weights → XGBoost: {best_weights[0]:.3f}, RF: {best_weights[1]:.3f}, Ridge: {best_weights[2]:.3f}")


✅ Best RMSE: 1098.7319
🏋️‍♂️ Optimal Weights → XGBoost: 0.225, RF: 0.000, Ridge: 0.775
