In [1]:
## Sales Prediction for Big Mart Outlets

import pandas as pd
import numpy as np
import os
import sys
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb

# Set the working directory to the current directory
script_dir = os.getcwd()
os.chdir(script_dir)

# Load the datasets
train = pd.read_csv('train_df.csv')
test = pd.read_csv('test_df.csv')
sample_submission = pd.read_csv('sample_submission_df.csv')

print("Train Data Sample:")
print(train.head())
print("\nTest Data Sample:")
print(test.head())

# --- Feature Engineering ---
if 'Item_Visibility' in train.columns:
    train['Item_Visibility_MeanRatio'] = train['Item_Visibility'] / train['Item_Visibility'].mean()
    test['Item_Visibility_MeanRatio'] = test['Item_Visibility'] / train['Item_Visibility'].mean()
if 'Outlet_Establishment_Year' in train.columns:
    train['Outlet_Years'] = 2025 - train['Outlet_Establishment_Year']
    test['Outlet_Years'] = 2025 - test['Outlet_Establishment_Year']
if 'Item_Outlet_Sales' in train.columns:
    train['Item_Outlet_Sales'] = np.log1p(train['Item_Outlet_Sales'])
if 'Item_Visibility' in train.columns:
    train['Item_Visibility'] = np.log1p(train['Item_Visibility'])
    test['Item_Visibility'] = np.log1p(test['Item_Visibility'])
if 'Item_Type' in train.columns and 'Outlet_Type' in train.columns:
    train['Item_Outlet_Combo'] = train['Item_Type'] + '_' + train['Outlet_Type']
    test['Item_Outlet_Combo'] = test['Item_Type'] + '_' + test['Outlet_Type']

def preprocess_data(df):
    for col in df.select_dtypes(include=[np.number]).columns:
        df[col].fillna(df[col].mean(), inplace=True)
    for col in df.select_dtypes(include=['object']).columns:
        df[col].fillna(df[col].mode()[0], inplace=True)
    df = pd.get_dummies(df, drop_first=True)
    return df

train_processed = preprocess_data(train)
test_processed = preprocess_data(test)

X = train_processed.drop('Item_Outlet_Sales', axis=1)
y = train_processed['Item_Outlet_Sales']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)



Train Data Sample:
  Item_Identifier  Item_Weight Item_Fat_Content  Item_Visibility  \
0           FDA15         9.30          Low Fat         0.016047   
1           DRC01         5.92          Regular         0.019278   
2           FDN15        17.50          Low Fat         0.016760   
3           FDX07        19.20          Regular         0.000000   
4           NCD19         8.93          Low Fat         0.000000   

               Item_Type  Item_MRP Outlet_Identifier  \
0                  Dairy  249.8092            OUT049   
1            Soft Drinks   48.2692            OUT018   
2                   Meat  141.6180            OUT049   
3  Fruits and Vegetables  182.0950            OUT010   
4              Household   53.8614            OUT013   

   Outlet_Establishment_Year Outlet_Size Outlet_Location_Type  \
0                       1999      Medium               Tier 1   
1                       2009      Medium               Tier 3   
2                       1999      Medium

In [2]:
# --- Advanced Models: RandomForest & XGBoost ---
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf_scores = cross_val_score(rf, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
print(f"RandomForest CV Mean Squared Error: {-rf_scores.mean():.2f}")
rf.fit(X_train, y_train)
rf_val_pred = rf.predict(X_val)
rf_mse = mean_squared_error(y_val, rf_val_pred)
print(f"RandomForest Validation MSE: {rf_mse:.2f}")

xgb_model = xgb.XGBRegressor(n_estimators=100, random_state=42, n_jobs=-1)
xgb_scores = cross_val_score(xgb_model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
print(f"XGBoost CV Mean Squared Error: {-xgb_scores.mean():.2f}")
xgb_model.fit(X_train, y_train)
xgb_val_pred = xgb_model.predict(X_val)
xgb_mse = mean_squared_error(y_val, xgb_val_pred)
print(f"XGBoost Validation MSE: {xgb_mse:.2f}")

test_processed_aligned = test_processed.reindex(columns=X_train.columns, fill_value=0)




RandomForest CV Mean Squared Error: 0.32
RandomForest Validation MSE: 0.30
RandomForest Validation MSE: 0.30
XGBoost CV Mean Squared Error: 0.29
XGBoost CV Mean Squared Error: 0.29
XGBoost Validation MSE: 0.28
XGBoost Validation MSE: 0.28


In [3]:
# --- Final Submission with XGBoost ---
xgb_test_pred = xgb_model.predict(test_processed_aligned)
xgb_test_pred = np.expm1(xgb_test_pred)
xgb_test_pred = np.clip(xgb_test_pred, 0, None)
submission = sample_submission.copy()
submission['Item_Outlet_Sales'] = xgb_test_pred
submission.to_csv('submission.csv', index=False)
print("Submission file created with XGBoost predictions: submission.csv")

Submission file created with XGBoost predictions: submission.csv


# Hyperparameter Tuning and Stacking Ensemble
Tune XGBoost hyperparameters and build a stacking ensemble for improved performance.

In [4]:
# Hyperparameter tuning for XGBoost
from sklearn.model_selection import RandomizedSearchCV
xgb_params = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7, 10],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'reg_alpha': [0, 0.1, 0.5],
    'reg_lambda': [1, 1.5, 2]
}

xgb_model = xgb.XGBRegressor(random_state=42, n_jobs=-1)
random_search = RandomizedSearchCV(xgb_model, xgb_params, n_iter=20, scoring='neg_mean_squared_error', cv=3, verbose=1, random_state=42)
random_search.fit(X_train, y_train)
print('Best XGBoost Params:', random_search.best_params_)
print('Best XGBoost CV Score:', -random_search.best_score_)

# Stacking ensemble
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import Ridge
rf = RandomForestRegressor(n_estimators=100, random_state=42)

stack = StackingRegressor(
    estimators=[
        ('xgb', xgb.XGBRegressor(**random_search.best_params_, random_state=42, n_jobs=-1)),
        ('rf', rf)
    ],
    final_estimator=Ridge(random_state=42)
)
stack.fit(X_train, y_train)
y_pred_stack = stack.predict(X_val)
stack_mse = mean_squared_error(y_val, y_pred_stack)
print(f'Stacking Validation MSE: {stack_mse:.2f}')

# Predict on test set using stacking ensemble
test_pred_stack = stack.predict(test_processed_aligned)
test_pred_stack = np.expm1(test_pred_stack)
test_pred_stack = np.clip(test_pred_stack, 0, None)
submission_stack = sample_submission.copy()
submission_stack['Item_Outlet_Sales'] = test_pred_stack
submission_stack.to_csv('submission_stack.csv', index=False)
print('Submission file created with stacking predictions: submission_stack.csv')

Fitting 3 folds for each of 20 candidates, totalling 60 fits
Best XGBoost Params: {'subsample': 0.8, 'reg_lambda': 2, 'reg_alpha': 0, 'n_estimators': 300, 'max_depth': 5, 'learning_rate': 0.05, 'colsample_bytree': 0.6}
Best XGBoost CV Score: 0.2806818058648225
Stacking Validation MSE: 0.27
Submission file created with stacking predictions: submission_stack.csv


In [6]:
# Feature Selection using XGBoost Feature Importances
importances = random_search.best_estimator_.feature_importances_
feature_names = X_train.columns
feature_importance_df = pd.DataFrame({'feature': feature_names, 'importance': importances})
feature_importance_df = feature_importance_df.sort_values(by='importance', ascending=False)
print('Top 20 XGBoost Features:')
print(feature_importance_df.head(20))

# Select top N important features
top_features = feature_importance_df['feature'].head(20).tolist()
X_train_top = X_train[top_features]
X_val_top = X_val[top_features]
test_top = test_processed_aligned[top_features]

# Retrain stacking on selected features
stack_top = StackingRegressor(
    estimators=[
        ('xgb', xgb.XGBRegressor(**random_search.best_params_, random_state=42, n_jobs=-1)),
        ('rf', RandomForestRegressor(n_estimators=100, random_state=42))
    ],
    final_estimator=Ridge(random_state=42)
)
stack_top.fit(X_train_top, y_train)
y_pred_stack_top = stack_top.predict(X_val_top)
stack_top_mse = mean_squared_error(y_val, y_pred_stack_top)
print(f'Stacking (Top Features) Validation MSE: {stack_top_mse:.2f}')

test_pred_stack_top = stack_top.predict(test_top)
test_pred_stack_top = np.expm1(test_pred_stack_top)
test_pred_stack_top = np.clip(test_pred_stack_top, 0, None)
submission_stack_top = sample_submission.copy()
submission_stack_top['Item_Outlet_Sales'] = test_pred_stack_top
submission_stack_top.to_csv('submission_stack_top.csv', index=False)
print('Submission file created with stacking predictions (top features): submission_stack_top.csv')

Top 20 XGBoost Features:
                                                feature  importance
1586                           Outlet_Identifier_OUT019    0.083036
1585                           Outlet_Identifier_OUT018    0.075102
1598                      Outlet_Type_Supermarket Type3    0.039044
5                                          Outlet_Years    0.037980
1596                      Outlet_Type_Supermarket Type1    0.037894
1592                                 Outlet_Size_Medium    0.034382
2                                              Item_MRP    0.031452
1587                           Outlet_Identifier_OUT027    0.029830
1622  Item_Outlet_Combo_Fruits and Vegetables_Grocer...    0.029265
3                             Outlet_Establishment_Year    0.027078
1634          Item_Outlet_Combo_Household_Grocery Store    0.026009
1618       Item_Outlet_Combo_Frozen Foods_Grocery Store    0.020963
1610             Item_Outlet_Combo_Canned_Grocery Store    0.009451
1650        Item_Outlet

# Exploratory Data Analysis (EDA) and Feature Engineering Rationale
This section provides an overview of the EDA performed on the Big Mart sales dataset, including key insights and the reasoning behind each feature engineering transformation.

In [None]:
# EDA: Data Overview and Initial Visualizations
import matplotlib.pyplot as plt
import seaborn as sns

# Display basic info
print('Train shape:', train.shape)
print('Test shape:', test.shape)
print('Train columns:', train.columns.tolist())
print('Test columns:', test.columns.tolist())

# Missing values
print('Missing values in train:')
print(train.isnull().sum())
print('\nMissing values in test:')
print(test.isnull().sum())

# Target distribution
plt.figure(figsize=(8,4))
sns.histplot(train['Item_Outlet_Sales'], bins=50, kde=True)
plt.title('Target Distribution: Item_Outlet_Sales')
plt.show()

# Feature distributions
num_features = ['Item_Weight', 'Item_Visibility', 'Item_MRP']
for col in num_features:
    plt.figure(figsize=(6,3))
    sns.histplot(train[col], bins=40, kde=True)
    plt.title(f'Distribution of {col}')
    plt.show()

# Categorical feature counts
cat_features = ['Item_Type', 'Outlet_Type', 'Outlet_Size', 'Outlet_Location_Type']
for col in cat_features:
    plt.figure(figsize=(8,3))
    sns.countplot(y=train[col])
    plt.title(f'Counts of {col}')
    plt.show()