# Big Mart Sale Prediction Part 2

In [65]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn import metrics

In [None]:
# loading dataset
big_mart_data = pd.read_csv('big_mart_data.csv')
big_mart_data.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [67]:
# checking missing value
big_mart_data.isnull().sum()

Unnamed: 0,0
Item_Identifier,0
Item_Weight,1463
Item_Fat_Content,0
Item_Visibility,0
Item_Type,0
Item_MRP,0
Outlet_Identifier,0
Outlet_Establishment_Year,0
Outlet_Size,2410
Outlet_Location_Type,0


In [68]:
big_mart_data_processed = big_mart_data.copy()

In [69]:
big_mart_data_processed.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


### Preprocessing

**Handling null values**

In [73]:
big_mart_data_processed['Item_Weight'].mean()

np.float64(12.857645184135976)

In [76]:
# Filling the null value by mean
big_mart_data_processed['Item_Weight'].fillna(big_mart_data_processed['Item_Weight'].mean(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  big_mart_data_processed['Item_Weight'].fillna(big_mart_data_processed['Item_Weight'].mean(), inplace=True)


In [77]:
big_mart_data_processed.isnull().sum()

Unnamed: 0,0
Item_Identifier,0
Item_Weight,0
Item_Fat_Content,0
Item_Visibility,0
Item_Type,0
Item_MRP,0
Outlet_Identifier,0
Outlet_Establishment_Year,0
Outlet_Size,2410
Outlet_Location_Type,0


In [79]:
missing_values = big_mart_data_processed['Outlet_Size'].isnull()
missing_values

Unnamed: 0,Outlet_Size
0,False
1,False
2,False
3,True
4,False
...,...
8518,False
8519,True
8520,False
8521,False


In [80]:
mode_outlet_size = big_mart_data_processed.pivot_table(values='Outlet_Size', columns='Outlet_Type', aggfunc=(lambda x: x.mode()[0]))
mode_outlet_size

Outlet_Type,Grocery Store,Supermarket Type1,Supermarket Type2,Supermarket Type3
Outlet_Size,Small,Small,Medium,Medium


In [81]:
# Handling missing valye by mode
big_mart_data_processed.loc[missing_values, 'Outlet_Size'] = big_mart_data_processed.loc[missing_values, 'Outlet_Type'].apply(lambda x: mode_outlet_size[x])

In [82]:
big_mart_data_processed.isnull().sum()

Unnamed: 0,0
Item_Identifier,0
Item_Weight,0
Item_Fat_Content,0
Item_Visibility,0
Item_Type,0
Item_MRP,0
Outlet_Identifier,0
Outlet_Establishment_Year,0
Outlet_Size,0
Outlet_Location_Type,0


In [83]:
big_mart_data_processed['Item_Fat_Content'].value_counts()

Unnamed: 0_level_0,count
Item_Fat_Content,Unnamed: 1_level_1
Low Fat,5089
Regular,2889
LF,316
reg,117
low fat,112


Here Low Fat, LF and low fat are same thing and regular and reg are same

In [84]:
big_mart_data_processed.replace({'Item_Fat_Content' : {'LF':'Low Fat', 'low fat':'Low Fat', 'reg':'Regular'}}, inplace=True)

In [85]:
big_mart_data_processed['Item_Fat_Content'].value_counts()

Unnamed: 0_level_0,count
Item_Fat_Content,Unnamed: 1_level_1
Low Fat,5517
Regular,3006


In [86]:
big_mart_data_processed.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,Small,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


Train-test split

In [87]:
X = big_mart_data_processed.drop(columns='Item_Outlet_Sales', axis=1)
Y = big_mart_data_processed['Item_Outlet_Sales']

In [88]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.2, random_state=2)
X_train.shape, X_test.shape, Y_train.shape, Y_test.shape

((6818, 11), (1705, 11), (6818,), (1705,))

In [89]:
le = LabelEncoder()

In [90]:
X_train['Item_Identifier'] = le.fit_transform(X_train['Item_Identifier'])
X_test['Item_Identifier'] = le.transform(X_test['Item_Identifier'])

X_train['Item_Fat_Content'] = le.fit_transform(X_train['Item_Fat_Content'])
X_test['Item_Fat_Content'] = le.transform(X_test['Item_Fat_Content'])

X_train['Item_Type'] = le.fit_transform(X_train['Item_Type'])
X_test['Item_Type'] = le.transform(X_test['Item_Type'])

X_train['Outlet_Identifier'] = le.fit_transform(X_train['Outlet_Identifier'])
X_test['Outlet_Identifier'] = le.transform(X_test['Outlet_Identifier'])

X_train['Outlet_Size'] = le.fit_transform(X_train['Outlet_Size'])
X_test['Outlet_Size'] = le.transform(X_test['Outlet_Size'])

X_train['Outlet_Location_Type'] = le.fit_transform(X_train['Outlet_Location_Type'])
X_test['Outlet_Location_Type'] = le.transform(X_test['Outlet_Location_Type'])

X_train['Outlet_Type'] = le.fit_transform(X_train['Outlet_Type'])
X_test['Outlet_Type'] = le.transform(X_test['Outlet_Type'])

In [91]:
X_train.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type
7173,627,11.8,1,0.057422,10,149.9366,8,1997,2,0,1
3315,996,12.857645,0,0.0,0,100.1384,5,1985,1,2,3
5932,1127,7.725,0,0.047783,3,249.1092,8,1997,2,0,1
7872,439,10.5,0,0.052555,5,89.683,8,1997,2,0,1
5946,993,12.857645,1,0.235859,13,46.1402,4,1985,2,0,0


In [92]:
X_test.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type
1112,496,12.857645,0,0.099747,6,75.2328,5,1985,1,2,3
1751,382,12.15,0,0.131446,5,246.846,7,2002,2,1,1
7648,1207,5.8,0,0.131221,4,87.8172,7,2002,2,1,1
7362,1415,17.0,0,0.087388,8,125.073,0,1998,2,2,0
5332,910,12.857645,0,0.009585,5,102.5016,4,1985,2,0,0


In [93]:
# Model training
model = XGBRegressor(
    n_estimators=100,        # Number of trees
    learning_rate=0.1,       # Shrinks each tree's contribution
    max_depth=3,             # Limit tree depth
    subsample=0.8,           # Use 80% of rows
    colsample_bytree=0.8,    # Use 80% of columns
    reg_alpha=0.5,           # L1 regularization
    reg_lambda=1             # L2 regularization
)

In [94]:
model.fit(X_train, Y_train)

In [95]:
# prediction on training data
training_data_prediction = model.predict(X_train)

In [96]:
# R squared Value
r2_train = metrics.r2_score(Y_train, training_data_prediction)

In [97]:
print('R Squared value = ', r2_train)

R Squared value =  0.6339265862830405


In [98]:
# prediction on test data
test_data_prediction = model.predict(X_test)

In [99]:
# R squared Value
r2_test = metrics.r2_score(Y_test, test_data_prediction)

In [100]:
print('R Squared value = ', r2_test)

R Squared value =  0.5863058051276349


# Conclusion :
**Training Data R Squared value : 0.6339265862830405**<br>
**Testing data R Squared value : 0.5863058051276349**

**2. Enhancing model performance through data cleaning and tuning**

In [101]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor

In [102]:
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5],
    'learning_rate': [0.05, 0.1],
    'subsample': [0.8, 1],
    'colsample_bytree': [0.8, 1],
    'reg_alpha': [0, 0.5],
    'reg_lambda': [1, 2]
}

In [103]:
xgb = XGBRegressor(random_state=42)

grid_search = GridSearchCV(
    estimator=xgb,
    param_grid=param_grid,
    cv=3,                   # 3-fold cross-validation
    scoring='r2',           # Use R² as the scoring metric
    n_jobs=-1,              # Use all CPU cores
    verbose=1
)

In [104]:
grid_search.fit(X_train, Y_train)

Fitting 3 folds for each of 128 candidates, totalling 384 fits


In [105]:
print("Best Parameters:", grid_search.best_params_)

best_model = grid_search.best_estimator_

r2_train = r2_score(Y_train, best_model.predict(X_train))
r2_test = r2_score(Y_test, best_model.predict(X_test))

print('Train R²:', r2_train)
print('Test R²:', r2_test)

Best Parameters: {'colsample_bytree': 1, 'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 100, 'reg_alpha': 0.5, 'reg_lambda': 2, 'subsample': 0.8}
Train R²: 0.6213861134058403
Test R²: 0.588941240171204


# Conclusion :
**Training Data R Squared value : 0.6213861134058403**<br>
**Testing data R Squared value : 0.588941240171204**

**3. Enhancing model performance through data cleaning**

In [106]:
big_mart_data_processed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            8523 non-null   object 
 1   Item_Weight                8523 non-null   float64
 2   Item_Fat_Content           8523 non-null   object 
 3   Item_Visibility            8523 non-null   float64
 4   Item_Type                  8523 non-null   object 
 5   Item_MRP                   8523 non-null   float64
 6   Outlet_Identifier          8523 non-null   object 
 7   Outlet_Establishment_Year  8523 non-null   int64  
 8   Outlet_Size                8523 non-null   object 
 9   Outlet_Location_Type       8523 non-null   object 
 10  Outlet_Type                8523 non-null   object 
 11  Item_Outlet_Sales          8523 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 799.2+ KB


In [107]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import r2_score
from xgboost import XGBRegressor


big_mart_data_processed2 = big_mart_data_processed.copy()

# Creating 'Item_MRP_Range' feature
def categorize_mrp(mrp):
    if mrp < 70:
        return 'Low'
    elif mrp < 140:
        return 'Medium'
    elif mrp < 200:
        return 'High'
    else:
        return 'Very High'

big_mart_data_processed2['Item_MRP_Range'] = big_mart_data_processed2['Item_MRP'].apply(categorize_mrp)

# Label Encoding of Categorical Features
label_enc_cols = [
    'Item_Fat_Content', 'Item_Type', 'Outlet_Identifier',
    'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type',
    'Item_MRP_Range'
]

le = LabelEncoder()
for col in label_enc_cols:
    big_mart_data_processed2[col] = le.fit_transform(big_mart_data_processed2[col])

# Preparing Features and Target
X = big_mart_data_processed2.drop(columns=['Item_Outlet_Sales', 'Item_Identifier'])  # Remove target & ID
y = big_mart_data_processed2['Item_Outlet_Sales']

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Train XGBRegressor
model = XGBRegressor(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=3,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.5,
    reg_lambda=1,
    random_state=42
)

model.fit(X_train, y_train)

# Predictions and Evaluation
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)

print('Train R²:', r2_train)
print('Test R²:', r2_test)


Train R²: 0.6267400947998292
Test R²: 0.6127822133156682


# Conclusion :
**Training Data R Squared value : 0.6267400947998292** <br>
**Testing data R Squared value : 0.6127822133156682**

**4. Enhancing model performance through data cleaning**


In [38]:
model.feature_importances_

array([0.00953291, 0.00583924, 0.01110536, 0.009952  , 0.1717108 ,
       0.09307231, 0.08709519, 0.01559769, 0.01819743, 0.31080136,
       0.26709566], dtype=float32)

In [118]:
big_mart_data_processed3 = big_mart_data_processed.copy()

In [119]:
big_mart_data_processed3.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,Small,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [120]:
big_mart_data_processed3.drop(columns='Item_Identifier', inplace=True)
big_mart_data_processed3

Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,9.300,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.1380
1,5.920,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,17.500,Low Fat,0.016760,Meat,141.6180,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.2700
3,19.200,Regular,0.000000,Fruits and Vegetables,182.0950,OUT010,1998,Small,Tier 3,Grocery Store,732.3800
4,8.930,Low Fat,0.000000,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052
...,...,...,...,...,...,...,...,...,...,...,...
8518,6.865,Low Fat,0.056783,Snack Foods,214.5218,OUT013,1987,High,Tier 3,Supermarket Type1,2778.3834
8519,8.380,Regular,0.046982,Baking Goods,108.1570,OUT045,2002,Small,Tier 2,Supermarket Type1,549.2850
8520,10.600,Low Fat,0.035186,Health and Hygiene,85.1224,OUT035,2004,Small,Tier 2,Supermarket Type1,1193.1136
8521,7.210,Regular,0.145221,Snack Foods,103.1332,OUT018,2009,Medium,Tier 3,Supermarket Type2,1845.5976


In [121]:
big_mart_data_processed3['Item_Visibility'].isnull().sum()

np.int64(0)

In [122]:
(big_mart_data_processed3['Item_Visibility'] == 0).sum()

np.int64(526)

In [123]:
big_mart_data_processed3['Item_Visibility'] = big_mart_data_processed3['Item_Visibility'].replace(0, np.nan)
big_mart_data_processed3['Item_Visibility'].fillna(big_mart_data_processed3['Item_Visibility'].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  big_mart_data_processed3['Item_Visibility'].fillna(big_mart_data_processed3['Item_Visibility'].median(), inplace=True)


In [124]:
(big_mart_data_processed3['Item_Visibility'] == 0).sum()

np.int64(0)

In [125]:
big_mart_data_processed3.drop(columns='Outlet_Identifier', inplace=True)
big_mart_data_processed3.tail()

Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
8518,6.865,Low Fat,0.056783,Snack Foods,214.5218,1987,High,Tier 3,Supermarket Type1,2778.3834
8519,8.38,Regular,0.046982,Baking Goods,108.157,2002,Small,Tier 2,Supermarket Type1,549.285
8520,10.6,Low Fat,0.035186,Health and Hygiene,85.1224,2004,Small,Tier 2,Supermarket Type1,1193.1136
8521,7.21,Regular,0.145221,Snack Foods,103.1332,2009,Medium,Tier 3,Supermarket Type2,1845.5976
8522,14.8,Low Fat,0.044878,Soft Drinks,75.467,1997,Small,Tier 1,Supermarket Type1,765.67


In [126]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import r2_score
from xgboost import XGBRegressor


# Creating 'Item_MRP_Range' feature
def categorize_mrp(mrp):
    if mrp < 70:
        return 'Low'
    elif mrp < 140:
        return 'Medium'
    elif mrp < 200:
        return 'High'
    else:
        return 'Very High'

big_mart_data_processed3['Item_MRP_Range'] = big_mart_data_processed3['Item_MRP'].apply(categorize_mrp)
big_mart_data_processed3['Outlet_Age'] = 2025 - big_mart_data_processed['Outlet_Establishment_Year']

big_mart_data_processed3.drop(columns=['Outlet_Establishment_Year'], inplace=True)

# Preparing Features and Target
X = big_mart_data_processed3.drop(columns=['Item_Outlet_Sales'])
y = big_mart_data_processed3['Item_Outlet_Sales']

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

le = LabelEncoder()

X_train['Item_Fat_Content'] = le.fit_transform(X_train['Item_Fat_Content'])
X_test['Item_Fat_Content'] = le.transform(X_test['Item_Fat_Content'])

X_train['Item_Type'] = le.fit_transform(X_train['Item_Type'])
X_test['Item_Type'] = le.transform(X_test['Item_Type'])

X_train['Outlet_Size'] = le.fit_transform(X_train['Outlet_Size'])
X_test['Outlet_Size'] = le.transform(X_test['Outlet_Size'])

X_train['Outlet_Location_Type'] = le.fit_transform(X_train['Outlet_Location_Type'])
X_test['Outlet_Location_Type'] = le.transform(X_test['Outlet_Location_Type'])

X_train['Outlet_Type'] = le.fit_transform(X_train['Outlet_Type'])
X_test['Outlet_Type'] = le.transform(X_test['Outlet_Type'])

X_train['Item_MRP_Range'] = le.fit_transform(X_train['Item_MRP_Range'])
X_test['Item_MRP_Range'] = le.transform(X_test['Item_MRP_Range'])

# Train XGBRegressor
model = XGBRegressor(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=3,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.5,
    reg_lambda=1,
    random_state=42
)

model.fit(X_train, y_train)

# Predictions and Evaluation
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)

print('Train R²:', r2_train)
print('Test R²:', r2_test)


Train R²: 0.6263448080687015
Test R²: 0.6133963127194171


# Conclusion:
**Train R²: 0.6263448080687015 <br>
Test R²: 0.613396312719417**

# The best model is Model 3 and Model 4 of this file.
### Train R square: 0.6263448080687015 <br>
### Test R square: 0.613396312719417