In [48]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBRFRegressor
from sklearn import metrics

In [49]:
# Loading dataset
big_mart_data = pd.read_csv('Train.csv')

In [50]:
# Checking missing values
big_mart_data.isnull().sum()

Item_Identifier                 0
Item_Weight                  1463
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64

In [51]:
# Filling missing values 'Item_Weight' with mean
big_mart_data['Item_Weight'] = big_mart_data['Item_Weight'].fillna(big_mart_data['Item_Weight'].mean())

In [52]:
# Finding mode of OutletSize based on OutletType
mode_outlet_size = big_mart_data.pivot_table(values='Outlet_Size', index='Outlet_Type', aggfunc=(lambda x: x.mode()[0]))

In [53]:
mode_outlet_size

Unnamed: 0_level_0,Outlet_Size
Outlet_Type,Unnamed: 1_level_1
Grocery Store,Small
Supermarket Type1,Small
Supermarket Type2,Medium
Supermarket Type3,Medium


In [54]:
missing_values = big_mart_data['Outlet_Size'].isnull()
print(missing_values)

0       False
1       False
2       False
3        True
4       False
        ...  
8518    False
8519     True
8520    False
8521    False
8522    False
Name: Outlet_Size, Length: 8523, dtype: bool


In [55]:
# Filling missing values in 'Outlet_Size' based on the 'Outlet_Type
big_mart_data.loc[missing_values, 'Outlet_Size'] = big_mart_data.loc[missing_values, 'Outlet_Type'].apply(lambda x: mode_outlet_size.loc[x])

In [56]:
big_mart_data.describe()

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Outlet_Establishment_Year,Item_Outlet_Sales
count,8523.0,8523.0,8523.0,8523.0,8523.0
mean,12.857645,0.066132,140.992782,1997.831867,2181.288914
std,4.226124,0.051598,62.275067,8.37176,1706.499616
min,4.555,0.0,31.29,1985.0,33.29
25%,9.31,0.026989,93.8265,1987.0,834.2474
50%,12.857645,0.053931,143.0128,1999.0,1794.331
75%,16.0,0.094585,185.6437,2004.0,3101.2964
max,21.35,0.328391,266.8884,2009.0,13086.9648


In [57]:
# Replacing incosistent categories in ItemFatContent with labels
big_mart_data = big_mart_data.replace({'Item_Fat_Content' : {'low fat' : 'Low Fat', 'LF' : 'Low Fat', 'reg' : 'Regular'}})

In [58]:
big_mart_data.describe()

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Outlet_Establishment_Year,Item_Outlet_Sales
count,8523.0,8523.0,8523.0,8523.0,8523.0
mean,12.857645,0.066132,140.992782,1997.831867,2181.288914
std,4.226124,0.051598,62.275067,8.37176,1706.499616
min,4.555,0.0,31.29,1985.0,33.29
25%,9.31,0.026989,93.8265,1987.0,834.2474
50%,12.857645,0.053931,143.0128,1999.0,1794.331
75%,16.0,0.094585,185.6437,2004.0,3101.2964
max,21.35,0.328391,266.8884,2009.0,13086.9648


In [59]:
# Initializing label encoder
encoder = LabelEncoder()

In [60]:
# Encoding the categorical columns with numerical labels
columns_to_encode = ['Item_Identifier', 'Item_Fat_Content', 'Item_Type', 'Outlet_Identifier', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type']

for col in columns_to_encode:
    big_mart_data[col] = encoder.fit_transform(big_mart_data[col].astype(str))

In [61]:
# Separating the independent variables
X = big_mart_data.drop(columns=['Item_Outlet_Sales'], axis=1)
Y = big_mart_data['Item_Outlet_Sales']

In [62]:
# Splitting the dataset into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=2)


In [63]:
model = XGBRFRegressor(random_state=2)

In [64]:
# Training the model
model.fit(X_train, Y_train)

In [65]:
# Predicting the sales
training_data_prediction = model.predict(X_train)

In [66]:
# Claculating r2_score for training data
r2_train = metrics.r2_score(Y_train, training_data_prediction)

In [67]:
print(f'{r2_train:.2f}')

0.62


In [68]:
# Predicting the sales on test data
test_data_prediction = model.predict(X_test)

In [69]:
# Calculating the r2_score for predicted value
r2_test = metrics.r2_score(Y_test, test_data_prediction)

In [70]:
print(f'{r2_test:.2f}')

0.60
