<a href="https://colab.research.google.com/github/EliAckah/Machine-Learning-Projects/blob/main/Big_Mart_Sales_using_XGBoost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Importing Dependencies

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn import metrics

Data Collection and Analysis

In [None]:
# loading the dataset into a pandas dataframe
bigmart_data = pd.read_csv('/content/drive/MyDrive/Machine Learning/Train.csv')

In [None]:
# print the first 5 rows
bigmart_data.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [None]:
# number of datapoints and number of features
bigmart_data.shape

(8523, 12)

In [None]:
# getting some info about the dataset
bigmart_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            8523 non-null   object 
 1   Item_Weight                7060 non-null   float64
 2   Item_Fat_Content           8523 non-null   object 
 3   Item_Visibility            8523 non-null   float64
 4   Item_Type                  8523 non-null   object 
 5   Item_MRP                   8523 non-null   float64
 6   Outlet_Identifier          8523 non-null   object 
 7   Outlet_Establishment_Year  8523 non-null   int64  
 8   Outlet_Size                6113 non-null   object 
 9   Outlet_Location_Type       8523 non-null   object 
 10  Outlet_Type                8523 non-null   object 
 11  Item_Outlet_Sales          8523 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 799.2+ KB


In [None]:
bigmart_data.isnull().sum()

Item_Identifier                 0
Item_Weight                  1463
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64

Categorical Variables

In [None]:
# Get list of categorical variables
s = (bigmart_data.dtypes == 'object')
object_cols = list(s[s].index)

print("Categorical variables:")
print(object_cols)

Categorical variables:
['Item_Identifier', 'Item_Fat_Content', 'Item_Type', 'Outlet_Identifier', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type']


Categorical Columns with unique values less than 6

In [None]:
categorical_cols_3 = [cname for cname in bigmart_data.columns if bigmart_data[cname].nunique() < 6 and
                        bigmart_data[cname].dtype == "object"]

In [None]:
print(categorical_cols_3)

['Item_Fat_Content', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type']


Handle Missing Values

In [None]:
bigmart_data['Item_Fat_Content'].replace({'low fat':'Low Fat','LF':'Low Fat', 'reg':'Regular'},inplace=True)

In [None]:
bigmart_data['Outlet_Type'].value_counts()

Supermarket Type1    5577
Grocery Store        1083
Supermarket Type3     935
Supermarket Type2     928
Name: Outlet_Type, dtype: int64

In [None]:
# handling missing values in the numerical col, ItemWeight
num_imputer = SimpleImputer(strategy = 'mean')


In [None]:
bigmart_data['Item_Weight'] = num_imputer.fit_transform(bigmart_data['Item_Weight'].values.reshape(-1,1))

In [None]:
bigmart_data['Item_Weight'].isnull().sum()

0

In [None]:
# handling missing values in the categorical col, Outlet Size
cat_imputer = SimpleImputer(missing_values = np.NaN, strategy = 'most_frequent')

In [None]:
bigmart_data['Outlet_Size'] = cat_imputer.fit_transform(bigmart_data['Outlet_Size'].values.reshape(-1,1))

In [None]:
bigmart_data['Outlet_Size'].isnull().sum()

0

In [None]:
bigmart_data.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,Medium,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [None]:
bigmart_data['Outlet_Size'].value_counts()

Medium    5203
Small     2388
High       932
Name: Outlet_Size, dtype: int64

Encoding the Categorical Columns



*   Ordinal Encoding
*   One Hot Encoding



In [None]:
# categorical cols with unique values less than 5
print(categorical_cols_3)

['Item_Fat_Content', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type']


In [None]:
# ordinal encode categorical columns with a form of order
ord_encoder = OrdinalEncoder()
bigmart_data['Item_Fat_Content'] = ord_encoder.fit_transform(bigmart_data['Item_Fat_Content'].values.reshape(-1,1))

bigmart_data['Outlet_Size'] = ord_encoder.fit_transform(bigmart_data['Outlet_Size'].values.reshape(-1,1))

bigmart_data['Outlet_Type'] = ord_encoder.fit_transform(bigmart_data['Outlet_Type'].values.reshape(-1,1))

bigmart_data['Outlet_Location_Type'] = ord_encoder.fit_transform(bigmart_data['Outlet_Location_Type'].values.reshape(-1,1))

In [None]:
bigmart_data['Outlet_Location_Type'].value_counts()

2.0    3350
1.0    2785
0.0    2388
Name: Outlet_Location_Type, dtype: int64

Categorical Columns With Unique Values greater than 4

In [None]:
categorical_cols_4 = [cname for cname in bigmart_data.columns if bigmart_data[cname].nunique() > 4 and
                        bigmart_data[cname].dtype == "object"]
print(categorical_cols_4)

['Item_Identifier', 'Item_Type', 'Outlet_Identifier']


One Hot Encode the categories with unique values greater than 4 and with no order

In [None]:
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output = False)

In [None]:
OH_cat_cols = pd.DataFrame(OH_encoder.fit_transform(bigmart_data[categorical_cols_4]))

In [None]:
# One-hot encoding removed index; put it back
OH_cat_cols.index = bigmart_data.index

# Remove categorical columns (will replace with one-hot encoding)
num_bigmart_data = bigmart_data.drop(categorical_cols_4, axis=1)


# Add one-hot encoded columns to numerical features
OH_bigmartdata = pd.concat([num_bigmart_data, OH_cat_cols], axis=1)

# Ensure all columns have string type
OH_bigmartdata.columns = OH_bigmartdata.columns.astype(str)

In [None]:
bigmart_data.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,0.0,0.016047,Dairy,249.8092,OUT049,1999,1.0,0.0,1.0,3735.138
1,DRC01,5.92,1.0,0.019278,Soft Drinks,48.2692,OUT018,2009,1.0,2.0,2.0,443.4228
2,FDN15,17.5,0.0,0.01676,Meat,141.618,OUT049,1999,1.0,0.0,1.0,2097.27
3,FDX07,19.2,1.0,0.0,Fruits and Vegetables,182.095,OUT010,1998,1.0,2.0,0.0,732.38
4,NCD19,8.93,0.0,0.0,Household,53.8614,OUT013,1987,0.0,2.0,1.0,994.7052


In [None]:
X = OH_bigmartdata.drop(columns='Item_Outlet_Sales', axis =1)
Y = OH_bigmartdata['Item_Outlet_Sales']

In [None]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y, test_size=0.2, random_state = 2)

In [None]:
print(X.shape,X_train.shape,X_test.shape)

(8523, 1593) (6818, 1593) (1705, 1593)


In [None]:
bregressor = XGBRegressor()

In [None]:
bregressor.fit(X_train, Y_train)

In [None]:
bigmart_pred = bregressor.predict(X_train)

In [None]:
# R squared value
r2_train = mean_absolute_error(bigmart_pred,Y_train)

In [None]:
print(r2_train)

641.0434328492119


In [None]:
# test data evaluation
bigmart_test_pred = bregressor.predict(X_test)


In [None]:
# R squared Value
r2_test = mean_absolute_error(bigmart_test_pred,Y_test)
print(r2_test)

806.6037072873365


In [None]:
my_model_2 = XGBRegressor(n_estimators = 1000, learning_rate = 0.033)



In [None]:
# Fit the model
my_model_2.fit(X_train,Y_train, early_stopping_rounds = 8,
              eval_set=[(X_test,Y_test)], verbose = False)



In [None]:
bigmart_pred = my_model_2.predict(X_train)

In [None]:
r1_train = mean_absolute_error(bigmart_pred,Y_train)

In [None]:
print(r1_train)

705.4000785224824


In [None]:
bigmartt_pred = my_model_2.predict(X_test)

In [None]:
r1_test = mean_absolute_error(bigmartt_pred,Y_test)

In [None]:
print(r1_test)

792.692583688256
