Dataset link -- https://www.kaggle.com/datasets/brijbhushannanda1979/bigmart-sales-data

# Workflow
1) Data
2) Data Pre processing
3) Data Analysis
4) Train Test Split
5) XGBoost Regressor
6) Evaluation

In [None]:
#Importing the dependencies
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn import metrics ## for finding the performance of the model

In [None]:
bigmart_data = pd.read_csv('D:\\Data Science\\Datasets\\Machine Learning\\Projects\\big_mart_sales\\train.csv')
bigmart_data.head()

In [None]:
bigmart_data.shape

In [None]:
bigmart_data.info()

In [None]:
## Checking the missing values
bigmart_data.isnull().sum()

In [None]:
## mean value of item_weight column
bigmart_data['Item_Weight'].mean()

In [None]:
# filling the missing values in "Item_weight column" with "Mean" value
bigmart_data['Item_Weight'].fillna(bigmart_data['Item_Weight'].mean(), inplace=True)

In [None]:
# mode of "Outlet_Size" column
bigmart_data['Outlet_Size'].mode()

In [None]:
mode_of_outlet_size = bigmart_data.pivot_table(values='Outlet_Size', columns='Outlet_Type', aggfunc=(lambda x:x.mode()[0]))
mode_of_outlet_size

In [None]:
miss_values = bigmart_data['Outlet_Size'].isnull()
miss_values

In [None]:
bigmart_data.loc[miss_values, 'Outlet_Size']= bigmart_data.loc[miss_values,'Outlet_Type'].apply(lambda x: mode_of_outlet_size[x])

In [None]:
bigmart_data.isnull().sum()

In [None]:
## Data Analysis
bigmart_data.describe() ## Statistical measures about the data

In [None]:
sns.set() ## setting the theme

In [None]:
## Item weight distributon
plt.figure(figsize=(5,5))
sns.histplot(bigmart_data['Item_Weight'], kde=True)
plt.show()

In [None]:
# Item visibility distribution
plt.figure(figsize=(5,5))
sns.histplot(bigmart_data['Item_Visibility'], kde=True)
plt.show()

In [None]:
# Item MRP
plt.figure(figsize=(5,5))
sns.histplot(bigmart_data['Item_MRP'], kde=True)
plt.show()

In [None]:
# Item distributiion sales
plt.figure(figsize=(5,5))
sns.histplot(bigmart_data['Item_Outlet_Sales'], kde=True)
plt.show()

In [None]:
## count plot for establishment year
plt.figure(figsize=(5,5))
sns.countplot(data = bigmart_data, x = 'Outlet_Establishment_Year')
plt.show()

In [None]:
## Categorical features
# Item_Fat_Content column
plt.figure(figsize=(5,5))
sns.countplot(data=bigmart_data,x='Item_Fat_Content')
plt.show()

In [None]:
# Item_Type column
plt.figure(figsize=(30,6))
sns.countplot(data=bigmart_data,x='Item_Type')
plt.show()

In [None]:
# Outlet_Size column
plt.figure(figsize=(5,5))
sns.countplot(data=bigmart_data, x='Outlet_Size')
plt.show()

# Pre-processing

In [None]:
bigmart_data.head()

In [None]:
bigmart_data['Item_Fat_Content'].value_counts()

In [None]:
bigmart_data.replace({'Item_Fat_Content':{'low fat':'Low Fat', 'LF': 'Low Fat', 'reg':'Regular'}}, inplace=True)

In [None]:
bigmart_data['Item_Fat_Content'].value_counts()

## Label Encoding

In [None]:
encoder = LabelEncoder()

In [None]:
bigmart_data['Item_Identifier'] = encoder.fit_transform(bigmart_data['Item_Identifier'])

bigmart_data['Item_Fat_Content'] = encoder.fit_transform(bigmart_data['Item_Fat_Content'])

bigmart_data['Item_Type'] = encoder.fit_transform(bigmart_data['Item_Type'])

bigmart_data['Outlet_Identifier'] = encoder.fit_transform(bigmart_data['Outlet_Identifier'])

bigmart_data['Outlet_Size'] = encoder.fit_transform(bigmart_data['Outlet_Size'])

bigmart_data['Outlet_Location_Type'] = encoder.fit_transform(bigmart_data['Outlet_Location_Type'])

bigmart_data['Outlet_Type'] = encoder.fit_transform(bigmart_data['Outlet_Type'])

In [None]:
bigmart_data.head()

In [None]:
## Splitting the dataset
X = bigmart_data.drop(columns='Item_Outlet_Sales', axis=1)
Y = bigmart_data['Item_Outlet_Sales']

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=2)

In [None]:
print(X.shape, X_train.shape, X_test.shape)

# Machine Learning Model Training

In [None]:
## XGBoost regressor
regressor = XGBRegressor()

In [None]:
regressor.fit(X_train, Y_train)

In [None]:
# Evaluation
training_data_prediction = regressor.predict(X_train)

In [None]:
r2_train = metrics.r2_score(Y_train, training_data_prediction)

In [None]:
print('R Squared value = ', r2_train)

In [None]:
test_data_prediction = regressor.predict(X_test)

In [None]:
r2_test = metrics.r2_score(Y_test, test_data_prediction)

In [None]:
print('R Squared value = ', r2_test)

In [None]:
regressor.score(X_test, Y_test)