In [54]:
import numpy as np                                        # for creating numpy arrays
import pandas as pd                                       # for creating pandas dataframe
import matplotlib.pyplot as plt                           # for Visualization
import seaborn as sns                                     # for Visualization
from sklearn.preprocessing import LabelEncoder            # for Label Encoding on categorical data
from sklearn.model_selection import train_test_split      # for splitting data
from xgboost import XGBRegressor                          # ML Model
from sklearn.linear_model import LinearRegression         # ML Model
from sklearn import metrics                               # to find performance of ML model

In [2]:
df = pd.read_csv('Train.csv')

In [3]:
df

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.300,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.1380
1,DRC01,5.920,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.500,Low Fat,0.016760,Meat,141.6180,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.2700
3,FDX07,19.200,Regular,0.000000,Fruits and Vegetables,182.0950,OUT010,1998,,Tier 3,Grocery Store,732.3800
4,NCD19,8.930,Low Fat,0.000000,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052
...,...,...,...,...,...,...,...,...,...,...,...,...
8518,FDF22,6.865,Low Fat,0.056783,Snack Foods,214.5218,OUT013,1987,High,Tier 3,Supermarket Type1,2778.3834
8519,FDS36,8.380,Regular,0.046982,Baking Goods,108.1570,OUT045,2002,,Tier 2,Supermarket Type1,549.2850
8520,NCJ29,10.600,Low Fat,0.035186,Health and Hygiene,85.1224,OUT035,2004,Small,Tier 2,Supermarket Type1,1193.1136
8521,FDN46,7.210,Regular,0.145221,Snack Foods,103.1332,OUT018,2009,Medium,Tier 3,Supermarket Type2,1845.5976


In [4]:
# first 5 rows of the dataframe
df.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [5]:
# number of data points(Rows) & number of features(Columns)
df.shape

(8523, 12)

In [6]:
# getting some information about null values and data types
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            8523 non-null   object 
 1   Item_Weight                7060 non-null   float64
 2   Item_Fat_Content           8523 non-null   object 
 3   Item_Visibility            8523 non-null   float64
 4   Item_Type                  8523 non-null   object 
 5   Item_MRP                   8523 non-null   float64
 6   Outlet_Identifier          8523 non-null   object 
 7   Outlet_Establishment_Year  8523 non-null   int64  
 8   Outlet_Size                6113 non-null   object 
 9   Outlet_Location_Type       8523 non-null   object 
 10  Outlet_Type                8523 non-null   object 
 11  Item_Outlet_Sales          8523 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 799.2+ KB


In [7]:
# checking for missing values
df.isnull().sum()

Item_Identifier                 0
Item_Weight                  1463
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64

In [8]:
df.describe()

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Outlet_Establishment_Year,Item_Outlet_Sales
count,7060.0,8523.0,8523.0,8523.0,8523.0
mean,12.857645,0.066132,140.992782,1997.831867,2181.288914
std,4.643456,0.051598,62.275067,8.37176,1706.499616
min,4.555,0.0,31.29,1985.0,33.29
25%,8.77375,0.026989,93.8265,1987.0,834.2474
50%,12.6,0.053931,143.0128,1999.0,1794.331
75%,16.85,0.094585,185.6437,2004.0,3101.2964
max,21.35,0.328391,266.8884,2009.0,13086.9648


In [63]:
df.dropna(inplace = True)

In [64]:
df.shape

(7060, 12)

In [65]:
#Data Pre-Processing

In [66]:
df.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,156,9.3,1,0.016047,4,249.8092,9,1999,1,0,1,3735.138
1,8,5.92,2,0.019278,14,48.2692,3,2009,1,2,2,443.4228
2,662,17.5,1,0.01676,10,141.618,9,1999,1,0,1,2097.27
3,1121,19.2,2,0.0,6,182.095,0,1998,3,2,0,732.38
4,1297,8.93,1,0.0,9,53.8614,1,1987,0,2,1,994.7052


In [67]:
df['Item_Fat_Content'].value_counts()

1    4222
2    2388
0     260
4     106
3      84
Name: Item_Fat_Content, dtype: int64

In [68]:
encoder = LabelEncoder()

In [69]:
df['Item_Identifier'] = encoder.fit_transform(df['Item_Identifier'])

df['Item_Fat_Content'] = encoder.fit_transform(df['Item_Fat_Content'])

df['Item_Type'] = encoder.fit_transform(df['Item_Type'])

df['Outlet_Identifier'] = encoder.fit_transform(df['Outlet_Identifier'])

df['Outlet_Size'] = encoder.fit_transform(df['Outlet_Size'])

df['Outlet_Location_Type'] = encoder.fit_transform(df['Outlet_Location_Type'])

df['Outlet_Type'] = encoder.fit_transform(df['Outlet_Type'])

In [70]:
df.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,156,9.3,1,0.016047,4,249.8092,7,1999,1,0,1,3735.138
1,8,5.92,2,0.019278,14,48.2692,3,2009,1,2,2,443.4228
2,660,17.5,1,0.01676,10,141.618,7,1999,1,0,1,2097.27
3,1117,19.2,2,0.0,6,182.095,0,1998,3,2,0,732.38
4,1293,8.93,1,0.0,9,53.8614,1,1987,0,2,1,994.7052


In [71]:
X = df.drop(columns='Item_Outlet_Sales', axis=1)
Y = df['Item_Outlet_Sales']

In [72]:
print(X)

      Item_Identifier  Item_Weight  Item_Fat_Content  Item_Visibility  \
0                 156        9.300                 1         0.016047   
1                   8        5.920                 2         0.019278   
2                 660       17.500                 1         0.016760   
3                1117       19.200                 2         0.000000   
4                1293        8.930                 1         0.000000   
...               ...          ...               ...              ...   
8518              369        6.865                 1         0.056783   
8519              893        8.380                 2         0.046982   
8520             1353       10.600                 1         0.035186   
8521              679        7.210                 2         0.145221   
8522               50       14.800                 1         0.044878   

      Item_Type  Item_MRP  Outlet_Identifier  Outlet_Establishment_Year  \
0             4  249.8092                  7    

In [73]:
print(Y)

0       3735.1380
1        443.4228
2       2097.2700
3        732.3800
4        994.7052
          ...    
8518    2778.3834
8519     549.2850
8520    1193.1136
8521    1845.5976
8522     765.6700
Name: Item_Outlet_Sales, Length: 7060, dtype: float64


In [74]:
#Machine Learning Model Training

In [75]:
#Linear Regressor

In [76]:
regressor = LinearRegression()

In [77]:

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=2)

In [78]:
regressor.fit(X_train, Y_train)

In [79]:
LinearRegression()

In [80]:
# prediction on training data
training_data_prediction = regressor.predict(X_train)

In [81]:
# R squared Value
r2_train = metrics.r2_score(Y_train, training_data_prediction)

In [82]:
print('R Squared value = ', r2_train)

R Squared value =  0.4647954241224679


In [83]:
# prediction on test data
test_data_prediction = regressor.predict(X_test)

In [84]:
# R squared Value
r2_test = metrics.r2_score(Y_test, test_data_prediction)

In [85]:
print('R Squared value = ', r2_test)

R Squared value =  0.4673130731538986


In [95]:

!pip install xgboost


Collecting xgboost
  Obtaining dependency information for xgboost from https://files.pythonhosted.org/packages/11/6f/419545a6a344cfd1358a80c36a06431881d607830483ef63d7c38905cd22/xgboost-2.0.1-py3-none-win_amd64.whl.metadata
  Downloading xgboost-2.0.1-py3-none-win_amd64.whl.metadata (2.0 kB)
Downloading xgboost-2.0.1-py3-none-win_amd64.whl (99.7 MB)
   ---------------------------------------- 0.0/99.7 MB ? eta -:--:--
   ---------------------------------------- 0.0/99.7 MB 1.4 MB/s eta 0:01:14
   ---------------------------------------- 0.0/99.7 MB 1.4 MB/s eta 0:01:14
   ---------------------------------------- 0.1/99.7 MB 660.6 kB/s eta 0:02:31
   ---------------------------------------- 0.1/99.7 MB 939.4 kB/s eta 0:01:47
   ---------------------------------------- 0.2/99.7 MB 919.0 kB/s eta 0:01:49
   ---------------------------------------- 0.2/99.7 MB 1.1 MB/s eta 0:01:30
   ---------------------------------------- 0.3/99.7 MB 1.3 MB/s eta 0:01:16
   ------------------------------


[notice] A new release of pip is available: 23.2.1 -> 23.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [96]:
from xgboost import XGBRegressor

In [97]:
#XGBoost Regressor

In [98]:
regressor = XGBRegressor()

In [99]:
regressor.fit(X_train, Y_train)

In [100]:
XGBRegressor()

In [101]:
# prediction on training data
training_data_prediction = regressor.predict(X_train)

In [102]:
# R squared Value
r2_train = metrics.r2_score(Y_train, training_data_prediction)

In [103]:
print('R Squared value = ', r2_train)

R Squared value =  0.8636824834453669


In [104]:
# prediction on test data
test_data_prediction = regressor.predict(X_test)

In [105]:
# R squared Value
r2_test = metrics.r2_score(Y_test, test_data_prediction)

In [106]:
print('R Squared value = ', r2_test)

R Squared value =  0.431570074014475


In [107]:
#As we can clearly see Linear Regression performs slighlty better 