In [46]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn.linear_model as lm
import statsmodels.formula.api as sm
from sklearn.preprocessing import scale
import seaborn as sns
import statsmodels.api as smf
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import train_test_split, LeaveOneOut, KFold, cross_val_score
from sklearn.metrics import mean_squared_error

In [47]:
mart = pd.read_csv('BigMart.csv')
mart

Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,9.300,Low_Fat,0.016047,Dairy,249.8092,Medium,Tier_1,Supermarket_Type1,3735.1380
1,5.920,Regular,0.019278,Soft_Drinks,48.2692,Medium,Tier_3,Supermarket_Type2,443.4228
2,17.500,Low_Fat,0.016760,Meat,141.6180,Medium,Tier_1,Supermarket_Type1,2097.2700
3,19.200,Regular,0.000000,Fruits_and_Vegetables,182.0950,,Tier_3,Grocery_Store,732.3800
4,8.930,Low_Fat,0.000000,Household,53.8614,High,Tier_3,Supermarket_Type1,994.7052
...,...,...,...,...,...,...,...,...,...
8518,6.865,Low_Fat,0.056783,Snack_Foods,214.5218,High,Tier_3,Supermarket_Type1,2778.3834
8519,8.380,Regular,0.046982,Baking_Goods,108.1570,,Tier_2,Supermarket_Type1,549.2850
8520,10.600,Low_Fat,0.035186,Health_and_Hygiene,85.1224,Small,Tier_2,Supermarket_Type1,1193.1136
8521,7.210,Regular,0.145221,Snack_Foods,103.1332,Medium,Tier_3,Supermarket_Type2,1845.5976


In [48]:
mart.Item_Weight.fillna(mart.Item_Weight.mean(skipna=True), inplace=True)
mart.Outlet_Size.fillna(mart.Outlet_Size.mode()[0], inplace=True)

In [49]:
mart.Item_Fat_Content.unique()

array(['Low_Fat', 'Regular', 'low_fat', 'LF', 'reg'], dtype=object)

In [50]:
ItemFatContent2 = mart.Item_Fat_Content.map({'low_fat': 'Low_Fat', 'LF': 'Low_Fat', 'Regular': 'Regular', 'Low_Fat': 'Low_Fat', 'reg' : 'Regular'})

In [51]:
mart['Item_Fat_Content2'] = ItemFatContent2

In [52]:
mart.Item_Fat_Content2.unique()

array(['Low_Fat', 'Regular'], dtype=object)

In [53]:
from statsmodels.tools.tools import add_constant
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [54]:
mart['Item_Weight_s'] = scale(mart.Item_Weight, with_mean=True, with_std=True)

In [55]:
mart['Item_Visibility_s'] = scale(mart.Item_Visibility, with_mean=True, with_std=True)
mart['Item_MRP_s'] = scale(mart.Item_Visibility, with_mean=True, with_std=True)
mart.columns

Index(['Item_Weight', 'Item_Fat_Content', 'Item_Visibility', 'Item_Type',
       'Item_MRP', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type',
       'Item_Outlet_Sales', 'Item_Fat_Content2', 'Item_Weight_s',
       'Item_Visibility_s', 'Item_MRP_s'],
      dtype='object')

In [56]:
X = add_constant(mart.drop(columns=['Item_Fat_Content', 'Item_Visibility_s', 'Item_MRP_s', 'Item_Weight_s', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type','Item_Fat_Content2', 'Item_Type']))

In [57]:
X

Unnamed: 0,const,Item_Weight,Item_Visibility,Item_MRP,Item_Outlet_Sales
0,1.0,9.300,0.016047,249.8092,3735.1380
1,1.0,5.920,0.019278,48.2692,443.4228
2,1.0,17.500,0.016760,141.6180,2097.2700
3,1.0,19.200,0.000000,182.0950,732.3800
4,1.0,8.930,0.000000,53.8614,994.7052
...,...,...,...,...,...
8518,1.0,6.865,0.056783,214.5218,2778.3834
8519,1.0,8.380,0.046982,108.1570,549.2850
8520,1.0,10.600,0.035186,85.1224,1193.1136
8521,1.0,7.210,0.145221,103.1332,1845.5976


In [58]:
vif = pd.DataFrame()
vif['Features'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(len(X.columns))]

In [59]:
vif

Unnamed: 0,Features,VIF
0,const,16.839234
1,Item_Weight,1.000783
2,Item_Visibility,1.024886
3,Item_MRP,1.48749
4,Item_Outlet_Sales,1.511739


In [60]:
mart

Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,Item_Fat_Content2,Item_Weight_s,Item_Visibility_s,Item_MRP_s
0,9.300,Low_Fat,0.016047,Dairy,249.8092,Medium,Tier_1,Supermarket_Type1,3735.1380,Low_Fat,-0.841872,-0.970732,-0.970732
1,5.920,Regular,0.019278,Soft_Drinks,48.2692,Medium,Tier_3,Supermarket_Type2,443.4228,Regular,-1.641706,-0.908111,-0.908111
2,17.500,Low_Fat,0.016760,Meat,141.6180,Medium,Tier_1,Supermarket_Type1,2097.2700,Low_Fat,1.098554,-0.956917,-0.956917
3,19.200,Regular,0.000000,Fruits_and_Vegetables,182.0950,Medium,Tier_3,Grocery_Store,732.3800,Regular,1.500838,-1.281758,-1.281758
4,8.930,Low_Fat,0.000000,Household,53.8614,High,Tier_3,Supermarket_Type1,994.7052,Low_Fat,-0.929428,-1.281758,-1.281758
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8518,6.865,Low_Fat,0.056783,Snack_Foods,214.5218,High,Tier_3,Supermarket_Type1,2778.3834,Low_Fat,-1.418084,-0.181193,-0.181193
8519,8.380,Regular,0.046982,Baking_Goods,108.1570,Medium,Tier_2,Supermarket_Type1,549.2850,Regular,-1.059578,-0.371154,-0.371154
8520,10.600,Low_Fat,0.035186,Health_and_Hygiene,85.1224,Small,Tier_2,Supermarket_Type1,1193.1136,Low_Fat,-0.534243,-0.599784,-0.599784
8521,7.210,Regular,0.145221,Snack_Foods,103.1332,Medium,Tier_3,Supermarket_Type2,1845.5976,Regular,-1.336444,1.532880,1.532880


In [111]:
ohe = pd.get_dummies(data= mart, columns=['Item_Fat_Content2', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type'])

In [112]:
ohe.columns

Index(['Item_Weight', 'Item_Fat_Content', 'Item_Visibility', 'Item_Type',
       'Item_MRP', 'Item_Outlet_Sales', 'Item_Weight_s', 'Item_Visibility_s',
       'Item_MRP_s', 'Item_Fat_Content2_Low_Fat', 'Item_Fat_Content2_Regular',
       'Outlet_Size_High', 'Outlet_Size_Medium', 'Outlet_Size_Small',
       'Outlet_Location_Type_Tier_1', 'Outlet_Location_Type_Tier_2',
       'Outlet_Location_Type_Tier_3', 'Outlet_Type_Grocery_Store',
       'Outlet_Type_Supermarket_Type1', 'Outlet_Type_Supermarket_Type2',
       'Outlet_Type_Supermarket_Type3'],
      dtype='object')

In [113]:
ohefinal = ohe.drop(columns=X.drop(columns=['const']))

In [114]:
ohefinal.drop(columns=['Item_Fat_Content'], inplace=True)

In [119]:
ohefinal['Sales'] = ohe['Item_Outlet_Sales']
ohefinal

Unnamed: 0,Item_Type,Item_Weight_s,Item_Visibility_s,Item_MRP_s,Item_Fat_Content2_Low_Fat,Item_Fat_Content2_Regular,Outlet_Size_High,Outlet_Size_Medium,Outlet_Size_Small,Outlet_Location_Type_Tier_1,Outlet_Location_Type_Tier_2,Outlet_Location_Type_Tier_3,Outlet_Type_Grocery_Store,Outlet_Type_Supermarket_Type1,Outlet_Type_Supermarket_Type2,Outlet_Type_Supermarket_Type3,Sales
0,Dairy,-0.841872,-0.970732,-0.970732,1,0,0,1,0,1,0,0,0,1,0,0,3735.1380
1,Soft_Drinks,-1.641706,-0.908111,-0.908111,0,1,0,1,0,0,0,1,0,0,1,0,443.4228
2,Meat,1.098554,-0.956917,-0.956917,1,0,0,1,0,1,0,0,0,1,0,0,2097.2700
3,Fruits_and_Vegetables,1.500838,-1.281758,-1.281758,0,1,0,1,0,0,0,1,1,0,0,0,732.3800
4,Household,-0.929428,-1.281758,-1.281758,1,0,1,0,0,0,0,1,0,1,0,0,994.7052
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8518,Snack_Foods,-1.418084,-0.181193,-0.181193,1,0,1,0,0,0,0,1,0,1,0,0,2778.3834
8519,Baking_Goods,-1.059578,-0.371154,-0.371154,0,1,0,1,0,0,1,0,0,1,0,0,549.2850
8520,Health_and_Hygiene,-0.534243,-0.599784,-0.599784,1,0,0,0,1,0,1,0,0,1,0,0,1193.1136
8521,Snack_Foods,-1.336444,1.532880,1.532880,0,1,0,1,0,0,0,1,0,0,1,0,1845.5976


In [116]:
string = ''
for a in ohefinal.columns:
    string+='+'+a

In [117]:
string

'+Item_Type+Item_Weight_s+Item_Visibility_s+Item_MRP_s+Item_Fat_Content2_Low_Fat+Item_Fat_Content2_Regular+Outlet_Size_High+Outlet_Size_Medium+Outlet_Size_Small+Outlet_Location_Type_Tier_1+Outlet_Location_Type_Tier_2+Outlet_Location_Type_Tier_3+Outlet_Type_Grocery_Store+Outlet_Type_Supermarket_Type1+Outlet_Type_Supermarket_Type2+Outlet_Type_Supermarket_Type3+Sales'

In [170]:
fit1 = sm.ols('Sales~Item_Weight_s+Item_Visibility_s+Item_MRP_s+Item_Fat_Content2_Low_Fat+Item_Fat_Content2_Regular+Outlet_Size_High+Outlet_Size_Medium+Outlet_Size_Small+Outlet_Location_Type_Tier_1+Outlet_Location_Type_Tier_2+Outlet_Location_Type_Tier_3+Outlet_Type_Grocery_Store+Outlet_Type_Supermarket_Type1+Outlet_Type_Supermarket_Type2+Outlet_Type_Supermarket_Type3', data=ohefinal).fit()

In [171]:
fit1.summary()

0,1,2,3
Dep. Variable:,Sales,R-squared:,0.244
Model:,OLS,Adj. R-squared:,0.242
Method:,Least Squares,F-statistic:,109.8
Date:,"Tue, 04 Apr 2023",Prob (F-statistic):,0.0
Time:,14:52:49,Log-Likelihood:,-74330.0
No. Observations:,8523,AIC:,148700.0
Df Residuals:,8497,BIC:,148900.0
Df Model:,25,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-2.583e+15,2.82e+15,-0.916,0.359,-8.11e+15,2.94e+15
Item_Type[T.Breads],228.7115,110.631,2.067,0.039,11.848,445.575
Item_Type[T.Breakfast],243.8418,153.521,1.588,0.112,-57.097,544.780
Item_Type[T.Canned],235.0598,82.534,2.848,0.004,73.274,396.846
Item_Type[T.Dairy],299.9962,81.739,3.670,0.000,139.767,460.225
Item_Type[T.Frozen_Foods],159.5004,77.410,2.060,0.039,7.759,311.242
Item_Type[T.Fruits_and_Vegetables],309.1686,72.178,4.283,0.000,167.682,450.656
Item_Type[T.Hard_Drinks],176.4186,118.641,1.487,0.137,-56.146,408.984
Item_Type[T.Health_and_Hygiene],63.0979,89.501,0.705,0.481,-112.347,238.542

0,1,2,3
Omnibus:,1486.643,Durbin-Watson:,2.004
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2825.929
Skew:,1.079,Prob(JB):,0.0
Kurtosis:,4.818,Cond. No.,5060000000000000.0
