In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn import set_config
set_config(display='diagram')

In [None]:
def eval_regression(true, pred):
  mae = mean_absolute_error(true, pred)
  mse = mean_squared_error(true, pred)
  rmse = np.sqrt(mse)
  r2 = r2_score(true, pred)

  print(f'MAE {mae},\n MSE {mse},\n RMSE: {rmse},\n R^2: {r2} ')

In [None]:
df = pd.read_csv('https://docs.google.com/spreadsheets/d/e/2PACX-1vT3wxApuGhpoZ-dZHQpSGuIUXmwilDVxt6rg6AaF25cWC7RFOUKg0vO5BYI3N3ebIwg2wO2xVcUuM8k/pub?gid=1150377214&single=true&output=csv')
df.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [None]:
df2 = df.copy()

In [None]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            8523 non-null   object 
 1   Item_Weight                7060 non-null   float64
 2   Item_Fat_Content           8523 non-null   object 
 3   Item_Visibility            8523 non-null   float64
 4   Item_Type                  8523 non-null   object 
 5   Item_MRP                   8523 non-null   float64
 6   Outlet_Identifier          8523 non-null   object 
 7   Outlet_Establishment_Year  8523 non-null   int64  
 8   Outlet_Size                6113 non-null   object 
 9   Outlet_Location_Type       8523 non-null   object 
 10  Outlet_Type                8523 non-null   object 
 11  Item_Outlet_Sales          8523 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 799.2+ KB


In [None]:
df2.duplicated().any()

False

In [None]:
df2.isna().sum()

Item_Identifier                 0
Item_Weight                  1463
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64

In [None]:
df2.dropna(inplace=True)

In [None]:
df2 = df2.drop(columns = ['Item_Weight','Outlet_Size', 'Item_Identifier', 'Outlet_Identifier'])

In [None]:
print('unique Item_Types', df2['Item_Type'].unique())
print('\n')
print('unique Item_Fat_Contents', df2['Item_Fat_Content'].unique())
print('\n')
print('unique Outlet_Types', df2['Outlet_Type'].unique())
print('\n')

unique Item_Types ['Dairy' 'Soft Drinks' 'Meat' 'Household' 'Baking Goods' 'Snack Foods'
 'Fruits and Vegetables' 'Breakfast' 'Health and Hygiene' 'Frozen Foods'
 'Hard Drinks' 'Canned' 'Starchy Foods' 'Breads' 'Others' 'Seafood']


unique Item_Fat_Contents ['Low Fat' 'Regular' 'low fat' 'reg' 'LF']


unique Outlet_Types ['Supermarket Type1' 'Supermarket Type2']




In [None]:
df2.replace(['LF', 'low fat'], ['Low Fat', 'Low Fat'], inplace=True) 
df2['Item_Fat_Content'].value_counts()

Low Fat    3004
Regular    1575
reg          71
Name: Item_Fat_Content, dtype: int64

In [None]:
df2.replace(['reg'], ['Regular'], inplace=True)
df2['Item_Fat_Content'].value_counts()

Low Fat    3004
Regular    1646
Name: Item_Fat_Content, dtype: int64

In [None]:
y= df2['Item_Outlet_Sales']
X= df2.drop(columns=['Item_Outlet_Sales'])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=42)

In [None]:
len(X_train)

3487

In [None]:
len(X_test)

1163

In [None]:
mean_imp = SimpleImputer(strategy='mean')
most_freq_imp = SimpleImputer(strategy='most_frequent')

In [None]:
cat_sel = make_column_selector(dtype_include='object')
num_sel = make_column_selector(dtype_include='number')

In [None]:
OHE = OneHotEncoder(handle_unknown='ignore', sparse= False)
Teas_gone_cold = StandardScaler()

In [None]:
npipe= make_pipeline(mean_imp, Teas_gone_cold)
npipe

In [None]:
cpipe= make_pipeline(most_freq_imp, OHE)
cpipe

In [None]:
npipe_tup= (npipe, num_sel)
cpipe_tup= (cpipe, cat_sel)

In [None]:
prep = make_column_transformer(npipe_tup, cpipe_tup, remainder= 'passthrough')
prep

In [None]:
prep.fit(X_train)

In [None]:
xtrainpro= prep.transform(X_train)
xtestpro= prep.transform(X_test)

In [None]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4650 entries, 0 to 8522
Data columns (total 8 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Fat_Content           4650 non-null   object 
 1   Item_Visibility            4650 non-null   float64
 2   Item_Type                  4650 non-null   object 
 3   Item_MRP                   4650 non-null   float64
 4   Outlet_Establishment_Year  4650 non-null   int64  
 5   Outlet_Location_Type       4650 non-null   object 
 6   Outlet_Type                4650 non-null   object 
 7   Item_Outlet_Sales          4650 non-null   float64
dtypes: float64(3), int64(1), object(4)
memory usage: 327.0+ KB


In [None]:
print(np.isnan(xtrainpro).sum().sum(), 'missing values in training data')
print(np.isnan(xtestpro).sum().sum(), 'missing values in testing data')
print('\n')
print('All data in xtrainpro are', xtrainpro.dtype)
print('All data in xtestpro are', xtestpro.dtype)
print('\n')
print('shape of data is', xtrainpro.shape)
print('\n')
xtrainpro

0 missing values in training data
0 missing values in testing data


All data in xtrainpro are float64
All data in xtestpro are float64


shape of data is (3487, 26)




array([[-0.80142192,  1.55615978, -0.02828611, ...,  0.        ,
         1.        ,  0.        ],
       [ 1.06509895,  0.31746464, -0.02828611, ...,  0.        ,
         1.        ,  0.        ],
       [ 1.49877652, -0.31978717,  1.32101179, ...,  1.        ,
         0.        ,  1.        ],
       ...,
       [ 0.43911456, -0.98967997,  1.32101179, ...,  1.        ,
         0.        ,  1.        ],
       [-0.69794226,  1.19135799, -0.02828611, ...,  0.        ,
         1.        ,  0.        ],
       [-0.28145496, -1.30541276, -1.6474436 , ...,  1.        ,
         1.        ,  0.        ]])

In [None]:
lin_reg= LinearRegression()

In [None]:
regg_pipe= make_pipeline(prep, lin_reg)
regg_pipe.fit(X_train, y_train)

In [None]:
train_pred = regg_pipe.predict(X_train)
test_pred = regg_pipe.predict(X_test)

In [None]:
rmse_train= np.mean(np.abs(train_pred-y_train)**2)
print(f'Model Training RMSE: {rmse_train}')

Model Training RMSE: 1197377.5570395964


In [None]:
rmse_test= np.mean(np.abs(test_pred-y_test)**2)
print(f'Model Testing RMSE: {rmse_test}')

Model Testing RMSE: 1145721.25976226


In [None]:
r2_train = np.corrcoef(y_train,train_pred)[0][1]**2
r2_test = np.corrcoef(y_test, test_pred)[0][1]**2

print(f'Model Training R2: {r2_train}')
print(f'Model Testing R2: {r2_test}')

Model Training R2: 0.4680085326384319
Model Testing R2: 0.4843745004424538


In [None]:
d_tree= DecisionTreeRegressor(random_state = 42)

In [None]:
d_tree_p= make_pipeline(prep, d_tree)

In [None]:
d_tree_p.fit(X_train, y_train)

In [None]:
predtrain= d_tree_p.predict(X_train)
predtest= d_tree_p.predict(X_test)

In [None]:
train_score= d_tree_p.score(X_train, y_train)
print(train_score)

1.0


In [None]:
test_score= d_tree_p.score(X_test, y_test)
print(test_score)

-0.1433379249141009


In [None]:
d_tree.get_params()

{'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'random_state': 42,
 'splitter': 'best'}

In [None]:
d_tree.get_depth()

33

In [None]:
d_tree.get_n_leaves()

3450