In [1]:
#Import
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn import set_config
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import joblib
set_config(display='diagram')

filename = "Data/sales_predictions.csv"
df = pd.read_csv(filename)


In [2]:

## setting random state for reproducibility
SEED = 321
np.random.seed(SEED)
## set pandas to display more columns
pd.set_option('display.max_columns',50)



In [3]:
# Run the following command on your local computer to check the version of sklearn
import sklearn
!python --version
print(f"sklearn version: {sklearn.__version__}")

Python 3.9.15
sklearn version: 1.1.3


In [4]:
# set text displays for sklearn
from sklearn import set_config
set_config(display='text')


In [5]:
def evaluate_regression(model, X_train,y_train, X_test, y_test): 
    """Evaluates a scikit learn regression model using r-squared and RMSE"""
    
    ## Training Data
    y_pred_train = model.predict(X_train)
    r2_train = metrics.r2_score(y_train, y_pred_train)
    rmse_train = metrics.mean_squared_error(y_train, y_pred_train, 
                                            squared=False)
    
    print(f"Training Data:\tR^2= {r2_train:.2f}\tRMSE= {rmse_train:.2f}")
        
    
    ## Test Data
    y_pred_test = model.predict(X_test)
    r2_test = metrics.r2_score(y_test, y_pred_test)
    rmse_test = metrics.mean_squared_error(y_test, y_pred_test, 
                                            squared=False)
    
    print(f"Test Data:\tR^2= {r2_test:.2f}\tRMSE= {rmse_test:.2f}")


In [6]:
def get_coeffs_linreg(lin_reg, feature_names = None, sort=True,ascending=True,
                     name='LinearRegression Coefficients'):
    if feature_names is None:
        feature_names = lin_reg.feature_names_in_
    ## Saving the coefficients
    coeffs = pd.Series(lin_reg.coef_, index= feature_names)
    coeffs['intercept'] = lin_reg.intercept_
    if sort==True:
        coeffs = coeffs.sort_values(ascending=ascending)
    return coeffs



In [7]:
def get_importances(model, feature_names=None,name='Feature Importance',
                   sort=False, ascending=True):
    
    ## checking for feature names
    if feature_names == None:
        feature_names = model.feature_names_in_
        
    ## Saving the feature importances
    importances = pd.Series(model.feature_importances_, index= feature_names,
                           name=name)
    
    # sort importances
    if sort == True:
        importances = importances.sort_values(ascending=ascending)
        
    return importances



In [8]:
def plot_importance(importances, top_n=None,  figsize=(8,6)):
    # sorting with asc=false for correct order of bars
    if top_n==None:
        ## sort all features and set title
        plot_vals = importances.sort_values()
        title = "All Features - Ranked by Importance"
    else:
        ## sort features and keep top_n and set title
        plot_vals = importances.sort_values().tail(top_n)
        title = f"Top {top_n} Most Important Features"
    ## plotting top N importances
    ax = plot_vals.plot(kind='barh', figsize=figsize)
    ax.set(xlabel='Importance',
            ylabel='Feature Names',
            title=title)
    ## return ax in case want to continue to update/modify figure
    return ax



- Checking for missing values.
- Then filling data for missing values and confirming change.


In [9]:
#Data
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            8523 non-null   object 
 1   Item_Weight                7060 non-null   float64
 2   Item_Fat_Content           8523 non-null   object 
 3   Item_Visibility            8523 non-null   float64
 4   Item_Type                  8523 non-null   object 
 5   Item_MRP                   8523 non-null   float64
 6   Outlet_Identifier          8523 non-null   object 
 7   Outlet_Establishment_Year  8523 non-null   int64  
 8   Outlet_Size                6113 non-null   object 
 9   Outlet_Location_Type       8523 non-null   object 
 10  Outlet_Type                8523 non-null   object 
 11  Item_Outlet_Sales          8523 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 799.2+ KB


Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [10]:
df['Item_Fat_Content'].value_counts()
df['Item_Fat_Content'] = df['Item_Fat_Content'].replace('low fat', 'Low Fat')
df['Item_Fat_Content'] = df['Item_Fat_Content'].replace('LF', 'Low Fat')
df['Item_Fat_Content'] = df['Item_Fat_Content'].replace('reg', 'Regular')
df = df.dropna()
#df["Outlet_Size"] = df["Outlet_Size"].dropna()

In [11]:
filter_df = (df["Item_Weight"] > 0) & (df["Item_Visibility"] > 0)

In [12]:
df = df[filter_df]
df[filter_df]

  df[filter_df]


Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.300,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.1380
1,DRC01,5.920,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.500,Low Fat,0.016760,Meat,141.6180,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.2700
6,FDO10,13.650,Regular,0.012741,Snack Foods,57.6588,OUT013,1987,High,Tier 3,Supermarket Type1,343.5528
11,FDA03,18.500,Regular,0.045464,Dairy,144.1102,OUT046,1997,Small,Tier 1,Supermarket Type1,2187.1530
...,...,...,...,...,...,...,...,...,...,...,...,...
8517,FDF53,20.750,Regular,0.083607,Frozen Foods,178.8318,OUT046,1997,Small,Tier 1,Supermarket Type1,3608.6360
8518,FDF22,6.865,Low Fat,0.056783,Snack Foods,214.5218,OUT013,1987,High,Tier 3,Supermarket Type1,2778.3834
8520,NCJ29,10.600,Low Fat,0.035186,Health and Hygiene,85.1224,OUT035,2004,Small,Tier 2,Supermarket Type1,1193.1136
8521,FDN46,7.210,Regular,0.145221,Snack Foods,103.1332,OUT018,2009,Medium,Tier 3,Supermarket Type2,1845.5976


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4358 entries, 0 to 8522
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            4358 non-null   object 
 1   Item_Weight                4358 non-null   float64
 2   Item_Fat_Content           4358 non-null   object 
 3   Item_Visibility            4358 non-null   float64
 4   Item_Type                  4358 non-null   object 
 5   Item_MRP                   4358 non-null   float64
 6   Outlet_Identifier          4358 non-null   object 
 7   Outlet_Establishment_Year  4358 non-null   int64  
 8   Outlet_Size                4358 non-null   object 
 9   Outlet_Location_Type       4358 non-null   object 
 10  Outlet_Type                4358 non-null   object 
 11  Item_Outlet_Sales          4358 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 442.6+ KB


In [14]:
#Define X and y
X = df.drop(columns = ["Item_Outlet_Sales", "Outlet_Identifier", "Item_Identifier"]).copy()

y = df["Item_Outlet_Sales"].copy()


#Train test split the data to prepare for machine learning
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=SEED)
X_train.isna().any()


Item_Weight                  False
Item_Fat_Content             False
Item_Visibility              False
Item_Type                    False
Item_MRP                     False
Outlet_Establishment_Year    False
Outlet_Size                  False
Outlet_Location_Type         False
Outlet_Type                  False
dtype: bool

In [15]:
## make categorical selector and verify it works 
cat_sel = make_column_selector(dtype_include='object')
cat_sel(X_train)

['Item_Fat_Content',
 'Item_Type',
 'Outlet_Size',
 'Outlet_Location_Type',
 'Outlet_Type']

In [16]:
## make numeric selector and verify it works 
num_sel = make_column_selector(dtype_include='number')
num_sel(X_train)

['Item_Weight', 'Item_Visibility', 'Item_MRP', 'Outlet_Establishment_Year']

In [17]:
## make pipelines for categorical vs numeric data
cat_pipe = make_pipeline(SimpleImputer(strategy='most_frequent', fill_value='MISSING'),
                         OneHotEncoder(drop='first', handle_unknown='ignore', sparse=False))
num_pipe = make_pipeline(SimpleImputer(strategy='median'), StandardScaler())



In [18]:
## make the preprocessing column transformer
preprocessor = make_column_transformer((num_pipe, num_sel),
                                       (cat_pipe,cat_sel),                                      
                                       verbose_feature_names_out=False)
preprocessor

ColumnTransformer(transformers=[('pipeline-1',
                                 Pipeline(steps=[('simpleimputer',
                                                  SimpleImputer(strategy='median')),
                                                 ('standardscaler',
                                                  StandardScaler())]),
                                 <sklearn.compose._column_transformer.make_column_selector object at 0x000002B0A6436100>),
                                ('pipeline-2',
                                 Pipeline(steps=[('simpleimputer',
                                                  SimpleImputer(fill_value='MISSING',
                                                                strategy='most_frequent')),
                                                 ('onehotencoder',
                                                  OneHotEncoder(drop='first',
                                                                handle_unknown='ignore',
             

In [19]:
## fit column transformer and run get_feature_names_out
preprocessor.fit(X_train)
feature_names = preprocessor.get_feature_names_out()
feature_names

array(['Item_Weight', 'Item_Visibility', 'Item_MRP',
       'Outlet_Establishment_Year', 'Item_Fat_Content_Regular',
       'Item_Type_Breads', 'Item_Type_Breakfast', 'Item_Type_Canned',
       'Item_Type_Dairy', 'Item_Type_Frozen Foods',
       'Item_Type_Fruits and Vegetables', 'Item_Type_Hard Drinks',
       'Item_Type_Health and Hygiene', 'Item_Type_Household',
       'Item_Type_Meat', 'Item_Type_Others', 'Item_Type_Seafood',
       'Item_Type_Snack Foods', 'Item_Type_Soft Drinks',
       'Item_Type_Starchy Foods', 'Outlet_Size_Medium',
       'Outlet_Size_Small', 'Outlet_Location_Type_Tier 2',
       'Outlet_Location_Type_Tier 3', 'Outlet_Type_Supermarket Type2'],
      dtype=object)

In [20]:
X_train_df = pd.DataFrame(preprocessor.transform(X_train),
                           columns = feature_names, index = X_train.index)
X_train_df.head(3)

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Outlet_Establishment_Year,Item_Fat_Content_Regular,Item_Type_Breads,Item_Type_Breakfast,Item_Type_Canned,Item_Type_Dairy,Item_Type_Frozen Foods,Item_Type_Fruits and Vegetables,Item_Type_Hard Drinks,Item_Type_Health and Hygiene,Item_Type_Household,Item_Type_Meat,Item_Type_Others,Item_Type_Seafood,Item_Type_Snack Foods,Item_Type_Soft Drinks,Item_Type_Starchy Foods,Outlet_Size_Medium,Outlet_Size_Small,Outlet_Location_Type_Tier 2,Outlet_Location_Type_Tier 3,Outlet_Type_Supermarket Type2
6436,-0.937701,-0.880575,0.157173,0.652306,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
1547,0.597684,2.238408,-1.331809,1.327034,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0
5432,0.822218,-1.318794,-0.599906,0.652306,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0


In [21]:
X_test_df = pd.DataFrame(preprocessor.transform(X_test),
                           columns = feature_names, index = X_test.index)
X_test_df.head(3)

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Outlet_Establishment_Year,Item_Fat_Content_Regular,Item_Type_Breads,Item_Type_Breakfast,Item_Type_Canned,Item_Type_Dairy,Item_Type_Frozen Foods,Item_Type_Fruits and Vegetables,Item_Type_Hard Drinks,Item_Type_Health and Hygiene,Item_Type_Household,Item_Type_Meat,Item_Type_Others,Item_Type_Seafood,Item_Type_Snack Foods,Item_Type_Soft Drinks,Item_Type_Starchy Foods,Outlet_Size_Medium,Outlet_Size_Small,Outlet_Location_Type_Tier 2,Outlet_Location_Type_Tier 3,Outlet_Type_Supermarket Type2
6305,-0.08661,0.210815,-0.850904,1.327034,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0
2103,1.570665,-0.976131,1.587135,-0.292313,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
105,0.897063,-0.42258,-0.007298,-1.641769,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [22]:
## confirm the first 3 rows index in y_test matches X_test_df
y_test.head(3)

6305    1133.8574
2103    5768.4912
105     2247.7408
Name: Item_Outlet_Sales, dtype: float64

In [23]:
## fitting a linear regression model
lin_reg = LinearRegression()
lin_reg.fit(X_train_df, y_train)
evaluate_regression(lin_reg, X_train_df, y_train, X_test_df,y_test)


Training Data:	R^2= 0.48	RMSE= 1087.39
Test Data:	R^2= 0.45	RMSE= 1107.24


In [24]:
## Saving the coefficients
coeffs = pd.Series(lin_reg.coef_, index= feature_names)
coeffs['intercept'] = lin_reg.intercept_
coeffs

Item_Weight                         -20.924450
Item_Visibility                      -5.933081
Item_MRP                           1031.125315
Outlet_Establishment_Year           -44.836197
Item_Fat_Content_Regular            -24.660029
Item_Type_Breads                     61.342520
Item_Type_Breakfast                 121.483570
Item_Type_Canned                    -59.659978
Item_Type_Dairy                     -71.785348
Item_Type_Frozen Foods              -50.641434
Item_Type_Fruits and Vegetables     -56.863395
Item_Type_Hard Drinks               -64.812830
Item_Type_Health and Hygiene        -94.738596
Item_Type_Household                -116.690466
Item_Type_Meat                     -120.936659
Item_Type_Others                    -71.696221
Item_Type_Seafood                   473.254442
Item_Type_Snack Foods               -32.207499
Item_Type_Soft Drinks               -75.466474
Item_Type_Starchy Foods              20.997483
Outlet_Size_Medium                   47.409228
Outlet_Size_S

In [25]:
lin_reg.coef_

array([ -20.92444964,   -5.93308081, 1031.12531458,  -44.83619653,
        -24.66002872,   61.34251953,  121.48356975,  -59.65997804,
        -71.78534805,  -50.64143362,  -56.86339462,  -64.81283034,
        -94.73859591, -116.69046615, -120.93665851,  -71.69622136,
        473.25444199,  -32.20749945,  -75.46647426,   20.99748306,
         47.40922811,  -86.47870249,  222.90024005, -120.59845526,
       -159.66792964])

In [26]:
## Checking the number of coeffs matches the # of feature names
print(len(lin_reg.coef_))
len(feature_names)

25


25

In [27]:
## Saving the coefficients
coeffs = pd.Series(lin_reg.coef_, index= feature_names, name='Feature Importance')
coeffs

Item_Weight                         -20.924450
Item_Visibility                      -5.933081
Item_MRP                           1031.125315
Outlet_Establishment_Year           -44.836197
Item_Fat_Content_Regular            -24.660029
Item_Type_Breads                     61.342520
Item_Type_Breakfast                 121.483570
Item_Type_Canned                    -59.659978
Item_Type_Dairy                     -71.785348
Item_Type_Frozen Foods              -50.641434
Item_Type_Fruits and Vegetables     -56.863395
Item_Type_Hard Drinks               -64.812830
Item_Type_Health and Hygiene        -94.738596
Item_Type_Household                -116.690466
Item_Type_Meat                     -120.936659
Item_Type_Others                    -71.696221
Item_Type_Seafood                   473.254442
Item_Type_Snack Foods               -32.207499
Item_Type_Soft Drinks               -75.466474
Item_Type_Starchy Foods              20.997483
Outlet_Size_Medium                   47.409228
Outlet_Size_S

In [28]:
# using .loc to add the intercept to the series
coeffs.loc['intercept'] = lin_reg.intercept_
coeffs

Item_Weight                         -20.924450
Item_Visibility                      -5.933081
Item_MRP                           1031.125315
Outlet_Establishment_Year           -44.836197
Item_Fat_Content_Regular            -24.660029
Item_Type_Breads                     61.342520
Item_Type_Breakfast                 121.483570
Item_Type_Canned                    -59.659978
Item_Type_Dairy                     -71.785348
Item_Type_Frozen Foods              -50.641434
Item_Type_Fruits and Vegetables     -56.863395
Item_Type_Hard Drinks               -64.812830
Item_Type_Health and Hygiene        -94.738596
Item_Type_Household                -116.690466
Item_Type_Meat                     -120.936659
Item_Type_Others                    -71.696221
Item_Type_Seafood                   473.254442
Item_Type_Snack Foods               -32.207499
Item_Type_Soft Drinks               -75.466474
Item_Type_Starchy Foods              20.997483
Outlet_Size_Medium                   47.409228
Outlet_Size_S

In [29]:
coeffs.sum()

3083.336845059055

In [30]:
pd.set_option('display.float_format', lambda x: f"{x:,.2f}")
coeffs

Item_Weight                         -20.92
Item_Visibility                      -5.93
Item_MRP                          1,031.13
Outlet_Establishment_Year           -44.84
Item_Fat_Content_Regular            -24.66
Item_Type_Breads                     61.34
Item_Type_Breakfast                 121.48
Item_Type_Canned                    -59.66
Item_Type_Dairy                     -71.79
Item_Type_Frozen Foods              -50.64
Item_Type_Fruits and Vegetables     -56.86
Item_Type_Hard Drinks               -64.81
Item_Type_Health and Hygiene        -94.74
Item_Type_Household                -116.69
Item_Type_Meat                     -120.94
Item_Type_Others                    -71.70
Item_Type_Seafood                   473.25
Item_Type_Snack Foods               -32.21
Item_Type_Soft Drinks               -75.47
Item_Type_Starchy Foods              21.00
Outlet_Size_Medium                   47.41
Outlet_Size_Small                   -86.48
Outlet_Location_Type_Tier 2         222.90
Outlet_Loca

In [31]:
#Checking options
df.select_dtypes('object').nunique()

Item_Identifier         1523
Item_Fat_Content           2
Item_Type                 16
Outlet_Identifier          5
Outlet_Size                3
Outlet_Location_Type       3
Outlet_Type                2
dtype: int64

In [32]:
## Make x and y variables
X = df.drop(columns = ["Item_Outlet_Sales", "Outlet_Identifier", "Item_Identifier"]).copy()

y = df["Item_Outlet_Sales"].copy()
## train-test-split with random state for reproducibility
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=SEED)
X_train.head()

Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type
6436,8.52,Low Fat,0.03,Soft Drinks,151.57,2004,Small,Tier 2,Supermarket Type1
1547,15.7,Low Fat,0.16,Household,57.56,2009,Medium,Tier 3,Supermarket Type2
5432,16.75,Regular,0.01,Canned,103.77,2004,Small,Tier 2,Supermarket Type1
4512,19.35,Low Fat,0.03,Health and Hygiene,65.62,2009,Medium,Tier 3,Supermarket Type2
3075,20.35,Low Fat,0.06,Baking Goods,233.36,1997,Small,Tier 1,Supermarket Type1


In [33]:
## make pipelines for categorical vs numeric data
cat_pipe = make_pipeline(SimpleImputer(strategy='constant',
                                       fill_value='MISSING'),
                         OneHotEncoder(drop='if_binary', sparse=False))
num_pipe = make_pipeline(SimpleImputer(strategy='median'))
## make the preprocessing column transformer
preprocessor = make_column_transformer((num_pipe, num_sel),
                                       (cat_pipe,cat_sel),
                                      verbose_feature_names_out=False)
## fit column transformer and run get_feature_names_out
preprocessor.fit(X_train)
feature_names = preprocessor.get_feature_names_out()
X_train_df = pd.DataFrame(preprocessor.transform(X_train),
                           columns = feature_names, index = X_train.index)
X_test_df = pd.DataFrame(preprocessor.transform(X_test),
                           columns = feature_names, index = X_test.index)
X_test_df.head(3)



Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Outlet_Establishment_Year,Item_Fat_Content_Regular,Item_Type_Baking Goods,Item_Type_Breads,Item_Type_Breakfast,Item_Type_Canned,Item_Type_Dairy,Item_Type_Frozen Foods,Item_Type_Fruits and Vegetables,Item_Type_Hard Drinks,Item_Type_Health and Hygiene,Item_Type_Household,Item_Type_Meat,Item_Type_Others,Item_Type_Seafood,Item_Type_Snack Foods,Item_Type_Soft Drinks,Item_Type_Starchy Foods,Outlet_Size_High,Outlet_Size_Medium,Outlet_Size_Small,Outlet_Location_Type_Tier 1,Outlet_Location_Type_Tier 2,Outlet_Location_Type_Tier 3,Outlet_Type_Supermarket Type2
6305,12.5,0.07,87.92,2009.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0
2103,20.25,0.02,241.85,1997.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
105,17.1,0.05,141.18,1987.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [34]:
## fitting a linear regression model
lin_reg = LinearRegression()
lin_reg.fit(X_train_df, y_train)
evaluate_regression(lin_reg, X_train_df, y_train, X_test_df,y_test)

Training Data:	R^2= 0.48	RMSE= 1087.39
Test Data:	R^2= 0.45	RMSE= 1107.24


In [35]:
## Saving the coefficients
coeffs = pd.Series(lin_reg.coef_, index= feature_names)

In [36]:
coeff_rank = coeffs.abs().rank().sort_values(ascending=False)
coeff_rank

Item_Type_Seafood                 28.00
Outlet_Location_Type_Tier 2       27.00
Outlet_Location_Type_Tier 3       26.00
Outlet_Type_Supermarket Type2     25.00
Item_Visibility                   24.00
Item_Type_Breakfast               23.00
Item_Type_Meat                    22.00
Item_Type_Household               21.00
Item_Type_Health and Hygiene      20.00
Item_Type_Breads                  19.00
Outlet_Size_Medium                18.00
Outlet_Size_Small                 17.00
Item_Type_Soft Drinks             16.00
Item_Type_Dairy                   15.00
Item_Type_Others                  14.00
Item_Type_Hard Drinks             13.00
Item_Type_Canned                  12.00
Item_Type_Fruits and Vegetables   11.00
Item_Type_Frozen Foods            10.00
Outlet_Location_Type_Tier 1        9.00
Item_Type_Starchy Foods            8.00
Item_Fat_Content_Regular           7.00
Item_Type_Snack Foods              6.00
Item_MRP                           5.00
Item_Type_Baking Goods             4.00


In [52]:
top_n_features = coeff_rank.head(3)
top_n_features

Item_Type_Seafood             28.00
Outlet_Location_Type_Tier 2   27.00
Outlet_Location_Type_Tier 3   26.00
dtype: float64

- Item_type_seafood is the most important to the target but the other two are also almost equally important.

In [38]:
## Slicing out the top_n_features names to visualize
plot_vals = coeffs.loc[top_n_features.index]
plot_vals.head(3)

Item_Type_Seafood              481.91
Outlet_Location_Type_Tier 2    193.33
Outlet_Location_Type_Tier 3   -153.70
dtype: float64

In [39]:
reg = RandomForestRegressor(random_state=SEED)
reg.fit(X_train_df,y_train)
evaluate_regression(reg, X_train_df, y_train, X_test_df, y_test)

Training Data:	R^2= 0.92	RMSE= 438.11
Test Data:	R^2= 0.39	RMSE= 1163.44


In [40]:
## Checking the number of feature importances matches the # of columns
print(len(reg.feature_importances_))
print(len(X_train_df.columns))
len(reg.feature_importances_) == len(X_train_df.columns)

28
28


True

In [41]:
## Saving the feature importances
importances = pd.Series(reg.feature_importances_, index= feature_names,
                       name='Feature Importance')
importances



Item_Weight                       0.09
Item_Visibility                   0.13
Item_MRP                          0.60
Outlet_Establishment_Year         0.02
Item_Fat_Content_Regular          0.01
Item_Type_Baking Goods            0.01
Item_Type_Breads                  0.00
Item_Type_Breakfast               0.00
Item_Type_Canned                  0.01
Item_Type_Dairy                   0.01
Item_Type_Frozen Foods            0.01
Item_Type_Fruits and Vegetables   0.01
Item_Type_Hard Drinks             0.00
Item_Type_Health and Hygiene      0.00
Item_Type_Household               0.01
Item_Type_Meat                    0.01
Item_Type_Others                  0.00
Item_Type_Seafood                 0.00
Item_Type_Snack Foods             0.01
Item_Type_Soft Drinks             0.00
Item_Type_Starchy Foods           0.00
Outlet_Size_High                  0.00
Outlet_Size_Medium                0.01
Outlet_Size_Small                 0.01
Outlet_Location_Type_Tier 1       0.01
Outlet_Location_Type_Tier

In [42]:
importances.sum()

1.0

In [43]:
## Saving the feature importances sorted from smallest to largest (ascending=True)
sorted_importance = importances.sort_values()
sorted_importance

Item_Type_Others                  0.00
Item_Type_Seafood                 0.00
Item_Type_Starchy Foods           0.00
Item_Type_Hard Drinks             0.00
Item_Type_Breads                  0.00
Outlet_Size_High                  0.00
Item_Type_Health and Hygiene      0.00
Item_Type_Breakfast               0.00
Item_Type_Soft Drinks             0.00
Item_Type_Meat                    0.01
Outlet_Type_Supermarket Type2     0.01
Item_Type_Canned                  0.01
Item_Type_Baking Goods            0.01
Outlet_Size_Medium                0.01
Item_Type_Dairy                   0.01
Outlet_Location_Type_Tier 3       0.01
Outlet_Location_Type_Tier 2       0.01
Item_Type_Household               0.01
Item_Type_Frozen Foods            0.01
Outlet_Location_Type_Tier 1       0.01
Outlet_Size_Small                 0.01
Item_Type_Fruits and Vegetables   0.01
Item_Type_Snack Foods             0.01
Item_Fat_Content_Regular          0.01
Outlet_Establishment_Year         0.02
Item_Weight              

In [47]:
importances.sort_values(ascending=False).head(5)

Item_MRP                    0.60
Item_Visibility             0.13
Item_Weight                 0.09
Outlet_Establishment_Year   0.02
Item_Fat_Content_Regular    0.01
Name: Feature Importance, dtype: float64

- These coefficients show the most importance in item MRP relating to the target. These are the 5 most important based off the model.

In [51]:
## saving variables for next lesson/notebook
import joblib
## creating a dictionary of all of the variables to save for later
export = {'X_train':X_train_df,
         'y_train': y_train,
         'X_test':X_test_df,
          "y_test": y_test,
         'preprocessor':preprocessor,
          'LinearRegression':lin_reg,
         'RandomForestRegressor':reg}
joblib.dump(export, 'random_forest_l01.joblib')



['random_forest_l01.joblib']