# Create Project

In [1]:
#import basic Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_selector
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.impute import SimpleImputer



## Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_selector
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer

## Models
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import ElasticNet

## Regression Metrics
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

## Set global scikit-learn configuration 
from sklearn import set_config

## Display estimators as a diagram
set_config(display='diagram') # 'text' or 'diagram'}


## Functions

In [2]:
## Create a function to take the true and predicted values
## and print MAE, MSE, RMSE, and R2 metrics for a model
def model_metrics(pipe, x_train, y_train, x_test, y_test, 
                       model_name='Regression Model', ):
  ## Train
  mae = round(mean_absolute_error(y_train, pipe.predict(x_train)),4)
  mse = round(mean_squared_error(y_train, pipe.predict(x_train)),4)
  rmse = round(np.sqrt(mean_squared_error(y_train, pipe.predict(x_train))),4)
  r2 = round(r2_score(y_train, pipe.predict(x_train)),6)
  print(f'{model_name} Train Scores')
  print(f'MAE: {mae:,.4f} \nMSE: {mse:,.4f} \nRMSE: {rmse:,.4f} \nR2: {r2:.4f}\n')

  ## Test
  mae = round(mean_absolute_error(y_test, pipe.predict(x_test)),4)
  mse = round(mean_squared_error(y_test, pipe.predict(x_test)),4)
  rmse = round(np.sqrt(mean_squared_error(y_test, pipe.predict(x_test))),4)
  r2 = round(r2_score(y_test, pipe.predict(x_test)),6)

  # Display the metrics for the model
  print(f'{model_name} Test Scores')
  print(f'MAE: {mae:,.4f} \nMSE: {mse:,.4f} \nRMSE: {rmse:,.4f} \nR2: {r2:.4f}\n')

## Load Data

In [3]:
#create pandas dataframe
filename='Data/sales_predictions - sales_predictions (1).csv'
df = pd.read_csv(filename)

In [4]:
## Display the first (5) rows of the dataframe
df.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


- The data has loaded properly.

In [5]:
## Display the number of rows and columns for the dataframe
df.shape
print(f'There are {df.shape[0]} rows, and {df.shape[1]} columns.')
print(f'The rows represent {df.shape[0]} observations, and the columns represent \
{df.shape[1]-1} features and 1 target variable.')

There are 8523 rows, and 12 columns.
The rows represent 8523 observations, and the columns represent 11 features and 1 target variable.


In [6]:
## Display the column names and datatypes for each column
df.dtypes

Item_Identifier               object
Item_Weight                  float64
Item_Fat_Content              object
Item_Visibility              float64
Item_Type                     object
Item_MRP                     float64
Outlet_Identifier             object
Outlet_Establishment_Year      int64
Outlet_Size                   object
Outlet_Location_Type          object
Outlet_Type                   object
Item_Outlet_Sales            float64
dtype: object

In [7]:
## Display the column names, count of non-null values, and their datatypes
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            8523 non-null   object 
 1   Item_Weight                7060 non-null   float64
 2   Item_Fat_Content           8523 non-null   object 
 3   Item_Visibility            8523 non-null   float64
 4   Item_Type                  8523 non-null   object 
 5   Item_MRP                   8523 non-null   float64
 6   Outlet_Identifier          8523 non-null   object 
 7   Outlet_Establishment_Year  8523 non-null   int64  
 8   Outlet_Size                6113 non-null   object 
 9   Outlet_Location_Type       8523 non-null   object 
 10  Outlet_Type                8523 non-null   object 
 11  Item_Outlet_Sales          8523 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 799.2+ KB


In [8]:
## Display the descriptive statistics for the numeric columns
df.describe(include="number") # or 'object'

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Outlet_Establishment_Year,Item_Outlet_Sales
count,7060.0,8523.0,8523.0,8523.0,8523.0
mean,12.857645,0.066132,140.992782,1997.831867,2181.288914
std,4.643456,0.051598,62.275067,8.37176,1706.499616
min,4.555,0.0,31.29,1985.0,33.29
25%,8.77375,0.026989,93.8265,1987.0,834.2474
50%,12.6,0.053931,143.0128,1999.0,1794.331
75%,16.85,0.094585,185.6437,2004.0,3101.2964
max,21.35,0.328391,266.8884,2009.0,13086.9648


In [9]:
## Display the descriptive statistics for the non-numeric columns
df.describe(exclude="number") # or 'object'

Unnamed: 0,Item_Identifier,Item_Fat_Content,Item_Type,Outlet_Identifier,Outlet_Size,Outlet_Location_Type,Outlet_Type
count,8523,8523,8523,8523,6113,8523,8523
unique,1559,5,16,10,3,3,4
top,FDW13,Low Fat,Fruits and Vegetables,OUT027,Medium,Tier 3,Supermarket Type1
freq,10,5089,1232,935,2793,3350,5577


# Clean the Data

- There are no unnecessary rows or columns. 

In [10]:
## Display the number of duplicate rows in the dataset
print(f'There are {df.duplicated().sum()} duplicate rows.')

There are 0 duplicate rows.


## Missing Values

In [11]:
#check for missing values
df.isna().sum()

Item_Identifier                 0
Item_Weight                  1463
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64

In [12]:
## Display the percentage of missing values by column
print(df.isna().sum()/len(df)*100)

Item_Identifier               0.000000
Item_Weight                  17.165317
Item_Fat_Content              0.000000
Item_Visibility               0.000000
Item_Type                     0.000000
Item_MRP                      0.000000
Outlet_Identifier             0.000000
Outlet_Establishment_Year     0.000000
Outlet_Size                  28.276428
Outlet_Location_Type          0.000000
Outlet_Type                   0.000000
Item_Outlet_Sales             0.000000
dtype: float64


## Check column data types

In [13]:
## Display column names and datatypes
df.dtypes

Item_Identifier               object
Item_Weight                  float64
Item_Fat_Content              object
Item_Visibility              float64
Item_Type                     object
Item_MRP                     float64
Outlet_Identifier             object
Outlet_Establishment_Year      int64
Outlet_Size                   object
Outlet_Location_Type          object
Outlet_Type                   object
Item_Outlet_Sales            float64
dtype: object

- All columns appear to have the correct data types

In [14]:
## Display column names
df.columns

Index(['Item_Identifier', 'Item_Weight', 'Item_Fat_Content', 'Item_Visibility',
       'Item_Type', 'Item_MRP', 'Outlet_Identifier',
       'Outlet_Establishment_Year', 'Outlet_Size', 'Outlet_Location_Type',
       'Outlet_Type', 'Item_Outlet_Sales'],
      dtype='object')

- All columns appear to have the correct names.

In [15]:
## Display the descriptive statistics for the non-numeric columns
df.describe(include="number")

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Outlet_Establishment_Year,Item_Outlet_Sales
count,7060.0,8523.0,8523.0,8523.0,8523.0
mean,12.857645,0.066132,140.992782,1997.831867,2181.288914
std,4.643456,0.051598,62.275067,8.37176,1706.499616
min,4.555,0.0,31.29,1985.0,33.29
25%,8.77375,0.026989,93.8265,1987.0,834.2474
50%,12.6,0.053931,143.0128,1999.0,1794.331
75%,16.85,0.094585,185.6437,2004.0,3101.2964
max,21.35,0.328391,266.8884,2009.0,13086.9648


In [16]:
## Print the unique values for the column
print('Unique Fat Content:\n', df['Item_Fat_Content'].unique())
print('\n')
## Print the unique values for the column
print('Unique Item Type:\n', df['Item_Type'].unique())
print('\n')
## Print the unique values for the column
print('Unique Location:\n', df['Outlet_Location_Type'].unique())
print('\n')
## Print the unique values for the column
print('Unique Outlet Type:\n', df['Outlet_Type'].unique())
print('\n')
## Print the unique values for the column
print('Unique Outlet:\n', df['Outlet_Identifier'].unique())
print('\n')
## Print the unique values for the column
print('Unique Outlet Size:\n', df['Outlet_Size'].unique())
print('\n')

Unique Fat Content:
 ['Low Fat' 'Regular' 'low fat' 'LF' 'reg']


Unique Item Type:
 ['Dairy' 'Soft Drinks' 'Meat' 'Fruits and Vegetables' 'Household'
 'Baking Goods' 'Snack Foods' 'Frozen Foods' 'Breakfast'
 'Health and Hygiene' 'Hard Drinks' 'Canned' 'Breads' 'Starchy Foods'
 'Others' 'Seafood']


Unique Location:
 ['Tier 1' 'Tier 3' 'Tier 2']


Unique Outlet Type:
 ['Supermarket Type1' 'Supermarket Type2' 'Grocery Store'
 'Supermarket Type3']


Unique Outlet:
 ['OUT049' 'OUT018' 'OUT010' 'OUT013' 'OUT027' 'OUT045' 'OUT017' 'OUT046'
 'OUT035' 'OUT019']


Unique Outlet Size:
 ['Medium' nan 'High' 'Small']




- Before test/train split need to address Fat Content values.
- No other values need to be addressed

In [17]:
# fix the inconsistencies (as noted in earlier projects)
df['Item_Fat_Content'] = df['Item_Fat_Content'].replace('LF', 'Low Fat')
df['Item_Fat_Content'] = df['Item_Fat_Content'].replace('low fat', 'Low Fat')
df['Item_Fat_Content'] = df['Item_Fat_Content'].replace('reg', 'Regular')
#check the column
df['Item_Fat_Content'].value_counts()

Low Fat    5517
Regular    3006
Name: Item_Fat_Content, dtype: int64

In [18]:
# Checking for problems in categorical data
for i in df.columns:
  if df[i].dtype == 'object' or df[i].nunique() <= 10:
    print(i.upper(), '\n********\n', df[i].unique(), '\n----->', 
          str(df[i].dtype).upper(), '\n********\n')

ITEM_IDENTIFIER 
********
 ['FDA15' 'DRC01' 'FDN15' ... 'NCF55' 'NCW30' 'NCW05'] 
-----> OBJECT 
********

ITEM_FAT_CONTENT 
********
 ['Low Fat' 'Regular'] 
-----> OBJECT 
********

ITEM_TYPE 
********
 ['Dairy' 'Soft Drinks' 'Meat' 'Fruits and Vegetables' 'Household'
 'Baking Goods' 'Snack Foods' 'Frozen Foods' 'Breakfast'
 'Health and Hygiene' 'Hard Drinks' 'Canned' 'Breads' 'Starchy Foods'
 'Others' 'Seafood'] 
-----> OBJECT 
********

OUTLET_IDENTIFIER 
********
 ['OUT049' 'OUT018' 'OUT010' 'OUT013' 'OUT027' 'OUT045' 'OUT017' 'OUT046'
 'OUT035' 'OUT019'] 
-----> OBJECT 
********

OUTLET_ESTABLISHMENT_YEAR 
********
 [1999 2009 1998 1987 1985 2002 2007 1997 2004] 
-----> INT64 
********

OUTLET_SIZE 
********
 ['Medium' nan 'High' 'Small'] 
-----> OBJECT 
********

OUTLET_LOCATION_TYPE 
********
 ['Tier 1' 'Tier 3' 'Tier 2'] 
-----> OBJECT 
********

OUTLET_TYPE 
********
 ['Supermarket Type1' 'Supermarket Type2' 'Grocery Store'
 'Supermarket Type3'] 
-----> OBJECT 
********



# Split the Data

## Define Target and Split

In [19]:
# #define X and y
target = 'Item_Outlet_Sales'
X = df.drop(columns = target)
y = df[target]

In [20]:
## Split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

## Prepare the  Data

### Identify Types

**Ordinal:** None

**Numeric:** 'year,	'item sales', 'item weight','MRP',	'item visibility', 'item id'

**Nominal:** 'outlet size', 'Location', 'outlet size'

**Categorical:** 'fat content', 'outlet type', 'item type'

### Imputers
- need to impute missing values

In [21]:
# instantiate the Standard Scaler and Imputers
scaler = StandardScaler()
ohe = OneHotEncoder(sparse = False, handle_unknown= 'ignore')

freq_imputer = SimpleImputer(strategy='most_frequent')
mean_imputer = SimpleImputer(strategy='mean')

### Column Selector

In [22]:
## Instantiate the column selectors
num_selector = make_column_selector(dtype_include='number')
cat_selector = make_column_selector(dtype_include='object')

### Create pipelines

In [23]:
# Numeric pipeline
numeric_pipe = make_pipeline(mean_imputer, scaler)
numeric_pipe
# Categorical pipeline
categorical_pipe = make_pipeline(freq_imputer, ohe)
categorical_pipe

### Create Tuples

In [24]:
# Tuples for Column Transformer
number_tuple = (numeric_pipe, num_selector)
category_tuple = (categorical_pipe, cat_selector)

### Column Transformer

In [25]:
## Create the preprocessor using make_column_transformer
preprocessor = make_column_transformer(number_tuple, 
                                       category_tuple, 
                                       remainder='drop')

# Create Models

## Baseline Model: Dummy Regressor

In [26]:
## Create an instance of the model
dummy = DummyRegressor(strategy='mean')

## Create a model pipeline
dummy_pipe = make_pipeline(preprocessor, dummy)

## Fit the model
dummy_pipe.fit(X_train, y_train)

In [27]:
## Display model performance metrics using a function
model_metrics(dummy_pipe, x_train=X_train, y_train=y_train, 
                          x_test=X_test, y_test=y_test, 
                           model_name='Dummy Model')

Dummy Model Train Scores
MAE: 1,360.2184 
MSE: 2,959,455.7045 
RMSE: 1,720.3069 
R2: 0.0000

Dummy Model Test Scores
MAE: 1,326.1210 
MSE: 2,772,144.4627 
RMSE: 1,664.9758 
R2: -0.0048



## Linear Regression Model

In [28]:
## Create an instance of the model
lin_reg = LinearRegression()

## Create a model pipeline
lin_reg_pipe = make_pipeline(preprocessor, lin_reg)

## Fit the model
lin_reg_pipe.fit(X_train, y_train)

In [29]:
# Make predictions using the training and testing data
training_predictions = lin_reg_pipe.predict(X_train)
test_predictions = lin_reg_pipe.predict(X_test)
training_predictions[:10]

array([2999.5, 3788.5, 2206.5, 1245.5, 2219. , -100. , 1602. , 4377. ,
       3687.5, 1618. ])

In [30]:
#calculating R-squared
train_r2 = np.corrcoef(y_train, training_predictions)[0][1]**2
test_r2 = np.corrcoef(y_test, test_predictions)[0][1]**2

print(f'Model Training R2: {train_r2}')
print(f'Model Testing R2: {test_r2}')

Model Training R2: 0.671687051140189
Model Testing R2: 0.0006045050387265348


In [31]:
## Display model performance metrics using a function
model_metrics(lin_reg_pipe, x_train=X_train, y_train=y_train, 
                          x_test=X_test, y_test=y_test, 
                           model_name='Linear Regression')

Linear Regression Train Scores
MAE: 735.7464 
MSE: 971,627.7568 
RMSE: 985.7118 
R2: 0.6717

Linear Regression Test Scores
MAE: 266,657,332,420.3554 
MSE: 14,648,544,673,698,199,726,194,688.0000 
RMSE: 3,827,341,724,186.4097 
R2: -5309411113472258048.0000



## Decision Tree Regressor

In [32]:
## Create an instance of the model
dec_tree = DecisionTreeRegressor()

## Create a model pipeline
dec_tree_pipe = make_pipeline(preprocessor, dec_tree)

## Fit the model
dec_tree_pipe.fit(X_train, y_train)

In [33]:
## Display model performance metrics using a function
model_metrics(dec_tree_pipe, x_train=X_train, y_train=y_train, 
                          x_test=X_test, y_test=y_test, 
                           model_name='Decision Tree Model')

Decision Tree Model Train Scores
MAE: 0.0000 
MSE: 0.0000 
RMSE: 0.0000 
R2: 1.0000

Decision Tree Model Test Scores
MAE: 993.7748 
MSE: 2,151,106.3431 
RMSE: 1,466.6650 
R2: 0.2203



## Bagged Tree Regressor

In [34]:
# Create an instance of the model
bag_tree = BaggingRegressor()

# Create a model pipeline
bag_tree_pipe = make_pipeline(preprocessor, bag_tree)

# Fit the model
bag_tree_pipe.fit(X_train, y_train)

In [35]:
## Display model performance metrics using a function
model_metrics(bag_tree_pipe, x_train=X_train, y_train=y_train, 
                          x_test=X_test, y_test=y_test, 
                           model_name='Bagged Tree Model')

Bagged Tree Model Train Scores
MAE: 322.7040 
MSE: 248,385.8188 
RMSE: 498.3832 
R2: 0.9161

Bagged Tree Model Test Scores
MAE: 805.9095 
MSE: 1,353,793.9709 
RMSE: 1,163.5265 
R2: 0.5093



## Random Forest Regressor

In [36]:
## Create an instance of the model
ran_for = RandomForestRegressor()

## Create a model pipeline
ran_for_pipe = make_pipeline(preprocessor, ran_for)

## Fit the model
ran_for_pipe.fit(X_train, y_train)

In [37]:
## Display model performance metrics using a function
model_metrics(ran_for_pipe, x_train=X_train, y_train=y_train, 
                          x_test=X_test, y_test=y_test, 
                           model_name='Random Forest Model')

Random Forest Model Train Scores
MAE: 297.4197 
MSE: 183,008.0968 
RMSE: 427.7945 
R2: 0.9382

Random Forest Model Test Scores
MAE: 770.5958 
MSE: 1,234,716.0109 
RMSE: 1,111.1778 
R2: 0.5525



## K Neighbors Regressor

In [38]:
## Create an instance of the model
knn = KNeighborsRegressor()

## Create a model pipeline
knn_pipe = make_pipeline(preprocessor, knn)

## Fit the model
knn_pipe.fit(X_train, y_train)

In [39]:
## Display model performance metrics using a function
model_metrics(knn_pipe, x_train=X_train, y_train=y_train, 
                          x_test=X_test, y_test=y_test, 
                           model_name='K-Nearest Neighbors Model')


K-Nearest Neighbors Model Train Scores
MAE: 712.1288 
MSE: 988,437.7991 
RMSE: 994.2021 
R2: 0.6660

K-Nearest Neighbors Model Test Scores
MAE: 836.4104 
MSE: 1,398,919.9402 
RMSE: 1,182.7595 
R2: 0.4930



## Elastic Net

In [40]:
## Create an instance of the model
ela_net = ElasticNet()

## Create a model pipeline
ela_net_pipe = make_pipeline(preprocessor, ela_net)

## Fit the model
ela_net_pipe.fit(X_train, y_train)

In [41]:
## Display model performance metrics using a function
model_metrics(ela_net_pipe, x_train=X_train, y_train=y_train, 
                          x_test=X_test, y_test=y_test, 
                           model_name='Elastic Net Model')

Elastic Net Model Train Scores
MAE: 990.8233 
MSE: 1,722,537.2167 
RMSE: 1,312.4547 
R2: 0.4180

Elastic Net Model Test Scores
MAE: 954.6704 
MSE: 1,590,455.8267 
RMSE: 1,261.1328 
R2: 0.4235



# Recommendations

**Model recommended:** Random Forest

- It had the lowest error scores for MAE, MSE and RMSE, and it had the hightest R2 on the Test dataset.

**Random Forest Model Test Scores**
- MAE: 776.0182 
- MSE: 1,253,923.2481 
- RMSE: 1,119.7871 
- R2: 0.5455
