<a href="https://colab.research.google.com/github/Elispreng/Project-1-Grocery-Sales/blob/main/Spreng_Project1_Final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Load the Data and Import Packages

In [57]:
#load the pandas and google drive
import pandas as pd
import numpy as np

from google.colab import drive

# Import packages for machine learning
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_selector
from sklearn.preprocessing import  OneHotEncoder, StandardScaler
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer

from sklearn import set_config
set_config(display='diagram')

## Regression Metrics
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

# import packages for linear regression
from sklearn.linear_model import LinearRegression

# import packages for a regression tree
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.tree import DecisionTreeRegressor

#import packages for bagged trees
from sklearn.ensemble import BaggingRegressor

In [54]:
#  create a model metrics fucnction for true and predicted values
## print MAE, MSE, RMSE, and R2 metrics
def model_metrics(pipe, x_train, y_train, x_test, y_test, 
                       model_name='Regression Model', ):
  ## Train
  mae = round(mean_absolute_error(y_train, pipe.predict(x_train)),4)
  mse = round(mean_squared_error(y_train, pipe.predict(x_train)),4)
  rmse = round(np.sqrt(mean_squared_error(y_train, pipe.predict(x_train))),4)
  r2 = round(r2_score(y_train, pipe.predict(x_train)),7)
  print(f'{model_name} Train Scores')
  print(f'MAE: {mae} \nMSE: {mse} \nRMSE: {rmse} \nR2: {r2}\n')

  ## Test
  mae = round(mean_absolute_error(y_test, pipe.predict(x_test)),4)
  mse = round(mean_squared_error(y_test, pipe.predict(x_test)),4)
  rmse = round(np.sqrt(mean_squared_error(y_test, pipe.predict(x_test))),4)
  r2 = round(r2_score(y_test, pipe.predict(x_test)),7)

  ## Display the metrics for the model
  print(f'{model_name} Test Scores')
  print(f'MAE: {mae} \nMSE: {mse} \nRMSE: {rmse} \nR2: {r2}\n')


In [28]:
#load data and create the pandas dataframe
filename = '/content/drive/MyDrive/Data Fundamental/#3 Exploratory Viz/sales_predictions - sales_predictions.csv'
df_grocery = pd.read_csv(filename)

# Inspect the Data

In [29]:
#look at the dataframe columns and the first  five lines of the data 
df_grocery. head()


Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [30]:
df_grocery.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            8523 non-null   object 
 1   Item_Weight                7060 non-null   float64
 2   Item_Fat_Content           8523 non-null   object 
 3   Item_Visibility            8523 non-null   float64
 4   Item_Type                  8523 non-null   object 
 5   Item_MRP                   8523 non-null   float64
 6   Outlet_Identifier          8523 non-null   object 
 7   Outlet_Establishment_Year  8523 non-null   int64  
 8   Outlet_Size                6113 non-null   object 
 9   Outlet_Location_Type       8523 non-null   object 
 10  Outlet_Type                8523 non-null   object 
 11  Item_Outlet_Sales          8523 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 799.2+ KB


# Data Cleaning and Initial Data Preperation

In [31]:
# Remove Unnecesary Columns if there are any
# There are no unnecessary columns

In [32]:
# check for duplicate rows
df_grocery.duplicated().sum()

0

In [33]:
#check for missing values
df_grocery.isna().sum()

Item_Identifier                 0
Item_Weight                  1463
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64

In [34]:
# check for inconsistencies in the fat content
df_grocery['Item_Fat_Content'].unique()


array(['Low Fat', 'Regular', 'low fat', 'LF', 'reg'], dtype=object)

In [35]:
# fix the inconsistencies
df_grocery['Item_Fat_Content'] = df_grocery['Item_Fat_Content'].replace('LF', 'Low Fat')
df_grocery['Item_Fat_Content'] = df_grocery['Item_Fat_Content'].replace('low fat', 'Low Fat')
df_grocery['Item_Fat_Content'] = df_grocery['Item_Fat_Content'].replace('reg', 'Regular')
#check the column
df_grocery['Item_Fat_Content'].value_counts()


Low Fat    5517
Regular    3006
Name: Item_Fat_Content, dtype: int64

In [36]:
# check the missing values for irregularities
df_grocery['Item_Weight'].describe()

count    7060.000000
mean       12.857645
std         4.643456
min         4.555000
25%         8.773750
50%        12.600000
75%        16.850000
max        21.350000
Name: Item_Weight, dtype: float64

# Split the Data

In [38]:
# #define X and y
target = 'Item_Outlet_Sales'
X = df_grocery.drop(columns =[target]).copy()
y = df_grocery[target].copy()

In [39]:
#create the test/train split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

# Instantiate the Transformers and Column Selectors

In [40]:
# instantiate the Standard Scaler and Imputers
scaler = StandardScaler()
ohe = OneHotEncoder(sparse = False, handle_unknown= 'ignore')

freq_imputer = SimpleImputer(strategy='most_frequent')
mean_imputer = SimpleImputer(strategy='mean')

In [41]:
# Selectors
cat_selector = make_column_selector(dtype_include='object')
num_selector = make_column_selector(dtype_include='number')

# Create Preprocessing Pipeline for Numeric Values and Categorical Data

In [42]:
# Numeric pipeline
numeric_pipe = make_pipeline(mean_imputer, scaler)
numeric_pipe

In [43]:
# Categorical pipeline
categorical_pipe = make_pipeline(freq_imputer, ohe)
categorical_pipe

# Combine Piplines Using Column Transformer

In [44]:
# Tuples for Column Transformer
number_tuple = (numeric_pipe, num_selector)
category_tuple = (categorical_pipe, cat_selector)
# ColumnTransformer
preprocessor = make_column_transformer(number_tuple, category_tuple, remainder='drop')
preprocessor

# Create linear regression model
1.   Create a pipeline and fit on the training data
2.  Evaluate acccoring to R^2 metrics
3. Evaluate  through RMSE

## Create a pipeline using linear regression

In [45]:
# set up the linear regression model
linreg = LinearRegression()

In [46]:
# Combine the preprocessing ColumnTransformer and the linear regression model in a Pipeline
linreg_pipe = make_pipeline(preprocessor, linreg)
linreg_pipe

In [47]:
# Fit the model pipeline on the training data
linreg_pipe.fit(X_train, y_train)
# Make predictions using the training and testing data
training_predictions = linreg_pipe.predict(X_train)
test_predictions = linreg_pipe.predict(X_test)
training_predictions[:10]

array([2984., 3796., 2180., 1292., 2236., -124., 1596., 4348., 3700.,
       1628.])

##  Evaluate acccoring to R^2 metrics

In [48]:
#calculating R-squared
train_r2 = np.corrcoef(y_train, training_predictions)[0][1]**2
test_r2 = np.corrcoef(y_test, test_predictions)[0][1]**2

print(f'Model Training R2: {train_r2}')
print(f'Model Testing R2: {test_r2}')

Model Training R2: 0.6714424977432687
Model Testing R2: 2.0994337117833997e-05


- This model can account for about 67% of the variation in the y_test 
using the features in the X_test.

## Evaluate using RMSE

In [49]:
# Calculating RMSE
RMSE_train = np.sqrt(np.mean(np.abs(training_predictions - y_train)**2))
RMSE_test = np.sqrt(np.mean(np.abs(test_predictions - y_test)**2))

print(f'Training RMSE:{RMSE_train}')
print(f'Test RMSE:{RMSE_test}')

Training RMSE:986.0858114271118
Test RMSE:6762579228318.999


# Create a regression tree model
1. Create a regression tree pipeline
2. Evaluate according to R^2 and  RMSE

In [50]:
# set up the  regression tree model
dec_tree = DecisionTreeRegressor(random_state=42)
#make the pipeline
dec_tree_pipeline = make_pipeline(preprocessor, dec_tree)
#fit the pipeline
dec_tree_pipeline.fit(X_train, y_train)



In [55]:
# display the model metrics
model_metrics(dec_tree_pipeline, x_train=X_train, y_train=y_train, 
                          x_test=X_test, y_test=y_test, 
                           model_name='Decision Tree Model')

Decision Tree Model Train Scores
MAE: 0.0 
MSE: 0.0 
RMSE: 0.0 
R2: 1.0

Decision Tree Model Test Scores
MAE: 992.5956 
MSE: 2103175.4804 
RMSE: 1450.2329 
R2: 0.2376974



# Create a Bagged Tree Model
 1. Create a pipleine
 2. Evaluate according to R2 and RMSE

In [58]:
# create an instance of the model
bag_tree = BaggingRegressor(random_state = 42)
# create a pipleine
bag_tree_pipe = make_pipeline(preprocessor, bag_tree)
# fit the pipeline
bag_tree_pipe.fit(X_train, y_train)

In [60]:
# evaluate using model metrics
model_metrics(bag_tree_pipe, x_train=X_train, y_train=y_train, 
                          x_test=X_test, y_test=y_test, 
                           model_name='Bagged Trees Model')

Bagged Trees Model Train Scores
MAE: 316.5676 
MSE: 240010.0091 
RMSE: 489.9082 
R2: 0.9189006

Bagged Trees Model Test Scores
MAE: 785.6302 
MSE: 1279423.2806 
RMSE: 1131.1159 
R2: 0.536269

