<a href="https://colab.research.google.com/github/AshikSathiya/Prediction-of-Product-Sales/blob/main/Prediction_of_Product_Sales_Machine_Learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# Mount google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# Import pandas as change max columns
import pandas as pd
import numpy as np


from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder

from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_selector, ColumnTransformer

from sklearn import set_config
set_config(transform_output='pandas')

In [5]:
fpath = '/content/drive/MyDrive/CodingDojo/01-Fundamentals/Week02/Data/sales_predictions_2023.csv'
df = pd.read_csv(fpath)
df

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.300,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.1380
1,DRC01,5.920,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.500,Low Fat,0.016760,Meat,141.6180,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.2700
3,FDX07,19.200,Regular,0.000000,Fruits and Vegetables,182.0950,OUT010,1998,,Tier 3,Grocery Store,732.3800
4,NCD19,8.930,Low Fat,0.000000,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052
...,...,...,...,...,...,...,...,...,...,...,...,...
8518,FDF22,6.865,Low Fat,0.056783,Snack Foods,214.5218,OUT013,1987,High,Tier 3,Supermarket Type1,2778.3834
8519,FDS36,8.380,Regular,0.046982,Baking Goods,108.1570,OUT045,2002,,Tier 2,Supermarket Type1,549.2850
8520,NCJ29,10.600,Low Fat,0.035186,Health and Hygiene,85.1224,OUT035,2004,Small,Tier 2,Supermarket Type1,1193.1136
8521,FDN46,7.210,Regular,0.145221,Snack Foods,103.1332,OUT018,2009,Medium,Tier 3,Supermarket Type2,1845.5976


#Cleaning Process

##Drop Duplicates

In [6]:
# Duplicate Filter

duplicated_rows = df.duplicated()
duplicated_rows

# Calculate the sum of the duplicated_rows filter
duplicated_rows.sum()

0

There are no duplicates

##Inconsitencies

In [7]:
object_cols = df.select_dtypes('object').columns


In [8]:
# Check the nunique for just the object cols
object_nunique = df[object_cols].nunique()
object_nunique

Item_Identifier         1559
Item_Fat_Content           5
Item_Type                 16
Outlet_Identifier         10
Outlet_Size                3
Outlet_Location_Type       3
Outlet_Type                4
dtype: int64

It appears as though Item_Fat_Content, Outlet_Size, Outlet_Location_Type, and Outlet_Type have a small amount of values, we will look more into these columns

In [9]:
# Drop the columns listed above
df.drop(["Item_Identifier", "Item_Type", "Outlet_Identifier"], axis=1, inplace=True)
df.head()

Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_MRP,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,9.3,Low Fat,0.016047,249.8092,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,5.92,Regular,0.019278,48.2692,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,17.5,Low Fat,0.01676,141.618,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,19.2,Regular,0.0,182.095,1998,,Tier 3,Grocery Store,732.38
4,8.93,Low Fat,0.0,53.8614,1987,High,Tier 3,Supermarket Type1,994.7052


In [10]:
# Remake the list of string columns (after dropping previous cols)
# loop through the list of string columns and print the value counts for the column


string_columns = df.select_dtypes(include='object').columns

for col in string_columns:
    value_counts = df[col].value_counts()
    print(value_counts)
    print()

Low Fat    5089
Regular    2889
LF          316
reg         117
low fat     112
Name: Item_Fat_Content, dtype: int64

Medium    2793
Small     2388
High       932
Name: Outlet_Size, dtype: int64

Tier 3    3350
Tier 2    2785
Tier 1    2388
Name: Outlet_Location_Type, dtype: int64

Supermarket Type1    5577
Grocery Store        1083
Supermarket Type3     935
Supermarket Type2     928
Name: Outlet_Type, dtype: int64



Inconsitent values are present in the Item_Fat_Content

In [11]:
# Check for inconsistencies by inspecting the value_counts for Item_Fat_Content
Item_Fat_Content_counts = df['Item_Fat_Content'].value_counts()
print(Item_Fat_Content_counts)

Low Fat    5089
Regular    2889
LF          316
reg         117
low fat     112
Name: Item_Fat_Content, dtype: int64



Rather than having 5 values, we should only have 2 (Low Fat and Regular)

In [12]:
# Standardize the values in the Central column
df['Item_Fat_Content'] = df['Item_Fat_Content'].replace({"LF":"Low Fat"})
df['Item_Fat_Content'] = df['Item_Fat_Content'].replace({"low fat":"Low Fat"})
df['Item_Fat_Content'] = df['Item_Fat_Content'].replace({"reg":"Regular"})


df['Item_Fat_Content'].value_counts()
# Check the value counts again to confirm
Item_Fat_Content_counts = df['Item_Fat_Content'].value_counts()
print(Item_Fat_Content_counts)

Low Fat    5517
Regular    3006
Name: Item_Fat_Content, dtype: int64


##Split Data

In [13]:
y = df['Item_Outlet_Sales']
X = df.drop('Item_Outlet_Sales', axis = 1)

display(y)

display(X)

0       3735.1380
1        443.4228
2       2097.2700
3        732.3800
4        994.7052
          ...    
8518    2778.3834
8519     549.2850
8520    1193.1136
8521    1845.5976
8522     765.6700
Name: Item_Outlet_Sales, Length: 8523, dtype: float64

Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_MRP,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type
0,9.300,Low Fat,0.016047,249.8092,1999,Medium,Tier 1,Supermarket Type1
1,5.920,Regular,0.019278,48.2692,2009,Medium,Tier 3,Supermarket Type2
2,17.500,Low Fat,0.016760,141.6180,1999,Medium,Tier 1,Supermarket Type1
3,19.200,Regular,0.000000,182.0950,1998,,Tier 3,Grocery Store
4,8.930,Low Fat,0.000000,53.8614,1987,High,Tier 3,Supermarket Type1
...,...,...,...,...,...,...,...,...
8518,6.865,Low Fat,0.056783,214.5218,1987,High,Tier 3,Supermarket Type1
8519,8.380,Regular,0.046982,108.1570,2002,,Tier 2,Supermarket Type1
8520,10.600,Low Fat,0.035186,85.1224,2004,Small,Tier 2,Supermarket Type1
8521,7.210,Regular,0.145221,103.1332,2009,Medium,Tier 3,Supermarket Type2


In [14]:
# Train test split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)
X_train.info()
y_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6392 entries, 4776 to 7270
Data columns (total 8 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Weight                5285 non-null   float64
 1   Item_Fat_Content           6392 non-null   object 
 2   Item_Visibility            6392 non-null   float64
 3   Item_MRP                   6392 non-null   float64
 4   Outlet_Establishment_Year  6392 non-null   int64  
 5   Outlet_Size                4580 non-null   object 
 6   Outlet_Location_Type       6392 non-null   object 
 7   Outlet_Type                6392 non-null   object 
dtypes: float64(3), int64(1), object(4)
memory usage: 449.4+ KB
<class 'pandas.core.series.Series'>
Int64Index: 6392 entries, 4776 to 7270
Series name: Item_Outlet_Sales
Non-Null Count  Dtype  
--------------  -----  
6392 non-null   float64
dtypes: float64(1)
memory usage: 99.9 KB


##Create a preprocessing object to prepare the dataset for Machine Learning

In [15]:
X_train.select_dtypes('object')

Unnamed: 0,Item_Fat_Content,Outlet_Size,Outlet_Location_Type,Outlet_Type
4776,Low Fat,Medium,Tier 3,Supermarket Type2
7510,Regular,Medium,Tier 3,Supermarket Type2
5828,Regular,Medium,Tier 1,Supermarket Type1
5327,Low Fat,Small,Tier 2,Supermarket Type1
4810,Low Fat,,Tier 2,Supermarket Type1
...,...,...,...,...
5734,Regular,,Tier 3,Grocery Store
5191,Low Fat,,Tier 2,Supermarket Type1
5390,Low Fat,,Tier 2,Supermarket Type1
860,Low Fat,,Tier 2,Supermarket Type1


It appears that Outlet Size snd Outlet Location Type are our ordinal values

###Ordinal Pipeline

In [16]:
df['Outlet_Size'].nunique()


3

In [17]:
ord_col = ['Outlet_Size','Outlet_Location_Type']

# Impute Most frequent Strategy
impute_common = SimpleImputer(strategy='most_frequent')

# SPecify Order of Ordinal COLUMNS
size_order = ['Small','Medium','High']
loc_order= ['Tier 1','Tier 2','Tier 3']
ord_encoder = OrdinalEncoder(categories=[size_order, loc_order])

scaler = StandardScaler()

# make a pipeline
ord_pipe = make_pipeline(impute_common, ord_encoder, scaler)
ord_pipe


###Categorical Pipeline

In [18]:
# Prepare Categorical pipeline
cat_cols = X_train.select_dtypes('object').drop(columns=ord_col).columns

#Impute Missing values with "MISSING"
impute_missing = SimpleImputer(strategy='constant',fill_value='MISSING')

#One Hot Encoding
ohe_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

#Create pipline
ohe_pipe = make_pipeline(impute_missing, ohe_encoder)
ohe_pipe

###Numeric Pipeline

In [19]:
#Prepare Numeric piepline
num_cols = X_train.select_dtypes('number').columns

# Impute using mean
mean_imputer = SimpleImputer(strategy="mean")

#Scaling
scaler = StandardScaler()

#Numeric Pipeline
numeric_pipe = make_pipeline(mean_imputer, scaler)
numeric_pipe

##Create a Column Transformer

###Create Tuples

In [20]:
# Define three tuples

# Making a numeric tuple for ColumnTransformer
num_tuple = ('numeric', numeric_pipe, num_cols)

# Making a ohe_tuple for ColumnTransformer
ohe_tuple = ('categorical', ohe_pipe, cat_cols)


# Making an ordinal_tuple for ColumnTransformer
ord_tuple = ('ordinal', ord_pipe, ord_col)


###Create Column Transformer

In [21]:
col_transformer = ColumnTransformer([num_tuple, ord_tuple, ohe_tuple],
                                    verbose_feature_names_out=False)

col_transformer

###Transform

In [22]:
#Fit the column transformer object to the training data.
col_transformer.fit(X_train)

In [23]:
# Transform the training data
X_train_processed = col_transformer.transform(X_train)
X_train_processed.head()


Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Item_Fat_Content_Low Fat,Item_Fat_Content_Regular,Outlet_Type_Grocery Store,Outlet_Type_Supermarket Type1,Outlet_Type_Supermarket Type2,Outlet_Type_Supermarket Type3
4776,0.817249,-0.712775,1.828109,1.327849,0.287374,1.084948,1.0,0.0,0.0,0.0,1.0,0.0
7510,0.55634,-1.291052,0.603369,1.327849,0.287374,1.084948,0.0,1.0,0.0,0.0,1.0,0.0
5828,-0.131512,1.813319,0.244541,0.136187,0.287374,-1.384777,0.0,1.0,0.0,1.0,0.0,0.0
5327,-1.169219,-1.004931,-0.952591,0.732018,-1.384048,-0.149914,1.0,0.0,0.0,1.0,0.0,0.0
4810,1.528819,-0.965484,-0.33646,0.493686,0.287374,-0.149914,1.0,0.0,0.0,1.0,0.0,0.0


In [24]:
# Transform the testing data
X_test_processed = col_transformer.transform(X_test)
X_test_processed.head()

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Item_Fat_Content_Low Fat,Item_Fat_Content_Regular,Outlet_Type_Grocery Store,Outlet_Type_Supermarket Type1,Outlet_Type_Supermarket Type2,Outlet_Type_Supermarket Type3
7503,0.3310089,-0.776646,-0.998816,-1.293807,1.958796,1.084948,1.0,0.0,0.0,1.0,0.0,0.0
2957,-1.179892,0.100317,-1.585194,-0.102145,-1.384048,-1.384777,1.0,0.0,0.0,1.0,0.0,0.0
7031,0.3784469,-0.482994,-1.595784,0.136187,0.287374,-1.384777,0.0,1.0,0.0,1.0,0.0,0.0
1084,4.213344e-16,-0.41544,0.506592,-1.532139,0.287374,1.084948,0.0,1.0,0.0,0.0,0.0,1.0
856,-0.6426567,-1.047426,0.886725,0.732018,-1.384048,-0.149914,0.0,1.0,0.0,1.0,0.0,0.0


#Regression Modeling

##Imports

In [38]:
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, GridSearchCV


In [26]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
def regression_metrics(y_true, y_pred, label='', verbose = True, output_dict=False):
  # Get metrics
  mae = mean_absolute_error(y_true, y_pred)
  mse = mean_squared_error(y_true, y_pred)
  rmse = mean_squared_error(y_true, y_pred, squared=False)
  r_squared = r2_score(y_true, y_pred)
  if verbose == True:
    # Print Result with Label and Header
    header = "-"*60
    print(header, f"Regression Metrics: {label}", header, sep='\n')
    print(f"- MAE = {mae:,.3f}")
    print(f"- MSE = {mse:,.3f}")
    print(f"- RMSE = {rmse:,.3f}")
    print(f"- R^2 = {r_squared:,.3f}")
  if output_dict == True:
      metrics = {'Label':label, 'MAE':mae,
                 'MSE':mse, 'RMSE':rmse, 'R^2':r_squared}
      return metrics
def evaluate_regression(reg, X_train, y_train, X_test, y_test, verbose = True,
                        output_frame=False):
  # Get predictions for training data
  y_train_pred = reg.predict(X_train)
  # Call the helper function to obtain regression metrics for training data
  results_train = regression_metrics(y_train, y_train_pred, verbose = verbose,
                                     output_dict=output_frame,
                                     label='Training Data')
  print()
  # Get predictions for test data
  y_test_pred = reg.predict(X_test)
  # Call the helper function to obtain regression metrics for test data
  results_test = regression_metrics(y_test, y_test_pred, verbose = verbose,
                                  output_dict=output_frame,
                                    label='Test Data' )
  # Store results in a dataframe if ouput_frame is True
  if output_frame:
    results_df = pd.DataFrame([results_train,results_test])
    # Set the label as the index
    results_df = results_df.set_index('Label')
    # Set index.name to none to get a cleaner looking result
    results_df.index.name=None
    # Return the dataframe
    return results_df.round(3)

##1. Your first task is to build a linear regression model to predict sales.



###Build a linear regression model.


In [28]:
# Instantiate linear regression model
linreg = LinearRegression()


# Combine your preprocessing object (from the previous step of the project) and the model into a model pipeline.
linreg_pipe = make_pipeline(col_transformer, linreg)
linreg_pipe

In [30]:
# Fit the training data on to model pipeline
linreg_pipe.fit(X_train, y_train)

###Use the custom evaluation function to get the metrics for your model (on training and test data).

In [32]:
#Model Evaluation
evaluate_regression(linreg_pipe, X_train, y_train, X_test, y_test)

------------------------------------------------------------
Regression Metrics: Training Data
------------------------------------------------------------
- MAE = 848.394
- MSE = 1,302,670.477
- RMSE = 1,141.346
- R^2 = 0.560

------------------------------------------------------------
Regression Metrics: Test Data
------------------------------------------------------------
- MAE = 803.991
- MSE = 1,190,707.288
- RMSE = 1,091.195
- R^2 = 0.568


###Compare the training vs. test R-squared values and answer the question: to what extent is this model overfit/underfit?






**Mean Absolute Error (MAE):** For the training data, the MAE is 848.394, and for the test data, it is 803.991. This means that on average, the model's predictions are off by approximately 848 units for the training data and 804 units for the test data.


**Mean Squared Error (MSE):** The MSE is 1,302,670.477 for the training data and 1,190,707.288 for the test data. Higher MSE values for the training data indicate larger prediction errors compared to the test data.

**Root Mean Squared Error (RMSE)**: RMSE is the square root of MSE and so it provides a similar measure of accuracy. The RMSE is 1,141.346 for the training data and 1,091.195 for the test data. Similar to MSE, the higher RMSE for the training data suggests larger errors compared to the testing data.

**R^2 (Coefficient of Determination)**: R^2 represents the proportion of the variance in sales that is predictable from the independent variables used in the model. The R^2 is 0.560 for the training data and 0.568 for the test data. This suggests that approximately 56% of the variance in sales can be explained by the model for the training data, and 57% for the test data.

**Overfitting/Underfitting**:

Overfitting occurs when a model performs well on the training data but poorly on test data due to the data being unseen. In this case, the model's performance on the training and test data is relatively similar across all metrics, with the test metrics actually being slightly better. This indicates that overfitting is less likely.

Underfitting occurs when a model is unable understand the underlying patterns in the data. The R^2 values of 0.560 for training data and 0.568 for test data suggest that the model may be somewhat underfitting and indicates that the model can not predict a substantial portion of the variance in sales.



##2. Your second task is to build a Random Forest model to predict sales.



###Build a default Random Forest model.


In [33]:
# Instantiate a Default Model
rf = RandomForestRegressor(random_state = 42)

In [34]:
# Create pipeline for default model
rf_pipe = make_pipeline(col_transformer, rf)

In [35]:
# Fit the default model on the training data
rf_pipe.fit(X_train, y_train)

###Use the custom evaluation function to get the metrics for your model (on training and test data).

In [36]:
# Use custom function to predict and evaluate
evaluate_regression(rf_pipe, X_train, y_train, X_test, y_test)

------------------------------------------------------------
Regression Metrics: Training Data
------------------------------------------------------------
- MAE = 298.133
- MSE = 184,157.037
- RMSE = 429.135
- R^2 = 0.938

------------------------------------------------------------
Regression Metrics: Test Data
------------------------------------------------------------
- MAE = 774.841
- MSE = 1,235,256.823
- RMSE = 1,111.421
- R^2 = 0.552


###Compare the training vs. test R-squared values and answer the question: to what extent is this model overfit/underfit?



The R-squared value of 0.938 from the training data indicates that approximately 93.8% of the variance in the target variable can be explained by the independent variables in the model. This is a reasonably good fit to the training data.

However the testing data metrics were considerably poorer. The R-squared value of 0.552 for the test data indicates that approximately 55.2% of the variance in the sales can be explained by the model. This value is lower compared to the training R-squared, suggesting overfitting

Overfitting occurs when a model performs significantly better on the training data compared to the test data, indicating that it may have memorized the training data's patterns instead of learning the underlying general patterns.





###Compare this model's performance to the linear regression model: which model has the best test scores?


When comparing the metrics from the random forest model to the linear regression model. It is evident that the linear regression model performed better. Firstly the linear regression model had a higher R^2 of 0.568 and the random forest model performed slightly worse witha score of 0.552.

##3. Use GridSearchCV to tune at least two hyperparameters for a Random Forest model.



In [37]:
# Parameters for tuning
rf_pipe.get_params()

{'memory': None,
 'steps': [('columntransformer',
   ColumnTransformer(transformers=[('numeric',
                                    Pipeline(steps=[('simpleimputer',
                                                     SimpleImputer()),
                                                    ('standardscaler',
                                                     StandardScaler())]),
                                    Index(['Item_Weight', 'Item_Visibility', 'Item_MRP',
          'Outlet_Establishment_Year'],
         dtype='object')),
                                   ('ordinal',
                                    Pipeline(steps=[('simpleimputer',
                                                     SimpleImputer(strategy='most_frequent')),
                                                    ('ordinalencoder',
                                                     OrdinalEncoder(categ...
                                                    ('standardscaler',
                              

In [39]:
#Create parameter list to test

params = {'randomforestregressor__max_depth': [None, 10 , 15, 20],
           'randomforestregressor__max_features': [1.0, 2.0 , 3.0, 4.0],
           'randomforestregressor__max_depth': [None, 10 , 15, 20, 25],
           'randomforestregressor__n_estimators': [50, 100, 150, 200],
           'randomforestregressor__max_features':['sqrt','log2',None],
           'randomforestregressor__oob_score': [False,True],
          'randomforestregressor__warm_start': [False,True],
         }


In [42]:
# Instaniate the gridsearch
gridsearch = GridSearchCV(rf_pipe, params, n_jobs=-1, cv = 5, verbose=1)
gridsearch

In [None]:
# Fit the gridsearch on the training data
gridsearch.fit(X_train, y_train)

# Obtain the best paramters from the gridsearch
gridsearch.best_params_

Fitting 5 folds for each of 240 candidates, totalling 1200 fits
