<a href="https://colab.research.google.com/github/BasmallahM/Prediction-of-Product-Sales/blob/main/Project_1_Part_6_ML(Core)_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# mounting drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# imports
import pandas as pd
import numpy as np
import seaborn as sns
pd.set_option('display.max_columns',100)


# import modeling tools
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import joblib
import os

# Set pandas as the default output for sklearn
from sklearn import set_config
set_config(transform_output='pandas')

In [None]:
# Define the custom functions for regressoin evaluation
def regression_metrics(y_true, y_pred, label='', verbose = True, output_dict=False):
  # Get metrics
  mae = mean_absolute_error(y_true, y_pred)
  mse = mean_squared_error(y_true, y_pred)
  rmse = np.sqrt(mse) # Calculate RMSE from MSE
  r_squared = r2_score(y_true, y_pred)
  if verbose == True:
    # Print Result with Label and Header
    header = "-"*60
    print(header, f"Regression Metrics: {label}", header, sep='\n')
    print(f"- MAE = {mae:,.3f}")
    print(f"- MSE = {mse:,.3f}")
    print(f"- RMSE = {rmse:,.3f}")
    print(f"- R^2 = {r_squared:,.3f}")
  if output_dict == True:
      metrics = {'Label':label, 'MAE':mae,
                 'MSE':mse, 'RMSE':rmse, 'R^2':r_squared}
      return metrics

def evaluate_regression(reg, X_train, y_train, X_test, y_test, verbose = True,
                        output_frame=False):
  # Get predictions for training data
  y_train_pred = reg.predict(X_train)

  # Call the helper function to obtain regression metrics for training data
  results_train = regression_metrics(y_train, y_train_pred, verbose = verbose,
                                     output_dict=output_frame,
                                     label='Training Data')
  print()
  # Get predictions for test data
  y_test_pred = reg.predict(X_test)
  # Call the helper function to obtain regression metrics for test data
  results_test = regression_metrics(y_test, y_test_pred, verbose = verbose,
                                  output_dict=output_frame,
                                    label='Test Data' )

  # Store results in a dataframe if ouput_frame is True
  if output_frame:
    results_df = pd.DataFrame([results_train,results_test])
    # Set the label as the index
    results_df = results_df.set_index('Label')
    # Set index.name to none to get a cleaner looking result
    results_df.index.name=None
    # Return the dataframe
    return results_df.round(3)

In [None]:
fpath = '/content/drive/MyDrive/sales_predictions_2023.csv'
df = pd.read_csv(fpath)
df.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            8523 non-null   object 
 1   Item_Weight                7060 non-null   float64
 2   Item_Fat_Content           8523 non-null   object 
 3   Item_Visibility            8523 non-null   float64
 4   Item_Type                  8523 non-null   object 
 5   Item_MRP                   8523 non-null   float64
 6   Outlet_Identifier          8523 non-null   object 
 7   Outlet_Establishment_Year  8523 non-null   int64  
 8   Outlet_Size                6113 non-null   object 
 9   Outlet_Location_Type       8523 non-null   object 
 10  Outlet_Type                8523 non-null   object 
 11  Item_Outlet_Sales          8523 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 799.2+ KB


In [None]:
df.isna().sum()

Unnamed: 0,0
Item_Identifier,0
Item_Weight,1463
Item_Fat_Content,0
Item_Visibility,0
Item_Type,0
Item_MRP,0
Outlet_Identifier,0
Outlet_Establishment_Year,0
Outlet_Size,2410
Outlet_Location_Type,0


In [None]:
# checking any duplicated data :
duplicated_rows = df.duplicated()
df.duplicated().sum()


np.int64(0)

In [None]:
# addrissing the unique data
df.nunique()
string_cols = df.select_dtypes('object').columns
string_cols

df['Item_Fat_Content'].value_counts()

Unnamed: 0_level_0,count
Item_Fat_Content,Unnamed: 1_level_1
Low Fat,5089
Regular,2889
LF,316
reg,117
low fat,112


In [None]:
#Find and fix any inconsistent categories of data
string_cols = df.select_dtypes('object').columns
for col in string_cols:
   print(f'{col}: {df[col].unique()}')

   print('\n')

# replacing data
   df['Item_Fat_Content'] = df['Item_Fat_Content'].replace({'LF': 'Low Fat', 'low fat': 'Low Fat', 'reg': 'Regular'})
df['Item_Fat_Content'].unique()

Item_Identifier: ['FDA15' 'DRC01' 'FDN15' ... 'NCF55' 'NCW30' 'NCW05']


Item_Fat_Content: ['Low Fat' 'Regular']


Item_Type: ['Dairy' 'Soft Drinks' 'Meat' 'Fruits and Vegetables' 'Household'
 'Baking Goods' 'Snack Foods' 'Frozen Foods' 'Breakfast'
 'Health and Hygiene' 'Hard Drinks' 'Canned' 'Breads' 'Starchy Foods'
 'Others' 'Seafood']


Outlet_Identifier: ['OUT049' 'OUT018' 'OUT010' 'OUT013' 'OUT027' 'OUT045' 'OUT017' 'OUT046'
 'OUT035' 'OUT019']


Outlet_Size: ['Medium' nan 'High' 'Small']


Outlet_Location_Type: ['Tier 1' 'Tier 3' 'Tier 2']


Outlet_Type: ['Supermarket Type1' 'Supermarket Type2' 'Grocery Store'
 'Supermarket Type3']




array(['Low Fat', 'Regular'], dtype=object)

In [None]:
# dropping the "Item_Identifier" feature due to its high cardinality.
df.drop(columns = 'Item_Identifier', inplace = True)

In [None]:
# split data
y = df['Item_Outlet_Sales']
X = df.drop(columns = 'Item_Outlet_Sales')

# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
X_train.head()

Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type
4776,16.35,Low Fat,0.029565,Household,256.4646,OUT018,2009,Medium,Tier 3,Supermarket Type2
7510,15.25,Regular,0.0,Snack Foods,179.766,OUT018,2009,Medium,Tier 3,Supermarket Type2
5828,12.35,Regular,0.158716,Meat,157.2946,OUT049,1999,Medium,Tier 1,Supermarket Type1
5327,7.975,Low Fat,0.014628,Baking Goods,82.325,OUT035,2004,Small,Tier 2,Supermarket Type1
4810,19.35,Low Fat,0.016645,Frozen Foods,120.9098,OUT045,2002,,Tier 2,Supermarket Type1


In [None]:
# inspecting the features
# null values
X_train.isna().sum()

Unnamed: 0,0
Item_Weight,1107
Item_Fat_Content,0
Item_Visibility,0
Item_Type,0
Item_MRP,0
Outlet_Identifier,0
Outlet_Establishment_Year,0
Outlet_Size,1812
Outlet_Location_Type,0
Outlet_Type,0


In [None]:
# summary statistics
X_train.describe().round(2)

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Outlet_Establishment_Year
count,5285.0,6392.0,6392.0,6392.0
mean,12.9,0.07,141.98,1997.86
std,4.64,0.05,62.63,8.39
min,4.56,0.0,31.29,1985.0
25%,8.9,0.03,94.15,1987.0
50%,12.65,0.05,144.11,1999.0
75%,17.0,0.09,186.9,2004.0
max,21.35,0.33,266.89,2009.0


In [None]:
# Defining lists of types of features
numeric_features = X_train.select_dtypes("number").columns
numeric_features

Index(['Item_Weight', 'Item_Visibility', 'Item_MRP',
       'Outlet_Establishment_Year'],
      dtype='object')

In [None]:
# ordinal list
ordinal_features = ['Outlet_Size']

In [None]:
# Defining lists of categorial features
categorical_features = X.select_dtypes(include=['object']).columns

# Drop a specific column
categorical_features = categorical_features.drop('Outlet_Size')


categorical_features

Index(['Item_Fat_Content', 'Item_Type', 'Outlet_Identifier',
       'Outlet_Location_Type', 'Outlet_Type'],
      dtype='object')

In [None]:
# numeric features pipeline
numeric_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy = 'median')),
    ('scaler', StandardScaler())
])

numeric_pipeline

In [None]:
# Categorical (Nominal) Pipeline
categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='MISSING')),
    ('encoder', OneHotEncoder(sparse_output=False, handle_unknown='ignore' ))
])

categorical_pipeline

In [None]:
# ordinal pipeline
ordinal_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OrdinalEncoder()),
    ('scaler', StandardScaler())
])

ordinal_pipeline

In [None]:
# making ordinal tuple for columntransfer
ordinal_tuple = ('ordinal_pipline', ordinal_pipeline, ordinal_features)

In [None]:
# making categorial tuple for columntrasfer
categorical_tuple = ('categorical_pipeline', categorical_pipeline, categorical_features)
categorical_tuple

('categorical_pipeline',
 Pipeline(steps=[('imputer',
                  SimpleImputer(fill_value='MISSING', strategy='constant')),
                 ('encoder',
                  OneHotEncoder(handle_unknown='ignore', sparse_output=False))]),
 Index(['Item_Fat_Content', 'Item_Type', 'Outlet_Identifier',
        'Outlet_Location_Type', 'Outlet_Type'],
       dtype='object'))

In [None]:
# Making a numeric tuple for ColumnTransformer
numeric_tuple = ('numeric_pipeline', numeric_pipeline, numeric_features)
numeric_tuple

('numeric_pipeline',
 Pipeline(steps=[('imputer', SimpleImputer(strategy='median')),
                 ('scaler', StandardScaler())]),
 Index(['Item_Weight', 'Item_Visibility', 'Item_MRP',
        'Outlet_Establishment_Year'],
       dtype='object'))

In [None]:
# Create ColumnTransformer
preprocessor  = ColumnTransformer([numeric_tuple, categorical_tuple, ordinal_tuple],
                                    verbose_feature_names_out=False
                                    )
preprocessor

In [None]:
# Fit the Transformer on Training Data
preprocessor.fit(X_train)

In [None]:
# Transform the training and test sets
X_train_processed = (preprocessor.transform(X_train))
X_test_processed = (preprocessor.transform(X_test))

X_train_processed.head()

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Outlet_Establishment_Year,Item_Fat_Content_Low Fat,Item_Fat_Content_Regular,Item_Type_Baking Goods,Item_Type_Breads,Item_Type_Breakfast,Item_Type_Canned,Item_Type_Dairy,Item_Type_Frozen Foods,Item_Type_Fruits and Vegetables,Item_Type_Hard Drinks,Item_Type_Health and Hygiene,Item_Type_Household,Item_Type_Meat,Item_Type_Others,Item_Type_Seafood,Item_Type_Snack Foods,Item_Type_Soft Drinks,Item_Type_Starchy Foods,Outlet_Identifier_OUT010,Outlet_Identifier_OUT013,Outlet_Identifier_OUT017,Outlet_Identifier_OUT018,Outlet_Identifier_OUT019,Outlet_Identifier_OUT027,Outlet_Identifier_OUT035,Outlet_Identifier_OUT045,Outlet_Identifier_OUT046,Outlet_Identifier_OUT049,Outlet_Location_Type_Tier 1,Outlet_Location_Type_Tier 2,Outlet_Location_Type_Tier 3,Outlet_Type_Grocery Store,Outlet_Type_Supermarket Type1,Outlet_Type_Supermarket Type2,Outlet_Type_Supermarket Type3,Outlet_Size
4776,0.827485,-0.712775,1.828109,1.327849,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,-0.287374
7510,0.566644,-1.291052,0.603369,1.327849,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,-0.287374
5828,-0.121028,1.813319,0.244541,0.136187,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,-0.287374
5327,-1.158464,-1.004931,-0.952591,0.732018,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.384048
4810,1.53887,-0.965484,-0.33646,0.493686,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,-0.287374


In [None]:
# to check missing data for X_train_processes
X_test_processed.isna().sum().sum()

np.int64(0)

# Project 1 - Part 6
## CRISP-DM Phase 4 - Modeling

### Linear Regression Model

In [None]:
# Build a linear regression model.
lin_reg = LinearRegression()
# Fit linear regression
lin_reg.fit(X_train_processed, y_train)

In [None]:
# evaluat the metrics
evaluate_regression(lin_reg, X_train_processed, y_train, X_test_processed, y_test)

------------------------------------------------------------
Regression Metrics: Training Data
------------------------------------------------------------
- MAE = 847.129
- MSE = 1,297,558.136
- RMSE = 1,139.104
- R^2 = 0.562

------------------------------------------------------------
Regression Metrics: Test Data
------------------------------------------------------------
- MAE = 804.120
- MSE = 1,194,349.715
- RMSE = 1,092.863
- R^2 = 0.567


 Compare the training vs. test R-squared values and answer the question: to what extent is this model overfit/underfit?

- The model generalizes similarly on both seen and unseen data, The model is neither overfit nor underfit,

- The training and test R² values are very similar (0.562 vs. 0.567).


- Error metrics (MAE, RMSE) are also close and similar for training and test sets,

- though its performance is moderate — it captures only about 56–57% of the variance.


## Random Forest model

In [None]:
# Instantiate default random forest model
rf = RandomForestRegressor(random_state = 42)

In [None]:
# Model Pipeline
rf_pipe = make_pipeline(preprocessor, rf)

# Fit the model pipeline on the training data only
rf_pipe.fit(X_train, y_train)
rf_pipe

In [None]:
# Use custom function to evaluate default model
evaluate_regression(rf_pipe, X_train, y_train, X_test, y_test)

------------------------------------------------------------
Regression Metrics: Training Data
------------------------------------------------------------
- MAE = 295.806
- MSE = 182,234.375
- RMSE = 426.889
- R^2 = 0.938

------------------------------------------------------------
Regression Metrics: Test Data
------------------------------------------------------------
- MAE = 766.617
- MSE = 1,214,654.579
- RMSE = 1,102.114
- R^2 = 0.560


Compare the training vs. test R-squared values and answer the question: to what extent is this model overfit/underfit?

- The training R² is very high (0.938), but the test R² drops to 0.560.

- Large gap in error metrics: RMSE increases by over 600 units.

- This model is clearly overfit — it performs extremely well on training data but performs poorly to unseen data.

Compare this model's performance to the linear regression model: which model has the best test scores?

- linear regression model performs more consistently across training and test data and has slightly better test R² and RMSE, indicating better performance

- Random Forest model is overfitting it learns the training data too well but performs poorly on new data

** Therefore, linear regression model is the better choice, it is more balanced and stable in real-world prediction scenarios.

## Use GridSearchCV to tune at least two hyperparameters for a Random Forest model.

In [None]:
# Parameters for tuning
rf_pipe.get_params()

{'memory': None,
 'steps': [('columntransformer',
   ColumnTransformer(transformers=[('numeric_pipeline',
                                    Pipeline(steps=[('imputer',
                                                     SimpleImputer(strategy='median')),
                                                    ('scaler', StandardScaler())]),
                                    Index(['Item_Weight', 'Item_Visibility', 'Item_MRP',
          'Outlet_Establishment_Year'],
         dtype='object')),
                                   ('categorical_pipeline',
                                    Pipeline(steps=[('imputer',
                                                     SimpleImputer(fill_value='MISSING',
                                                                   strategy='constant')),
                                                    ('e...
                                                     OneHotEncoder(handle_unknown='ignore',
                                                

In [None]:
# Define param grid with options to try
params = {'randomforestregressor__max_depth': [None,10,15,20],
          'randomforestregressor__n_estimators':[10,100,150,200],
          'randomforestregressor__min_samples_leaf':[2,3,4],
          'randomforestregressor__max_features':['sqrt','log2',None],
          'randomforestregressor__oob_score':[True,False],
          }

In [None]:
# Instantiate the gridsearch
gridsearch = GridSearchCV(rf_pipe, params, n_jobs=-1, cv = 3, verbose=1)
gridsearch

In [None]:
# Fit the gridsearch on training data
gridsearch.fit(X_train, y_train)

Fitting 3 folds for each of 288 candidates, totalling 864 fits


In [46]:
# Obtain best parameters
gridsearch.best_params_

{'randomforestregressor__max_depth': 10,
 'randomforestregressor__max_features': None,
 'randomforestregressor__min_samples_leaf': 3,
 'randomforestregressor__n_estimators': 100,
 'randomforestregressor__oob_score': True}

In [47]:
# Define and refit best model
best_rf = gridsearch.best_estimator_
evaluate_regression(best_rf, X_train, y_train, X_test, y_test)

------------------------------------------------------------
Regression Metrics: Training Data
------------------------------------------------------------
- MAE = 653.621
- MSE = 868,731.344
- RMSE = 932.058
- R^2 = 0.706

------------------------------------------------------------
Regression Metrics: Test Data
------------------------------------------------------------
- MAE = 734.518
- MSE = 1,118,752.629
- RMSE = 1,057.711
- R^2 = 0.595


Compare your tuned model to your default Random Forest: did the performance improve?

- Tuned model generalizes better. The drop in training accuracy is acceptable because it comes with better test performance—showing that overfitting was reduced.

- Performance improved overall, especially for real-world (test) data. So yes, the tuning was successful.

# CRISP-DM Phase 5 - Evaluation

### Recommended Model
* Overall, which model do you recommend?

the tuned random forest model shows a strong training R2, meaning it captures a significant amount of variance in the training data. although its testing R2 is lower it still significantly outperforms the Linear regression model, which struggles with a negative R2 on the testing set, this suggests the random forest model has better predictive capabilities for new data.

### Explanation for Stakeholders
* Interpreting Model Performance (R-squared) for Stakeholders

R-squared (R²) tells us how much of the variation in our target variable (the value we're trying to predict) is explained by the model. For the tuned model, an R² of approximately 0.595 means:

"Our model can explain about 60% of the differences we see in the outcome, based on the input features. While not perfect, this shows a good level of understanding of the factors influencing the result."

* selected metric: RMSE
I also use RMSE, which on the test set is about 1,058. RMSE tells us, on average, how much our predictions differ from the actual values in the same units as the target variable. Lower RMSE means more accurate predictions.

I chose RMSE because:

- It is easy to interpret since it’s in the same scale as the prediction target.

- It penalizes larger errors more, helping to highlight when predictions are significantly off.

- It complements R² by showing the average prediction error size, not just the proportion of variance explained.



# Overfitting vs. Underfitting comparision


Regression Metrics: Training Data
------------------------------------------------------------
- MAE = 653.621
- MSE = 868,731.344
- RMSE = 932.058
- R^2 = 0.706


------------------------------------------------------------
Regression Metrics: Test Data
------------------------------------------------------------
- MAE = 734.518
- MSE = 1,118,752.629
- RMSE = 1,057.711
- R^2 = 0.595



### Conclusion
* The training R² is 0.706 and test R² is 0.595.

- This gap indicates a moderate amount of overfitting—the model performs better on training data than on new data, but not excessively so.

- The tuning process helped reduce overfitting compared to the default model,Performance improved overall