<a href="https://colab.research.google.com/github/Agrave1/Food-Sales-Predictions/blob/main/Project1_Final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [64]:
import pandas as pd
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.impute import SimpleImputer
from sklearn import set_config
set_config(display = "diagram")

In [65]:
sales_df = pd.read_csv("/content/drive/MyDrive/Portfolio Projects/sales_predictions (1).csv")
sales_df.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [66]:
#Check for duplicates
sales_df.duplicated().sum()

0

In [67]:
#Check for missing values
sales_df.isna().sum()

Item_Identifier                 0
Item_Weight                  1463
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64

In [68]:
sales_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            8523 non-null   object 
 1   Item_Weight                7060 non-null   float64
 2   Item_Fat_Content           8523 non-null   object 
 3   Item_Visibility            8523 non-null   float64
 4   Item_Type                  8523 non-null   object 
 5   Item_MRP                   8523 non-null   float64
 6   Outlet_Identifier          8523 non-null   object 
 7   Outlet_Establishment_Year  8523 non-null   int64  
 8   Outlet_Size                6113 non-null   object 
 9   Outlet_Location_Type       8523 non-null   object 
 10  Outlet_Type                8523 non-null   object 
 11  Item_Outlet_Sales          8523 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 799.2+ KB


In [69]:
sales_df2 = sales_df.copy()

In [70]:
#Define X and y
X = sales_df2.drop(columns = ["Item_Outlet_Sales", "Item_Identifier", "Outlet_Identifier"])
y = sales_df2["Item_Outlet_Sales"]
#Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

## Preprocessing

In [71]:
#Selectors
cat_selector = make_column_selector(dtype_include = "object")
num_selector = make_column_selector(dtype_include = "number")

In [80]:
#Create a subset of data for only categorical columns
train_cat_data = X_train[cat_selector(X_train)]
test_cat_data = X_test[cat_selector(X_test)]

In [81]:
#Imputers
freq_imputer = SimpleImputer(strategy = "most_frequent")
mean_imputer = SimpleImputer(strategy = "mean")
#Scaler
scaler = StandardScaler()
#One Hot Encoder
ohe = OneHotEncoder(handle_unknown = "ignore", sparse = False)

In [88]:
ohe.fit(train_cat_data)
#transform both the training and the testing data
train_ohe = ohe.transform(train_cat_data)
test_ohe = ohe.transform(test_cat_data)
train_ohe

array([[0., 1., 0., ..., 0., 1., 0.],
       [0., 0., 1., ..., 0., 1., 0.],
       [0., 0., 1., ..., 1., 0., 0.],
       ...,
       [0., 1., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       [1., 0., 0., ..., 1., 0., 0.]])

In [82]:
#Pipelines
cat_pipe = make_pipeline(freq_imputer, ohe)
num_pipe = make_pipeline(mean_imputer, scaler)

In [83]:
#Tuples for column transfers
cat_tuple = (cat_pipe, cat_selector)
num_tuple = (num_pipe, num_selector)
#Columntransformer
preprocessor = make_column_transformer(cat_tuple, num_tuple)

In [84]:
#Fit the data
preprocessor.fit(X_train)

In [92]:
#Transform data
X_train_processed = preprocessor.transform(X_train)
X_test_processed = preprocessor.transform(X_test)

In [99]:
print(X_train_processed.shape)
print(X_train_processed.dtype)
print(X_test_processed.dtype)
X_train_processed

(6392, 35)
float64
float64


array([[ 0.        ,  1.        ,  0.        , ..., -0.71277507,
         1.82810922,  1.32784893],
       [ 0.        ,  0.        ,  1.        , ..., -1.29105225,
         0.60336888,  1.32784893],
       [ 0.        ,  0.        ,  1.        , ...,  1.81331864,
         0.24454056,  0.13618724],
       ...,
       [ 0.        ,  1.        ,  0.        , ..., -0.92052713,
         1.52302674,  0.49368575],
       [ 0.        ,  0.        ,  0.        , ..., -0.2277552 ,
        -0.38377708,  1.0895166 ],
       [ 1.        ,  0.        ,  0.        , ..., -0.95867683,
        -0.73836105, -0.10214509]])

## Linear Regression Model

In [100]:
#Instantiate model
reg = LinearRegression()

In [103]:
#Create pipeline
reg_pipe = make_pipeline(scaler, reg)

In [105]:
#Train model
reg_pipe.fit(X_train_processed, y_train)

In [108]:
#Create predictions
train_pred = reg_pipe.predict(X_train_processed)
test_pred = reg_pipe.predict(X_test_processed)

In [113]:
#Evaluate R2
train_r2 = r2_score(y_train, train_pred)
test_r2 = r2_score(y_test, test_pred)

print(f"Training R2 Score: {train_r2:.3f}")
print(f"testing R2 Score: {test_r2:.3f}")

Training R2 Score: 0.561
testing R2 Score: 0.567


In [111]:
#Evaluate RMSE
train_mse = mean_squared_error(y_train, train_pred)
train_rmse = np.sqrt(train_mse)
test_mse = mean_squared_error(y_test, test_pred)
test_rmse = np.sqrt(test_mse)

print(f"Training RMSE: {train_rmse:.3f}")
print(f"Testing RMSE: {test_rmse:.3f}")

Training RMSE: 1140.341
Testing RMSE: 1092.998


## Regression Tree

In [115]:
#Instantiate model
dec_tree = DecisionTreeRegressor(random_state = 42)

In [119]:
#Fit Model
dec_tree.fit(X_train_processed, y_train)

In [121]:
#Create predictions
train_pred2 = dec_tree.predict(X_train_processed)
test_pred2 = dec_tree.predict(X_test_processed)

In [125]:
#Evaluate model
train_score = dec_tree.score(X_train_processed, y_train)
test_score = dec_tree.score(X_test_processed, y_test)

print(f"Training R2 Score: {train_score:.3f}")
print(f"Testing R2 Score: {test_score:.3f}")

Training R2 Score: 1.000
Testing R2 Score: 0.228


In [127]:
#Get depth
dec_tree.get_depth()

45

In [129]:
#Create list of depths
depths = list(range(2,45))
#Dataframe to store each score at each given depth
scores = pd.DataFrame(index = depths, columns = ("Test Score", "Train Score"))
#For loop to get each score at each given depth
for depth in depths:
  dec_tree = DecisionTreeRegressor(max_depth = depth, random_state = 42)
  dec_tree.fit(X_train_processed, y_train)
  train_score = dec_tree.score(X_train_processed, y_train)
  test_score = dec_tree.score(X_test_processed, y_test)
  scores.loc[depth, "Train Score"] = train_score
  scores.loc[depth, "Test Score"] = test_score

In [131]:
#Display sorted scores to find best depth
sorted_scores = scores.sort_values(by = "Test Score", ascending = False)
sorted_scores.head()

Unnamed: 0,Test Score,Train Score
5,0.59472,0.603932
6,0.584507,0.615139
4,0.584005,0.582625
7,0.578042,0.626773
8,0.567225,0.642701


In [141]:
best_model = DecisionTreeRegressor(max_depth = 9, random_state = 42)
best_model.fit(X_train_processed, y_train)
train_pred3 = best_model.predict(X_train_processed)
test_pred3 = best_model.predict(X_test_processed)

train_mse2 = mean_squared_error(y_train, train_pred3)
train_rmse2 = np.sqrt(train_mse2)
test_mse2 = mean_squared_error(y_test, test_pred3)
test_rmse2 = np.sqrt(test_mse2)

print(f"Training RMSE: {train_rmse2:.3f}")
print(f"Testing RMSE: {test_rmse2:.3f}")

Training RMSE: 1001.993
Testing RMSE: 1112.254


## I recommend the decision tree model because the R2 value is slightly higher and the RMSE values are more comparative between the training and testing sets.