<a href="https://colab.research.google.com/github/BradenAnderson/sales-predictions/blob/main/03_Modeling_Part_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Braden Anderson
### Sales Predictions Project - Modeling Part 1
### Coding Dojo
### Data Science and Machine Learning Bootcamp


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import missingno
import pickle
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import BayesianRidge, LinearRegression, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor, BaggingRegressor 
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler, PowerTransformer
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.metrics import r2_score, mean_squared_error, SCORERS, mean_absolute_error
from sklearn.pipeline import Pipeline

%matplotlib inline

In [None]:
filename = '/content/drive/MyDrive/Programming/Colab Notebooks/Coding_Dojo/Sales_Project/Current/sales_predictions_clean.csv'

sales_df = pd.read_csv(filename, index_col=0)

In [None]:
a = sales_df.iloc[:,0].unique()
len(a)
sales_df.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,missing,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [None]:
'''
numeric_feat = ['Item_Weight', 'Item_Visibility', 'Item_MRP']
ordinal_categorical_feat = ['Item_Fat_Content', 'Outlet_Establishment_Year', 'Outlet_Size']
nominal_categorical_feat = ["Item_Type", "Outlet_Identifier", "Outlet_Location_Type", "Outlet_Type"]

X = sales_df.loc[:, (sales_df.columns != 'Item_Outlet_Sales') & (sales_df.columns != 'Item_Identifier')]

y = sales_df["Item_Outlet_Sales"]

# ------------------------------------------------------------------------------------------------------------------------------------
# handle_unknown = 'ignore' is required for the OneHot encoder because the 'Item_Identifier' column has too many unique values.
# this means after one hot encoding, there will be some values that show up in the test set that were not seen in the training 
# set. By default this raises an error. Our only option (assuming we want to keep this column in the model, which in fact it may
# be better just to remove it), is to ignore the errors. 
#
# When errors are ignored, and an unknown parameter is encountered during the transform, the resulting one-hot encoded column
# for the unknown parameter will be all zeros, and in the inverse transform the unknown category will be denoted as "None". 
#
# ValueError: `handle_unknown` must be 'error' when the drop parameter is specified, as both would create categories that are all zero.
#---------------------------------------------------------------------------------------------------------------------------------------

preprocess = ColumnTransformer(transformers=[("std_scaler", StandardScaler(), numeric_feat),
                                             ("one_hot_encoder", OneHotEncoder(categories='auto', drop="first", sparse=False), nominal_categorical_feat), 
                                             ("ordinal_encoder", OrdinalEncoder(), ordinal_categorical_feat)],
                               remainder='passthrough')


total_columns = 36
column_list = list(range(total_columns + 1)) 
num_neighbors = list(range(1,40))

random_forest = RandomForestRegressor(random_state=42, oob_score=True)
bagged_trees = BaggingRegressor(random_state=42, oob_score=True)

num_trees = [1, 5, 10, 50, 100, 150, 200]
max_depth = [None, 7]

reg_strategy = TransformedTargetRegressor()
bcox_transformer = PowerTransformer(method='box-cox')
score_types = {'r2' :'r2', 'MSE' : 'neg_mean_squared_error'}

model_pipeline = Pipeline([("preprocessing", preprocess),
                           ('feature_selector', SelectKBest(f_regression)),
                           ('regress', reg_strategy)])


parameter_grid = [{'feature_selector__k' : column_list,                       # Vary the number of "Best" parameters to use when building the model.
                   'regress__regressor' : [LinearRegression(), Lasso()],      # Vary the model between Linear and Lasso regression. 
                   'regress__transformer' : [None, bcox_transformer]},        # Vary whether or not a box-cox transform is applied to the target vector.
                  {'feature_selector__k': column_list,                        # Vary the number of "Best" parameters to use when building the model. 
                   'regress__regressor' : [KNeighborsRegressor()],            # Use K Nearest Neighbors as the model.
                   'regress__regressor__n_neighbors' : num_neighbors,         # Vary the number of nearest neighbors the model uses from 1 to 39. 
                   'regress__regressor__weights' : ['uniform', 'distance'],   # Vary whether or the neighbors voting scheme is uniform of weighted by distance.
                   'regress__transformer' : [None, bcox_transformer]},        # Vary whether or not a box-cox transform is applied to the target. 
                  {'feature_selector__k': column_list,                        # Vary number of features the model uses.
                   'regress__regressor' : [random_forest],                    # Vary the model type to use a random forest
                   'regress__regressor__n_estimators' : num_trees,            # Vary the number of trees in the random forest
                   'regress__regressor__max_depth' : max_depth,               # Vary the max depth of each tree.
                   'regress__transformer' : [None, bcox_transformer]},        # Vary whether or not the target gets box-cox transformed.
                  {'feature_selector__k': column_list,                        # Vary the number of features used in the model.
                   'regress__regressor' : [bagged_trees],                     # Vary the model type to bagged trees.
                   'regress__regressor__n_estimators' : num_trees,            # Vary the number of trees used in the model.
                   'regress__regressor__base_estimator' : [DecisionTreeRegressor()],      # Each tree in the bagged trees model is a decision tree.
                   'regress__regressor__base_estimator__max_depth' : max_depth,           # Vary the max depth of each decision tree.
                   'regress__transformer' : [None, bcox_transformer]}]                    # Vary whether or not the target variable gets transformed. 


gs = GridSearchCV(estimator=model_pipeline, param_grid=parameter_grid, scoring=score_types, refit='MSE', cv=5, n_jobs=-1)

gs.fit(X, y)

PATH = '/content/drive/MyDrive/Programming/Colab Notebooks/Coding_Dojo/Sales_Project/Current/'
gridsearch_result_filename = 'gridsearch_models_1.pkl'
full_path = PATH + gridsearch_result_filename
with open(full_path, 'wb') as file:
  pickle.dump(gs, file)

'''

'\nnumeric_feat = [\'Item_Weight\', \'Item_Visibility\', \'Item_MRP\']\nordinal_categorical_feat = [\'Item_Fat_Content\', \'Outlet_Establishment_Year\', \'Outlet_Size\']\nnominal_categorical_feat = ["Item_Type", "Outlet_Identifier", "Outlet_Location_Type", "Outlet_Type"]\n\nX = sales_df.loc[:, (sales_df.columns != \'Item_Outlet_Sales\') & (sales_df.columns != \'Item_Identifier\')]\n\ny = sales_df["Item_Outlet_Sales"]\n\n# ------------------------------------------------------------------------------------------------------------------------------------\n# handle_unknown = \'ignore\' is required for the OneHot encoder because the \'Item_Identifier\' column has too many unique values.\n# this means after one hot encoding, there will be some values that show up in the test set that were not seen in the training \n# set. By default this raises an error. Our only option (assuming we want to keep this column in the model, which in fact it may\n# be better just to remove it), is to ignore the

In [None]:
full_path = '/content/drive/MyDrive/Programming/Colab Notebooks/Coding_Dojo/Sales_Project/Current/gridsearch_models_1.pkl'

with open(full_path, 'rb') as file:
  gs_results = pickle.load(file)


X = sales_df.loc[:, (sales_df.columns != 'Item_Outlet_Sales') & (sales_df.columns != 'Item_Identifier')]
y = sales_df["Item_Outlet_Sales"]

Predict_For_MAE = gs_results.predict(X)
search_results = gs_results.cv_results_
top_estimator = gs_results.best_estimator_
top_score = gs_results.best_score_
top_parameters = gs_results.best_params_

In [None]:
mean_absolute_error(y, Predict_For_MAE)

712.2291196699813

In [None]:
top_parameters

{'feature_selector__k': 35,
 'regress__regressor': RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                       max_depth=7, max_features='auto', max_leaf_nodes=None,
                       max_samples=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=100, n_jobs=None, oob_score=True,
                       random_state=42, verbose=0, warm_start=False),
 'regress__regressor__max_depth': 7,
 'regress__regressor__n_estimators': 100,
 'regress__transformer': None}

In [None]:
#------------------------------------------------------------------------------------------------------------------------------------------------
# Perform Data Cleaning on the GridSearchCV output
#------------------------------------------------------------------------------------------------------------------------------------------------

gs_result_df = pd.DataFrame(search_results)

results_simplified_df= gs_result_df.loc[:,:].sort_values(by=['rank_test_MSE'], ignore_index=True)

results_simplified_df.rename(columns= {'param_feature_selector__k' : 'num_features_in_model',
                                       'param_regress__regressor' : 'model_type',
                                       'param_regress__transformer' : 'target_transformation',
                                       'param_regress__regressor__n_neighbors' : "Num_Nearest_Neighbors",
                                       'param_regress__regressor__weights' : "Neighbors_Score_Strategy",
                                       'param_regress__regressor__max_depth' : 'rand_forest_max_depth',
                                       'param_regress__regressor__n_estimators' : 'Number_of_Trees',
                                       'param_regress__regressor__base_estimator__max_depth' : 'bagged_tree_max_depth'}, inplace=True)

results_simplified_df = results_simplified_df.loc[:, ['num_features_in_model', 'model_type', 'target_transformation', 'Num_Nearest_Neighbors',
                                                      'Neighbors_Score_Strategy', 'rand_forest_max_depth', 'Number_of_Trees', 'bagged_tree_max_depth',
                                                      'mean_test_r2', 'rank_test_r2', 'mean_test_MSE', 'rank_test_MSE']]


# For easier viewing, combine 'rand_forest_max_depth' and 'bagged_tree_max_depth" into a single column called "tree_depth".
results_simplified_df['tree_depth'] = results_simplified_df['rand_forest_max_depth'].fillna(value=results_simplified_df['bagged_tree_max_depth'])
results_simplified_df['tree_depth'] = results_simplified_df['tree_depth'].fillna(value='No_Limit')
results_simplified_df.drop(columns=['rand_forest_max_depth', 'bagged_tree_max_depth'], inplace=True)

# Indicate if the model recieved a 'Box-cox' transformation of the target variable, or if not then list 'No_transformation'.
results_simplified_df['target_transformation'] = results_simplified_df['target_transformation'].fillna(value="No_Transformation")
results_simplified_df.loc[ (results_simplified_df['target_transformation'] != 'No_Transformation') , 'target_transformation'] = "Box-Cox"

# For easier viewing, trim model_type down so it only shows the name of the model used.
results_simplified_df['model_type'] = results_simplified_df['model_type'].astype(str)
results_simplified_df['model_type'] = results_simplified_df['model_type'].map(lambda model_string : model_string.split('(')[0])

results_simplified_df['Num_Nearest_Neighbors'] = results_simplified_df['Num_Nearest_Neighbors'].fillna(value='Not_Applicable')
results_simplified_df['Neighbors_Score_Strategy'] = results_simplified_df['Neighbors_Score_Strategy'].fillna(value='Not_Applicable')
results_simplified_df['Number_of_Trees'] = results_simplified_df['Number_of_Trees'].fillna(value='Not_Applicable')


# Negative MSE is not very interpretable. Take the absolute value and square root to get a more meaningful number. 
results_simplified_df['RMSE'] = results_simplified_df['mean_test_MSE'].abs().pow(1./2)

#https://stackoverflow.com/questions/51325601/how-to-stop-my-pandas-data-table-from-being-truncated-when-printed
pd.set_option('display.max_rows', 500)

results_simplified_df.head(500)



Unnamed: 0,num_features_in_model,model_type,target_transformation,Num_Nearest_Neighbors,Neighbors_Score_Strategy,Number_of_Trees,mean_test_r2,rank_test_r2,mean_test_MSE,rank_test_MSE,tree_depth,RMSE
0,35,RandomForestRegressor,No_Transformation,Not_Applicable,Not_Applicable,100,0.593854,1,-1180705.0,1,7,1086.602685
1,35,BaggingRegressor,No_Transformation,Not_Applicable,Not_Applicable,100,0.593836,3,-1180745.0,2,7,1086.621121
2,35,RandomForestRegressor,No_Transformation,Not_Applicable,Not_Applicable,50,0.593805,4,-1180864.0,3,7,1086.675639
3,14,BaggingRegressor,No_Transformation,Not_Applicable,Not_Applicable,50,0.593844,2,-1180939.0,4,7,1086.71016
4,35,BaggingRegressor,No_Transformation,Not_Applicable,Not_Applicable,50,0.593759,5,-1181003.0,5,7,1086.739648
5,32,RandomForestRegressor,No_Transformation,Not_Applicable,Not_Applicable,100,0.593717,6,-1181122.0,6,7,1086.794289
6,32,RandomForestRegressor,No_Transformation,Not_Applicable,Not_Applicable,50,0.593645,7,-1181361.0,7,7,1086.904421
7,34,BaggingRegressor,No_Transformation,Not_Applicable,Not_Applicable,100,0.593617,9,-1181386.0,8,7,1086.915777
8,34,RandomForestRegressor,No_Transformation,Not_Applicable,Not_Applicable,100,0.59362,8,-1181388.0,9,7,1086.916589
9,34,BaggingRegressor,No_Transformation,Not_Applicable,Not_Applicable,50,0.593578,10,-1181532.0,10,7,1086.982791


In [None]:
filename = '/content/drive/MyDrive/Programming/Colab Notebooks/Coding_Dojo/Week_6/test_AbJTz2l.csv'

test_df = pd.read_csv(filename)

In [None]:
X = test_df.loc[:, (test_df.columns != 'Item_Outlet_Sales') & (test_df.columns != 'Item_Identifier')]
