In [1]:
# Apprentice Chef Case - (Carlos) Andres Restrepo Ayala - Hult MsBA 2021 Class
# Prediction Model
# Assumption: The data delivered by marketing team was collected and stored correctly

"""
    Docstring:
    
    A) Purpose: The purpose of this code is to test different models to predict 
    the customer's revenue in their first year of using Apprentice Chef, Inc 
    services. The analyzed models were: OLS, Lasso, ARD, KNN and standard KNN. 
    The criteria for choosing the best model are: threshold of the test score 
    (R-squared) as 0.75, the highest test score achieved and the threshold gap 
    between the of the train and test subsets as 0, 05.

    In the end, the code scores the models based on the above criteria, first 
    shows a comparative table between the models and then the values of the 
    elements of the chosen model.
    
    Some line of code with "print" sentences were turn into comments after 
    were used to check the correct function of the code. You can turn active 
    those line to see intermediate results.
    
    B) Bugs or malfunction as the code expected.
    No bugs are detected when the code is run.
"""


# imports libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt  # also <from matplotlib import pyplot as plt>
import seaborn as sns
import csv
import statsmodels.formula.api as smf# linear regression (statsmodels)
from sklearn.model_selection import train_test_split # train/test split
from sklearn.linear_model import LinearRegression # linear regression (scikit-learn)
import sklearn.linear_model # linear models
from sklearn.linear_model import ARDRegression
from sklearn.neighbors import KNeighborsRegressor # KNN for Regression
from sklearn.preprocessing import StandardScaler # standard scaler

In [2]:
# initial preparing the dataframe

# setting pandas print options
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)


# reading the dataframe
file ='./datasets/Apprentice_Chef_Dataset.xlsx'
df_meals = pd.read_excel(io = file)

# renaming LARGEST_ORDER_SIZE variable
df_meals= df_meals.rename(columns = {'LARGEST_ORDER_SIZE': 'AVG_MEALSORDER_PER_CUSTOMER'})

# droping categorical variables for linear modeling
df_meals = df_meals.drop(['NAME', 'FIRST_NAME','FAMILY_NAME'],    # Temporary no include 'EMAIL' from list of droped
                         axis = 1)

In [3]:
# feature engineering Email
# This part of the code just can be run once

# splitting emails

# placeholder list
placeholder_lst = []

# looping over each email address
for index, col in df_meals.iterrows():
    
    # splitting email domain at '@'
    split_email = df_meals.loc[index, 'EMAIL'].split(sep = '@')
    
    # appending placeholder_lst with the results
    placeholder_lst.append(split_email)
    

# converting placeholder_lst into a DataFrame 
email_df = pd.DataFrame(placeholder_lst)

# renaming column to concatenate
email_df.columns = [ '0' , 'EMAIL_DOMAIN' ]


# defining Emails Domain Groups
professional_email_domains = ['@mmm.com', '@amex.com', '@apple.com',
                              '@boeing.com', '@caterpillar.com', '@chevron.com',
                             '@cisco.com', '@cocacola.com', '@disney.com',
                             '@dupont.com', '@exxon.com', '@ge.org', 
                              '@goldmansacs.com', '@homedepot.com', '@ibm.com',
                             '@intel.com', '@jnj.com', '@jpmorgan.com', 
                              '@mcdonalds.com', '@merck.com', '@microsoft.com',
                             '@nike.com', '@pfizer.com', '@pg.com', 
                              '@travelers.com', '@unitedtech.com', 
                              '@verizon.com','@visa.com', '@walmart.com']

personal_email_domains     = ['@gmail.com', '@yahoo.com', '@protonmail.com']

junk_email_domains         = ['@me.com', '@aol.com', '@hotmail.com', 
                              '@live.com', '@msn.com', '@passport.com']

# resetting placeholderlist
placeholder_lst = []

# looping to group observations by domain type
for domain in email_df['EMAIL_DOMAIN']:
        if  '@' + domain in professional_email_domains:
            placeholder_lst.append('PROFESSIONAL')
        
        elif '@' + domain in personal_email_domains:
            placeholder_lst.append('PERSONAL')
            
        elif '@' + domain in junk_email_domains:
            placeholder_lst.append('JUNK')
            
        else:
            placeholder_lst.append('NEW_DOMAIN')

# concatenating with original DataFrame
df_meals['DOMAIN_GROUP'] = pd.Series(placeholder_lst)

# checking results
df_meals['DOMAIN_GROUP'].value_counts()

# one hot encoding 
one_hot_email_domain       = pd.get_dummies(df_meals['DOMAIN_GROUP'])

# dropping categorical variables after they've been encoded
df_meals = df_meals.drop('DOMAIN_GROUP', axis = 1)

# joining codings together
df_meals = df_meals.join([one_hot_email_domain])

# removing original EMAIL variable
df_meals = df_meals.drop(['EMAIL'], axis = 1)

In [4]:
# feature engineering

# Log transformation for Revenues
df_meals['LOG_REVENUE'] = np.log10(df_meals['REVENUE'])

# Log transformation for explanatory variables
df_meals['LOG_TOTAL_MEALS_ORDERED'] = np.log10(df_meals['TOTAL_MEALS_ORDERED'])
df_meals['LOG_AVG_TIME_PER_SITE_VISIT'] = np.log10(df_meals['AVG_TIME_PER_SITE_VISIT'])
df_meals['LOG_AVG_PREP_VID_TIME'] = np.log10(df_meals['AVG_PREP_VID_TIME'])
df_meals['LOG_UNIQUE_MEALS_PURCH'] = np.log10(df_meals['UNIQUE_MEALS_PURCH'])
df_meals['LOG_CONTACTS_W_CUSTOMER_SERVICE'] = np.log10(df_meals['CONTACTS_W_CUSTOMER_SERVICE'])
df_meals['LOG_AVG_MEALSORDER_PER_CUSTOMER'] = np.log10(df_meals['AVG_MEALSORDER_PER_CUSTOMER'])

In [5]:
# feature engineering

# Recoding Explanatory variables using dummy variables

# Dummy variables for recoding
df_meals['WEEKLY_SUBSCRIPTION'] = 0
df_meals['SAW_INSTRUCTIONS']    = 0
df_meals['TOOK_MASTERCLASS']    = 0


# Iterating over each original column to
# change values in the new feature columns
for index, value in df_meals.iterrows():
    
    # WEEKLY_PLAN
    if df_meals.loc[index, 'WEEKLY_PLAN'] > 0:
        df_meals.loc[index, 'WEEKLY_SUBSCRIPTION'] = 1
        
    # AVG_PREP_VID_TIME
    if df_meals.loc[index, 'AVG_PREP_VID_TIME'] > 0:
        df_meals.loc[index, 'SAW_INSTRUCTIONS'] = 1
    
    # MASTER_CLASSES_ATTENDED
    if df_meals.loc[index, 'MASTER_CLASSES_ATTENDED'] > 0:
        df_meals.loc[index, 'TOOK_MASTERCLASS'] = 1

In [6]:
# featuring engineering

# recoding TOTAL_PHOTOS_VIEWED - counting the number of zeroes for 
photos_zeroes   = len(df_meals['TOTAL_PHOTOS_VIEWED'][df_meals['TOTAL_PHOTOS_VIEWED'] == 0]) # TOTAL PHOTOS

# printing a table of the results
#print(f"""
#                 No\t\tYes
#               ---------------------
#Photos       | {photos_zeroes}\t\t{len(df_meals) - photos_zeroes}
#""")

# recogind the variable
df_meals['PHOTOS_VIEWED']    = 0


# Iterating over each original column to
# change values in the new feature columns
for index, value in df_meals.iterrows():
    
    # WEEKLY_PLAN
    if df_meals.loc[index, 'TOTAL_PHOTOS_VIEWED'] > 0:
        df_meals.loc[index, 'PHOTOS_VIEWED'] = 1

In [7]:
# Spliting the dataset into train and test subsets

# preparing explanatory variable data
df_meals_data   = df_meals.drop(['REVENUE',
                                 'LOG_REVENUE'],
                                  axis = 1)


# preparing response variables
df_meals_target = df_meals.loc[ : , 'LOG_REVENUE']

# preparing training and testing sets (all letters are lowercase)
x_train, x_test, y_train, y_test = train_test_split(
            df_meals_data,
            df_meals_target,
            test_size = 0.25,
            random_state = 219)


# checking the shapes of the datasets
print(f"""
Training Data
-------------
X-side: {x_train.shape}
y-side: {y_train.shape}


Testing Data
------------
X-side: {x_test.shape}
y-side: {y_test.shape}
""")


Training Data
-------------
X-side: (1459, 37)
y-side: (1459,)


Testing Data
------------
X-side: (487, 37)
y-side: (487,)



In [8]:
# converting model_performance into a DataFrame
df_meals = pd.DataFrame(df_meals)


# sending model results to Excel
df_meals.to_excel('./datasets/regression_df_meals.xlsx',
                           index = False)

In [9]:
df_meals

Unnamed: 0,REVENUE,CROSS_SELL_SUCCESS,TOTAL_MEALS_ORDERED,UNIQUE_MEALS_PURCH,CONTACTS_W_CUSTOMER_SERVICE,PRODUCT_CATEGORIES_VIEWED,AVG_TIME_PER_SITE_VISIT,MOBILE_NUMBER,CANCELLATIONS_BEFORE_NOON,CANCELLATIONS_AFTER_NOON,TASTES_AND_PREFERENCES,PC_LOGINS,MOBILE_LOGINS,WEEKLY_PLAN,EARLY_DELIVERIES,LATE_DELIVERIES,PACKAGE_LOCKER,REFRIGERATED_LOCKER,AVG_PREP_VID_TIME,AVG_MEALSORDER_PER_CUSTOMER,MASTER_CLASSES_ATTENDED,MEDIAN_MEAL_RATING,AVG_CLICKS_PER_VISIT,TOTAL_PHOTOS_VIEWED,JUNK,NEW_DOMAIN,PERSONAL,PROFESSIONAL,LOG_REVENUE,LOG_TOTAL_MEALS_ORDERED,LOG_AVG_TIME_PER_SITE_VISIT,LOG_AVG_PREP_VID_TIME,LOG_UNIQUE_MEALS_PURCH,LOG_CONTACTS_W_CUSTOMER_SERVICE,LOG_AVG_MEALSORDER_PER_CUSTOMER,WEEKLY_SUBSCRIPTION,SAW_INSTRUCTIONS,TOOK_MASTERCLASS,PHOTOS_VIEWED
0,393.0,1,14,6,12,10,48.00,1,3,1,1,5,2,0,0,2,0,0,33.4,1,0,1,17,0,0,1,0,0,2.594393,1.146128,1.681241,1.523746,0.778151,1.079181,0.000000,0,1,0,0
1,1365.0,1,87,3,8,8,40.35,1,0,0,1,5,1,12,0,2,0,0,84.8,1,0,3,13,170,0,0,0,1,3.135133,1.939519,1.605844,1.928396,0.477121,0.903090,0.000000,1,1,0,1
2,800.0,1,15,7,11,5,19.77,1,3,0,1,6,1,1,0,1,0,0,63.0,1,0,2,16,0,0,0,0,1,2.903090,1.176091,1.296007,1.799341,0.845098,1.041393,0.000000,1,1,0,0
3,600.0,1,13,6,11,5,90.00,1,2,0,1,6,1,14,0,3,0,0,43.8,1,0,2,14,0,0,0,0,1,2.778151,1.113943,1.954243,1.641474,0.778151,1.041393,0.000000,1,1,0,0
4,1490.0,1,47,8,6,10,40.38,1,0,0,0,5,1,5,0,8,0,0,84.8,1,1,3,12,205,0,0,0,1,3.173186,1.672098,1.606166,1.928396,0.903090,0.778151,0.000000,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1941,3450.0,0,87,8,8,7,108.90,1,0,0,1,6,1,6,0,3,0,0,212.5,10,2,3,11,0,0,0,1,0,3.537819,1.939519,2.037028,2.327359,0.903090,0.903090,1.000000,1,1,1,0
1942,5829.0,0,244,4,7,2,133.91,1,1,2,1,5,1,13,0,3,0,0,282.2,10,1,4,10,424,0,0,1,0,3.765594,2.387390,2.126813,2.450557,0.602060,0.845098,1.000000,1,1,1,1
1943,1900.0,0,57,2,8,4,102.71,1,2,0,1,6,1,11,3,7,0,0,254.4,10,0,4,12,480,0,0,1,0,3.278754,1.755875,2.011613,2.405517,0.301030,0.903090,1.000000,1,1,0,1
1944,1600.0,0,74,3,10,10,638.87,0,0,0,1,5,2,5,0,3,0,0,564.2,10,3,3,11,796,0,0,1,0,3.204120,1.869232,2.805412,2.751433,0.477121,1.000000,1.000000,1,1,1,1


In [10]:
# preparing data for OLS regression using SciKit-learn 

# declaring X variables that were statistically significant for OSL regression

x_variables = ['CROSS_SELL_SUCCESS', 'TOTAL_MEALS_ORDERED', 
               'UNIQUE_MEALS_PURCH', 'AVG_PREP_VID_TIME', 
               'AVG_MEALSORDER_PER_CUSTOMER', 'MEDIAN_MEAL_RATING',
               'TOTAL_PHOTOS_VIEWED', 'LOG_TOTAL_MEALS_ORDERED',
               'LOG_AVG_PREP_VID_TIME', 'WEEKLY_SUBSCRIPTION',
               'SAW_INSTRUCTIONS', 'TOOK_MASTERCLASS',
               'LOG_UNIQUE_MEALS_PURCH', 'PHOTOS_VIEWED',
               'JUNK', 'NEW_DOMAIN', 'PERSONAL', 'PROFESSIONAL']

# preparing x-variables from the OLS model
ols_data = df_meals[x_variables]

# preparing response variable
df_meals_target = df_meals['LOG_REVENUE']

###############################################
## setting up more than one train-test split ##
###############################################
# FULL X-dataset (normal Y)
x_train_FULL, x_test_FULL, y_train_FULL, y_test_FULL = train_test_split(
            df_meals_data,     # x-variables
            df_meals_target,   # y-variable
            test_size = 0.25,
            random_state = 219)


# OLS p-value x-dataset (normal Y)
x_train_OLS, x_test_OLS, y_train_OLS, y_test_OLS = train_test_split(
            ols_data,         # x-variables
            df_meals_target,   # y-variable
            test_size = 0.25,
            random_state = 219)


In [11]:
# OLS regression using SciKit-Learn
 
# INSTANTIATING a model object
lr = LinearRegression()


# FITTING to the training data  # check _OLS to all regression code lines AND CHECK WITH AND WITHOUT OLS (OR _FULL)
lr_fit = lr.fit(x_train_OLS, y_train_OLS)


# PREDICTING on new data
lr_pred = lr_fit.predict(x_test_OLS)


# SCORING the results
print('OLS Training Score :', lr.score(x_train_OLS, y_train_OLS).round(4))  # using R-square
print('OLS Testing Score  :',  lr.score(x_test_OLS, y_test_OLS).round(4)) # using R-square

lr_train_score = lr.score(x_train_OLS, y_train_OLS).round(4)
lr_test_score = lr.score(x_test_OLS, y_test_OLS).round(4)

# displaying and saving the gap between training and testing
print('OLS Train-Test Gap :', abs(lr_train_score - lr_test_score).round(4))
lr_test_gap = abs(lr_train_score - lr_test_score).round(4)

OLS Training Score : 0.7615
OLS Testing Score  : 0.7479
OLS Train-Test Gap : 0.0136


In [12]:
# OLS regression model
# zipping each feature name to its coefficient
lr_model_values = zip(df_meals_data[x_variables].columns,
                      lr_fit.coef_.round(decimals = 2))


# setting up a placeholder list to store model features
lr_model_lst = [('intercept', lr_fit.intercept_.round(decimals = 2))]

# printing out each feature-coefficient pair one by one
for val in lr_model_values:
    lr_model_lst.append(val)
    

# checking the results
#for pair in lr_model_lst:
#    print(pair)

In [13]:
# LASSO regression using SciKit-Learn

# INSTANTIATING a model object
alpha_value = 0.000003
lasso_model = sklearn.linear_model.Lasso(alpha = alpha_value,
                                         normalize = True) # default magitude


# FITTING to the training data
lasso_fit = lasso_model.fit(x_train_FULL, y_train_FULL)


# PREDICTING on new data
lasso_pred = lasso_fit.predict(x_test_FULL)


# SCORING the results
print('Lasso Training Score :', lasso_model.score(x_train_FULL, y_train_FULL).round(4))
print('Lasso Testing Score  :', lasso_model.score(x_test_FULL, y_test_FULL).round(4))


## the following code has been provided for you ##

# saving scoring data for future use
lasso_train_score = lasso_model.score(x_train_FULL, y_train_FULL).round(4) # using R-square
lasso_test_score  = lasso_model.score(x_test_FULL, y_test_FULL).round(4)   # using R-square


# displaying and saving the gap between training and testing
print('Lasso Train-Test Gap :', abs(lasso_train_score - lasso_test_score).round(4))
lasso_test_gap = abs(lasso_train_score - lasso_test_score).round(4)

Lasso Training Score : 0.7737
Lasso Testing Score  : 0.7642
Lasso Train-Test Gap : 0.0095


In [14]:
# Lasso regression model
# zipping each feature name to its coefficient
lasso_model_values = zip(df_meals_data.columns, lasso_fit.coef_.round(decimals = 2))


# setting up a placeholder list to store model features
lasso_model_lst = [('intercept', lasso_fit.intercept_.round(decimals = 2))]


# printing out each feature-coefficient pair one by one
for val in lasso_model_values:
    lasso_model_lst.append(val)
    

# checking the results
#for pair in lasso_model_lst:
#    print(pair)
    
# dropping coefficients that are equal to zero

# printing out each feature-coefficient pair one by one
for feature, coefficient in lasso_model_lst:
        
        if coefficient == 0:
            lasso_model_lst.remove((feature, coefficient))

            
# checking the results
#for pair in lasso_model_lst:
#    print(pair)

In [15]:
# ARD regression using SciKit-Learn

# INSTANTIATING a model object
ard_model = ARDRegression()


# FITTING the training data
ard_fit = ard_model.fit(x_train_FULL, y_train_FULL)


# PREDICTING on new data
ard_pred = ard_fit.predict(x_test_FULL)


print('Training Score     :', ard_model.score(x_train_FULL, y_train_FULL).round(4))
print('Testing Score      :',  ard_model.score(x_test_FULL, y_test_FULL).round(4))


# saving scoring data for future use
ard_train_score = ard_model.score(x_train_FULL, y_train_FULL).round(4)
ard_test_score  = ard_model.score(x_test_FULL, y_test_FULL).round(4)


# displaying and saving the gap between training and testing
print('ARD Train-Test Gap :', abs(ard_train_score - ard_test_score).round(4))
ard_test_gap = abs(ard_train_score - ard_test_score).round(4)

Training Score     : 0.7625
Testing Score      : 0.7592
ARD Train-Test Gap : 0.0033


In [16]:
# ARD regression model

# zipping each feature name to its coefficient
ard_model_values = zip(df_meals_data.columns, ard_fit.coef_.round(decimals = 5))


# setting up a placeholder list to store model features
ard_model_lst = [('intercept', ard_fit.intercept_.round(decimals = 2))]


# printing out each feature-coefficient pair one by one
for val in ard_model_values:
    ard_model_lst.append(val)
    

# checking the results
#for pair in ard_model_lst:
#    print(pair)

# dropping coefficients that are equal to zero

# printing out each feature-coefficient pair one by one
for feature, coefficient in ard_model_lst:
        
        if coefficient == 0:
            ard_model_lst.remove((feature, coefficient))

            
# checking the results
#for pair in ard_model_lst:
#    print(pair)

In [17]:
# preparing the dataset for KNN model

# preparing explanatory variable data
df_meals_knndata   = df_meals.drop(['REVENUE',
                                   'LOG_REVENUE'],
                                    axis = 1)

# preparing the target variable
df_meals_knntarget = df_meals.loc[ : , 'LOG_REVENUE']

# INSTANTIATING a StandardScaler() object
scaler = StandardScaler()


# FITTING the scaler with housing_data
scaler.fit(df_meals_knndata)


# TRANSFORMING our data after fit
X_scaled = scaler.transform(df_meals_knndata)


# converting scaled data into a DataFrame
X_scaled_df = pd.DataFrame(X_scaled)

# adding labels to the scaled DataFrame
X_scaled_df.columns = df_meals_knndata.columns

# checking the results
#X_scaled_df.describe().round(2)

In [18]:
# building the first knn model

# INSTANTIATING a KNN model object
knn_reg = KNeighborsRegressor(algorithm = 'auto',
                              n_neighbors = 1)


# FITTING to the training data
knn_fit = knn_reg.fit(x_train, y_train)


# PREDICTING on new data
knn_reg_pred = knn_fit.predict(x_test)


# SCORING the results
print('KNN Training Score:', knn_reg.score(x_train, y_train).round(4))
print('KNN Testing Score :',  knn_reg.score(x_test, y_test).round(4))


# saving scoring data for future use
knn_reg_score_train = knn_reg.score(x_train, y_train).round(4)
knn_reg_score_test  = knn_reg.score(x_test, y_test).round(4)


# displaying and saving the gap between training and testing
print('KNN Train-Test Gap:', abs(knn_reg_score_train - knn_reg_score_test).round(4))
knn_reg_test_gap = abs(knn_reg_score_train - knn_reg_score_test).round(4)

KNN Training Score: 1.0
KNN Testing Score : 0.4636
KNN Train-Test Gap: 0.5364


In [19]:
# calculating the optimal number of Neighbors

# creating lists for training set accuracy and test set accuracy
training_accuracy = []
test_accuracy     = []


# building a visualization of 1 to 50 neighbors
neighbors_settings = range(1, 21)


for n_neighbors in neighbors_settings:
    # Building the model
    clf = KNeighborsRegressor(n_neighbors = n_neighbors)
    clf.fit(x_train, y_train)
    
    # Recording the training set accuracy
    training_accuracy.append(clf.score(x_train, y_train))
    
    # Recording the generalization accuracy
    test_accuracy.append(clf.score(x_test, y_test))


# plotting the visualization
#fig, ax = plt.subplots(figsize=(12,8))
#plt.plot(neighbors_settings, training_accuracy, label = "training accuracy")
#plt.plot(neighbors_settings, test_accuracy, label = "test accuracy")
#plt.ylabel("Accuracy")
#plt.xlabel("n_neighbors")
#plt.legend()
#plt.show()

# finding the optimal number of neighbors
opt_neighbors = test_accuracy.index(max(test_accuracy)) + 1
print(f"""The optimal number of neighbors is {opt_neighbors}""")

The optimal number of neighbors is 12


In [20]:
# running the knn model

# INSTANTIATING a KNN model object
knn_reg = KNeighborsRegressor(algorithm = 'auto',
                              n_neighbors = opt_neighbors)


# FITTING to the training data
knn_fit = knn_reg.fit(x_train, y_train)


# PREDICTING on new data
knn_reg_pred = knn_fit.predict(x_test)


# SCORING the results
print('KNN Training Score:', knn_reg.score(x_train, y_train).round(4))
print('KNN Testing Score :',  knn_reg.score(x_test, y_test).round(4))


# saving scoring data for future use
knn_reg_score_train = knn_reg.score(x_train, y_train).round(4)
knn_reg_score_test  = knn_reg.score(x_test, y_test).round(4)


# displaying and saving the gap between training and testing
print('KNN Train-Test Gap:', abs(knn_reg_score_train - knn_reg_score_test).round(4))
knn_reg_test_gap = abs(knn_reg_score_train - knn_reg_score_test).round(4)

KNN Training Score: 0.6752
KNN Testing Score : 0.6643
KNN Train-Test Gap: 0.0109


In [21]:
# preparing the dataframe for knn model with Standardized Data

# this is the exact code we were using before
X_train_STAND, X_test_STAND, y_train_STAND, y_test_STAND = train_test_split(
            X_scaled_df,
            df_meals_knntarget,
            test_size = 0.25,
            random_state = 219)

# creating lists for training set accuracy and test set accuracy
training_accuracy = []
test_accuracy = []


# building a visualization of 1 to 50 neighbors
neighbors_settings = range(1, 21)


for n_neighbors in neighbors_settings:
    # Building the model
    clf = KNeighborsRegressor(n_neighbors = n_neighbors)
    clf.fit(X_train_STAND, y_train_STAND)
    
    # Recording the training set accuracy
    training_accuracy.append(clf.score(X_train_STAND, y_train_STAND))
    
    # Recording the generalization accuracy
    test_accuracy.append(clf.score(X_test_STAND, y_test_STAND))


# plotting the visualization
#fig, ax = plt.subplots(figsize=(12,8))
#plt.plot(neighbors_settings, training_accuracy, label = "training accuracy")
#plt.plot(neighbors_settings, test_accuracy,     label = "test accuracy")
#plt.ylabel("Accuracy")
#plt.xlabel("n_neighbors")
#plt.legend()
#plt.show()


# finding the optimal number of neighbors
opt_neighbors_stand = test_accuracy.index(max(test_accuracy)) + 1
print(f"""The optimal number of neighbors is {opt_neighbors_stand}""")

The optimal number of neighbors is 8


In [22]:
# INSTANTIATING a model with the optimal number of neighbors
knn_stand = KNeighborsRegressor(algorithm = 'auto',
                                n_neighbors = opt_neighbors_stand)



# FITTING the model based on the training data
knn_stand_fit = knn_stand.fit(X_train_STAND, y_train_STAND)



# PREDITCING on new data
knn_stand_pred = knn_stand_fit.predict(X_test_STAND)



# SCORING the results
print('KNN Training Score:', knn_stand.score(X_train_STAND, y_train_STAND).round(4))
print('KNN Testing Score :',  knn_stand.score(X_test_STAND, y_test_STAND).round(4))


# saving scoring data for future use
knn_stand_score_train = knn_stand.score(X_train_STAND, y_train_STAND).round(4)
knn_stand_score_test  = knn_stand.score(X_test_STAND, y_test_STAND).round(4)


# displaying and saving the gap between training and testing
print('KNN Train-Test Gap:', abs(knn_stand_score_train - knn_stand_score_test).round(4))
knn_stand_test_gap = abs(knn_stand_score_train - knn_stand_score_test).round(4)


KNN Training Score: 0.7527
KNN Testing Score : 0.7139
KNN Train-Test Gap: 0.0388


In [31]:
# creating a dictionary for linear models results
model_performance = {
    
    'Model Type'    : ['OLS', 'Lasso', 'ARD'],
           
    'Training' : [lr_train_score, lasso_train_score,
                                ard_train_score],
           
    'Testing'  : [lr_test_score, lasso_test_score,
                                   ard_test_score],
                    
    'Train-Test Gap' : [lr_test_gap, lasso_test_gap,
                                        ard_test_gap],
                    
    'Model Size' : [len(lr_model_lst), len(lasso_model_lst),
                                    len(ard_model_lst)],
                    
    'Model' : [lr_model_lst, lasso_model_lst, ard_model_lst]}

# creating a dictionary for KNN models results
model_performance = {
    
    'Model Type'    : ['OLS', 'LASSO', 'ARD'],
           
    
    'Training' : [lr_train_score, lasso_train_score, ard_train_score],
           
    
    'Testing'  : [lr_test_score, lasso_test_score, ard_test_score],
                    
    
    'Train-Test Gap' : [lr_test_gap, lasso_test_gap, ard_test_gap],
                   
    
    'Model Size' : [len(lr_model_lst), len(lasso_model_lst),
                                    len(ard_model_lst)],
                    
    'Model'      : [lr_model_lst, lasso_model_lst, ard_model_lst] }

# comparing results

print(f"""
Model                Train Score      Test Score      Train-Test Gap      \
Model Size
_____                ___________      __________      ______________      \
__________
OLS                  {lr_train_score}           {lr_test_score}          \
{lr_test_gap}              {len(lr_model_lst)}
Lasso                {lasso_train_score}           {lasso_test_score}          \
{lasso_test_gap}              {len(lasso_model_lst)}
ARD                  {ard_train_score}           {ard_test_score}          \
{ard_test_gap}              {len(ard_model_lst)}
""")

# comparing results

print(f"""
KNN Model            Train Score      Test Score      Train-Test Gap       Neighbors 
________________     ___________      __________      ______________       _________
Non-Standardized     {knn_reg_score_train}          \
 {knn_reg_score_test}          {knn_reg_test_gap}        \
       {opt_neighbors}  
Standardized         \
{knn_stand_score_train}           {knn_stand_score_test}          {knn_stand_test_gap}        \
       {opt_neighbors_stand}
""")

# converting model_performance into a DataFrame
model_performance = pd.DataFrame(model_performance)


# sending model results to Excel
model_performance.to_excel('./model_results/linear_model_performance.xlsx',
                           index = False)


Model                Train Score      Test Score      Train-Test Gap      Model Size
_____                ___________      __________      ______________      __________
OLS                  0.7615           0.7479          0.0136              19
Lasso                0.7737           0.7642          0.0095              27
ARD                  0.7625           0.7592          0.0033              23


KNN Model            Train Score      Test Score      Train-Test Gap       Neighbors 
________________     ___________      __________      ______________       _________
Non-Standardized     0.6752           0.6643          0.0109               12  
Standardized         0.7527           0.7139          0.0388               8



In [24]:
print("""OLS regression model""")
for index in lr_model_lst:
    print(index)

OLS regression model
('intercept', 1.82)
('CROSS_SELL_SUCCESS', -0.02)
('TOTAL_MEALS_ORDERED', -0.0)
('UNIQUE_MEALS_PURCH', 0.05)
('AVG_PREP_VID_TIME', 0.0)
('AVG_MEALSORDER_PER_CUSTOMER', -0.02)
('MEDIAN_MEAL_RATING', 0.06)
('TOTAL_PHOTOS_VIEWED', 0.0)
('LOG_TOTAL_MEALS_ORDERED', 0.3)
('LOG_AVG_PREP_VID_TIME', 0.41)
('WEEKLY_SUBSCRIPTION', -0.01)
('SAW_INSTRUCTIONS', 0.0)
('TOOK_MASTERCLASS', 0.04)
('LOG_UNIQUE_MEALS_PURCH', -0.61)
('PHOTOS_VIEWED', 0.02)
('JUNK', -0.01)
('NEW_DOMAIN', 0.01)
('PERSONAL', -0.0)
('PROFESSIONAL', -0.0)


In [25]:
print("""Lasso regression model""")
for index in lasso_model_lst:
    print(index)

Lasso regression model
('intercept', 1.64)
('CROSS_SELL_SUCCESS', -0.02)
('UNIQUE_MEALS_PURCH', 0.05)
('PRODUCT_CATEGORIES_VIEWED', 0.0)
('MOBILE_NUMBER', 0.0)
('CANCELLATIONS_AFTER_NOON', -0.0)
('TASTES_AND_PREFERENCES', 0.01)
('MOBILE_LOGINS', -0.01)
('EARLY_DELIVERIES', -0.0)
('PACKAGE_LOCKER', -0.01)
('REFRIGERATED_LOCKER', -0.01)
('AVG_MEALSORDER_PER_CUSTOMER', -0.02)
('MASTER_CLASSES_ATTENDED', 0.01)
('MEDIAN_MEAL_RATING', 0.05)
('TOTAL_PHOTOS_VIEWED', 0.0)
('JUNK', -0.01)
('NEW_DOMAIN', 0.01)
('PROFESSIONAL', -0.0)
('LOG_TOTAL_MEALS_ORDERED', 0.38)
('LOG_AVG_TIME_PER_SITE_VISIT', 0.02)
('LOG_AVG_PREP_VID_TIME', 0.4)
('LOG_UNIQUE_MEALS_PURCH', -0.62)
('LOG_CONTACTS_W_CUSTOMER_SERVICE', 0.16)
('LOG_AVG_MEALSORDER_PER_CUSTOMER', 0.03)
('WEEKLY_SUBSCRIPTION', -0.02)
('TOOK_MASTERCLASS', 0.03)
('PHOTOS_VIEWED', 0.02)


In [26]:
print("""
ARD regression model""")
for index in ard_model_lst:
    print(index)


ARD regression model
('intercept', 1.46)
('CROSS_SELL_SUCCESS', -0.01262)
('UNIQUE_MEALS_PURCH', 0.04879)
('PRODUCT_CATEGORIES_VIEWED', 0.0)
('MOBILE_NUMBER', 0.0)
('CANCELLATIONS_AFTER_NOON', 0.0)
('PC_LOGINS', 0.0)
('WEEKLY_PLAN', 0.0)
('LATE_DELIVERIES', 0.0)
('REFRIGERATED_LOCKER', 0.0)
('AVG_MEALSORDER_PER_CUSTOMER', -0.01493)
('MEDIAN_MEAL_RATING', 0.06234)
('TOTAL_PHOTOS_VIEWED', 0.0)
('NEW_DOMAIN', 0.0)
('PROFESSIONAL', 0.0)
('LOG_TOTAL_MEALS_ORDERED', 0.25937)
('LOG_AVG_TIME_PER_SITE_VISIT', 0.01253)
('LOG_AVG_PREP_VID_TIME', 0.57674)
('LOG_UNIQUE_MEALS_PURCH', -0.60116)
('LOG_CONTACTS_W_CUSTOMER_SERVICE', 0.11041)
('WEEKLY_SUBSCRIPTION', -0.01013)
('TOOK_MASTERCLASS', 0.03694)
('PHOTOS_VIEWED', 0.03262)


In [27]:
print("""
For KNN models, doesn't apply model and model size
""")


For KNN models, doesn't apply model and model size



In [28]:
# rating the models

# objects to store the ratinga
rating_ols_regression   = 0
rating_lasso_regression = 0
rating_ard_regression   = 0
rating_knn_model        = 0
rating_knn_stand_moder  = 0

# scoring for t_test_score above the treshold = 0.75 
if lr_test_score > 0.75: rating_ols_regression += 1
if lasso_test_score > 0.75: rating_lasso_regression += 1
if ard_test_score > 0.75: rating_ard_regression += 1
if knn_reg_score_test > 0.75: rating_knn_model += 1
if knn_stand_score_test > 0.75: rating_knn_stand_moder += 1

# scoring tfor rain_test_gap below the treshold = 0.05
if lr_test_gap < 0.05: rating_ols_regression += 1
if lasso_test_gap < 0.05: rating_lasso_regression += 1
if ard_test_gap < 0.05: rating_ard_regression += 1
if knn_reg_test_gap < 0.05: rating_knn_model += 1
if knn_stand_score_test <0.05: rating_knn_stand_moder += 1

# scoring for the model with highest test_score 
if lr_test_score       == max(lr_test_score, lasso_test_score, ard_test_score, ard_test_score,\
                                knn_reg_score_test, knn_stand_score_test):
                                rating_ols_regression += 1
if lasso_test_score    == max(lr_test_score, lasso_test_score, ard_test_score, ard_test_score,\
                                knn_reg_score_test, knn_stand_score_test):
                                rating_lasso_regression += 1
if ard_test_score      == max(lr_test_score, lasso_test_score, ard_test_score, ard_test_score,\
                                knn_reg_score_test, knn_stand_score_test):
                                rating_ard_regression += 1
if knn_reg_score_test  == max(lr_test_score, lasso_test_score, ard_test_score, ard_test_score,\
                                knn_reg_score_test, knn_stand_score_test): 
                                rating_knn_model += 1
if knn_stand_score_test == max(lr_test_score, lasso_test_score, ard_test_score, ard_test_score,\
                                knn_reg_score_test, knn_stand_score_test): 
                                rating_knn_stand_moder += 1

model_scoring = [rating_ols_regression, rating_lasso_regression, rating_ard_regression,
                 rating_knn_model, rating_knn_stand_moder]        

chose_model =[]
# scoring tfor rain_test_gap below the treshold = 0.05
if rating_ols_regression   == max(model_scoring): chosen_model ='OLS'
if rating_lasso_regression == max(model_scoring): chosen_model ='LASSO'
if rating_ard_regression   == max(model_scoring): chosen_model ='ARD'
if rating_knn_model        == max(model_scoring): chosen_model ='KNN'
if rating_knn_stand_moder  == max(model_scoring): chosen_model ='KNN STAND'

    
# creating items of the chosen model
train_score_best    = []
test_score_best     = []
train_test_gap_best = []
model_size_best     = []
neighbors_best      = []
model_best          = []    

# filling the items of the best model items
if chosen_model == 'OLS':
    train_score_best    = lr_train_score
    test_score_best     = lr_test_score
    train_test_gap_best = lr_test_gap
    model_size_best     = len(lr_model_lst)
    neighbors_best      = 'NA'
    model_best          = lr_model_lst
    alpha_best          = 'NA'

if chosen_model == 'LASSO':
    train_score_best    = lasso_train_score
    test_score_best     = lasso_test_score
    train_test_gap_best = lasso_test_gap
    model_size_best     = len(lasso_model_lst)
    neighbors_best      = 'NA'
    model_best          = lasso_model_lst
    alpha_best          = alpha_value
    
if chosen_model == 'ARD':
    train_score_best    = ard_train_score
    test_score_best     = ard_test_score
    train_test_gap_best = ard_test_gap
    model_size_best     = len(ard_model_lst)
    neighbors_best      = 'NA'
    model_best          = ard_model_lst
    alpha_best          = 'NA'
    
if chosen_model == 'KNN':
    train_score_best    = knn_reg_score_train
    test_score_best     = lr_test_score
    train_test_gap_best = lr_test_gap
    model_size_best     = 'NA'
    neighbors_best      = 'opt_neighbors'
    model_best          = 'NA'
    alpha_best          = 'NA'
    
if chosen_model == 'KNN STAND':
    train_score_best    = knn_stand_score_train
    test_score_best     = knn_stand_score_test
    train_test_gap_best = knn_stand_test_gap
    model_size_best     = 'NA'
    neighbors_best      = opt_neighbors_stand
    model_best          = 'NA'
    alpha_best          = 'NA'
    
print(f""" The chosen_model is: {chosen_model}""")
    

 The chosen_model is: LASSO


____________________________________________________________________________________
**Chosen Model**

In [29]:
# generating the final report

print(f""" Elements of the chosen model:

Model:              {chosen_model}

Item                 Value             
________________     ______      
Trin score:          {train_score_best}
Test score:          {test_score_best}
Train-Test gap:      {train_test_gap_best}
Model size:          {model_size_best}
Num of Neihbors:     {neighbors_best}
Aplha value:         {alpha_value}
""")  


 Elements of the chosen model:

Model:              LASSO

Item                 Value             
________________     ______      
Trin score:          0.7737
Test score:          0.7642
Train-Test gap:      0.0095
Model size:          27
Num of Neihbors:     NA
Aplha value:         3e-06



In [30]:
print(f""" 
Details of the chosen model:
____________________________
""")
for index in model_best: 
    print(index)

 
Details of the chosen model:
____________________________

('intercept', 1.64)
('CROSS_SELL_SUCCESS', -0.02)
('UNIQUE_MEALS_PURCH', 0.05)
('PRODUCT_CATEGORIES_VIEWED', 0.0)
('MOBILE_NUMBER', 0.0)
('CANCELLATIONS_AFTER_NOON', -0.0)
('TASTES_AND_PREFERENCES', 0.01)
('MOBILE_LOGINS', -0.01)
('EARLY_DELIVERIES', -0.0)
('PACKAGE_LOCKER', -0.01)
('REFRIGERATED_LOCKER', -0.01)
('AVG_MEALSORDER_PER_CUSTOMER', -0.02)
('MASTER_CLASSES_ATTENDED', 0.01)
('MEDIAN_MEAL_RATING', 0.05)
('TOTAL_PHOTOS_VIEWED', 0.0)
('JUNK', -0.01)
('NEW_DOMAIN', 0.01)
('PROFESSIONAL', -0.0)
('LOG_TOTAL_MEALS_ORDERED', 0.38)
('LOG_AVG_TIME_PER_SITE_VISIT', 0.02)
('LOG_AVG_PREP_VID_TIME', 0.4)
('LOG_UNIQUE_MEALS_PURCH', -0.62)
('LOG_CONTACTS_W_CUSTOMER_SERVICE', 0.16)
('LOG_AVG_MEALSORDER_PER_CUSTOMER', 0.03)
('WEEKLY_SUBSCRIPTION', -0.02)
('TOOK_MASTERCLASS', 0.03)
('PHOTOS_VIEWED', 0.02)
