***
***
***

<br><h1>A1 - Regression based Analysis</h1>
<h3> Machine Learning </h3><br><br>
Submitted by - Brian Dsouza<br>
Hult International Business School<br><br><br>

***
***
***

***
***
<h2>Exploratory Data Analysis</h2>

***
***

In [None]:
#Importing necessary libraries
import pandas                  as pd    
import matplotlib.pyplot       as plt
import seaborn                 as sns
import statsmodels.formula.api as smf 
from sklearn.model_selection   import train_test_split 
from sklearn.neighbors         import KNeighborsRegressor 
from sklearn.preprocessing     import StandardScaler
from sklearn.linear_model      import LogisticRegression         
from sklearn.metrics           import confusion_matrix           
from sklearn.metrics           import roc_auc_score            
from sklearn.neighbors         import KNeighborsClassifier       
from sklearn.neighbors         import KNeighborsRegressor        
from sklearn.preprocessing     import StandardScaler             
from sklearn.tree              import DecisionTreeClassifier     
from sklearn.tree              import export_graphviz           
from sklearn.externals.six     import StringIO                   
from IPython.display           import Image                     
import pydotplus                                                 
from sklearn.model_selection   import GridSearchCV               
from sklearn.metrics           import make_scorer                
from sklearn.ensemble          import RandomForestClassifier     
from sklearn.ensemble          import GradientBoostingClassifier 

#Saving the file path in an object named file
file = "Apprentice_Chef_Dataset.xlsx"

#Reading the file into Python for analysis
original_df = pd.read_excel(file)


***
<h6>User defined functions that are used throughout in this script</h6>

***

In [None]:
##############################################################
#Creating a function to flag missing values, mv_flagger(df)
##############################################################
def mv_flagger(df):
    """
Flags all columns that have missing values with 'm_(COLUMN NAME)'.

PARAMETERS
----------
df : DataFrame to flag missing values


RETURNS
-------
DataFrame with missing value flags."""


    for col in df:

        if df[col].isnull().astype(int).sum() > 0:
            df['m_'+col] = df[col].isnull().astype(int)
            
    return df

#########################
# text_split_feature
#########################
def text_split_feature(col, df, sep=' ', new_col_name='number_of_names'):
    """
Splits values in a string Series (as part of a DataFrame) and sums the number
of resulting items. Automatically appends summed column to original DataFrame.

PARAMETERS
----------
col          : column to split
df           : DataFrame where column is located
sep          : string sequence to split by, default ' '
new_col_name : name of new column after summing split, default
               'number_of_names'
"""
    
    df[new_col_name] = 0
    
    
    for index, val in df.iterrows():
        df.loc[index, new_col_name] = len(df.loc[index, col].split(sep = ' '))
        
########################################
# optimal_neighbors
########################################
def optimal_neighbors(X_data,
                      y_data,
                      standardize = True,
                      pct_test=0.25,
                      seed=802,
                      response_type='reg',
                      max_neighbors=20,
                      show_viz=True):
    """
Exhaustively compute training and testing results for KNN across
[1, max_neighbors]. Outputs the maximum test score and (by default) a
visualization of the results.
PARAMETERS
----------
X_data        : explanatory variable data
y_data        : response variable
standardize   : whether or not to standardize the X data, default True
pct_test      : test size for training and validation from (0,1), default 0.25
seed          : random seed to be used in algorithm, default 802
response_type : type of neighbors algorithm to use, default 'reg'
    Use 'reg' for regression (KNeighborsRegressor)
    Use 'class' for classification (KNeighborsClassifier)
max_neighbors : maximum number of neighbors in exhaustive search, default 20
show_viz      : display or surpress k-neigbors visualization, default True
"""    
    
    
    if standardize == True:
        # optionally standardizing X_data
        scaler             = StandardScaler()
        scaler.fit(X_data)
        X_scaled           = scaler.transform(X_data)
        X_scaled_df        = pd.DataFrame(X_scaled)
        X_data             = X_scaled_df



    # train-test split
    X_train, X_test, y_train, y_test = train_test_split(X_data,
                                                        y_data,
                                                        test_size = pct_test,
                                                        random_state = seed)


    # creating lists for training set accuracy and test set accuracy
    training_accuracy = []
    test_accuracy = []
    
    
    # setting neighbor range
    neighbors_settings = range(1, max_neighbors + 1)


    for n_neighbors in neighbors_settings:
        # building the model based on response variable type
        if response_type == 'reg':
            clf = KNeighborsRegressor(n_neighbors = n_neighbors)
            clf.fit(X_train, y_train)
            
        elif response_type == 'class':
            clf = KNeighborsClassifier(n_neighbors = n_neighbors)
            clf.fit(X_train, y_train)            
            
        else:
            print("Error: response_type must be 'reg' or 'class'")
        
        
        # recording the training set accuracy
        training_accuracy.append(clf.score(X_train, y_train))
    
        # recording the generalization accuracy
        test_accuracy.append(clf.score(X_test, y_test))


    # optionally displaying visualization
    if show_viz == True:
        # plotting the visualization
        fig, ax = plt.subplots(figsize=(12,8))
        plt.plot(neighbors_settings, training_accuracy, label = "training accuracy")
        plt.plot(neighbors_settings, test_accuracy, label = "test accuracy")
        plt.ylabel("Accuracy")
        plt.xlabel("n_neighbors")
        plt.legend()
        plt.show()
    
    
    # returning optimal number of neighbors
    print(f"The optimal number of neighbors is: {test_accuracy.index(max(test_accuracy))+1}")
    return test_accuracy.index(max(test_accuracy))+1


########################################
# visual_cm
########################################
def visual_cm(true_y, pred_y, labels = None):
    """
Creates a visualization of a confusion matrix.

PARAMETERS
----------
true_y : true values for the response variable
pred_y : predicted values for the response variable
labels : , default None
    """
    # visualizing the confusion matrix

    # setting labels
    lbls = labels
    

    # declaring a confusion matrix object
    cm = confusion_matrix(y_true = true_y,
                          y_pred = pred_y)


    # heatmap
    sns.heatmap(cm,
                annot       = True,
                xticklabels = lbls,
                yticklabels = lbls,
                cmap        = 'Blues',
                fmt         = 'g')


    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title('Confusion Matrix of the Classifier')
    plt.show()
    
########################################
# display_tree
########################################
def display_tree(tree, feature_df, height = 500, width = 800):
    """
    PARAMETERS
    ----------
    tree       : fitted tree model object
        fitted CART model to visualized
    feature_df : DataFrame
        DataFrame of explanatory features (used to generate labels)
    height     : int, default 500
        height in pixels to which to constrain image in html
    width      : int, default 800
        width in pixels to which to constrain image in html
    """

    # visualizing the tree
    dot_data = StringIO()

    
    # exporting tree to graphviz
    export_graphviz(decision_tree      = tree,
                    out_file           = dot_data,
                    filled             = True,
                    rounded            = True,
                    special_characters = True,
                    feature_names      = feature_df.columns)


    # declaring a graph object
    graph = pydotplus.graph_from_dot_data(dot_data.getvalue())


    # creating image
    img = Image(graph.create_png(),
                height = height,
                width  = width)
    
    return img

########################################
# plot_feature_importances
########################################
def plot_feature_importances(model, train, export = False):
    """
    Plots the importance of features from a CART model.
    
    PARAMETERS
    ----------
    model  : CART model
    train  : explanatory variable training data
    export : whether or not to export as a .png image, default False
    """
    
    # declaring the number
    n_features = X_train.shape[1]
    
    # setting plot window
    fig, ax = plt.subplots(figsize=(12,9))
    
    plt.barh(range(n_features), model.feature_importances_, align='center')
    plt.yticks(pd.np.arange(n_features), train.columns)
    plt.xlabel("Feature importance")
    plt.ylabel("Feature")
    
    if export == True:
        plt.savefig('Tree_Leaf_50_Feature_Importance.png')

In [None]:
#Viewing the entire file
original_df

In [None]:
#Viewing the column names
list(original_df.columns)

In [None]:
#Viewing the info on each variable
original_df.info()

In [None]:
#Finding the descriptive statistics of each column, round to 2 decimals
original_df.iloc[:, :].describe().round(2)

In [None]:
#Checking to see if any columns have missing values and summing them up to find the total of missing values per column
original_df.isnull().sum()

In [None]:
#Finding the variable values of quantiles 0.20, 0.40, 0.60, 0.80, 1.00
original_df.loc[:, :].quantile([0.20,
                                0.40,
                                0.60,
                                0.80,
                                1.00])

In [None]:
#original_df.loc[:, 'AVG_TIME_PER_SITE_VISIT'].quantile([0.90,
#                                                        0.95,
#                                                        0.99,
#                                                        1.00])

In [None]:
#original_df[original_df['AVG_TIME_PER_SITE_VISIT'] > 1500]

In [None]:
#original_df[original_df['AVG_TIME_PER_SITE_VISIT'] > 1500].loc[:, 'AVG_TIME_PER_SITE_VISIT']

In [None]:
#original_df.loc[:, 'TOTAL_MEALS_ORDERED'].quantile([0.90,
#                                                    0.95,
#                                                    0.99,
#                                                    1.00])

In [None]:
#original_df[original_df['TOTAL_MEALS_ORDERED'] > 450]

In [None]:
#original_df.loc[:, 'AVG_PREP_VID_TIME'].quantile([0.90,
#                                                  0.95,
#                                                  0.99,
#                                                  1.00])

In [None]:
#original_df[original_df['AVG_PREP_VID_TIME'] > 550]

In [None]:
#original_df.loc[:, 'TOTAL_PHOTOS_VIEWED'].quantile([0.90,
#                                                    0.95,
#                                                    0.99,
#                                                    1.00])

In [None]:
#original_df[original_df['TOTAL_PHOTOS_VIEWED'] > 1500]

***
Making assumptions about each variable's data type, if it is Continuous, Discrete, Binary, Count, Categorical, so that
a suitable strategy can be applied to engineer the features.
***

In [None]:

"""
Continuous:
REVENUE
PRODUCT_CATEGORIES_VIEWED (also in another place)
AVG_TIME_PER_SITE_VISIT
CANCELLATIONS_BEFORE_NOON (also in another place)
CANCELLATIONS_AFTER_NOON  (also in another place)
MOBILE_LOGINS             (also in another place)
PC_LOGINS                 (also in another place)
WEEKLY_PLAN               (also in another place)
EARLY_DELIVERIES          (also in another place)
LATE_DELIVERIES           (also in another place)
AVG_PREP_VID_TIME
AVG_CLICKS_PER_VISIT
TOTAL_PHOTOS_VIEWED

Binary:
CROSS_SELL_SUCCESS
MOBILE_NUMBER
TASTES_AND_PREFERENCES
PACKAGE_LOCKER
REFRIGERATED_LOCKER


Count:
TOTAL_MEALS_ORDERED
UNIQUE_MEALS_PURCH
CONTACTS_W_CUSTOMER_SERVICE
PRODUCT_CATEGORIES_VIEWED (also in another place)
CANCELLATIONS_BEFORE_NOON (also in another place)
CANCELLATIONS_AFTER_NOON  (also in another place)
MOBILE_LOGINS             (also in another place)
PC_LOGINS                 (also in another place)
WEEKLY_PLAN               (also in another place)
EARLY_DELIVERIES          (also in another place)
LATE_DELIVERIES           (also in another place)
MASTER_CLASSES_ATTENDED

Discrete:
FOLLOWED_RECOMMENDATIONS_PCT
LARGEST_ORDER_SIZE
MEDIAN_MEAL_RATING


"""


***
<h2>Feature treatment and Engineering</h2>

***

In [None]:
#Splitting EMAIL column to find different domains

#Creating a temporary placeholder list
placeholder_lst = []

#Looping over each email address
for index, col in original_df.iterrows(): #The iterrows() function is used to iterate over DataFrame rows as (index, Series) pairs. Iterates over the DataFrame columns, returning a tuple with the column name and the content as a Series.
    
    # splitting email domain at '@'
    split_email = original_df.loc[index, 'EMAIL'].split(sep = '@')
    
    # appending placeholder_lst with the results
    placeholder_lst.append(split_email)
    

# converting placeholder_lst into a DataFrame 
email_df = pd.DataFrame(placeholder_lst)


# displaying the results
email_df

In [None]:
#Concatenating with apprentice_chef 

#Renaming the email_df columns 
email_df.columns = ['0' , 'EMAIL_DOMAIN']


#Concatenating email_domain with apprentice_chef
original_df = pd.concat([original_df, email_df['EMAIL_DOMAIN']],
                            axis = 1)

original_df

In [None]:
# printing value counts of email_domain
original_df.loc[: ,'EMAIL_DOMAIN'].value_counts()

In [None]:
#Creating a new column for email domain groups - personal, professional, junk
personal_email_domains      = ['gmail.com', 
                               'protonmail.com', 
                               'yahoo.com']
professional_email_domains  = ['amex.com', 
                               'merck.com', 
                               'cocacola.com', 
                               'jnj.com', 
                               'mcdonalds.com', 
                               'nike.com', 
                               'apple.com', 
                               'ge.org', 
                               'dupont.com', 
                               'ibm.com', 
                               'chevron.com', 
                               'microsoft.com', 
                               'travelers.com', 
                               'exxon.com', 
                               'unitedhealth.com', 
                               'boeing.com', 
                               'verizon.com', 
                               'mmm.com', 
                               'caterpillar.com', 
                               'pg.com', 
                               'disney.com', 
                               'walmart.com', 
                               'pfizer.com', 
                               'visa.com', 
                               'jpmorgan.com', 
                               'cisco.com', 
                               'goldmansacs.com', 
                               'unitedtech.com', 
                               'homedepot.com', 
                               'intel.com']
junk_email_domains          = ['msn.com', 
                               'aol.com', 
                               'passport.com', 
                               'hotmail.com', 
                               'live.com', 
                               'me.com']

#Placeholder list
placeholder_lst = []


# looping to group observations by domain type
for domain in original_df['EMAIL_DOMAIN']:
    
    if domain in personal_email_domains:
        placeholder_lst.append('personal')
        

    elif domain in professional_email_domains:
        placeholder_lst.append('professional')

    elif domain in junk_email_domains:
        placeholder_lst.append('junk')

    else:
            print('Unknown')


#Creating a new column called DOMAIN_TYPE
original_df['DOMAIN_TYPE'] = pd.DataFrame(placeholder_lst)


original_df

In [None]:
original_df.groupby('DOMAIN_TYPE').agg({'CROSS_SELL_SUCCESS': ['count','sum', 'mean']})

<h5> INSIGHT: 1 </h5> <br><br>
* The success rate (=1) of promoting Halfway there is on average very high (~80%) among customers that register using their professional email id.<br>
* Customers using personal email id subscribe to the promotion more, 602 in total, compared to the other 2 email domain registers.<br>
* Customers registered using the junk email domain have the lowest success rate.<br><br>


In [None]:
#Flagging the missing values by calling the user defined functionn mv_flagger
mv_flagger(original_df)

In [None]:
#Viewing the dataset where people have not given their family name
original_df.loc[original_df['m_FAMILY_NAME']==1,:]

***
The last name is missing as there is a parenthesis around it. The code considered it NaN coz of the parenthesis. The email id actually provides the last names.

***

In [None]:
#Splitting the names and summing the number of resulting items
text_split_feature(col = 'NAME',
                   df  = original_df)

original_df

In [None]:
#Finding the highest words in a particular name
original_df['number_of_names'].max()

In [None]:
#Viewing the names that have 6 words
original_df['NAME'][original_df['number_of_names']==6]

In [None]:
#Viewing the names that have more than 3 words (assuming peple generally have one first, last and middle name)
original_df['NAME'][original_df['number_of_names']>3]

In [None]:
#One-hot encoding the categorical variable 'DOMAIN_TYPE'
one_hot_DOMAIN_TYPE  = pd.get_dummies(original_df['DOMAIN_TYPE'])

#Joining the one-hot encoded columns to the apprentice_chef dataset
original_df = original_df.join([one_hot_DOMAIN_TYPE])

#Viewing the new dataset
original_df

In [None]:
#Creating histograms through seaborn's distplots to visually detect outliers
fig, ax = plt.subplots(figsize = (10, 8))

plt.subplot(2, 2, 1)
sns.distplot(original_df['REVENUE'],
             bins  = 'fd',
             color = 'b')
plt.xlabel("REVENUE")

plt.subplot(2, 2, 2)
sns.distplot(original_df['CROSS_SELL_SUCCESS'],
             bins  = 'fd',
             color = 'g')
plt.xlabel("CROSS_SELL_SUCCESS")

plt.subplot(2, 2, 3)
sns.distplot(original_df['TOTAL_MEALS_ORDERED'],
             bins  = 'fd',
             color = 'y')
plt.xlabel("TOTAL_MEALS_ORDERED")

plt.subplot(2, 2, 4)
sns.distplot(original_df['UNIQUE_MEALS_PURCH'],
             bins  = 'fd',
             color = 'r')
plt.xlabel("UNIQUE_MEALS_PURCH")
plt.tight_layout()
plt.savefig('Apprentice Chef Data Histograms 1 of 7.png')
plt.show()

#####################################################

fig, ax = plt.subplots(figsize = (10, 8))

plt.subplot(2, 2, 1)
sns.distplot(original_df['CONTACTS_W_CUSTOMER_SERVICE'],
             bins  = 'fd',
             color = 'b')
plt.xlabel("CONTACTS_W_CUSTOMER_SERVICE")

plt.subplot(2, 2, 2)
sns.distplot(original_df['PRODUCT_CATEGORIES_VIEWED'],
             bins  = 'fd',
             color = 'g')
plt.xlabel("PRODUCT_CATEGORIES_VIEWED")

plt.subplot(2, 2, 3)
sns.distplot(original_df['AVG_TIME_PER_SITE_VISIT'],
             bins  = 'fd',
             color = 'y')
plt.xlabel("AVG_TIME_PER_SITE_VISIT")

plt.subplot(2, 2, 4)
sns.distplot(original_df['MOBILE_NUMBER'],
             bins  = 'fd',
             color = 'r')
plt.xlabel("MOBILE_NUMBER")
plt.tight_layout()
plt.savefig('Apprentice Chef Data Histograms 2 of 7.png')
plt.show()

#####################################################

fig, ax = plt.subplots(figsize = (10, 8))

plt.subplot(2, 2, 1)
sns.distplot(original_df['CANCELLATIONS_BEFORE_NOON'],
             bins  = 'fd',
             color = 'b')
plt.xlabel("CANCELLATIONS_BEFORE_NOON")

plt.subplot(2, 2, 2)
sns.distplot(original_df['CANCELLATIONS_AFTER_NOON'],
             bins  = 'fd',
             color = 'g')
plt.xlabel("CANCELLATIONS_AFTER_NOON")

plt.subplot(2, 2, 3)
sns.distplot(original_df['TASTES_AND_PREFERENCES'],
             bins  = 'fd',
             color = 'y')
plt.xlabel("TASTES_AND_PREFERENCES")

plt.subplot(2, 2, 4)
sns.distplot(original_df['PC_LOGINS'],
             bins  = 'fd',
             color = 'r')
plt.xlabel("PC_LOGINS")
plt.tight_layout()
plt.savefig('Apprentice Chef Data Histograms 3 of 7.png')
plt.show()

#####################################################

fig, ax = plt.subplots(figsize = (10, 8))

plt.subplot(2, 2, 1)
sns.distplot(original_df['MOBILE_LOGINS'],
             bins  = 'fd',
             color = 'b')
plt.xlabel("MOBILE_LOGINS")

plt.subplot(2, 2, 2)
sns.distplot(original_df['WEEKLY_PLAN'],
             bins  = 'fd',
             color = 'g')
plt.xlabel("WEEKLY_PLAN")

plt.subplot(2, 2, 3)
sns.distplot(original_df['EARLY_DELIVERIES'],
             bins  = 'fd',
             color = 'y')
plt.xlabel("EARLY_DELIVERIES")

plt.subplot(2, 2, 4)
sns.distplot(original_df['LATE_DELIVERIES'],
             bins  = 'fd',
             color = 'r')
plt.xlabel("LATE_DELIVERIES")
plt.tight_layout()
plt.savefig('Apprentice Chef Data Histograms 4 of 7.png')
plt.show()

#####################################################

fig, ax = plt.subplots(figsize = (10, 8))

plt.subplot(2, 2, 1)
sns.distplot(original_df['PACKAGE_LOCKER'],
             bins  = 'fd',
             color = 'b')
plt.xlabel("PACKAGE_LOCKER")

plt.subplot(2, 2, 2)
sns.distplot(original_df['REFRIGERATED_LOCKER'],
             bins  = 'fd',
             color = 'g')
plt.xlabel("REFRIGERATED_LOCKER")

plt.subplot(2, 2, 3)
sns.distplot(original_df['FOLLOWED_RECOMMENDATIONS_PCT'],
             bins  = 'fd',
             color = 'y')
plt.xlabel("FOLLOWED_RECOMMENDATIONS_PCT")

plt.subplot(2, 2, 4)
sns.distplot(original_df['AVG_PREP_VID_TIME'],
             bins  = 'fd',
             color = 'r')
plt.xlabel("AVG_PREP_VID_TIME")
plt.tight_layout()
plt.savefig('Apprentice Chef Data Histograms 5 of 7.png')
plt.show()

#####################################################

fig, ax = plt.subplots(figsize = (10, 8))

plt.subplot(2, 2, 1)
sns.distplot(original_df['LARGEST_ORDER_SIZE'],
             bins  = 'fd',
             color = 'b')
plt.xlabel("LARGEST_ORDER_SIZE")

plt.subplot(2, 2, 2)
sns.distplot(original_df['MASTER_CLASSES_ATTENDED'],
             bins  = 'fd',
             color = 'g')
plt.xlabel("MASTER_CLASSES_ATTENDED")

plt.subplot(2, 2, 3)
sns.distplot(original_df['MEDIAN_MEAL_RATING'],
             bins  = 'fd',
             color = 'y')
plt.xlabel("MEDIAN_MEAL_RATING")

plt.subplot(2, 2, 4)
sns.distplot(original_df['AVG_CLICKS_PER_VISIT'],
             bins  = 'fd',
             color = 'r')
plt.xlabel("AVG_CLICKS_PER_VISIT")
plt.tight_layout()
plt.savefig('Apprentice Chef Data Histograms 6 of 7.png')
plt.show()

#####################################################

fig, ax = plt.subplots(figsize = (10, 8))

plt.subplot(1, 1, 1)
sns.distplot(original_df['TOTAL_PHOTOS_VIEWED'],
             bins  = 'fd',
             color = 'b')
plt.xlabel("TOTAL_PHOTOS_VIEWED")
plt.tight_layout()
plt.savefig('Apprentice Chef Data Histograms 7 of 7.png')
plt.show()



In [None]:
#Setting outlier thresholds
REVENUE_HI                       = 5800
#CROSS_SELL_SUCCESS
TOTAL_MEALS_ORDERED_HI           = 220
UNIQUE_MEALS_PURCH_HI            = 9
CONTACTS_W_CUSTOMER_SERVICE_LOW  = 3
CONTACTS_W_CUSTOMER_SERVICE_HI   = 12.5
PRODUCT_CATEGORIES_VIEWED_LOW    = 2
PRODUCT_CATEGORIES_VIEWED_HI     = 10
AVG_TIME_PER_SITE_VISIT_HI       = 200
#MOBILE_NUMBER
CANCELLATIONS_BEFORE_NOON_HI     = 5
CANCELLATIONS_AFTER_NOON_HI      = 2
#TASTES_AND_PREFERENCES
PC_LOGINS_LOW                    = 5
PC_LOGINS_HI                     = 6
MOBILE_LOGINS_LOW                = 1
MOBILE_LOGINS_HI                 = 2
WEEKLY_PLAN_HI                   = 15
EARLY_DELIVERIES_HI              = 4
LATE_DELIVERIES_HI               = 10
#PACKAGE_LOCKER
#REFRIGERATED_LOCKER
FOLLOWED_RECOMMENDATIONS_PCT_LOW = 10
FOLLOWED_RECOMMENDATIONS_PCT_HI  = 70
AVG_PREP_VID_TIME_LOW            = 70
AVG_PREP_VID_TIME_HI             = 280
LARGEST_ORDER_SIZE_LOW           = 2
LARGEST_ORDER_SIZE_HI            = 8
MASTER_CLASSES_ATTENDED_HI       = 1
MEDIAN_MEAL_RATING_LOW           = 2
MEDIAN_MEAL_RATING_HI            = 4
AVG_CLICKS_PER_VISIT_LOW         = 8
AVG_CLICKS_PER_VISIT_HI          = 17.5
TOTAL_PHOTOS_VIEWED_HI           = 500

In [None]:
#Creating new features (columns) for outliers
#REVENUE
original_df['out_REVENUE'] = 0 #initializing the column
condition_hi = original_df.loc[0:,'out_REVENUE'][original_df['REVENUE'] > REVENUE_HI]

original_df['out_REVENUE'].replace(to_replace = condition_hi,
                                   value      = 1,
                                   inplace    = True)

#TOTAL_MEALS_ORDERED
original_df['out_TOTAL_MEALS_ORDERED'] = 0 #initializing the column
condition_hi = original_df.loc[0:,'out_TOTAL_MEALS_ORDERED'][original_df['TOTAL_MEALS_ORDERED'] > TOTAL_MEALS_ORDERED_HI]

original_df['out_TOTAL_MEALS_ORDERED'].replace(to_replace = condition_hi,
                                               value      = 1,
                                               inplace    = True)

#UNIQUE_MEALS_PURCH
original_df['out_UNIQUE_MEALS_PURCH'] = 0 #initializing the column
condition_hi = original_df.loc[0:,'out_UNIQUE_MEALS_PURCH'][original_df['UNIQUE_MEALS_PURCH'] > UNIQUE_MEALS_PURCH_HI]

original_df['out_UNIQUE_MEALS_PURCH'].replace(to_replace = condition_hi,
                                              value      = 1,
                                              inplace    = True)

#CONTACTS_W_CUSTOMER_SERVICE
original_df['out_CONTACTS_W_CUSTOMER_SERVICE'] = 0 #initializing the column
condition_hi = original_df.loc[0:,'out_CONTACTS_W_CUSTOMER_SERVICE'][original_df['CONTACTS_W_CUSTOMER_SERVICE'] > CONTACTS_W_CUSTOMER_SERVICE_HI]
condition_lo = original_df.loc[0:,'out_CONTACTS_W_CUSTOMER_SERVICE'][original_df['CONTACTS_W_CUSTOMER_SERVICE'] < CONTACTS_W_CUSTOMER_SERVICE_LOW]

original_df['out_CONTACTS_W_CUSTOMER_SERVICE'].replace(to_replace = condition_hi,
                                                       value      = 1,
                                                       inplace    = True)
original_df['out_CONTACTS_W_CUSTOMER_SERVICE'].replace(to_replace = condition_lo,
                                                       value      = 1,
                                                       inplace    = True)

#PRODUCT_CATEGORIES_VIEWED
original_df['out_PRODUCT_CATEGORIES_VIEWED'] = 0 #initializing the column
condition_hi = original_df.loc[0:,'out_PRODUCT_CATEGORIES_VIEWED'][original_df['PRODUCT_CATEGORIES_VIEWED'] > PRODUCT_CATEGORIES_VIEWED_HI]
condition_lo = original_df.loc[0:,'out_PRODUCT_CATEGORIES_VIEWED'][original_df['PRODUCT_CATEGORIES_VIEWED'] < PRODUCT_CATEGORIES_VIEWED_LOW]

original_df['out_PRODUCT_CATEGORIES_VIEWED'].replace(to_replace = condition_hi,
                                                     value      = 1,
                                                     inplace    = True)
original_df['out_PRODUCT_CATEGORIES_VIEWED'].replace(to_replace = condition_lo,
                                                     value      = 1,
                                                     inplace    = True)

#AVG_TIME_PER_SITE_VISIT
original_df['out_AVG_TIME_PER_SITE_VISIT'] = 0 #initializing the column
condition_hi = original_df.loc[0:,'out_AVG_TIME_PER_SITE_VISIT'][original_df['AVG_TIME_PER_SITE_VISIT'] > AVG_TIME_PER_SITE_VISIT_HI]

original_df['out_AVG_TIME_PER_SITE_VISIT'].replace(to_replace = condition_hi,
                                                   value      = 1,
                                                   inplace    = True)

#CANCELLATIONS_BEFORE_NOON
original_df['out_CANCELLATIONS_BEFORE_NOON'] = 0 #initializing the column
condition_hi = original_df.loc[0:,'out_CANCELLATIONS_BEFORE_NOON'][original_df['CANCELLATIONS_BEFORE_NOON'] > CANCELLATIONS_BEFORE_NOON_HI]

original_df['out_CANCELLATIONS_BEFORE_NOON'].replace(to_replace = condition_hi,
                                                     value      = 1,
                                                     inplace    = True)

#CANCELLATIONS_AFTER_NOON
original_df['out_CANCELLATIONS_AFTER_NOON'] = 0 #initializing the column
condition_hi = original_df.loc[0:,'out_CANCELLATIONS_AFTER_NOON'][original_df['CANCELLATIONS_AFTER_NOON'] > CANCELLATIONS_AFTER_NOON_HI]

original_df['out_CANCELLATIONS_AFTER_NOON'].replace(to_replace = condition_hi,
                                                    value      = 1,
                                                    inplace    = True)

#PC_LOGINS
original_df['out_PC_LOGINS'] = 0 #initializing the column
condition_hi = original_df.loc[0:,'out_PC_LOGINS'][original_df['PC_LOGINS'] > PC_LOGINS_HI]
condition_lo = original_df.loc[0:,'out_PC_LOGINS'][original_df['PC_LOGINS'] < PC_LOGINS_LOW]

original_df['out_PC_LOGINS'].replace(to_replace = condition_hi,
                                     value      = 1,
                                     inplace    = True)
original_df['out_PC_LOGINS'].replace(to_replace = condition_lo,
                                     value      = 1,
                                     inplace    = True)

#MOBILE_LOGINS
original_df['out_MOBILE_LOGINS'] = 0 #initializing the column
condition_hi = original_df.loc[0:,'out_MOBILE_LOGINS'][original_df['MOBILE_LOGINS'] > MOBILE_LOGINS_HI]
condition_lo = original_df.loc[0:,'out_MOBILE_LOGINS'][original_df['MOBILE_LOGINS'] < MOBILE_LOGINS_LOW]

original_df['out_MOBILE_LOGINS'].replace(to_replace = condition_hi,
                                         value      = 1,
                                         inplace    = True)
original_df['out_MOBILE_LOGINS'].replace(to_replace = condition_lo,
                                         value      = 1,
                                         inplace    = True)

#WEEKLY_PLAN
original_df['out_WEEKLY_PLAN'] = 0 #initializing the column
condition_hi = original_df.loc[0:,'out_WEEKLY_PLAN'][original_df['WEEKLY_PLAN'] > WEEKLY_PLAN_HI]

original_df['out_WEEKLY_PLAN'].replace(to_replace = condition_hi,
                                       value      = 1,
                                       inplace    = True)

#EARLY_DELIVERIES
original_df['out_EARLY_DELIVERIES'] = 0 #initializing the column
condition_hi = original_df.loc[0:,'out_EARLY_DELIVERIES'][original_df['EARLY_DELIVERIES'] > EARLY_DELIVERIES_HI]

original_df['out_EARLY_DELIVERIES'].replace(to_replace = condition_hi,
                                            value      = 1,
                                            inplace    = True)

#LATE_DELIVERIES
original_df['out_LATE_DELIVERIES'] = 0 #initializing the column
condition_hi = original_df.loc[0:,'out_LATE_DELIVERIES'][original_df['LATE_DELIVERIES'] > LATE_DELIVERIES_HI]

original_df['out_LATE_DELIVERIES'].replace(to_replace = condition_hi,
                                           value      = 1,
                                           inplace    = True)

#FOLLOWED_RECOMMENDATIONS_PCT
original_df['out_FOLLOWED_RECOMMENDATIONS_PCT'] = 0 #initializing the column
condition_hi = original_df.loc[0:,'out_FOLLOWED_RECOMMENDATIONS_PCT'][original_df['FOLLOWED_RECOMMENDATIONS_PCT'] > FOLLOWED_RECOMMENDATIONS_PCT_HI]
condition_lo = original_df.loc[0:,'out_FOLLOWED_RECOMMENDATIONS_PCT'][original_df['FOLLOWED_RECOMMENDATIONS_PCT'] < FOLLOWED_RECOMMENDATIONS_PCT_LOW]

original_df['out_FOLLOWED_RECOMMENDATIONS_PCT'].replace(to_replace = condition_hi,
                                                        value      = 1,
                                                        inplace    = True)
original_df['out_FOLLOWED_RECOMMENDATIONS_PCT'].replace(to_replace = condition_lo,
                                                        value      = 1,
                                                        inplace    = True)

#AVG_PREP_VID_TIME
original_df['out_AVG_PREP_VID_TIME'] = 0 #initializing the column
condition_hi = original_df.loc[0:,'out_AVG_PREP_VID_TIME'][original_df['AVG_PREP_VID_TIME'] > AVG_PREP_VID_TIME_HI]
condition_lo = original_df.loc[0:,'out_AVG_PREP_VID_TIME'][original_df['AVG_PREP_VID_TIME'] < AVG_PREP_VID_TIME_LOW]

original_df['out_AVG_PREP_VID_TIME'].replace(to_replace = condition_hi,
                                             value      = 1,
                                             inplace    = True)
original_df['out_AVG_PREP_VID_TIME'].replace(to_replace = condition_lo,
                                             value      = 1,
                                             inplace    = True)

#LARGEST_ORDER_SIZE
original_df['out_LARGEST_ORDER_SIZE'] = 0 #initializing the column
condition_hi = original_df.loc[0:,'out_LARGEST_ORDER_SIZE'][original_df['LARGEST_ORDER_SIZE'] > LARGEST_ORDER_SIZE_HI]
condition_lo = original_df.loc[0:,'out_LARGEST_ORDER_SIZE'][original_df['LARGEST_ORDER_SIZE'] < LARGEST_ORDER_SIZE_LOW]

original_df['out_LARGEST_ORDER_SIZE'].replace(to_replace = condition_hi,
                                              value      = 1,
                                              inplace    = True)
original_df['out_LARGEST_ORDER_SIZE'].replace(to_replace = condition_lo,
                                              value      = 1,
                                              inplace    = True)

#MASTER_CLASSES_ATTENDED
original_df['out_MASTER_CLASSES_ATTENDED'] = 0 #initializing the column
condition_hi = original_df.loc[0:,'out_MASTER_CLASSES_ATTENDED'][original_df['MASTER_CLASSES_ATTENDED'] > MASTER_CLASSES_ATTENDED_HI]

original_df['out_MASTER_CLASSES_ATTENDED'].replace(to_replace = condition_hi,
                                                   value      = 1,
                                                   inplace    = True)

#MEDIAN_MEAL_RATING
original_df['out_MEDIAN_MEAL_RATING'] = 0 #initializing the column
condition_hi = original_df.loc[0:,'out_MEDIAN_MEAL_RATING'][original_df['MEDIAN_MEAL_RATING'] > MEDIAN_MEAL_RATING_HI]
condition_lo = original_df.loc[0:,'out_MEDIAN_MEAL_RATING'][original_df['MEDIAN_MEAL_RATING'] < MEDIAN_MEAL_RATING_LOW]

original_df['out_MEDIAN_MEAL_RATING'].replace(to_replace = condition_hi,
                                              value      = 1,
                                              inplace    = True)
original_df['out_MEDIAN_MEAL_RATING'].replace(to_replace = condition_lo,
                                              value      = 1,
                                              inplace    = True)

#AVG_CLICKS_PER_VISIT
original_df['out_AVG_CLICKS_PER_VISIT'] = 0 #initializing the column
condition_hi = original_df.loc[0:,'out_AVG_CLICKS_PER_VISIT'][original_df['AVG_CLICKS_PER_VISIT'] > AVG_CLICKS_PER_VISIT_HI]
condition_lo = original_df.loc[0:,'out_AVG_CLICKS_PER_VISIT'][original_df['AVG_CLICKS_PER_VISIT'] < AVG_CLICKS_PER_VISIT_LOW]

original_df['out_AVG_CLICKS_PER_VISIT'].replace(to_replace = condition_hi,
                                                value      = 1,
                                                inplace    = True)
original_df['out_AVG_CLICKS_PER_VISIT'].replace(to_replace = condition_lo,
                                                value      = 1,
                                                inplace    = True)

#TOTAL_PHOTOS_VIEWED
original_df['out_TOTAL_PHOTOS_VIEWED'] = 0 #initializing the column
condition_hi = original_df.loc[0:,'out_TOTAL_PHOTOS_VIEWED'][original_df['TOTAL_PHOTOS_VIEWED'] > TOTAL_PHOTOS_VIEWED_HI]

original_df['out_TOTAL_PHOTOS_VIEWED'].replace(to_replace = condition_hi,
                                               value      = 1,
                                               inplace    = True)

In [None]:
#Viewing the new dataset with the outlier features
original_df

In [None]:
#Dropping the already encoded variables from the dataset
original_df = original_df.drop(['NAME', 'FIRST_NAME', 'FAMILY_NAME', 'EMAIL', 'EMAIL_DOMAIN', 'DOMAIN_TYPE'], axis = 1)

original_df

In [None]:
#Saving the feature rich dataset to excel
original_df.to_excel('apprentice_chef_feature_rich.xlsx',
                         index = False)

In [None]:
#Checking the top 10 positively correlated explanatory variables with the target variable
original_df_corr = original_df.corr().round(2)
original_df_corr['CROSS_SELL_SUCCESS'].sort_values(ascending = False).head(10)

In [None]:
#Checking the top 10 negatively correlated explanatory variables with the target variable
original_df_corr['CROSS_SELL_SUCCESS'].sort_values(ascending = False).tail(n=15)

***
Further to insight one, we see that junk email domain have a very high negative correlation with cross sell success.
Meaning more people that register with these email domains are less likely to opt for the cross selling promotion of wine bottle. These email domains, most probably taken out of use by the companies, due to shifting into a new email domain, are most likely used by people that are not very active at checking their mails in this address.

<h6>Insight 2</h6>
People who follow meal recommendation, that is displayed on the web or mobile platform, are very likely to subscribe to the cross selling promotion, as shown by the high positive correlation. 

From insight 1 and 2, could it be beneficial if Halfway there is promoted more aggressively to people who register with their professional email addresses and to the ones that repeatedly follow the meal recommendation? Could the meal recommendations be tailored to the professionals in a way, such that those recommended meals are the ones that compliment the wine better?

***

In [None]:
#Correlation heatmap 1

#fig, ax = plt.subplots(figsize=(15,15))

#original_df_corr1 = original_df.corr().round(2).iloc[0:19, 0:19]

#sns.heatmap(original_df_corr1,
#            cmap       = 'coolwarm',
#            square     = True,
#            annot      = True,
#            linecolor  = 'black',
#            linewidths = 0.5)

#plt.show()

In [None]:
#Correlation heatmap 2

fig, ax = plt.subplots(figsize=(15,15))

original_df_corr2 = original_df.corr().round(2).iloc[20:39, 20:39]

sns.heatmap(original_df_corr2,
            cmap       = 'coolwarm',
            square     = True,
            annot      = True,
            linecolor  = 'black',
            linewidths = 0.5)

plt.show()

***
As seen from the correlation plot, median meal rating is very negatively correlated with average clicks per visit (-0.86). Probably people that click many times are not finding what they are looking for and hence end up giving a poor meal rating. Some thing that the recommendation engine design should take care of.

***

In [None]:
#Correlation heatmap 3

#fig, ax = plt.subplots(figsize=(15,15))

#original_df_corr3 = original_df.corr().round(2).iloc[40:50, 40:50]

#sns.heatmap(original_df_corr3,
#            cmap       = 'coolwarm',
#            square     = True,
#            annot      = True,
#            linecolor  = 'black',
#            linewidths = 0.5)

#plt.show()

***
<br><h2>Modeling</h2><br>
***

In [None]:
#Declaring the explanatory variables
original_df_data = original_df.drop('CROSS_SELL_SUCCESS', axis = 1)

#Declaring the target variable
original_df_target = original_df.loc[:, 'CROSS_SELL_SUCCESS']

In [None]:
#Creating train-test split with stratification
X_train, X_test, y_train, y_test = train_test_split(original_df_data,
                                                    original_df_target,
                                                    test_size = 0.25,
                                                    random_state = 222,
                                                    stratify = original_df_target)


#Exploring the dimensions of training set 
print(X_train.shape)
print(y_train.shape)

#Exploring the dimensions of testing set
print(X_test.shape)
print(y_test.shape)

***
Creating a base model between the target variable and the highly correlated explanatory variable

***

In [None]:
#Instantiating a logistic regression model object 
logistic_small = smf.logit(formula   = """CROSS_SELL_SUCCESS ~ FOLLOWED_RECOMMENDATIONS_PCT""",
                           data = original_df)


#FITTING the model object
results_logistic = logistic_small.fit()


#Checking the results SUMMARY
results_logistic.summary()

***
Creating a Logistic Regression model using statsmodel, with all the explanatory variables

***

In [None]:
#for val in original_df_data:
#    print(f"{val} +")

In [None]:
#Instantiating a logistic regression model object
logistic_full = smf.logit(formula = """CROSS_SELL_SUCCESS ~ REVENUE +
                                                            TOTAL_MEALS_ORDERED +
                                                            UNIQUE_MEALS_PURCH +
                                                            CONTACTS_W_CUSTOMER_SERVICE +
                                                            PRODUCT_CATEGORIES_VIEWED +
                                                            AVG_TIME_PER_SITE_VISIT +
                                                            MOBILE_NUMBER +
                                                            CANCELLATIONS_BEFORE_NOON +
                                                            CANCELLATIONS_AFTER_NOON +
                                                            TASTES_AND_PREFERENCES +
                                                            PC_LOGINS +
                                                            MOBILE_LOGINS +
                                                            WEEKLY_PLAN +
                                                            EARLY_DELIVERIES +
                                                            LATE_DELIVERIES +
                                                            PACKAGE_LOCKER +
                                                            REFRIGERATED_LOCKER +
                                                            FOLLOWED_RECOMMENDATIONS_PCT +
                                                            AVG_PREP_VID_TIME +
                                                            LARGEST_ORDER_SIZE +
                                                            MASTER_CLASSES_ATTENDED +
                                                            MEDIAN_MEAL_RATING +
                                                            AVG_CLICKS_PER_VISIT +
                                                            TOTAL_PHOTOS_VIEWED +
                                                            m_FAMILY_NAME +
                                                            number_of_names +
                                                            junk +
                                                            personal +
                                                            professional +
                                                            out_REVENUE +
                                                            out_TOTAL_MEALS_ORDERED +
                                                            out_UNIQUE_MEALS_PURCH +
                                                            out_CONTACTS_W_CUSTOMER_SERVICE +
                                                            out_PRODUCT_CATEGORIES_VIEWED +
                                                            out_AVG_TIME_PER_SITE_VISIT +
                                                            out_CANCELLATIONS_BEFORE_NOON +
                                                            out_CANCELLATIONS_AFTER_NOON +
                                                            out_PC_LOGINS +
                                                            out_MOBILE_LOGINS +
                                                            out_WEEKLY_PLAN +
                                                            out_EARLY_DELIVERIES +
                                                            out_LATE_DELIVERIES +
                                                            out_FOLLOWED_RECOMMENDATIONS_PCT +
                                                            out_AVG_PREP_VID_TIME +
                                                            out_LARGEST_ORDER_SIZE +
                                                            out_MASTER_CLASSES_ATTENDED +
                                                            out_MEDIAN_MEAL_RATING +
                                                            out_AVG_CLICKS_PER_VISIT +
                                                            out_TOTAL_PHOTOS_VIEWED""",
                                                            data = original_df)

#Fitting the model object
results_full = logistic_full.fit()

#Displaying the results
results_full.summary()

In [None]:
#Creating a model with just the significant variables (p-value less than 0.05)
#Instantiating a logistic regression model object
logistic_sig = smf.logit(formula = """CROSS_SELL_SUCCESS ~ MOBILE_NUMBER +
                                                           CANCELLATIONS_BEFORE_NOON +
                                                           CANCELLATIONS_AFTER_NOON +
                                                           TASTES_AND_PREFERENCES +
                                                           PC_LOGINS +
                                                           MOBILE_LOGINS +
                                                           REFRIGERATED_LOCKER +
                                                           FOLLOWED_RECOMMENDATIONS_PCT +
                                                           number_of_names +
                                                           personal +
                                                           professional +
                                                           out_CONTACTS_W_CUSTOMER_SERVICE +
                                                           out_FOLLOWED_RECOMMENDATIONS_PCT""",
                                                           data = original_df)

#Fitting the model object
results_full = logistic_sig.fit()

#Displaying the results
results_full.summary()

In [None]:
#The variables that were discarded in order of their high p-value
#junk
#out_CANCELLATIONS_AFTER_NOON +
#UNIQUE_MEALS_PURCH +
#out_MASTER_CLASSES_ATTENDED +
#MEDIAN_MEAL_RATING +
#out_EARLY_DELIVERIES
#PRODUCT_CATEGORIES_VIEWED +
#out_UNIQUE_MEALS_PURCH +
#PACKAGE_LOCKER +
#MASTER_CLASSES_ATTENDED +
#out_MOBILE_LOGINS +
#CONTACTS_W_CUSTOMER_SERVICE +
#LATE_DELIVERIES +
#LARGEST_ORDER_SIZE +
#WEEKLY_PLAN +
#out_CANCELLATIONS_BEFORE_NOON +
#TOTAL_PHOTOS_VIEWED +
#out_MEDIAN_MEAL_RATING +
#out_REVENUE + +
#out_TOTAL_PHOTOS_VIEWED
#AVG_PREP_VID_TIME ++
#out_AVG_CLICKS_PER_VISIT +
#out_LARGEST_ORDER_SIZE
#out_PRODUCT_CATEGORIES_VIEWED +
#out_PC_LOGINS +
#out_LATE_DELIVERIES +
#out_AVG_TIME_PER_SITE_VISIT +
#out_WEEKLY_PLAN +
#AVG_TIME_PER_SITE_VISIT +
#m_FAMILY_NAME +
##out_TOTAL_MEALS_ORDERED +
#TOTAL_MEALS_ORDERED + +
#out_AVG_PREP_VID_TIME
#REVENUE +
#AVG_CLICKS_PER_VISIT +
#EARLY_DELIVERIES +

In [None]:
#Creating a dictionary to store candidate models

candidate_dict = {

 #Full model
 'logit_full'   : [ 'REVENUE', 
                    'TOTAL_MEALS_ORDERED', 
                    'UNIQUE_MEALS_PURCH', 
                    'CONTACTS_W_CUSTOMER_SERVICE', 
                    'PRODUCT_CATEGORIES_VIEWED', 
                    'AVG_TIME_PER_SITE_VISIT', 
                    'MOBILE_NUMBER', 
                    'CANCELLATIONS_BEFORE_NOON', 
                    'CANCELLATIONS_AFTER_NOON', 
                    'TASTES_AND_PREFERENCES', 
                    'PC_LOGINS', 
                    'MOBILE_LOGINS', 
                    'WEEKLY_PLAN', 
                    'EARLY_DELIVERIES', 
                    'LATE_DELIVERIES', 
                    'PACKAGE_LOCKER', 
                    'REFRIGERATED_LOCKER', 
                    'FOLLOWED_RECOMMENDATIONS_PCT', 
                    'AVG_PREP_VID_TIME', 
                    'LARGEST_ORDER_SIZE', 
                    'MASTER_CLASSES_ATTENDED', 
                    'MEDIAN_MEAL_RATING', 
                    'AVG_CLICKS_PER_VISIT', 
                    'TOTAL_PHOTOS_VIEWED', 
                    'm_FAMILY_NAME', 
                    'number_of_names', 
                    'junk', 
                    'personal', 
                    'professional', 
                    'out_REVENUE', 
                    'out_TOTAL_MEALS_ORDERED', 
                    'out_UNIQUE_MEALS_PURCH', 
                    'out_CONTACTS_W_CUSTOMER_SERVICE', 
                    'out_PRODUCT_CATEGORIES_VIEWED', 
                    'out_AVG_TIME_PER_SITE_VISIT', 
                    'out_CANCELLATIONS_BEFORE_NOON', 
                    'out_CANCELLATIONS_AFTER_NOON', 
                    'out_PC_LOGINS', 
                    'out_MOBILE_LOGINS', 
                    'out_WEEKLY_PLAN', 
                    'out_EARLY_DELIVERIES', 
                    'out_LATE_DELIVERIES', 
                    'out_FOLLOWED_RECOMMENDATIONS_PCT', 
                    'out_AVG_PREP_VID_TIME', 
                    'out_LARGEST_ORDER_SIZE', 
                    'out_MASTER_CLASSES_ATTENDED', 
                    'out_MEDIAN_MEAL_RATING', 
                    'out_AVG_CLICKS_PER_VISIT', 
                    'out_TOTAL_PHOTOS_VIEWED'],
 
 #Significant variables only
 'logit_sig'    : ['MOBILE_NUMBER',
                   'CANCELLATIONS_BEFORE_NOON', 
                   'CANCELLATIONS_AFTER_NOON',
                   'TASTES_AND_PREFERENCES',
                   'PC_LOGINS', 
                   'MOBILE_LOGINS', 
                   'REFRIGERATED_LOCKER', 
                   'FOLLOWED_RECOMMENDATIONS_PCT',
                   'number_of_names', 
                   'personal',
                   'professional', 
                   'out_CONTACTS_W_CUSTOMER_SERVICE', 
                   'out_FOLLOWED_RECOMMENDATIONS_PCT']

}

In [None]:
# printing candidate variable sets
print(f"""
/--------------------------\\
|Explanatory Variable Sets |
\\--------------------------/

Full Model:
-----------
{candidate_dict['logit_full']}


Significant p-value Model:
--------------------------
{candidate_dict['logit_sig']}
""")

<h6>Building a Logistic Regression model</h6>

In [None]:
#Using the significant variables as explanatory variables
original_df_data   =  original_df.loc[ : , candidate_dict['logit_sig']]


#Conducting the train test split with the new data
X_train, X_test, y_train, y_test = train_test_split(original_df_data,
                                                    original_df_target,
                                                    random_state = 222,
                                                    test_size    = 0.25,
                                                    stratify     = original_df_target)


#INSTANTIATING a logistic regression model
logreg = LogisticRegression(solver = 'lbfgs',
                            C = 1,
                            random_state = 222)


#FITTING the training data
logreg_fit = logreg.fit(X_train, y_train)


#PREDICTING based on the testing set
logreg_pred = logreg_fit.predict(X_test)


#SCORING the results
print('Training ACCURACY:', logreg_fit.score(X_train, y_train).round(4))
print('Testing  ACCURACY:', logreg_fit.score(X_test, y_test).round(4))
print('ROC_AUC_SCORE    :', roc_auc_score(y_true  = y_test,
                                          y_score = logreg_pred).round(4))

In [None]:
#Creating the confusion matrix for LogisticRegression
print(confusion_matrix(y_true = y_test,
                       y_pred = logreg_pred))

visual_cm(true_y = y_test,
          pred_y = logreg_pred,
          labels = ['Not subscribed', 'Subscribed'])

***
Saving the model performances for future comparisons

***

In [None]:
#Creating an empty list
model_performance = [['Model', 'Training Accuracy',
                      'Testing Accuracy', 'AUC Value']]


#Training accuracy of LogisticRegression model
logreg_train_acc  = logreg_fit.score(X_train, y_train).round(4)


#Training accuracy of LogisticRegression model
logreg_test_acc   = logreg_fit.score(X_test, y_test).round(4)


#ROC AUC value of LogisticRegression model
logreg_auc = roc_auc_score(y_true  = y_test,
                           y_score = logreg_pred).round(4)


#Saving the results
model_performance.append(['Logistic Regression',
                          logreg_train_acc,
                          logreg_test_acc,
                          logreg_auc])

model_performance

<h6>Building a KNN model</h6>

In [None]:
#Using the pre-defined function to find the optimal neighbors
opt_neighbors = optimal_neighbors(X_data = original_df_data,
                                  y_data = original_df_target,
                                  response_type = 'class')

***
Scaling the explanatory data using Standard Scaler and building a KNN model

***

In [None]:
#INSTANTIATING StandardScaler()
scaler = StandardScaler()


#FITTING the data
scaler.fit(original_df_data)


#TRANSFORMING the data
X_scaled     = scaler.transform(original_df_data)


#Converting to a DataFrame
X_scaled_df  = pd.DataFrame(X_scaled) 


#Train-test split with the scaled data
X_train_scaled, X_test_scaled, y_train, y_test = train_test_split(X_scaled_df,
                                                                  original_df_target,
                                                                  random_state = 222,
                                                                  test_size    = 0.25,
                                                                  stratify     = original_df_target)


#INSTANTIATING a KNN classification model with optimal neighbors
knn_opt = KNeighborsClassifier(n_neighbors = opt_neighbors)


#FITTING the training data
knn_fit = knn_opt.fit(X_train_scaled, y_train)


#PREDICTING based on the testing set
knn_pred = knn_fit.predict(X_test_scaled)


# SCORING the results
print('Training ACCURACY:', knn_fit.score(X_train_scaled, y_train).round(4))
print('Testing  ACCURACY:', knn_fit.score(X_test_scaled, y_test).round(4))
print('ROC AUC Score    :', roc_auc_score(y_true  = y_test,
                                          y_score = knn_pred).round(4))

In [None]:
#Creating the confusion matrix for KNN
print(confusion_matrix(y_true = y_test,
                       y_pred = knn_pred))

visual_cm(true_y = y_test,
          pred_y = knn_pred,
          labels = ['Not subscribed', 'Subscribed'])

***
Saving the model performances for future comparisons

***

In [None]:
#Training accuracy of a scaled KNN model
knn_train_acc  = knn_fit.score(X_train_scaled, y_train).round(4)


#Testing accuracy of a scaled KNN model
knn_test_acc   = knn_fit.score(X_test_scaled, y_test).round(4)


#ROC AUC value of a scaled KNN model
knn_auc = roc_auc_score(y_true  = y_test,
                        y_score = knn_pred).round(4)


#Saving the results
model_performance.append(['Scaled KNN Classification',
                          knn_train_acc,
                          knn_test_acc,
                          knn_auc])

model_performance

<h6>Building a Pruned Classification Tree model</h6>

In [None]:
#INSTANTIATING a classification tree object
tree_pruned      = DecisionTreeClassifier(max_depth = 4,
                                          min_samples_leaf = 25,
                                          random_state = 222)


#FITTING the training data
tree_pruned_fit  = tree_pruned.fit(X_train, y_train)


#PREDICTING on new data
tree_pred = tree_pruned_fit.predict(X_test)


#SCORING the model
print('Training ACCURACY:', tree_pruned_fit.score(X_train, y_train).round(4))
print('Testing  ACCURACY:', tree_pruned_fit.score(X_test, y_test).round(4))
print('AUC Score        :', roc_auc_score(y_true  = y_test,
                                          y_score = tree_pred).round(4))


#Calling display_tree
display_tree(tree       = tree_pruned_fit,
             feature_df = X_train)

In [None]:
#Plotting feature importance
plot_feature_importances(tree_pruned_fit,
                         train = X_train,
                         export = False)

***
Saving the model performances for future comparisons

***

In [None]:
#Train accuracy of Pruned Tree
pruned_tree_train_acc = tree_pruned_fit.score(X_train, y_train).round(4)


#Test accuracy of Pruned Tree
pruned_tree_test_acc  = tree_pruned_fit.score(X_test, y_test).round(4)


#ROC AUC value
pruned_tree_auc       = roc_auc_score(y_true  = y_test,
                                      y_score = tree_pred).round(4)


#Saving the results
model_performance.append(['Pruned Tree',
                          pruned_tree_train_acc,
                          pruned_tree_test_acc,
                          pruned_tree_auc])

model_performance

<h6>Hyperparameter tuning on Logistic Regression model using GridSearchCV</h6>

In [None]:
#help(LogisticRegression)

In [None]:
#Declaring a hyperparameter space
C_space          = pd.np.arange(0.1, 3.0, 0.1)
warm_start_space = [True, False]


#Creating a hyperparameter grid
param_grid = {'C'          : C_space,
              'warm_start' : warm_start_space}


#INSTANTIATING the model object without hyperparameters
lr_tuned = LogisticRegression(solver = 'lbfgs',
                              random_state = 222)


#GridSearchCV object
lr_tuned_cv = GridSearchCV(estimator  = lr_tuned,
                           param_grid = param_grid,
                           cv         = 3,
                           scoring    = make_scorer(roc_auc_score,
                                                    needs_threshold = False))


#FITTING to the FULL DATASET (due to cross-validation)
lr_tuned_cv.fit(original_df_data, original_df_target)


#Printing the optimal parameters and best score
print("Tuned Parameters  :", lr_tuned_cv.best_params_)
print("Tuned CV AUC      :", lr_tuned_cv.best_score_.round(4))

In [None]:
lr_tuned_cv.best_estimator_

In [None]:
#Building a model based on hyperparameter tuning results

#Using the significant variables as explanatory variables
original_df_data   =  original_df.loc[ : , candidate_dict['logit_sig']]


#Conducting the train test split with the new data
X_train, X_test, y_train, y_test = train_test_split(original_df_data,
                                                    original_df_target,
                                                    random_state = 222,
                                                    test_size    = 0.25,
                                                    stratify     = original_df_target)

#INSTANTIATING a logistic regression model with tuned values
lr_tuned = lr_tuned_cv.best_estimator_

#PREDICTING based on the testing set
lr_tuned_pred = lr_tuned.predict(X_test)


#SCORING the results
print('Training ACCURACY:', lr_tuned.score(X_train, y_train).round(4))
print('Testing  ACCURACY:', lr_tuned.score(X_test, y_test).round(4))
print('AUC Score        :', roc_auc_score(y_true  = y_test,
                                          y_score = lr_tuned_pred).round(4))

In [None]:
#Declaring model performance objects
lr_train_acc = lr_tuned.score(X_train, y_train).round(4)
lr_test_acc  = lr_tuned.score(X_test, y_test).round(4)
lr_auc       = roc_auc_score(y_true  = y_test,
                             y_score = lr_tuned_pred).round(4)


#Appending to model_performance
model_performance.append(['Tuned Logistic Regression',
                           lr_train_acc,
                           lr_test_acc,
                           lr_auc])

model_performance

<h6>Hyperparameter tuning on Classification Tree with GridSearchCV</h6>

In [None]:
#help(DecisionTreeClassifier)

In [None]:
#Declaring a hyperparameter space
criterion_space = ['gini', 'entropy']
splitter_space  = ['best', 'random']
depth_space     = pd.np.arange(1, 25)
leaf_space      = pd.np.arange(1, 100)


#Creating a hyperparameter grid
param_grid = {'criterion'        : criterion_space,
              'splitter'         : splitter_space,
              'max_depth'        : depth_space,
              'min_samples_leaf' : leaf_space}


#INSTANTIATING the model object without hyperparameters
tuned_tree = DecisionTreeClassifier(random_state = 222)


#GridSearchCV object
tuned_tree_cv = GridSearchCV(estimator  = tuned_tree,
                             param_grid = param_grid,
                             cv         = 3,
                             scoring    = make_scorer(roc_auc_score,
                                                      needs_threshold = False))


#FITTING to the FULL DATASET (due to cross-validation)
tuned_tree_cv.fit(original_df_data, original_df_target)


#Printing the optimal parameters and best score
print("Tuned Parameters  :", tuned_tree_cv.best_params_)
print("Tuned Training AUC:", tuned_tree_cv.best_score_.round(4))

#Tuned Parameters  : {'criterion': 'gini', 'max_depth': 7, 'min_samples_leaf': 9, 'splitter': 'random'}
#Tuned Training AUC: 0.6707

In [None]:
#Building a model based on hyperparameter tuning results

#INSTANTIATING a logistic regression model with tuned values
tree_tuned = tuned_tree_cv.best_estimator_


#PREDICTING based on the testing set
tree_tuned_pred = tree_tuned.predict(X_test)


#SCORING the results
print('Training ACCURACY:', tree_tuned.score(X_train, y_train).round(4))
print('Testing  ACCURACY:', tree_tuned.score(X_test, y_test).round(4))
print('AUC Score        :', roc_auc_score(y_true  = y_test,
                                          y_score = tree_tuned_pred).round(4))

In [None]:
#Declaring model performance objects
tree_train_acc = tree_tuned.score(X_train, y_train).round(4)
tree_test_acc  = tree_tuned.score(X_test, y_test).round(4)
tree_auc       = roc_auc_score(y_true  = y_test,
                               y_score = tree_tuned_pred).round(4)


#Appending to model_performance
model_performance.append(['Tuned Tree',
                           tree_train_acc,
                           tree_test_acc,
                           tree_auc])


#Checking the results
model_performance

In [None]:
#Displaying the tree
display_tree(tree       = tree_tuned,
             feature_df = original_df_data,
             height     = 2000,
             width      = 2000)

<h6>Building a Random forest model on significant explanatory variables</h6>

In [None]:
#help(RandomForestClassifier)

In [None]:
#INSTANTIATING a random forest model with default values
rf_default = RandomForestClassifier(n_estimators     = 10,
                                    criterion        = 'gini',
                                    max_depth        = None,
                                    min_samples_leaf = 1,
                                    bootstrap        = True,
                                    warm_start       = False,
                                    random_state     = 222)

#FITTING the training data
rf_default_fit = rf_default.fit(X_train, y_train)


#PREDICTING based on the testing set
rf_default_fit_pred = rf_default_fit.predict(X_test)


#SCORING the results
print('Training ACCURACY:', rf_default_fit.score(X_train, y_train).round(4))
print('Testing  ACCURACY:', rf_default_fit.score(X_test, y_test).round(4))
print('AUC Score        :', roc_auc_score(y_true  = y_test,
                                          y_score = rf_default_fit_pred).round(4))

In [None]:
#Plotting the important features
plot_feature_importances(rf_default_fit,
                         train = X_train,
                         export = False)

In [None]:
#Declaring model performance objects
rf_train_acc = rf_default_fit.score(X_train, y_train).round(4)
rf_test_acc  = rf_default_fit.score(X_test, y_test).round(4)
rf_auc       = roc_auc_score(y_true  = y_test,
                             y_score = rf_default_fit_pred).round(4)


#Appending to model_performance
model_performance.append(['Random Forest Default Parameters',
                          rf_train_acc,
                          rf_test_acc,
                          rf_auc])


# checking the results
model_performance

<h6>Building a Random forest model with tuned parameters using GridSearchCV</h6>

In [None]:
#Declaring a hyperparameter space
estimator_space  = pd.np.arange(100, 1100, 250)
leaf_space       = pd.np.arange(1, 31, 10)
criterion_space  = ['gini', 'entropy']
bootstrap_space  = [True, False]
warm_start_space = [True, False]


#Creating a hyperparameter grid
param_grid = {'n_estimators'     : estimator_space,
              'min_samples_leaf' : leaf_space,
              'criterion'        : criterion_space,
              'bootstrap'        : bootstrap_space,
              'warm_start'       : warm_start_space}


#INSTANTIATING the model object without hyperparameters
full_forest_grid = RandomForestClassifier(random_state = 222)


#GridSearchCV object
full_forest_cv = GridSearchCV(estimator  = full_forest_grid,
                              param_grid = param_grid,
                              cv         = 3,
                              scoring    = make_scorer(roc_auc_score,
                                           needs_threshold = False))


#FITTING to the FULL DATASET (due to cross-validation)
full_forest_cv.fit(original_df_data, original_df_target)


# printing the optimal parameters and best score
print("Tuned Parameters  :", full_forest_cv.best_params_)
print("Tuned Training AUC:", full_forest_cv.best_score_.round(4))

#Tuned Parameters  : {'bootstrap': False, 'criterion': 'entropy', 'min_samples_leaf': 1, 'n_estimators': 600, 'warm_start': True}
#Tuned Training AUC: 0.6264

In [None]:
#INSTANTIATING the model object with hyperparameters
full_rf_tuned = RandomForestClassifier(bootstrap        = True,
                                       criterion        = 'gini',
                                       min_samples_leaf = 1,
                                       n_estimators     = 100,
                                       warm_start       = True,
                                       random_state     = 222)


#FIT step is needed as we are not using .best_estimator
full_rf_tuned_fit = full_rf_tuned.fit(X_train, y_train)


#PREDICTING based on the testing set
full_rf_tuned_pred = full_rf_tuned_fit.predict(X_test)


#SCORING the results
print('Training ACCURACY:', full_rf_tuned_fit.score(X_train, y_train).round(4))
print('Testing  ACCURACY:', full_rf_tuned_fit.score(X_test, y_test).round(4))
print('AUC Score        :', roc_auc_score(y_true  = y_test,
                                          y_score = full_rf_tuned_pred).round(4))

In [None]:
#Declaring model performance objects
rf_train_acc = full_rf_tuned_fit.score(X_train, y_train).round(4)
rf_test_acc  = full_rf_tuned_fit.score(X_test, y_test).round(4)
rf_auc       = roc_auc_score(y_true  = y_test,
                             y_score = full_rf_tuned_pred).round(4)


#Appending to model_performance
model_performance.append(['Tuned Random Forest',
                           rf_train_acc,
                           rf_test_acc,
                           rf_auc])


#Checking the results
model_performance

<h6>Gradient Boosted Machines model with default parameters</h6>

In [None]:
#help(GradientBoostingClassifier)

In [None]:
#INSTANTIATING the model object without hyperparameters
full_gbm_default = GradientBoostingClassifier(loss          = 'deviance',
                                              learning_rate = 0.1,
                                              n_estimators  = 100,
                                              criterion     = 'friedman_mse',
                                              max_depth     = 3,
                                              warm_start    = False,
                                              random_state  = 222)


#FITTING to the training data
full_gbm_default_fit = full_gbm_default.fit(X_train, y_train)


#PREDICTING based on the testing set
full_gbm_default_pred = full_gbm_default_fit.predict(X_test)


#SCORING the results
print('Training ACCURACY:', full_gbm_default_fit.score(X_train, y_train).round(4))
print('Testing ACCURACY :', full_gbm_default_fit.score(X_test, y_test).round(4))
print('AUC Score        :', roc_auc_score(y_true  = y_test,
                                          y_score = full_gbm_default_pred).round(4))

In [None]:
#Declaring model performance objects
gbm_train_acc = full_gbm_default_fit.score(X_train, y_train).round(4)
gbm_test_acc  = full_gbm_default_fit.score(X_test, y_test).round(4)
gbm_auc       = roc_auc_score(y_true  = y_test,
                              y_score = full_gbm_default_pred).round(4)


#Appending to model_performance
model_performance.append(['GBM default parameters',
                           gbm_train_acc,
                           gbm_test_acc,
                           gbm_auc])


#Checking the results
model_performance

<h6>Gradient Boosted Machines model with tuned parameters</h6>

In [None]:
#Declaring a hyperparameter space
learn_space     = pd.np.arange(0.1, 1.6, 0.3)
estimator_space = pd.np.arange(50, 250, 50)
depth_space     = pd.np.arange(1, 10)


#Creating a hyperparameter grid
param_grid = {'learning_rate' : learn_space,
              'max_depth'     : depth_space,
              'n_estimators'  : estimator_space}


#INSTANTIATING the model object without hyperparameters
full_gbm_grid = GradientBoostingClassifier(random_state = 222)


#GridSearchCV object
full_gbm_cv = GridSearchCV(estimator  = full_gbm_grid,
                           param_grid = param_grid,
                           cv         = 3,
                           scoring    = make_scorer(roc_auc_score,
                                                    needs_threshold = False))


#FITTING to the FULL DATASET (due to cross-validation)
full_gbm_cv.fit(original_df_data, original_df_target)


#Printing the optimal parameters and best score
print("Tuned Parameters  :", full_gbm_cv.best_params_)
print("Tuned Training AUC:", full_gbm_cv.best_score_.round(4))

#Tuned Parameters  : {'learning_rate': 0.7000000000000001, 'max_depth': 2, 'n_estimators': 200}
#Tuned Training AUC: 0.6486

In [None]:
#INSTANTIATING the model object with hyperparameters
gbm_tuned = GradientBoostingClassifier(learning_rate = 0.7,
                                       max_depth     = 2,
                                       n_estimators  = 200,
                                       random_state  = 222)


#FIT step is needed as we are not using .best_estimator
gbm_tuned_fit = gbm_tuned.fit(X_train, y_train)


#PREDICTING based on the testing set
gbm_tuned_pred = gbm_tuned_fit.predict(X_test)


#SCORING the results
print('Training ACCURACY:', gbm_tuned_fit.score(X_train, y_train).round(4))
print('Testing  ACCURACY:', gbm_tuned_fit.score(X_test, y_test).round(4))
print('AUC Score        :', roc_auc_score(y_true  = y_test,
                                          y_score = gbm_tuned_pred).round(4))

In [None]:
#Declaring model performance objects
gbm_train_acc = gbm_tuned_fit.score(X_train, y_train).round(4)
gbm_test_acc  = gbm_tuned_fit.score(X_test, y_test).round(4)
gbm_auc       = roc_auc_score(y_true  = y_test,
                              y_score = gbm_tuned_pred).round(4)


#Appending to model_performance
model_performance.append(['Tuned GBM',
                           gbm_train_acc,
                           gbm_test_acc,
                           gbm_auc])


#Checking the results
model_performance

In [None]:
#Converting to DataFrame and checking the results
model_performance_df = pd.DataFrame(model_performance[1:], columns = model_performance[0])
model_performance_df

In [None]:
#Finding each model's performance
model_performance_df.sort_values(by = 'AUC Value',
                                 ascending = False)

In [None]:
#Saving the DataFrame to Excel
model_performance_df.to_excel('Classification Model Performance.xlsx',
                              index = False)

<h6>The final model selected is the Tuned Tree model that has an AUC of 0.8065 and a Testing score of 0.8337</h6>