In [70]:
import pandas as pd
import numpy as np

In [71]:
#loading the cleaned dataframe for preprocessing

df = pd.read_csv('Peruvian_Bank_Data/clean_df.csv', header = 0)
df.head()

Unnamed: 0,age,job,marital,education,in_default,avg_yearly_balance,housing_loan,personal_loan,contact_method,day,month,duration,campaign_contacts,prev_days,previous_contacts,prev_outcome,term_deposit
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,5,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,5,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,5,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,5,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,5,198,1,-1,0,unknown,no


I believe the best way to preprocess this is to provide dummy labels to the categorical data and then use a RobustScaler for avg_yearly_balance, and duration because of the outliers. Prev_days will also get a robustscaling, but this will be done last because it is subject to change -- I'm concerned about the negative one being abrasive to the model. But we will see.  MinMax scaler for campaign_contacts and previous_contacts and age. Let's begin and see how it goes. I will do the scaling first and then the dummy variables -- the dummy variables make the dataframe a bit unbearable to scroll through. 

In [72]:
from sklearn.preprocessing import RobustScaler, MinMaxScaler

In [73]:
#applying robustscaler to avg_yearly_balance, duration, prev_days
X = pd.DataFrame(df['avg_yearly_balance'])
RobSca = RobustScaler().fit_transform(X)

In [74]:
df2 = df.copy() #creating a copy just in case something goes wrong
#going off to replace the avg_yearly_balance values
df2['avg_yearly_balance'] = RobSca

In [75]:
df2.head() #looks successful, let's continue. As practice let's create a function for the remaining required preprocessing

Unnamed: 0,age,job,marital,education,in_default,avg_yearly_balance,housing_loan,personal_loan,contact_method,day,month,duration,campaign_contacts,prev_days,previous_contacts,prev_outcome,term_deposit
0,58,management,married,tertiary,no,1.247241,yes,no,unknown,5,5,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,-0.308315,yes,no,unknown,5,5,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,-0.328182,yes,yes,unknown,5,5,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,0.778514,yes,no,unknown,5,5,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,-0.328918,no,no,unknown,5,5,198,1,-1,0,unknown,no


In [76]:
#creating a function to make the rest of the preprocessing simpler
def preproc(dataframe, column, scalertype):
    X = pd.DataFrame(dataframe[column])
    scaler = scalertype.fit_transform(X)
    dataframe[column] = scaler
    

In [77]:
preproc(df2, 'duration', RobustScaler())

In [78]:
preproc(df2, 'prev_days', RobustScaler())

In [79]:
#minmax for campaign_contacts and previous_contacts and age
preproc(df2, 'campaign_contacts', MinMaxScaler())
preproc(df2, 'previous_contacts', MinMaxScaler())
preproc(df2, 'age', MinMaxScaler())
df2.head()

Unnamed: 0,age,job,marital,education,in_default,avg_yearly_balance,housing_loan,personal_loan,contact_method,day,month,duration,campaign_contacts,prev_days,previous_contacts,prev_outcome,term_deposit
0,0.519481,management,married,tertiary,no,1.247241,yes,no,unknown,5,5,0.373272,0.0,0.0,0.0,unknown,no
1,0.337662,technician,single,secondary,no,-0.308315,yes,no,unknown,5,5,-0.133641,0.0,0.0,0.0,unknown,no
2,0.194805,entrepreneur,married,secondary,no,-0.328182,yes,yes,unknown,5,5,-0.479263,0.0,0.0,0.0,unknown,no
3,0.376623,blue-collar,married,unknown,no,0.778514,yes,no,unknown,5,5,-0.40553,0.0,0.0,0.0,unknown,no
4,0.194805,unknown,single,unknown,no,-0.328918,no,no,unknown,5,5,0.082949,0.0,0.0,0.0,unknown,no


In [80]:
#decided also here that I should do campaign contacts
preproc(df2, 'campaign_contacts', MinMaxScaler())
df2.head()

Unnamed: 0,age,job,marital,education,in_default,avg_yearly_balance,housing_loan,personal_loan,contact_method,day,month,duration,campaign_contacts,prev_days,previous_contacts,prev_outcome,term_deposit
0,0.519481,management,married,tertiary,no,1.247241,yes,no,unknown,5,5,0.373272,0.0,0.0,0.0,unknown,no
1,0.337662,technician,single,secondary,no,-0.308315,yes,no,unknown,5,5,-0.133641,0.0,0.0,0.0,unknown,no
2,0.194805,entrepreneur,married,secondary,no,-0.328182,yes,yes,unknown,5,5,-0.479263,0.0,0.0,0.0,unknown,no
3,0.376623,blue-collar,married,unknown,no,0.778514,yes,no,unknown,5,5,-0.40553,0.0,0.0,0.0,unknown,no
4,0.194805,unknown,single,unknown,no,-0.328918,no,no,unknown,5,5,0.082949,0.0,0.0,0.0,unknown,no


In [81]:
#Getting dummies now for the categorical data
#df_ready = this dataframe is ready for modeling
df_ready = pd.get_dummies(df2, columns = ['job','marital','education','in_default','housing_loan','personal_loan','contact_method','prev_outcome'])


In [82]:
df_ready.columns

Index(['age', 'avg_yearly_balance', 'day', 'month', 'duration',
       'campaign_contacts', 'prev_days', 'previous_contacts', 'term_deposit',
       'job_admin.', 'job_blue-collar', 'job_entrepreneur', 'job_housemaid',
       'job_management', 'job_retired', 'job_self-employed', 'job_services',
       'job_student', 'job_technician', 'job_unemployed', 'job_unknown',
       'marital_divorced', 'marital_married', 'marital_single',
       'education_primary', 'education_secondary', 'education_tertiary',
       'education_unknown', 'in_default_no', 'in_default_yes',
       'housing_loan_no', 'housing_loan_yes', 'personal_loan_no',
       'personal_loan_yes', 'contact_method_cellular',
       'contact_method_telephone', 'contact_method_unknown',
       'prev_outcome_failure', 'prev_outcome_other', 'prev_outcome_success',
       'prev_outcome_unknown'],
      dtype='object')

### Baseline Models

I will be running three baseline models with default parameters to test the predictive capability. Tune as necessary

In [83]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import classification_report

X = df_ready.drop(columns = ['term_deposit'])
y = df_ready['term_deposit']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 44)

In [84]:
#Logistic Regression CV
for i in range(10,21,2):
    clf = LogisticRegressionCV(cv=3, Cs = i, random_state=44, max_iter = 1500).fit(X_train, y_train)
    clf_predictions = clf.predict(X_test)
    clf_train_score = clf.score(X_train, y_train)
    clf_test_score = clf.score(X_test, y_test)
    print("Train Score: {}, Test Score: {} \n".format(clf_train_score, clf_test_score))
    print("Cs: {}. \n{}".format(i, classification_report(y_test, clf_predictions)))

KeyboardInterrupt: 

In [None]:
#the order is no, yes, let's focus on the yes
clfproba = clf.predict_proba(X_test)[:,1]
print(clfproba)
clfprobatrain = clf.predict_proba(X_train)[:,1]
print(clfprobatrain)

In [None]:
df_coef = pd.DataFrame(index = X.columns, data = clf.coef_[0, :], columns = ['Coefficient'])
#Here the odds that campaign contacts and previous succeses have an impact on 'yes' are high
df_coef['Abs_Coefficient'] = abs(clf.coef_[0])
df_coef

In [None]:
clf_predictions = clf.predict(X_test)
clf_train_score = clf.score(X_train, y_train)
clf_test_score = clf.score(X_test, y_test)
print(clf_train_score, clf_test_score)

Random Forests Classification

In [None]:
from sklearn.ensemble import RandomForestClassifier

#To make later parameter tuning easier, let's make sure we check multiple depths. 

for i in range(5,40,5):
    rfc = RandomForestClassifier(max_depth=i, random_state=44).fit(X_train, y_train)
    rfc_predictions = rfc.predict(X_test)
    rfc_train_score = rfc.score(X_train, y_train)
    rfc_test_score = rfc.score(X_test, y_test)
    print("Train Score: {}, Test Score: {} \n".format(rfc_train_score, rfc_test_score))
    print("Depth: {}. \n{}".format(i, classification_report(y_test, rfc_predictions)))

An important factor here is recall, we want to try to get recall as high as possible so that we are able to correctly identify customers. 0.45 is higher than before, if our precision falters, but our recall is high, this is acceptable as a few misplaced man-hours is less impactful than missed opportunities.

After running more baselines, we will revisit this using the probabilities from predict_proba_ to determine tweak. 

KNN baseline

In [127]:
from sklearn.neighbors import KNeighborsClassifier
#let's do some initial testing with different neighbors
for i in range(2,12,2):
    knn = KNeighborsClassifier(n_neighbors=i).fit(X_train, y_train)
    knn_predictions = knn.predict(X_test)
    knn_train_score = knn.score(X_train, y_train)
    knn_test_score = knn.score(X_test, y_test)
    print("Train Score: {}, Test_Score: {} \n".format(knn_train_score, knn_test_score))
    print("Neighbors: {}. \n{}".format(i, classification_report(y_test, knn_predictions)))

Train Score: 0.9418304033092038, Test_Score: 0.892225201072386 

Depth: 2. 
              precision    recall  f1-score   support

          no       0.90      0.98      0.94     13107
         yes       0.64      0.25      0.36      1813

    accuracy                           0.89     14920
   macro avg       0.77      0.62      0.65     14920
weighted avg       0.87      0.89      0.87     14920

Train Score: 0.9235321153625187, Test_Score: 0.8928954423592493 

Depth: 4. 
              precision    recall  f1-score   support

          no       0.91      0.98      0.94     13107
         yes       0.64      0.27      0.38      1813

    accuracy                           0.89     14920
   macro avg       0.77      0.62      0.66     14920
weighted avg       0.87      0.89      0.87     14920

Train Score: 0.9159485234976444, Test_Score: 0.8931635388739947 

Depth: 6. 
              precision    recall  f1-score   support

          no       0.91      0.98      0.94     13107
       

In [None]:
#simple naive bayes just to see what a very simple model handles.
from sklearn.naive_bayes import GaussianNB

nb = GaussianNB()
nb.fit(X_train, y_train)
nb_predictions = nb.predict(X_test)
nb_train_score = nb.score(X_train, y_train)
nb_test_score = nb.score(X_test, y_test)
print("Train Score: {}, Test_Score: {} \n".format(nb_train_score, nb_test_score))
print("\n{}".format(classification_report(y_test, nb_predictions)))


Let's go ahead and tweak the best of each. Again, we're going for high recall.

In [85]:
#Logistic Regression CV, using predict_proba
#testing different thresholds of probability
proba_thresh_list = [0.1,0.15,0.2,0.25,0.3,0.35,0.4,0.45]

for i in proba_thresh_list:
    clf = LogisticRegressionCV(cv=3, Cs = 14, random_state=44, max_iter = 1500).fit(X_train, y_train)
    clf_predictions = clf.predict(X_test)
    clf_train_score = clf.score(X_train, y_train)
    clf_test_score = clf.score(X_test, y_test)
    clf_proba = clf.predict_proba(X_test)
    clf_lower_thresh = (clf_proba[:,1] > i)
    print("Train Score: {}, Test Score: {} \n".format(clf_train_score, clf_test_score))
    print("Probability Threshold: {} \n{}".format(i, classification_report((y_test == 'yes'), clf_lower_thresh)))

Train Score: 0.8998046650580259, Test Score: 0.9015415549597855 

Probability Threshold: 0.1 
              precision    recall  f1-score   support

       False       0.98      0.79      0.87     13107
        True       0.36      0.86      0.51      1813

    accuracy                           0.80     14920
   macro avg       0.67      0.83      0.69     14920
weighted avg       0.90      0.80      0.83     14920

Train Score: 0.8998046650580259, Test Score: 0.9015415549597855 

Probability Threshold: 0.15 
              precision    recall  f1-score   support

       False       0.96      0.88      0.92     13107
        True       0.45      0.73      0.55      1813

    accuracy                           0.86     14920
   macro avg       0.70      0.80      0.74     14920
weighted avg       0.90      0.86      0.87     14920

Train Score: 0.8998046650580259, Test Score: 0.9015415549597855 

Probability Threshold: 0.2 
              precision    recall  f1-score   support

       F

As predicted lower threshold, higher recall, but very low precision. Let's continue on and see how the other models fare.

In [126]:
#RandomForests, using predict_proba
#testing different thresholds of probability
proba_thresh_list = [0.1,0.15,0.2,0.25,0.3,0.35,0.4,0.45]

for i in proba_thresh_list:
    rfc = RandomForestClassifier(max_depth=14, random_state=44).fit(X_train, y_train)
    rfc_predictions = rfc.predict(X_test)
    rfc_train_score = rfc.score(X_train, y_train)
    rfc_test_score = rfc.score(X_test, y_test)
    rfc_proba = rfc.predict_proba(X_test)
    rfc_lower_thresh = (rfc_proba[:,1] > i).astype('int')
    print("Train Score: {}, Test Score: {} \n".format(rfc_train_score, rfc_test_score))
    print("Probability Threshold: {}. \n{}".format(i, classification_report((y_test == 'yes').astype('int'), rfc_lower_thresh)))

Train Score: 0.9554463977938642, Test Score: 0.910053619302949 

Probability Threshold: 0.1. 
              precision    recall  f1-score   support

           0       0.99      0.79      0.88     13107
           1       0.39      0.93      0.54      1813

    accuracy                           0.81     14920
   macro avg       0.69      0.86      0.71     14920
weighted avg       0.91      0.81      0.84     14920

Train Score: 0.9554463977938642, Test Score: 0.910053619302949 

Probability Threshold: 0.15. 
              precision    recall  f1-score   support

           0       0.98      0.85      0.91     13107
           1       0.45      0.87      0.59      1813

    accuracy                           0.85     14920
   macro avg       0.71      0.86      0.75     14920
weighted avg       0.92      0.85      0.87     14920

Train Score: 0.9554463977938642, Test Score: 0.910053619302949 

Probability Threshold: 0.2. 
              precision    recall  f1-score   support

        

In [128]:
#KNN probability testing

for i in proba_thresh_list:
    knn = KNeighborsClassifier(n_neighbors=8).fit(X_train, y_train)
    knn_predictions = knn.predict(X_test)
    knn_train_score = knn.score(X_train, y_train)
    knn_test_score = knn.score(X_test, y_test)
    knn_proba = knn.predict_proba(X_test)
    knn_lower_thresh = (knn_proba[:,1] > i)
    print("Train Score: {}, Test_Score: {} \n".format(knn_train_score, knn_test_score))
    print("Probability Threshold: {}. \n{}".format(i, classification_report((y_test == 'yes'), knn_lower_thresh)))

Train Score: 0.9113236814891417, Test_Score: 0.8942359249329759 

Probability Threshold: 0.1. 
              precision    recall  f1-score   support

       False       0.98      0.78      0.87     13107
        True       0.35      0.88      0.50      1813

    accuracy                           0.79     14920
   macro avg       0.67      0.83      0.68     14920
weighted avg       0.90      0.79      0.82     14920

Train Score: 0.9113236814891417, Test_Score: 0.8942359249329759 

Probability Threshold: 0.15. 
              precision    recall  f1-score   support

       False       0.96      0.88      0.92     13107
        True       0.47      0.73      0.57      1813

    accuracy                           0.87     14920
   macro avg       0.71      0.81      0.74     14920
weighted avg       0.90      0.87      0.88     14920

Train Score: 0.9113236814891417, Test_Score: 0.8942359249329759 

Probability Threshold: 0.2. 
              precision    recall  f1-score   support

     

It looks as though with this edit, RandomForests is doing the best here. We can see that the with a probability threshold of 0.2 we are getting: 
Probability Threshold: 0.2. 
              precision    recall  f1-score   support

           0       0.97      0.89      0.93     13107
           1       0.51      0.83      0.63      1813
           

This allows shows us that we will be filling our picks just over half with incorrect picks, but we are securing 0.83 of the correct choices.

In sales terms, let's say each call takes average 5 minutes (Edit this later with numbers from DF). Each term_deposit is profitting 100 dollars. Of 100 sales we saw that 