In [1]:
import pandas as pd
import numpy as np
import catboost

In [2]:
#loading the cleaned dataframe for preprocessing

df = pd.read_csv('Peruvian_Bank_Data/clean_df.csv', header = 0)
df.head()

Unnamed: 0,age,job,marital,education,in_default,avg_yearly_balance,housing_loan,personal_loan,contact_method,day,month,duration,campaign_contacts,prev_days,previous_contacts,prev_outcome,term_deposit
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,5,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,5,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,5,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,5,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,5,198,1,-1,0,unknown,no


I believe the best way to preprocess this is to provide dummy labels to the categorical data and then use a RobustScaler for avg_yearly_balance, and duration because of the outliers. Prev_days will also get a robustscaling, but this will be done last because it is subject to change -- I'm concerned about the negative one being abrasive to the model. But we will see.  MinMax scaler for campaign_contacts and previous_contacts and age. Let's begin and see how it goes. I will do the scaling first and then the dummy variables -- the dummy variables make the dataframe a bit unbearable to scroll through. 

In [3]:
from sklearn.preprocessing import RobustScaler, MinMaxScaler

In [4]:
#applying robustscaler to avg_yearly_balance, duration, prev_days
X = pd.DataFrame(df['avg_yearly_balance'])
RobSca = RobustScaler().fit_transform(X)

In [5]:
df2 = df.copy() #creating a copy just in case something goes wrong
#going off to replace the avg_yearly_balance values
df2['avg_yearly_balance'] = RobSca

In [6]:
df2.head() #looks successful, let's continue. As practice let's create a function for the remaining required preprocessing

Unnamed: 0,age,job,marital,education,in_default,avg_yearly_balance,housing_loan,personal_loan,contact_method,day,month,duration,campaign_contacts,prev_days,previous_contacts,prev_outcome,term_deposit
0,58,management,married,tertiary,no,1.247241,yes,no,unknown,5,5,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,-0.308315,yes,no,unknown,5,5,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,-0.328182,yes,yes,unknown,5,5,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,0.778514,yes,no,unknown,5,5,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,-0.328918,no,no,unknown,5,5,198,1,-1,0,unknown,no


In [7]:
#creating a function to make the rest of the preprocessing simpler
def preproc(dataframe, column, scalertype):
    X = pd.DataFrame(dataframe[column])
    scaler = scalertype.fit_transform(X)
    dataframe[column] = scaler
    

In [8]:
preproc(df2, 'duration', RobustScaler())

In [9]:
preproc(df2, 'prev_days', RobustScaler())

In [10]:
#minmax for campaign_contacts and previous_contacts and age
preproc(df2, 'campaign_contacts', MinMaxScaler())
preproc(df2, 'previous_contacts', MinMaxScaler())
preproc(df2, 'age', MinMaxScaler())
df2.head()

Unnamed: 0,age,job,marital,education,in_default,avg_yearly_balance,housing_loan,personal_loan,contact_method,day,month,duration,campaign_contacts,prev_days,previous_contacts,prev_outcome,term_deposit
0,0.519481,management,married,tertiary,no,1.247241,yes,no,unknown,5,5,0.373272,0.0,0.0,0.0,unknown,no
1,0.337662,technician,single,secondary,no,-0.308315,yes,no,unknown,5,5,-0.133641,0.0,0.0,0.0,unknown,no
2,0.194805,entrepreneur,married,secondary,no,-0.328182,yes,yes,unknown,5,5,-0.479263,0.0,0.0,0.0,unknown,no
3,0.376623,blue-collar,married,unknown,no,0.778514,yes,no,unknown,5,5,-0.40553,0.0,0.0,0.0,unknown,no
4,0.194805,unknown,single,unknown,no,-0.328918,no,no,unknown,5,5,0.082949,0.0,0.0,0.0,unknown,no


In [11]:
#decided also here that I should do campaign contacts
preproc(df2, 'campaign_contacts', MinMaxScaler())
df2.head()

Unnamed: 0,age,job,marital,education,in_default,avg_yearly_balance,housing_loan,personal_loan,contact_method,day,month,duration,campaign_contacts,prev_days,previous_contacts,prev_outcome,term_deposit
0,0.519481,management,married,tertiary,no,1.247241,yes,no,unknown,5,5,0.373272,0.0,0.0,0.0,unknown,no
1,0.337662,technician,single,secondary,no,-0.308315,yes,no,unknown,5,5,-0.133641,0.0,0.0,0.0,unknown,no
2,0.194805,entrepreneur,married,secondary,no,-0.328182,yes,yes,unknown,5,5,-0.479263,0.0,0.0,0.0,unknown,no
3,0.376623,blue-collar,married,unknown,no,0.778514,yes,no,unknown,5,5,-0.40553,0.0,0.0,0.0,unknown,no
4,0.194805,unknown,single,unknown,no,-0.328918,no,no,unknown,5,5,0.082949,0.0,0.0,0.0,unknown,no


In [12]:
#Getting dummies now for the categorical data
#df_ready = this dataframe is ready for modeling
df_ready = pd.get_dummies(df2, columns = ['job','marital','education','in_default','housing_loan','personal_loan','contact_method','prev_outcome'])


In [13]:
df_ready.columns

Index(['age', 'avg_yearly_balance', 'day', 'month', 'duration',
       'campaign_contacts', 'prev_days', 'previous_contacts', 'term_deposit',
       'job_admin.', 'job_blue-collar', 'job_entrepreneur', 'job_housemaid',
       'job_management', 'job_retired', 'job_self-employed', 'job_services',
       'job_student', 'job_technician', 'job_unemployed', 'job_unknown',
       'marital_divorced', 'marital_married', 'marital_single',
       'education_primary', 'education_secondary', 'education_tertiary',
       'education_unknown', 'in_default_no', 'in_default_yes',
       'housing_loan_no', 'housing_loan_yes', 'personal_loan_no',
       'personal_loan_yes', 'contact_method_cellular',
       'contact_method_telephone', 'contact_method_unknown',
       'prev_outcome_failure', 'prev_outcome_other', 'prev_outcome_success',
       'prev_outcome_unknown'],
      dtype='object')

### Baseline Models

I will be running three baseline models with default parameters to test the predictive capability. Tune as necessary

In [14]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegressionCV

X = df_ready.drop(columns = ['term_deposit'])
y = df_ready['term_deposit']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 44)

In [15]:
clf = LogisticRegressionCV(cv = 3, max_iter = 1000, random_state = 44).fit(X_train, y_train)

In [16]:
from sklearn.metrics import classification_report

predictX = clf.predict(X_test)
print(classification_report(y_test, predictX))

              precision    recall  f1-score   support

          no       0.91      0.98      0.95     13107
         yes       0.70      0.33      0.45      1813

    accuracy                           0.90     14920
   macro avg       0.81      0.65      0.70     14920
weighted avg       0.89      0.90      0.89     14920



In [17]:
#the order is no, yes, let's focus on the yes
clfproba = clf.predict_proba(X_test)[:,1]
print(clfproba)
clfprobatrain = clf.predict_proba(X_train)[:,1]
print(clfprobatrain)

[0.0749084  0.70219189 0.02766863 ... 0.06233517 0.67588651 0.02076343]
[0.04351439 0.16854613 0.05346438 ... 0.03322651 0.02058422 0.06911573]


In [45]:
df_coef = pd.DataFrame(index = X.columns, data = clf.coef_[0, :], columns = ['Coefficient'])
#Here the odds that campaign contacts and previous succeses have an impact on 'yes' are high
df_coef['Abs_Coefficient'] = abs(clf.coef_[0])
df_coef

Unnamed: 0,Coefficient,Abs_Coefficient
age,0.032454,0.032454
avg_yearly_balance,0.02968,0.02968
day,-0.004655,0.004655
month,-0.017072,0.017072
duration,0.86102,0.86102
campaign_contacts,-6.302164,6.302164
prev_days,-6.7e-05,6.7e-05
previous_contacts,0.063207,0.063207
job_admin.,0.188167,0.188167
job_blue-collar,-0.154641,0.154641


In [71]:
clf_predictions = clf.predict(X_test)
clf_train_score = clf.score(X_train, y_train)
clf_test_score = clf.score(X_test, y_test)
print(clf_train_score, clf_test_score)

0.8998621165115478 0.9012064343163538


Random Forests Classification

In [84]:
from sklearn.ensemble import RandomForestClassifier

#To make later parameter tuning easier, let's make sure we check multiple depths. 

for i in range(5,40,5):
    rfc = RandomForestClassifier(max_depth=i, random_state=44).fit(X_train, y_train)
    rfc_predictions = rfc.predict(X_test)
    rfc_train_score = rfc.score(X_train, y_train)
    rfc_test_score = rfc.score(X_test, y_test)
    print("Train Score: {}, Test Score: {} \n".format(rfc_train_score, rfc_test_score))
    print("Depth: {}. \n{}".format(i, classification_report(y_test, rfc_predictions)))

Train Score: 0.8949212915086752, Test Score: 0.8901474530831099 

Depth: 5. 
              precision    recall  f1-score   support

          no       0.89      1.00      0.94     13107
         yes       0.82      0.12      0.21      1813

    accuracy                           0.89     14920
   macro avg       0.86      0.56      0.58     14920
weighted avg       0.88      0.89      0.85     14920

Train Score: 0.9149718487877744, Test Score: 0.8977882037533512 

Depth: 10. 
              precision    recall  f1-score   support

          no       0.90      0.99      0.94     13107
         yes       0.79      0.22      0.34      1813

    accuracy                           0.90     14920
   macro avg       0.84      0.60      0.64     14920
weighted avg       0.89      0.90      0.87     14920

Train Score: 0.963144892565782, Test Score: 0.9109249329758713 

Depth: 15. 
              precision    recall  f1-score   support

          no       0.92      0.98      0.95     13107
     

An important factor here is recall, we want to try to get recall as high as possible so that we are able to correctly identify customers. 0.45 is higher than before, if our precision falters, but our recall is high, this is acceptable as a few misplaced man-hours is less impactful than missed opportunities.

KNN baseline

In [83]:
from sklearn.neighbors import KNeighborsClassifier
#let's do some initial testing with different neighbors
for i in range(2,20,2):
    knn = KNeighborsClassifier(n_neighbors=i).fit(X_train, y_train)
    knn_predictions = knn.predict(X_test)
    knn_train_score = knn.score(X_train, y_train)
    knn_test_score = knn.score(X_test, y_test)
    print("Train Score: {}, Test_Score: {} \n".format(knn_train_score, knn_test_score))
    print("Depth: {}. \n{}".format(i, classification_report(y_test, rfc_predictions)))

Train Score: 0.9418304033092038, Test_Score: 0.892225201072386 

Depth: 2. 
              precision    recall  f1-score   support

          no       0.93      0.98      0.95     13107
         yes       0.75      0.46      0.57      1813

    accuracy                           0.92     14920
   macro avg       0.84      0.72      0.76     14920
weighted avg       0.91      0.92      0.91     14920

Train Score: 0.9235321153625187, Test_Score: 0.8928954423592493 

Depth: 4. 
              precision    recall  f1-score   support

          no       0.93      0.98      0.95     13107
         yes       0.75      0.46      0.57      1813

    accuracy                           0.92     14920
   macro avg       0.84      0.72      0.76     14920
weighted avg       0.91      0.92      0.91     14920

Train Score: 0.9159485234976444, Test_Score: 0.8931635388739947 

Depth: 6. 
              precision    recall  f1-score   support

          no       0.93      0.98      0.95     13107
       

After n_neighbors 12 it begins to dip a bit in the test score, but no significant changes at all across the multitude of tests. There is something missing here and I believe it can be done by looking at the data and establishing semi-blind tiers for all of the customers as another point for the algorithm to go off of. We can do this by looking at their probability. This will be done in the next step. 