In [28]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import normalize
from sklearn.model_selection import train_test_split


In [52]:
df_ori = pd.DataFrame(pd.read_csv('Peruvian_Bank_Data/clean_df.csv'))
df_ori.head()

Unnamed: 0,age,job,marital,education,in_default,avg_yearly_balance,housing_loan,personal_loan,contact_method,day,month,duration,campaign_contacts,prev_days,previous_contacts,prev_outcome,term_deposit
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,5,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,5,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,5,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,5,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,5,198,1,-1,0,unknown,no


In [71]:
#looking to get a coefficient reading so we are going to process the data with normalization and throw into logreg
df = df_ori.copy()
df = pd.get_dummies(data = df, columns = ['in_default', 'job', 'marital', 'education', 'contact_method', 'prev_outcome', 'housing_loan', 'personal_loan']) 
df_X = df.drop(columns = ['term_deposit'])
df_y = df['term_deposit']
X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size = 0.3, random_state = 44)

In [109]:
#normalizing the X sets
X_train = normalize(X_train, axis = 0)
X_test = normalize(X_test, axis = 0)

In [110]:
X_train = pd.DataFrame(data = X_train, columns = df_X.columns)
X_test = pd.DataFrame(data = X_test, columns = df_X.columns)

In [111]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
logregcoefs = abs(logreg.coef_)

In [112]:
df_coefs = pd.DataFrame(data = logregcoefs, columns = df_X.columns)
#df_coefs = normalize(df_coefs)
#df_coefs = pd.DataFrame(data = df_coefs, columns = df_X.columns)
df_coefs.head()

Unnamed: 0,age,avg_yearly_balance,day,month,duration,campaign_contacts,prev_days,previous_contacts,in_default_no,in_default_yes,...,contact_method_telephone,contact_method_unknown,prev_outcome_failure,prev_outcome_other,prev_outcome_success,prev_outcome_unknown,housing_loan_no,housing_loan_yes,personal_loan_no,personal_loan_yes
0,4.82362,0.641157,4.376595,4.332708,5.835529,2.955073,1.162786,1.577131,4.658281,1.11858,...,0.988941,4.892867,1.342457,0.401059,9.987566,5.526496,1.388485,5.386779,3.805039,3.194765


In [125]:
#preprocessing the data more appropriately for Random Forest
from sklearn.preprocessing import RobustScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import classification_report

def preproc(dataframe, column, scalertype):
    X = pd.DataFrame(dataframe[column])
    scaler = scalertype.fit_transform(X)
    dataframe[column] = scaler
    
df_X = df.drop(columns = ['term_deposit'])
df_y = df['term_deposit']
X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size = 0.3, random_state = 44)
    
preproc(X_train, 'avg_yearly_balance', RobustScaler())
preproc(X_train, 'duration', RobustScaler())
preproc(X_train, 'prev_days', RobustScaler())
preproc(X_train, 'campaign_contacts', MinMaxScaler())
preproc(X_train, 'previous_contacts', MinMaxScaler())
preproc(X_train, 'age', MinMaxScaler())
preproc(X_train, 'campaign_contacts', MinMaxScaler())

preproc(X_test, 'avg_yearly_balance', RobustScaler())
preproc(X_test, 'duration', RobustScaler())
preproc(X_test, 'prev_days', RobustScaler())
preproc(X_test, 'campaign_contacts', MinMaxScaler())
preproc(X_test, 'previous_contacts', MinMaxScaler())
preproc(X_test, 'age', MinMaxScaler())
preproc(X_test, 'campaign_contacts', MinMaxScaler())

print(X_train.head())
print(X_test.head())

            age  avg_yearly_balance  day  month  duration  campaign_contacts  \
13976  0.363636           -0.127183   10      7  0.023148           0.000000   
42022  0.467532            0.208785   28     10  0.555556           0.000000   
17593  0.428571           -0.177908   29      7 -0.314815           0.016129   
24489  0.350649            1.153464   17     11 -0.546296           0.000000   
44180  0.155844           -0.168351   14      7  0.754630           0.129032   

       prev_days  previous_contacts  in_default_no  in_default_yes  ...  \
13976        0.0           0.000000              1               0  ...   
42022        0.0           0.000000              1               0  ...   
17593        0.0           0.000000              1               0  ...   
24489      132.0           0.003636              1               0  ...   
44180        0.0           0.000000              1               0  ...   

       contact_method_telephone  contact_method_unknown  prev_outcom

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice 

In [127]:
#Grid Search
parameter_grid = {'n_estimators' : [100,1000,5000], 'max_depth' : [10, 20, 30, 'None'], 'max_features' : ['auto', 'log2']}
rfc = RandomForestClassifier()
grid = GridSearchCV(estimator = rfc, param_grid = parameter_grid, scoring = 'accuracy').fit(X_train, y_train)
grid.best_params_
grid.cv_results_


TypeError: '<=' not supported between instances of 'str' and 'int'

TypeError: '<=' not supported between instances of 'str' and 'int'

TypeError: '<=' not supported between instances of 'str' and 'int'

TypeError: '<=' not supported between instances of 'str' and 'int'

TypeError: '<=' not supported between instances of 'str' and 'int'

TypeError: '<=' not supported between instances of 'str' and 'int'

TypeError: '<=' not supported between instances of 'str' and 'int'

TypeError: '<=' not supported between instances of 'str' and 'int'

TypeError: '<=' not supported between instances of 'str' and 'int'

TypeError: '<=' not supported between instances of 'str' and 'int'

TypeError: '<=' not supported between instances of 'str' and 'int'

TypeError: '<=' not supported between instances of 'str' and 'int'

TypeError: '<=' not supported between instances of 'str' and 'int'

TypeError: '<=' not supported between instances of 'str' and 'int'

TypeError: '<=' not supported between instances 

TypeError: '<=' not supported between instances of 'str' and 'int'

TypeError: '<=' not supported between instances of 'str' and 'int'

TypeError: '<=' not supported between instances of 'str' and 'int'



{'mean_fit_time': array([2.29280210e+00, 2.21992842e+01, 1.11637063e+02, 2.05373411e+00,
        1.97851687e+01, 9.92668643e+01, 3.26532054e+00, 3.26523731e+01,
        1.70563412e+02, 2.93354092e+00, 2.95108131e+01, 1.76399547e+02,
        3.66220646e+00, 3.77456754e+01, 2.06702899e+02, 3.29977460e+00,
        3.12933166e+01, 1.89527430e+02, 1.05808115e-01, 5.56944036e-01,
        2.40286708e+00, 9.04064178e-02, 4.54905653e-01, 2.23412180e+00]),
 'std_fit_time': array([3.49387176e-02, 1.58803210e-01, 8.43251154e-01, 5.33834325e-02,
        1.94614827e-01, 8.43974227e-01, 3.70868293e-02, 1.07618951e-01,
        1.93966697e+00, 4.50822193e-02, 2.17414544e-01, 1.23209753e+01,
        2.48357392e-01, 1.60015870e+00, 5.11908034e+00, 7.87045592e-02,
        1.58064381e+00, 1.71407026e+01, 1.86714305e-02, 5.75463268e-02,
        3.99556039e-01, 5.08327719e-03, 9.07982773e-03, 2.90850295e-01]),
 'mean_score_time': array([ 0.0910058 ,  0.8281908 ,  4.5134872 ,  0.10421367,  0.84359283,
       

In [128]:
grid.best_params_

{'max_depth': 30, 'max_features': 'auto', 'n_estimators': 5000}

In [129]:
rfc_best = RandomForestClassifier(max_depth = 30, max_features = 'auto', n_estimators = 5000)
rfc_best.fit(X_train, y_train)
rfc_predictions = rfc_best.predict(X_test)
rfc_train_score = rfc_best.score(X_train, y_train)
rfc_test_score = rfc_best.score(X_test, y_test)
print("Train Score: {}, Test Score: {} \n".format(rfc_train_score, rfc_test_score))
print("\n{}".format(classification_report(y_test, rfc_predictions)))

Train Score: 1.0, Test Score: 0.9154155495978552 


              precision    recall  f1-score   support

          no       0.93      0.98      0.95     13107
         yes       0.76      0.45      0.56      1813

    accuracy                           0.92     14920
   macro avg       0.84      0.71      0.76     14920
weighted avg       0.91      0.92      0.91     14920



So this model is based on the choice to achieve accuracy. We are going to run the grid search again to get recall this time.

In [131]:
grid_re = GridSearchCV(estimator = rfc, param_grid = parameter_grid, scoring = 'recall').fit(X_train, y_train == 'yes')
grid_re.best_params_
grid_re.cv_results_

TypeError: '<=' not supported between instances of 'str' and 'int'

TypeError: '<=' not supported between instances of 'str' and 'int'

TypeError: '<=' not supported between instances of 'str' and 'int'

TypeError: '<=' not supported between instances of 'str' and 'int'

TypeError: '<=' not supported between instances of 'str' and 'int'

TypeError: '<=' not supported between instances of 'str' and 'int'

TypeError: '<=' not supported between instances of 'str' and 'int'

TypeError: '<=' not supported between instances of 'str' and 'int'

TypeError: '<=' not supported between instances of 'str' and 'int'

TypeError: '<=' not supported between instances of 'str' and 'int'

TypeError: '<=' not supported between instances of 'str' and 'int'

TypeError: '<=' not supported between instances of 'str' and 'int'

TypeError: '<=' not supported between instances of 'str' and 'int'

TypeError: '<=' not supported between instances of 'str' and 'int'

TypeError: '<=' not supported between instances 

TypeError: '<=' not supported between instances of 'str' and 'int'

TypeError: '<=' not supported between instances of 'str' and 'int'

TypeError: '<=' not supported between instances of 'str' and 'int'



{'mean_fit_time': array([4.14125676e+00, 5.13492275e+01, 2.38762646e+02, 6.44557505e+00,
        5.58033402e+01, 2.16808331e+02, 7.38903651e+00, 6.01195673e+01,
        3.20777254e+02, 5.61252470e+00, 5.82573845e+01, 3.09918411e+02,
        7.72016196e+00, 6.33442125e+01, 3.46368264e+02, 5.78050666e+00,
        6.02209462e+01, 3.28284658e+02, 1.29809809e-01, 4.62342882e-01,
        2.05601168e+00, 1.00510979e-01, 5.79252529e-01, 2.03242617e+00]),
 'std_fit_time': array([4.59233312e-01, 3.96030132e+00, 9.84496837e+00, 1.60162490e+00,
        1.27916885e+01, 9.77332525e+00, 1.74162798e+00, 1.24604803e+00,
        9.15850527e+00, 8.92500636e-02, 1.43563466e+00, 8.85700449e+00,
        1.57831049e+00, 1.36232510e+00, 5.36978140e+00, 3.98901209e-02,
        2.68621709e+00, 4.91308847e+00, 3.49675970e-02, 7.71641552e-03,
        4.23545497e-02, 1.06208813e-02, 2.56012455e-01, 4.34086486e-02]),
 'mean_score_time': array([ 0.25541883,  2.70506988, 19.49194713,  0.34648719,  2.77940307,
       

In [132]:
grid_re.best_params_

{'max_depth': 30, 'max_features': 'auto', 'n_estimators': 5000}

Well it's reassuring to see that the original model went ahead and ended on the same best parameters. it's time to tweak the data a bit, like before. I want to be able to have the highest recall possible, but this time, seeing if we can't increase our accuracy while we do it.

In [138]:
proba_thresh_list = [0.01, 0.025, 0.05,0.075,0.1,0.15,0.2,0.25,0.3]
rfc_best_proba = rfc_best.predict_proba(X_test)
for i in proba_thresh_list:
    rfc_best_lower_thresh = (rfc_best_proba[:,1] >i).astype('int')
    print("Probability Threshold: {}. \n{}".format(i, classification_report((y_test == 'yes').astype('int'), rfc_best_lower_thresh)))

Probability Threshold: 0.01. 
              precision    recall  f1-score   support

           0       1.00      0.40      0.57     13107
           1       0.19      1.00      0.31      1813

    accuracy                           0.47     14920
   macro avg       0.59      0.70      0.44     14920
weighted avg       0.90      0.47      0.54     14920

Probability Threshold: 0.025. 
              precision    recall  f1-score   support

           0       1.00      0.57      0.72     13107
           1       0.24      0.99      0.39      1813

    accuracy                           0.62     14920
   macro avg       0.62      0.78      0.56     14920
weighted avg       0.91      0.62      0.68     14920

Probability Threshold: 0.05. 
              precision    recall  f1-score   support

           0       1.00      0.69      0.81     13107
           1       0.30      0.98      0.46      1813

    accuracy                           0.72     14920
   macro avg       0.65      0.83    

In [139]:
#This model does perform noticeably better than the previous rfc model. Let's go ahead and get to the nitty details
for i in np.arange(0.01, 0.025, 0.001): 
    rfc_best_lower_thresh = (rfc_best_proba[:,1] >i).astype('int')
    print("Probability Threshold: {}. \n{}".format(i, classification_report((y_test == 'yes').astype('int'), rfc_best_lower_thresh)))

Probability Threshold: 0.01. 
              precision    recall  f1-score   support

           0       1.00      0.40      0.57     13107
           1       0.19      1.00      0.31      1813

    accuracy                           0.47     14920
   macro avg       0.59      0.70      0.44     14920
weighted avg       0.90      0.47      0.54     14920

Probability Threshold: 0.011. 
              precision    recall  f1-score   support

           0       1.00      0.42      0.59     13107
           1       0.19      1.00      0.32      1813

    accuracy                           0.49     14920
   macro avg       0.60      0.71      0.46     14920
weighted avg       0.90      0.49      0.56     14920

Probability Threshold: 0.011999999999999999. 
              precision    recall  f1-score   support

           0       1.00      0.43      0.60     13107
           1       0.20      1.00      0.33      1813

    accuracy                           0.50     14920
   macro avg       0.

In [142]:
#Still have the same breaking point at 1.8%, but here we have a higher precision of .22 vs .19, which is going to save a many calls
#bringing in the tradeoff report
def trade_off_report(rf_tuple, cust = 50000, conv_perc = 0.1168, avg_dur = 4.3):
    nm_conv = round((cust * conv_perc),2)
    nm_dur_total = round((cust * avg_dur),2)
    nm_conv_interval = round((nm_dur_total / nm_conv),2)
    m_conv = round((nm_conv * rf_tuple[2]),2)
    m_dur_total = round(((m_conv / rf_tuple[1]) * avg_dur),2) #getting all customers that would be called for 'yes', then multiplying by duration
    m_conv_interval = round((m_dur_total / m_conv),2)
    print("Threshold: {}\nNo Model Conversions: {}\nModel Conversions: {}\nNo Model Minutes used: {}\nModel Minutes Used: {}\n\
    No Model Conversion Interval: {}\nModel Conversion Interval: {}\n".format(rf_tuple[0], nm_conv, m_conv, nm_dur_total, m_dur_total, nm_conv_interval, m_conv_interval))

rf_018 = ("1.8% revised", 0.22, 1.0)
rf_018_old = ('1.8% old', 0.19, 1.0)

trade_off_report(rf_018)
trade_off_report(rf_018_old)

Threshold: 1.8% revised
No Model Conversions: 5840.0
Model Conversions: 5840.0
No Model Minutes used: 215000.0
Model Minutes Used: 114145.45
    No Model Conversion Interval: 36.82
Model Conversion Interval: 19.55

Threshold: 1.8% old
No Model Conversions: 5840.0
Model Conversions: 5840.0
No Model Minutes used: 215000.0
Model Minutes Used: 132168.42
    No Model Conversion Interval: 36.82
Model Conversion Interval: 22.63



In [144]:
print("Revised additional minute savings = " + str(132168.42 - 114145.45))

Revised additional minute savings = 18022.970000000016


So this is at our best recall. But, the company may be interested in more focused iterations. So let's go ahead and run a few more trade_off_reports


In [145]:
rf_018 = ("1.8%", 0.22, 1.0)
rf_05 = ('5%', 0.3, 0.98)
rf_10 = ('10%', 0.39, 0.95)
rf_15 = ('15%', 0.45, 0.9)
rf_20 = ('20%', 0.51, 0.86)
rf_25 = ('25%', 0.55, 0.81)
rf_30 = ('30%', 0.6, 0.75)

tuple_list = [rf_018, rf_05,rf_10, rf_15, rf_20, rf_25, rf_30]
for i in tuple_list:
    trade_off_report(i)

Threshold: 1.8%
No Model Conversions: 5840.0
Model Conversions: 5840.0
No Model Minutes used: 215000.0
Model Minutes Used: 114145.45
    No Model Conversion Interval: 36.82
Model Conversion Interval: 19.55

Threshold: 5%
No Model Conversions: 5840.0
Model Conversions: 5723.2
No Model Minutes used: 215000.0
Model Minutes Used: 82032.53
    No Model Conversion Interval: 36.82
Model Conversion Interval: 14.33

Threshold: 10%
No Model Conversions: 5840.0
Model Conversions: 5548.0
No Model Minutes used: 215000.0
Model Minutes Used: 61170.26
    No Model Conversion Interval: 36.82
Model Conversion Interval: 11.03

Threshold: 15%
No Model Conversions: 5840.0
Model Conversions: 5256.0
No Model Minutes used: 215000.0
Model Minutes Used: 50224.0
    No Model Conversion Interval: 36.82
Model Conversion Interval: 9.56

Threshold: 20%
No Model Conversions: 5840.0
Model Conversions: 5022.4
No Model Minutes used: 215000.0
Model Minutes Used: 42345.73
    No Model Conversion Interval: 36.82
Model Conv

While this view is approachable. It still lacks priority. Here we can see that if we were to simply run everybody through a specific threshold we can do a single split of customers to reduce our time. But we can actively seek prioritization through tiering the customers.

In [155]:
X_whole = pd.concat([X_train, X_test])
print(X_whole.shape)
y_whole = pd.concat([y_train, y_test])
print(y_whole.shape)

(49732, 40)
(49732,)


In [156]:
rfc_whole_proba = rfc_best.predict_proba(X_whole)
X_whole['probability'] = rfc_whole_proba[:,1]
X_whole['probability'] = X_whole['probability'].round(3)
X_whole.head()

Unnamed: 0,age,avg_yearly_balance,day,month,duration,campaign_contacts,prev_days,previous_contacts,in_default_no,in_default_yes,...,contact_method_unknown,prev_outcome_failure,prev_outcome_other,prev_outcome_success,prev_outcome_unknown,housing_loan_no,housing_loan_yes,personal_loan_no,personal_loan_yes,probability
13976,0.363636,-0.127183,10,7,0.023148,0.0,0.0,0.0,1,0,...,0,0,0,0,1,0,1,1,0,0.001
42022,0.467532,0.208785,28,10,0.555556,0.0,0.0,0.0,1,0,...,0,0,0,0,1,1,0,1,0,0.14
17593,0.428571,-0.177908,29,7,-0.314815,0.016129,0.0,0.0,1,0,...,0,0,0,0,1,1,0,1,0,0.003
24489,0.350649,1.153464,17,11,-0.546296,0.0,132.0,0.003636,1,0,...,0,1,0,0,0,0,1,1,0,0.001
44180,0.155844,-0.168351,14,7,0.75463,0.129032,0.0,0.0,1,0,...,0,0,0,0,1,1,0,1,0,0.04


In [172]:
X_whole['tier'] = pd.cut(X_whole['probability'],[0, 0.018, 0.4, 0.6, 1.0],labels=['tier_4', 'tier_3', 'tier_2', 'tier_1'])
df_tiered = X_whole.copy()
df_tiered['term_deposit'] = y_whole
df_tiered.head()

Unnamed: 0,age,avg_yearly_balance,day,month,duration,campaign_contacts,prev_days,previous_contacts,in_default_no,in_default_yes,...,prev_outcome_other,prev_outcome_success,prev_outcome_unknown,housing_loan_no,housing_loan_yes,personal_loan_no,personal_loan_yes,probability,tier,term_deposit
13976,0.363636,-0.127183,10,7,0.023148,0.0,0.0,0.0,1,0,...,0,0,1,0,1,1,0,0.001,tier_4,no
42022,0.467532,0.208785,28,10,0.555556,0.0,0.0,0.0,1,0,...,0,0,1,1,0,1,0,0.14,tier_3,no
17593,0.428571,-0.177908,29,7,-0.314815,0.016129,0.0,0.0,1,0,...,0,0,1,1,0,1,0,0.003,tier_4,no
24489,0.350649,1.153464,17,11,-0.546296,0.0,132.0,0.003636,1,0,...,0,0,0,0,1,1,0,0.001,tier_4,no
44180,0.155844,-0.168351,14,7,0.75463,0.129032,0.0,0.0,1,0,...,0,0,1,1,0,1,0,0.04,tier_3,no


In [173]:
df_tiered.tier.loc[df_tiered['term_deposit'] == 'yes'].value_counts().sort_index()

tier_4       9
tier_3     681
tier_2     584
tier_1    4536
Name: tier, dtype: int64

In [174]:
df_tiered.tier.loc[df_tiered['term_deposit'] == 'no'].value_counts().sort_index()

tier_4    24819
tier_3    14903
tier_2      417
tier_1      107
Name: tier, dtype: int64

In [175]:
print("Tier 4: " + str(round((9/24828),4)))
print("Tier 3: " + str(round((681/15584),4)))
print("Tier 2: " + str(round((584/1001),4)))
print("Tier 1: " + str(round((4536/4643),4)))

Tier 4: 0.0004
Tier 3: 0.0437
Tier 2: 0.5834
Tier 1: 0.977


Compared to the previous model, this has higher probabiltiy accuracy and therfore a stronger tier 1, with less customers in the tier 2 and 3 sections and the vast majority in tier 4. Here we actually see a weird change where the tier 3 customers have more actual conversions than tier 2, but the proportion is much smaller compared to the tier 2.

With this new campaign set up the company will be able to prioritize customers using this model as well as immensely cut down on the hours used if chosen to move forward with it. I believe this is better than near blind outgoing calls.