In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import normalize
from sklearn.model_selection import train_test_split


In [2]:
df_ori = pd.DataFrame(pd.read_csv('Peruvian_Bank_Data/clean_df.csv'))
df_ori.head()

Unnamed: 0,age,job,marital,education,in_default,avg_yearly_balance,housing_loan,personal_loan,contact_method,day,month,duration,campaign_contacts,prev_days,previous_contacts,prev_outcome,term_deposit
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,5,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,5,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,5,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,5,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,5,198,1,-1,0,unknown,no


In [3]:
#looking to get a coefficient reading so we are going to process the data with normalization and throw into logreg
df = df_ori.copy()
df = pd.get_dummies(data = df, columns = ['in_default', 'job', 'marital', 'education', 'contact_method', 'prev_outcome', 'housing_loan', 'personal_loan']) 
df['term_deposit'] = df['term_deposit'].map( 
                   {'yes': 1 ,'no': 0}) 
df_X = df.drop(columns = ['term_deposit'])
df_y = df['term_deposit']
X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size = 0.3, random_state = 44)

In [4]:
#normalizing the X sets
X_train = normalize(X_train, axis = 0)
X_test = normalize(X_test, axis = 0)

In [5]:
X_train = pd.DataFrame(data = X_train, columns = df_X.columns)
X_test = pd.DataFrame(data = X_test, columns = df_X.columns)

In [6]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
logregcoefs = abs(logreg.coef_)

In [7]:
df_coefs = pd.DataFrame(data = logregcoefs, columns = df_X.columns)
#df_coefs = normalize(df_coefs)
#df_coefs = pd.DataFrame(data = df_coefs, columns = df_X.columns)
df_coefs.head()

Unnamed: 0,age,avg_yearly_balance,day,month,duration,campaign_contacts,prev_days,previous_contacts,in_default_no,in_default_yes,...,contact_method_telephone,contact_method_unknown,prev_outcome_failure,prev_outcome_other,prev_outcome_success,prev_outcome_unknown,housing_loan_no,housing_loan_yes,personal_loan_no,personal_loan_yes
0,0.251788,2.392378,0.651134,0.315657,15.494386,2.840783,4.220843,3.459741,0.12717,0.956256,...,0.811715,6.266501,0.114717,1.464113,14.607603,3.237911,5.09811,4.541883,1.277424,2.939503


In [8]:
#preprocessing the data more appropriately for Random Forest
from sklearn.preprocessing import RobustScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import classification_report

def preproc(dataframe, column, scalertype):
    X = pd.DataFrame(dataframe[column])
    scaler = scalertype.fit_transform(X)
    dataframe[column] = scaler
    
df_X = df.drop(columns = ['term_deposit'])
df_y = df['term_deposit']
X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size = 0.3, random_state = 44)
    
preproc(X_train, 'avg_yearly_balance', RobustScaler())
preproc(X_train, 'duration', RobustScaler())
preproc(X_train, 'prev_days', RobustScaler())
preproc(X_train, 'campaign_contacts', MinMaxScaler())
preproc(X_train, 'previous_contacts', MinMaxScaler())
preproc(X_train, 'age', MinMaxScaler())
preproc(X_train, 'campaign_contacts', MinMaxScaler())

preproc(X_test, 'avg_yearly_balance', RobustScaler())
preproc(X_test, 'duration', RobustScaler())
preproc(X_test, 'prev_days', RobustScaler())
preproc(X_test, 'campaign_contacts', MinMaxScaler())
preproc(X_test, 'previous_contacts', MinMaxScaler())
preproc(X_test, 'age', MinMaxScaler())
preproc(X_test, 'campaign_contacts', MinMaxScaler())

print(X_train.head())
print(X_test.head())

            age  avg_yearly_balance  day  month  duration  campaign_contacts  \
13976  0.363636           -0.127183   10      7  0.023148           0.000000   
42022  0.467532            0.208785   28     10  0.555556           0.000000   
17593  0.428571           -0.177908   29      7 -0.314815           0.016129   
24489  0.350649            1.153464   17     11 -0.546296           0.000000   
44180  0.155844           -0.168351   14      7  0.754630           0.129032   

       prev_days  previous_contacts  in_default_no  in_default_yes  ...  \
13976        0.0           0.000000              1               0  ...   
42022        0.0           0.000000              1               0  ...   
17593        0.0           0.000000              1               0  ...   
24489      132.0           0.003636              1               0  ...   
44180        0.0           0.000000              1               0  ...   

       contact_method_telephone  contact_method_unknown  prev_outcom

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe[column] = scaler
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe[column] = scaler
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe[column] = scaler
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead



In [9]:
y_train

13976    0
42022    0
17593    0
24489    0
44180    0
        ..
4180     0
49723    0
25773    0
3491     0
14100    0
Name: term_deposit, Length: 34812, dtype: int64

In [11]:
#Grid Search
parameter_grid = {'n_estimators' : [100,300, 500], 'max_depth' : [10, 20, 30], 'max_features' : ['auto']}
rfc = RandomForestClassifier()
grid = GridSearchCV(estimator = rfc, param_grid = parameter_grid, scoring = 'accuracy').fit(X_train, y_train)
grid.best_params_
grid.cv_results_


{'mean_fit_time': array([ 1.46308861,  4.32523694,  7.26437893,  2.1360888 ,  6.37157197,
        11.39557257,  2.36228423,  6.96554766, 11.43781905]),
 'std_fit_time': array([0.04729071, 0.0508443 , 0.11147033, 0.05591604, 0.06730301,
        0.13819222, 0.08973281, 0.24165591, 0.15522231]),
 'mean_score_time': array([0.06761937, 0.19767179, 0.33909383, 0.10033751, 0.29400859,
        0.53995628, 0.11389594, 0.32473121, 0.5431478 ]),
 'std_score_time': array([0.0022217 , 0.00212935, 0.01255193, 0.00256678, 0.00353981,
        0.05299072, 0.01027601, 0.02298196, 0.0103184 ]),
 'param_max_depth': masked_array(data=[10, 10, 10, 20, 20, 20, 30, 30, 30],
              mask=[False, False, False, False, False, False, False, False,
                    False],
        fill_value='?',
             dtype=object),
 'param_max_features': masked_array(data=['auto', 'auto', 'auto', 'auto', 'auto', 'auto', 'auto',
                    'auto', 'auto'],
              mask=[False, False, False, False, Fa

In [12]:
grid.best_params_

{'max_depth': 30, 'max_features': 'auto', 'n_estimators': 500}

In [13]:
rfc_best = RandomForestClassifier(max_depth = 30, max_features = 'auto', n_estimators = 500)
rfc_best.fit(X_train, y_train)
rfc_predictions = rfc_best.predict(X_test)
rfc_train_score = rfc_best.score(X_train, y_train)
rfc_test_score = rfc_best.score(X_test, y_test)
print("Train Score: {}, Test Score: {} \n".format(rfc_train_score, rfc_test_score))
print("\n{}".format(classification_report(y_test, rfc_predictions)))

Train Score: 1.0, Test Score: 0.9166890080428954 


              precision    recall  f1-score   support

           0       0.93      0.98      0.95     13107
           1       0.77      0.45      0.57      1813

    accuracy                           0.92     14920
   macro avg       0.85      0.72      0.76     14920
weighted avg       0.91      0.92      0.91     14920



So this model is based on the choice to achieve accuracy. We are going to run the grid search again to get recall this time.

In [16]:
grid_re = GridSearchCV(estimator = rfc, param_grid = parameter_grid, scoring = 'recall').fit(X_train, y_train)
grid_re.best_params_
grid_re.cv_results_

{'mean_fit_time': array([ 1.53469234,  4.66652684,  7.69447403,  2.24199538,  6.61830096,
        10.95869727,  2.32279663,  6.78896213, 11.62751532]),
 'std_fit_time': array([0.05223219, 0.0512734 , 0.12506252, 0.04114195, 0.05859671,
        0.11949019, 0.04980688, 0.2204991 , 0.46605863]),
 'mean_score_time': array([0.07300954, 0.22499309, 0.36043577, 0.10752401, 0.32393408,
        0.51422567, 0.11169515, 0.32892251, 0.55153069]),
 'std_score_time': array([0.00317435, 0.01362751, 0.00573242, 0.00146155, 0.02696389,
        0.01388545, 0.00274463, 0.00494165, 0.02564732]),
 'param_max_depth': masked_array(data=[10, 10, 10, 20, 20, 20, 30, 30, 30],
              mask=[False, False, False, False, False, False, False, False,
                    False],
        fill_value='?',
             dtype=object),
 'param_max_features': masked_array(data=['auto', 'auto', 'auto', 'auto', 'auto', 'auto', 'auto',
                    'auto', 'auto'],
              mask=[False, False, False, False, Fa

In [17]:
grid_re.best_params_

{'max_depth': 30, 'max_features': 'auto', 'n_estimators': 300}

In [18]:
rfc_recall = RandomForestClassifier(max_depth = 30, max_features = 'auto', n_estimators = 300)
rfc_recall.fit(X_train, y_train)
rfc_recall_predictions = rfc_recall.predict(X_test)
rfc_recall_train_score = rfc_recall.score(X_train, y_train)
rfc_recall_test_score = rfc_recall.score(X_test, y_test)
print("Train Score: {}, Test Score: {} \n".format(rfc_recall_train_score, rfc_recall_test_score))
print("\n{}".format(classification_report(y_test, rfc_recall_predictions)))

Train Score: 1.0, Test Score: 0.9168230563002681 


              precision    recall  f1-score   support

           0       0.93      0.98      0.95     13107
           1       0.76      0.46      0.57      1813

    accuracy                           0.92     14920
   macro avg       0.85      0.72      0.76     14920
weighted avg       0.91      0.92      0.91     14920



It seems like the n_estimators moving down to 300, did not improve the model as much as would be important. So we will stick with the original best parameters for precision. From here though, as done before, we will lower the probability threshold in order to find the new threshold where we can get all customers into the correct tiers based on their likelihood to subscribe to the term deposit.

In [19]:
proba_thresh_list = [0.01, 0.025, 0.05,0.075,0.1,0.15,0.2,0.25,0.3]
rfc_best_proba = rfc_best.predict_proba(X_test)
for i in proba_thresh_list:
    rfc_best_lower_thresh = (rfc_best_proba[:,1] >i).astype('int')
    print("Probability Threshold: {}. \n{}".format(i, classification_report((y_test), rfc_best_lower_thresh)))

Probability Threshold: 0.01. 
              precision    recall  f1-score   support

           0       1.00      0.41      0.58     13107
           1       0.19      1.00      0.32      1813

    accuracy                           0.48     14920
   macro avg       0.59      0.70      0.45     14920
weighted avg       0.90      0.48      0.55     14920

Probability Threshold: 0.025. 
              precision    recall  f1-score   support

           0       1.00      0.57      0.72     13107
           1       0.24      0.99      0.39      1813

    accuracy                           0.62     14920
   macro avg       0.62      0.78      0.55     14920
weighted avg       0.91      0.62      0.68     14920

Probability Threshold: 0.05. 
              precision    recall  f1-score   support

           0       1.00      0.69      0.81     13107
           1       0.30      0.98      0.46      1813

    accuracy                           0.72     14920
   macro avg       0.65      0.83    

In [20]:
#This model does perform noticeably better than the previous rfc model. Let's go ahead and get to the nitty details
for i in np.arange(0.01, 0.025, 0.001): 
    rfc_best_lower_thresh = (rfc_best_proba[:,1] >i).astype('int')
    print("Probability Threshold: {}. \n{}".format(i, classification_report((y_test), rfc_best_lower_thresh)))

Probability Threshold: 0.01. 
              precision    recall  f1-score   support

           0       1.00      0.41      0.58     13107
           1       0.19      1.00      0.32      1813

    accuracy                           0.48     14920
   macro avg       0.59      0.70      0.45     14920
weighted avg       0.90      0.48      0.55     14920

Probability Threshold: 0.011. 
              precision    recall  f1-score   support

           0       1.00      0.42      0.59     13107
           1       0.19      1.00      0.32      1813

    accuracy                           0.49     14920
   macro avg       0.60      0.71      0.46     14920
weighted avg       0.90      0.49      0.56     14920

Probability Threshold: 0.011999999999999999. 
              precision    recall  f1-score   support

           0       1.00      0.42      0.59     13107
           1       0.19      1.00      0.32      1813

    accuracy                           0.49     14920
   macro avg       0.

In [22]:
#bringing in the tradeoff report
def trade_off_report(rf_tuple, cust = 50000, conv_perc = 0.1168, avg_dur = 4.3):
    nm_conv = round((cust * conv_perc),2)
    nm_dur_total = round((cust * avg_dur),2)
    nm_conv_interval = round((nm_dur_total / nm_conv),2)
    m_conv = round((nm_conv * rf_tuple[2]),2)
    m_dur_total = round(((m_conv / rf_tuple[1]) * avg_dur),2) #getting all customers that would be called for 'yes', then multiplying by duration
    m_conv_interval = round((m_dur_total / m_conv),2)
    print("Threshold: {}\nNo Model Conversions: {}\nModel Conversions: {}\nNo Model Minutes used: {}\nModel Minutes Used: {}\n\
    No Model Conversion Interval: {}\nModel Conversion Interval: {}\n".format(rf_tuple[0], nm_conv, m_conv, nm_dur_total, m_dur_total, nm_conv_interval, m_conv_interval))

rf_013 = ("1.3% revised", 0.20, 1.0)
rf_018 = ('1.8% old', 0.19, 1.0)

trade_off_report(rf_013)
trade_off_report(rf_018)

Threshold: 1.3% revised
No Model Conversions: 5840.0
Model Conversions: 5840.0
No Model Minutes used: 215000.0
Model Minutes Used: 125560.0
    No Model Conversion Interval: 36.82
Model Conversion Interval: 21.5

Threshold: 1.8% old
No Model Conversions: 5840.0
Model Conversions: 5840.0
No Model Minutes used: 215000.0
Model Minutes Used: 132168.42
    No Model Conversion Interval: 36.82
Model Conversion Interval: 22.63



In [23]:
print("Revised additional minute savings = " + str(132168.42 - 125560.0))

Revised additional minute savings = 6608.420000000013


So this is at our best recall. But, the company may be interested in more focused iterations. So let's go ahead and run a few more trade_off_reports


In [24]:
rf_013 = ("1.3%", 0.2, 1.0)
rf_05 = ('5%', 0.3, 0.98)
rf_10 = ('10%', 0.39, 0.94)
rf_15 = ('15%', 0.45, 0.9)
rf_20 = ('20%', 0.51, 0.85)
rf_25 = ('25%', 0.56, 0.80)
rf_30 = ('30%', 0.6, 0.74)

tuple_list = [rf_013, rf_05,rf_10, rf_15, rf_20, rf_25, rf_30]
for i in tuple_list:
    trade_off_report(i)

Threshold: 1.3%
No Model Conversions: 5840.0
Model Conversions: 5840.0
No Model Minutes used: 215000.0
Model Minutes Used: 125560.0
    No Model Conversion Interval: 36.82
Model Conversion Interval: 21.5

Threshold: 5%
No Model Conversions: 5840.0
Model Conversions: 5723.2
No Model Minutes used: 215000.0
Model Minutes Used: 82032.53
    No Model Conversion Interval: 36.82
Model Conversion Interval: 14.33

Threshold: 10%
No Model Conversions: 5840.0
Model Conversions: 5489.6
No Model Minutes used: 215000.0
Model Minutes Used: 60526.36
    No Model Conversion Interval: 36.82
Model Conversion Interval: 11.03

Threshold: 15%
No Model Conversions: 5840.0
Model Conversions: 5256.0
No Model Minutes used: 215000.0
Model Minutes Used: 50224.0
    No Model Conversion Interval: 36.82
Model Conversion Interval: 9.56

Threshold: 20%
No Model Conversions: 5840.0
Model Conversions: 4964.0
No Model Minutes used: 215000.0
Model Minutes Used: 41853.33
    No Model Conversion Interval: 36.82
Model Conver

Here we see the direct exchange between using the model to focus down on likely conversions, and then using the model to implement speed and reduce conversion intervals. My personal focus was to get the same amount of conversions while reducing time, which is why I have been focusing on the threshold and recall, however, if speed is necessary and time is limited it is possible to settle for less conversions, less minutes used, and less time between conversions at the cost of missing a lot of conversions. But who's to say the company couldn't just loop back and get the others when the time is abundant again. 

In [25]:
X_whole = pd.concat([X_train, X_test])
print(X_whole.shape)
y_whole = pd.concat([y_train, y_test])
print(y_whole.shape)

(49732, 40)
(49732,)


In [26]:
rfc_whole_proba = rfc_best.predict_proba(X_whole)
X_whole['probability'] = rfc_whole_proba[:,1]
X_whole['probability'] = X_whole['probability'].round(3)
X_whole.head()

Unnamed: 0,age,avg_yearly_balance,day,month,duration,campaign_contacts,prev_days,previous_contacts,in_default_no,in_default_yes,...,contact_method_unknown,prev_outcome_failure,prev_outcome_other,prev_outcome_success,prev_outcome_unknown,housing_loan_no,housing_loan_yes,personal_loan_no,personal_loan_yes,probability
13976,0.363636,-0.127183,10,7,0.023148,0.0,0.0,0.0,1,0,...,0,0,0,0,1,0,1,1,0,0.0
42022,0.467532,0.208785,28,10,0.555556,0.0,0.0,0.0,1,0,...,0,0,0,0,1,1,0,1,0,0.134
17593,0.428571,-0.177908,29,7,-0.314815,0.016129,0.0,0.0,1,0,...,0,0,0,0,1,1,0,1,0,0.0
24489,0.350649,1.153464,17,11,-0.546296,0.0,132.0,0.003636,1,0,...,0,1,0,0,0,0,1,1,0,0.002
44180,0.155844,-0.168351,14,7,0.75463,0.129032,0.0,0.0,1,0,...,0,0,0,0,1,1,0,1,0,0.038


In [30]:
X_whole['tier'] = pd.cut(X_whole['probability'],[0, 0.013, 0.4, 0.6, 1.0],labels=['tier_4', 'tier_3', 'tier_2', 'tier_1'])
df_tiered = X_whole.copy()
df_tiered['term_deposit'] = y_whole
df_tiered.head()

Unnamed: 0,age,avg_yearly_balance,day,month,duration,campaign_contacts,prev_days,previous_contacts,in_default_no,in_default_yes,...,prev_outcome_other,prev_outcome_success,prev_outcome_unknown,housing_loan_no,housing_loan_yes,personal_loan_no,personal_loan_yes,probability,tier,term_deposit
13976,0.363636,-0.127183,10,7,0.023148,0.0,0.0,0.0,1,0,...,0,0,1,0,1,1,0,0.0,,0
42022,0.467532,0.208785,28,10,0.555556,0.0,0.0,0.0,1,0,...,0,0,1,1,0,1,0,0.134,tier_3,0
17593,0.428571,-0.177908,29,7,-0.314815,0.016129,0.0,0.0,1,0,...,0,0,1,1,0,1,0,0.0,,0
24489,0.350649,1.153464,17,11,-0.546296,0.0,132.0,0.003636,1,0,...,0,0,0,0,1,1,0,0.002,tier_4,0
44180,0.155844,-0.168351,14,7,0.75463,0.129032,0.0,0.0,1,0,...,0,0,1,1,0,1,0,0.038,tier_3,0


In [31]:
df_tiered.tier.loc[df_tiered['term_deposit'] == 1].value_counts().sort_index()

tier_4       9
tier_3     701
tier_2     570
tier_1    4530
Name: tier, dtype: int64

In [32]:
df_tiered.tier.loc[df_tiered['term_deposit'] == 0].value_counts().sort_index()

tier_4    17133
tier_3    17565
tier_2      421
tier_1      109
Name: tier, dtype: int64

In [34]:
print("Tier 4: " + str(round((9/17142),4)))
print("Tier 3: " + str(round((701/18266),4)))
print("Tier 2: " + str(round((570/991),4)))
print("Tier 1: " + str(round((4530/4639),4)))

Tier 4: 0.0005
Tier 3: 0.0384
Tier 2: 0.5752
Tier 1: 0.9765


In [35]:
#Just to make sure
df_tiered.loc[df_tiered['tier'] == 'tier_1']
#we see here that from our quick pull, all of the listed tier_1 customers in sight have submitted to term deposits

Unnamed: 0,age,avg_yearly_balance,day,month,duration,campaign_contacts,prev_days,previous_contacts,in_default_no,in_default_yes,...,prev_outcome_other,prev_outcome_success,prev_outcome_unknown,housing_loan_no,housing_loan_yes,personal_loan_no,personal_loan_yes,probability,tier,term_deposit
40566,0.207792,-0.273479,10,7,0.722222,0.048387,0.0,0.000000,1,0,...,0,0,1,1,0,1,0,0.636,tier_1,1
38527,0.311688,-0.154383,15,5,3.851852,0.032258,0.0,0.000000,1,0,...,0,0,1,0,1,1,0,0.870,tier_1,1
48450,0.441558,1.202720,31,8,0.402778,0.000000,475.0,0.003636,1,0,...,0,0,0,0,1,1,0,0.940,tier_1,1
41072,0.584416,0.327146,14,8,0.444444,0.016129,0.0,0.000000,1,0,...,0,0,1,0,1,1,0,0.730,tier_1,1
44465,0.246753,0.083808,6,8,-0.027778,0.000000,94.0,0.010909,1,0,...,0,0,0,1,0,1,0,0.757,tier_1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28751,0.311688,0.364578,30,1,3.420091,0.000000,0.0,0.000000,1,0,...,0,0,1,1,0,1,0,0.765,tier_1,1
46947,0.285714,-0.253651,11,8,0.178082,0.000000,462.0,0.017241,1,0,...,0,1,0,0,1,1,0,0.656,tier_1,1
47139,0.051948,2.239970,20,10,0.684932,0.000000,184.0,0.051724,1,0,...,0,0,0,1,0,1,0,0.702,tier_1,1
41147,0.480519,1.105565,18,8,0.050228,0.000000,387.0,0.051724,1,0,...,0,1,0,1,0,0,1,0.616,tier_1,1


Compared to the previous model, this has higher probabiltiy accuracy and therfore a stronger tier 1, with less customers in the tier 2 and 3 sections and the vast majority in tier 4. In fact, if just the tier_1 customers were picked, we would have approximately 80% of all term deposits potential, with almost no wasted reachouts. 

With this new campaign set up the company will be able to prioritize customers using this model as well as immensely cut down on the hours used if chosen to move forward with it. I believe this is better than near blind outgoing calls.