In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import normalize
from sklearn.model_selection import train_test_split


In [2]:
df_ori = pd.DataFrame(pd.read_csv('Peruvian_Bank_Data/clean_df.csv'))
df_ori.head()


Unnamed: 0,age,job,marital,education,in_default,avg_yearly_balance,housing_loan,personal_loan,contact_method,day,month,duration,campaign_contacts,prev_days,previous_contacts,prev_outcome,term_deposit
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,5,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,5,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,5,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,5,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,5,198,1,-1,0,unknown,no


In [3]:
#looking to get a coefficient reading so we are going to process the data with normalization and throw into logreg
df = df_ori.copy()
df = pd.get_dummies(data = df, columns = ['in_default', 'job', 'marital', 'education', 'contact_method', 'prev_outcome', 'housing_loan', 'personal_loan']) 
df.term_deposit.replace(('yes', 'no'), (1, 0), inplace=True)
df_X = df.drop(columns = ['term_deposit'])
df_y = df['term_deposit']
X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size = 0.3, random_state = 44)

In [4]:
#normalizing the X sets
X_train = normalize(X_train, axis = 0)
X_test = normalize(X_test, axis = 0)

In [5]:
X_train = pd.DataFrame(data = X_train, columns = df_X.columns)
X_test = pd.DataFrame(data = X_test, columns = df_X.columns)

In [6]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
logregcoefs = abs(logreg.coef_)

In [7]:
df_coefs = pd.DataFrame(data = logregcoefs, columns = df_X.columns)
#df_coefs = normalize(df_coefs)
#df_coefs = pd.DataFrame(data = df_coefs, columns = df_X.columns)
df_coefs.head()

Unnamed: 0,age,avg_yearly_balance,day,month,duration,campaign_contacts,prev_days,previous_contacts,in_default_no,in_default_yes,...,contact_method_telephone,contact_method_unknown,prev_outcome_failure,prev_outcome_other,prev_outcome_success,prev_outcome_unknown,housing_loan_no,housing_loan_yes,personal_loan_no,personal_loan_yes
0,0.251788,2.392378,0.651134,0.315657,15.494386,2.840783,4.220843,3.459741,0.12717,0.956256,...,0.811715,6.266501,0.114717,1.464113,14.607603,3.237911,5.09811,4.541883,1.277424,2.939503


In [8]:
#preprocessing the data more appropriately for Random Forest
from sklearn.preprocessing import RobustScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import classification_report

def preproc(dataframe, column, scalertype):
    X = pd.DataFrame(dataframe[column])
    scaler = scalertype.fit_transform(X)
    dataframe[column] = scaler
    
df_X = df.drop(columns = ['term_deposit'])
df_y = df['term_deposit']
X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size = 0.3, random_state = 44)
    
preproc(X_train, 'avg_yearly_balance', RobustScaler())
preproc(X_train, 'duration', RobustScaler())
preproc(X_train, 'prev_days', RobustScaler())
preproc(X_train, 'campaign_contacts', MinMaxScaler())
preproc(X_train, 'previous_contacts', MinMaxScaler())
preproc(X_train, 'age', MinMaxScaler())
preproc(X_train, 'campaign_contacts', MinMaxScaler())

preproc(X_test, 'avg_yearly_balance', RobustScaler())
preproc(X_test, 'duration', RobustScaler())
preproc(X_test, 'prev_days', RobustScaler())
preproc(X_test, 'campaign_contacts', MinMaxScaler())
preproc(X_test, 'previous_contacts', MinMaxScaler())
preproc(X_test, 'age', MinMaxScaler())
preproc(X_test, 'campaign_contacts', MinMaxScaler())

print(X_train.head())
print(X_test.head())

            age  avg_yearly_balance  day  month  duration  campaign_contacts  \
13976  0.363636           -0.127183   10      7  0.023148           0.000000   
42022  0.467532            0.208785   28     10  0.555556           0.000000   
17593  0.428571           -0.177908   29      7 -0.314815           0.016129   
24489  0.350649            1.153464   17     11 -0.546296           0.000000   
44180  0.155844           -0.168351   14      7  0.754630           0.129032   

       prev_days  previous_contacts  in_default_no  in_default_yes  ...  \
13976        0.0           0.000000              1               0  ...   
42022        0.0           0.000000              1               0  ...   
17593        0.0           0.000000              1               0  ...   
24489      132.0           0.003636              1               0  ...   
44180        0.0           0.000000              1               0  ...   

       contact_method_telephone  contact_method_unknown  prev_outcom

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice 

In [9]:
y_train.head()

13976    0
42022    0
17593    0
24489    0
44180    0
Name: term_deposit, dtype: int64

In [10]:
#Grid Search
parameter_grid = {'n_estimators' : [100,1000,5000], 'max_depth' : [10, 20, 30, 'None'], 'max_features' : ['auto', 'log2']}
rfc = RandomForestClassifier()
grid = GridSearchCV(estimator = rfc, param_grid = parameter_grid, scoring = 'accuracy').fit(X_train, y_train)
grid.best_params_
grid.cv_results_


KeyboardInterrupt: 

In [None]:
grid.best_params_

In [None]:
rfc_best = RandomForestClassifier(max_depth = 30, max_features = 'auto', n_estimators = 5000)
rfc_best.fit(X_train, y_train)
rfc_predictions = rfc_best.predict(X_test)
rfc_train_score = rfc_best.score(X_train, y_train)
rfc_test_score = rfc_best.score(X_test, y_test)
print("Train Score: {}, Test Score: {} \n".format(rfc_train_score, rfc_test_score))
print("\n{}".format(classification_report(y_test, rfc_predictions)))

So this model is based on the choice to achieve accuracy. We are going to run the grid search again to get recall this time.

In [None]:
grid_re = GridSearchCV(estimator = rfc, param_grid = parameter_grid, scoring = 'recall').fit(X_train, y_train == 'yes')
grid_re.best_params_
grid_re.cv_results_

In [None]:
grid_re.best_params_

Well it's reassuring to see that the original model went ahead and ended on the same best parameters. it's time to tweak the data a bit, like before. I want to be able to have the highest recall possible, but this time, seeing if we can't increase our accuracy while we do it.

In [None]:
proba_thresh_list = [0.01, 0.025, 0.05,0.075,0.1,0.15,0.2,0.25,0.3]
rfc_best_proba = rfc_best.predict_proba(X_test)
for i in proba_thresh_list:
    rfc_best_lower_thresh = (rfc_best_proba[:,1] >i).astype('int')
    print("Probability Threshold: {}. \n{}".format(i, classification_report((y_test == 'yes').astype('int'), rfc_best_lower_thresh)))

In [None]:
#This model does perform noticeably better than the previous rfc model. Let's go ahead and get to the nitty details
for i in np.arange(0.01, 0.025, 0.001): 
    rfc_best_lower_thresh = (rfc_best_proba[:,1] >i).astype('int')
    print("Probability Threshold: {}. \n{}".format(i, classification_report((y_test == 'yes').astype('int'), rfc_best_lower_thresh)))

In [None]:
#Still have the same breaking point at 1.8%, but here we have a higher precision of .22 vs .19, which is going to save a many calls
#bringing in the tradeoff report
def trade_off_report(rf_tuple, cust = 50000, conv_perc = 0.1168, avg_dur = 4.3):
    nm_conv = round((cust * conv_perc),2)
    nm_dur_total = round((cust * avg_dur),2)
    nm_conv_interval = round((nm_dur_total / nm_conv),2)
    m_conv = round((nm_conv * rf_tuple[2]),2)
    m_dur_total = round(((m_conv / rf_tuple[1]) * avg_dur),2) #getting all customers that would be called for 'yes', then multiplying by duration
    m_conv_interval = round((m_dur_total / m_conv),2)
    print("Threshold: {}\nNo Model Conversions: {}\nModel Conversions: {}\nNo Model Minutes used: {}\nModel Minutes Used: {}\n\
    No Model Conversion Interval: {}\nModel Conversion Interval: {}\n".format(rf_tuple[0], nm_conv, m_conv, nm_dur_total, m_dur_total, nm_conv_interval, m_conv_interval))

rf_018 = ("1.8% revised", 0.22, 1.0)
rf_018_old = ('1.8% old', 0.19, 1.0)

trade_off_report(rf_018)
trade_off_report(rf_018_old)

In [None]:
print("Revised additional minute savings = " + str(132168.42 - 114145.45))

So this is at our best recall. But, the company may be interested in more focused iterations. So let's go ahead and run a few more trade_off_reports


In [None]:
rf_018 = ("1.8%", 0.22, 1.0)
rf_05 = ('5%', 0.3, 0.98)
rf_10 = ('10%', 0.39, 0.95)
rf_15 = ('15%', 0.45, 0.9)
rf_20 = ('20%', 0.51, 0.86)
rf_25 = ('25%', 0.55, 0.81)
rf_30 = ('30%', 0.6, 0.75)

tuple_list = [rf_018, rf_05,rf_10, rf_15, rf_20, rf_25, rf_30]
for i in tuple_list:
    trade_off_report(i)

While this view is approachable. It still lacks priority. Here we can see that if we were to simply run everybody through a specific threshold we can do a single split of customers to reduce our time. But we can actively seek prioritization through tiering the customers.

In [None]:
X_whole = pd.concat([X_train, X_test])
print(X_whole.shape)
y_whole = pd.concat([y_train, y_test])
print(y_whole.shape)

In [None]:
rfc_whole_proba = rfc_best.predict_proba(X_whole)
X_whole['probability'] = rfc_whole_proba[:,1]
X_whole['probability'] = X_whole['probability'].round(3)
X_whole.head()

In [None]:
X_whole['tier'] = pd.cut(X_whole['probability'],[0, 0.018, 0.4, 0.6, 1.0],labels=['tier_4', 'tier_3', 'tier_2', 'tier_1'])
df_tiered = X_whole.copy()
df_tiered['term_deposit'] = y_whole
df_tiered.head()

In [None]:
df_tiered.tier.loc[df_tiered['term_deposit'] == 'yes'].value_counts().sort_index()

In [None]:
df_tiered.tier.loc[df_tiered['term_deposit'] == 'no'].value_counts().sort_index()

In [None]:
print("Tier 4: " + str(round((9/24828),4)))
print("Tier 3: " + str(round((681/15584),4)))
print("Tier 2: " + str(round((584/1001),4)))
print("Tier 1: " + str(round((4536/4643),4)))

Compared to the previous model, this has higher probabiltiy accuracy and therfore a stronger tier 1, with less customers in the tier 2 and 3 sections and the vast majority in tier 4. Here we actually see a weird change where the tier 3 customers have more actual conversions than tier 2, but the proportion is much smaller compared to the tier 2.

With this new campaign set up the company will be able to prioritize customers using this model as well as immensely cut down on the hours used if chosen to move forward with it. I believe this is better than near blind outgoing calls.