In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report
#from sklearn.metrics import roc_auc_score

In [2]:
ca = pd.read_csv('banking.csv', header=0)
ca = ca.dropna()
print(ca.shape)
print(list(ca.columns))

(41188, 21)
['age', 'job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'emp_var_rate', 'cons_price_idx', 'cons_conf_idx', 'euribor3m', 'nr_employed', 'y']


In [3]:
base = ca[['y']]

In [4]:
## identify categorical variables
cat = ca.select_dtypes(exclude=['int', 'float']).columns
print(len(cat))
print('Number of categoriacal features:', len(cat))
CatVar=ca[cat]

Nl=[]
for column in CatVar.columns:
    if CatVar[column].isnull().values.any() == True:
        Nl.append(column)
    
print('columns with missing values:', Nl)

10
Number of categoriacal features: 10
columns with missing values: []


In [5]:
Cat_dummies = pd.get_dummies(CatVar)
print(CatVar.shape)
print(Cat_dummies.shape)

(41188, 10)
(41188, 53)


In [6]:
## Identify continuous variables
cont1 = ca.select_dtypes(include=['int', 'float']).columns
print('initial number of continuous features: ',len(cont1))


## Exclude those previously classified as dummies
cont4=set(cont1).difference(cat)
cont=set(cont4).difference(base)
print('initial number of continuous features after exclusions: ',len(cont))
print()

ContVar= ca[cont]

Nl3=[]
for column in ContVar.columns:
    if ContVar[column].isnull().values.any() == True:
        Nl3.append(column)
    
print('features with missing values: ',Nl3)

initial number of continuous features:  11
initial number of continuous features after exclusions:  10

features with missing values:  []


In [7]:
print(base.shape)
print(ContVar.shape)
print(Cat_dummies.shape)

(41188, 1)
(41188, 10)
(41188, 53)


In [8]:
df = pd.concat([base,ContVar,Cat_dummies,],axis=1)
df.shape
df.to_csv('banking_campaign.csv')

In [9]:
X = df.iloc[:,1:]
y = df.iloc[:,0]

In [10]:
from imblearn.over_sampling import SMOTE
os = SMOTE(random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
columns = X_train.columns
os_data_X,os_data_y=os.fit_sample(X_train, y_train)
os_data_X = pd.DataFrame(data=os_data_X,columns=columns )
os_data_y= pd.DataFrame(data=os_data_y,columns=['y'])

print("length of oversampled data is ",len(os_data_X))
print("Number of no subscription in oversampled data",len(os_data_y[os_data_y['y']==0]))
print("Number of subscription",len(os_data_y[os_data_y['y']==1]))
print("Proportion of no subscription data in oversampled data is ",len(os_data_y[os_data_y['y']==0])/len(os_data_X))
print("Proportion of subscription data in oversampled data is ",len(os_data_y[os_data_y['y']==1])/len(os_data_X))

Using TensorFlow backend.


length of oversampled data is  51134
Number of no subscription in oversampled data 25567
Number of subscription 25567
Proportion of no subscription data in oversampled data is  0.5
Proportion of subscription data in oversampled data is  0.5


In [11]:
GBM = GradientBoostingClassifier(learning_rate=0.01, 
                                      n_estimators=1000,
                                      max_depth=6, 
                                      min_samples_split=50, 
                                      min_samples_leaf=25, 
                                      subsample=0.8,
                                      max_features=10, 
                                      random_state=10)
GBM.fit(X_train,y_train)

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.01, loss='deviance', max_depth=6,
                           max_features=10, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=25, min_samples_split=50,
                           min_weight_fraction_leaf=0.0, n_estimators=1000,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=10, subsample=0.8, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [12]:
print('Accuracy of the GBM on test set: {:.3f}'.format(GBM.score(X_test, y_test)))
pred=GBM.predict(X_test)
print(classification_report(y_test, pred))
from sklearn.metrics import roc_auc_score
print (roc_auc_score(y_test, pred))

Accuracy of the GBM on test set: 0.919
              precision    recall  f1-score   support

           0       0.94      0.97      0.95     10981
           1       0.66      0.54      0.60      1376

    accuracy                           0.92     12357
   macro avg       0.80      0.75      0.78     12357
weighted avg       0.91      0.92      0.92     12357

0.7549541504564968


In [13]:
y_pred2=GBM.predict_proba(X_test)

In [14]:
def lift (test, pred, cardinaility):

    res = pd.DataFrame(np.column_stack((test, pred)),
                       columns=['Target','PR_0', 'PR_1'])

    res['scr_grp'] = pd.qcut(res['PR_0'], cardinaility, labels=False)+1

    crt = pd.crosstab(res.scr_grp, res.Target).reset_index()
    crt = crt.rename(columns= {'Target':'Np',0.0: 'Negatives', 1.0: 'Positives'})

    G = crt['Positives'].sum()
    B = crt['Negatives'].sum()
   
    avg_resp_rate = G/(G+B)

    crt['resp_rate'] = round(crt['Positives']/(crt['Positives']+crt['Negatives']),2)
    crt['lift'] = round((crt['resp_rate']/avg_resp_rate),2)
    crt['rand_resp'] = 1/cardinaility
    crt['cmltv_p'] = round((crt['Positives']).cumsum(),2)
    crt['cmltv_p_perc'] = round(((crt['Positives']/G).cumsum())*100,1)
    crt['cmltv_n'] = round((crt['Negatives']).cumsum(),2)  
    crt['cmltv_n_perc'] = round(((crt['Negatives']/B).cumsum())*100,1)   
    crt['cmltv_rand_p_perc'] = (crt.rand_resp.cumsum())*100
    crt['cmltv_resp_rate'] = round(crt['cmltv_p']/(crt['cmltv_p']+crt['cmltv_n']),2)   
    crt['cmltv_lift'] = round(crt['cmltv_resp_rate']/avg_resp_rate,2)
    crt['KS']=round(crt['cmltv_p_perc']-crt['cmltv_rand_p_perc'],2)
    crt = crt.drop(['rand_resp','cmltv_p','cmltv_n',], axis=1)
    
    print('average response rate: ' , avg_resp_rate)
    return crt

In [15]:
ModelLift = lift(y_test,y_pred2,10)

average response rate:  0.11135388848425994


In [16]:
ModelLift

Target,scr_grp,Negatives,Positives,resp_rate,lift,cmltv_p_perc,cmltv_n_perc,cmltv_rand_p_perc,cmltv_resp_rate,cmltv_lift,KS
0,1,433,803,0.65,5.84,58.4,3.9,10.0,0.65,5.84,48.4
1,2,831,405,0.33,2.96,87.8,11.5,20.0,0.49,4.4,67.8
2,3,1100,135,0.11,0.99,97.6,21.5,30.0,0.36,3.23,67.6
3,4,1215,21,0.02,0.18,99.1,32.6,40.0,0.28,2.51,59.1
4,5,1229,7,0.01,0.09,99.6,43.8,50.0,0.22,1.98,49.6
5,6,1232,3,0.0,0.0,99.9,55.0,60.0,0.19,1.71,39.9
6,7,1235,1,0.0,0.0,99.9,66.3,70.0,0.16,1.44,29.9
7,8,1235,0,0.0,0.0,99.9,77.5,80.0,0.14,1.26,19.9
8,9,1236,0,0.0,0.0,99.9,88.8,90.0,0.12,1.08,9.9
9,10,1235,1,0.0,0.0,100.0,100.0,100.0,0.11,0.99,0.0


In [17]:
dec = ['Decile 1','Decile 2','Decile 3','Decile 4','Decile 5','Decile 6','Decile 7','Decile 8','Decile 9','Decile 10',]
MLift=ModelLift[['Positives','Negatives','cmltv_lift','KS']].copy()
MLift.index = (dec)

In [18]:
import cufflinks as cf
cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)
MLift[['Positives','Negatives']].iplot(kind='bar',yTitle='Volume',xTitle='Model decile', title='Positives & Negatives by model decile')

In [19]:
MLift[['cmltv_lift']].iplot(kind='bar',color='LightSkyBlue',yTitle='Lift',xTitle='Model decile', title='Cumulative Lift', yrange=[1.11, 6])

In [20]:
MLift[['KS']].iplot(kind='bar',color='DarkSlateGrey', yTitle='Separation',xTitle='Model decile', title='Target separation')