# Model lift calculation

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report

## data import and cleansing

In [None]:
ca = pd.read_csv('banking.csv', header=0)
ca = ca.dropna()
print(ca.shape)
print(list(ca.columns))

In [None]:
base = ca[['y']]

In [None]:
## identify categorical variables
cat = ca.select_dtypes(exclude=['int', 'float']).columns
print(len(cat))
print('Number of categoriacal features:', len(cat))
CatVar=ca[cat]

Nl=[]
for column in CatVar.columns:
    if CatVar[column].isnull().values.any() == True:
        Nl.append(column)
    
print('columns with missing values:', Nl)

In [None]:
Cat_dummies = pd.get_dummies(CatVar)
print(CatVar.shape)
print(Cat_dummies.shape)

In [None]:
## Identify continuous variables
cont1 = ca.select_dtypes(include=['int', 'float']).columns
print('initial number of continuous features: ',len(cont1))


## Exclude those previously classified as dummies
cont4=set(cont1).difference(cat)
cont=set(cont4).difference(base)
print('initial number of continuous features after exclusions: ',len(cont))
print()

ContVar= ca[cont]

Nl3=[]
for column in ContVar.columns:
    if ContVar[column].isnull().values.any() == True:
        Nl3.append(column)
    
print('features with missing values: ',Nl3)

In [None]:
print(base.shape)
print(ContVar.shape)
print(Cat_dummies.shape)

In [None]:
df = pd.concat([base,ContVar,Cat_dummies,],axis=1)
df.shape
df.to_csv('banking_campaign.csv')

In [None]:
X = df.iloc[:,1:]
y = df.iloc[:,0]

In [None]:
from imblearn.over_sampling import SMOTE
os = SMOTE(random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
columns = X_train.columns
os_data_X,os_data_y=os.fit_sample(X_train, y_train)
os_data_X = pd.DataFrame(data=os_data_X,columns=columns )
os_data_y= pd.DataFrame(data=os_data_y,columns=['y'])

print("length of oversampled data is ",len(os_data_X))
print("Number of no subscription in oversampled data",len(os_data_y[os_data_y['y']==0]))
print("Number of subscription",len(os_data_y[os_data_y['y']==1]))
print("Proportion of no subscription data in oversampled data is ",len(os_data_y[os_data_y['y']==0])/len(os_data_X))
print("Proportion of subscription data in oversampled data is ",len(os_data_y[os_data_y['y']==1])/len(os_data_X))

##  gradient boosting model

In [None]:
GBM = GradientBoostingClassifier(learning_rate=0.01, 
                                      n_estimators=1000,
                                      max_depth=6, 
                                      min_samples_split=50, 
                                      min_samples_leaf=25, 
                                      subsample=0.8,
                                      max_features=10, 
                                      random_state=10)
GBM.fit(X_train,y_train)

In [None]:
print('Accuracy of the GBM on test set: {:.3f}'.format(GBM.score(X_test, y_test)))
pred=GBM.predict(X_test)
print(classification_report(y_test, pred))
from sklearn.metrics import roc_auc_score
print (roc_auc_score(y_test, pred))

In [None]:
y_pred2=GBM.predict_proba(X_test)

## model lift calculation and visualisation

In [None]:
def lift (test, pred, cardinaility):

    res = pd.DataFrame(np.column_stack((test, pred)),
                       columns=['Target','PR_0', 'PR_1'])

    res['scr_grp'] = pd.qcut(res['PR_0'], cardinaility, labels=False)+1

    crt = pd.crosstab(res.scr_grp, res.Target).reset_index()
    crt = crt.rename(columns= {'Target':'Np',0.0: 'Negatives', 1.0: 'Positives'})

    G = crt['Positives'].sum()
    B = crt['Negatives'].sum()
   
    avg_resp_rate = G/(G+B)

    crt['resp_rate'] = round(crt['Positives']/(crt['Positives']+crt['Negatives']),2)
    crt['lift'] = round((crt['resp_rate']/avg_resp_rate),2)
    crt['rand_resp'] = 1/cardinaility
    crt['cmltv_p'] = round((crt['Positives']).cumsum(),2)
    crt['cmltv_p_perc'] = round(((crt['Positives']/G).cumsum())*100,1)
    crt['cmltv_n'] = round((crt['Negatives']).cumsum(),2)  
    crt['cmltv_n_perc'] = round(((crt['Negatives']/B).cumsum())*100,1)   
    crt['cmltv_rand_p_perc'] = (crt.rand_resp.cumsum())*100
    crt['cmltv_resp_rate'] = round(crt['cmltv_p']/(crt['cmltv_p']+crt['cmltv_n']),2)   
    crt['cmltv_lift'] = round(crt['cmltv_resp_rate']/avg_resp_rate,2)
    crt['KS']=round(crt['cmltv_p_perc']-crt['cmltv_rand_p_perc'],2)
    crt = crt.drop(['rand_resp','cmltv_p','cmltv_n',], axis=1)
    
    print('average response rate: ' , avg_resp_rate)
    return crt

In [None]:
ModelLift = lift(y_test,y_pred2,10)

In [None]:
ModelLift

In [None]:
dec = ['Decile 1','Decile 2','Decile 3','Decile 4','Decile 5','Decile 6','Decile 7','Decile 8','Decile 9','Decile 10',]
MLift=ModelLift[['Positives','Negatives','cmltv_lift','KS']].copy()
MLift.index = (dec)

In [None]:
import cufflinks as cf
cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)

MLift[['Positives','Negatives']].iplot(kind='bar',yTitle='Volume',xTitle='Model decile', title='Positives & Negatives by model decile')

In [None]:
MLift[['cmltv_lift']].iplot(kind='bar',color='LightSkyBlue',yTitle='Lift',xTitle='Model decile', title='Cumulative Lift', yrange=[1.11, 6])

In [None]:
MLift[['KS']].iplot(kind='bar',color='DarkSlateGrey', yTitle='Separation',xTitle='Model decile', title='Target separation')