In [5]:
#!pip install lime
#!pip install shap
#!pip install imblearn

In [303]:
import pandas as pd
import numpy as np
import seaborn as sns
import scipy.stats as s
import matplotlib.pyplot as plt
sns.set(style = 'whitegrid')
import warnings
warnings.filterwarnings('ignore')

from sklearn.metrics import classification_report

from sklearn.feature_selection import RFE
from mlxtend.feature_selection import SequentialFeatureSelector as sfs
from sklearn.linear_model import LogisticRegression

from sklearn.preprocessing import StandardScaler

import statsmodels.api as sm

from sklearn.metrics import confusion_matrix,precision_score,f1_score,cohen_kappa_score

from sklearn.tree import DecisionTreeClassifier,export_graphviz
from sklearn.metrics import plot_confusion_matrix,plot_roc_curve,roc_auc_score,roc_curve

from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,StackingClassifier

from xgboost import XGBClassifier

import pydotplus

from IPython.display import Image

import os
os.environ["PATH"] += os.pathsep + 'C:/ProgramData/Anaconda3/pkgs/Graphviz'

from sklearn.model_selection import GridSearchCV,RandomizedSearchCV, cross_val_score

import lime
from lime import lime_tabular

import shap

from statsmodels.stats.outliers_influence import variance_inflation_factor

1. **[Decision Tree](#dtree)**

### Reading the data

In [None]:
data = pd.read_excel('Store_Data_after_EDA.xlsx')

data = data.drop(columns=['Unnamed: 0', 'index'],axis=1)
data.head()

In [None]:
data.retained.value_counts().plot(kind='bar')
plt.show()

In [None]:
# The data is imbalanced.

### Changing the columns into datetime

In [None]:
data['firstorder']= data.firstorder.astype('datetime64')
data['lastorder'] = data.lastorder.astype('datetime64')

### Data information

In [None]:
data.describe()

In [None]:
data.info()

In [None]:
data.hist(bins=20,figsize=(15,10))
plt.show()

In [None]:
data['create_first'] = data.firstorder - data.created
data['create_first'] = data['create_first']/np.timedelta64(1,'D')
data['first_last'] = data.lastorder - data.firstorder
data['first_last'] = data['first_last']/np.timedelta64(1,'D')

In [None]:
data.describe()

In [None]:
data = data.drop(['ordfreq','custid'],axis=1)

In [None]:
data = data.drop(['firstorder','lastorder','created'],axis=1)

In [None]:
data.head()

### Test-Train Split

In [None]:
inp = data.drop('retained',axis=1)
out = data.retained
from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest = train_test_split(inp,out,test_size=0.3,random_state=12)

### Outlier Treatment

In [None]:
plt.figure(figsize=(15,10))
plt.subplot(2,3,1)
sns.kdeplot(xtrain.esent,color='red');
plt.subplot(2,3,2)
sns.kdeplot(data.eclickrate,color='green')
plt.subplot(2,3,3)
sns.kdeplot(data.avgorder,color='black')
plt.subplot(2,3,4)
sns.kdeplot(data.eopenrate,color='blue')
plt.show()

In [None]:
plt.figure(figsize=(15,10))
plt.subplot(2,3,1)
sns.boxplot(x = data.esent,color='red');
plt.subplot(2,3,2)
sns.boxplot(x = data.eclickrate,color='green')
plt.subplot(2,3,3)
sns.boxplot(x = data.avgorder,color='pink')
plt.subplot(2,3,4)
sns.boxplot(data.eopenrate,color='blue')
plt.show()

In [None]:
def impute(x):
    for i in x.columns:
        iqr = 1.5*(x[i].quantile(0.75) - x[i].quantile(0.25))
        ul = x[i].quantile(0.75) + iqr
        ll = x[i].quantile(0.25) - iqr
        temp = []
        for j in x[i].index:
            if x[i][j] > ul or x[i][j] < ll :
                temp.append(np.median(x[i]))
            elif x[i][j] < 0:
                temp.append(0)
            else:
                temp.append(x[i][j])
        x[i]= temp
    return x

In [None]:
xtrain[['esent','eclickrate','avgorder','eopenrate']] = impute(xtrain[['esent','eclickrate','avgorder','eopenrate']])
xtest[['esent','eclickrate','avgorder','eopenrate']] = impute(xtest[['esent','eclickrate','avgorder','eopenrate']])

In [None]:
plt.figure(figsize=(15,10))
plt.subplot(2,3,1)
sns.boxplot(x = xtrain.esent,color='red');
plt.subplot(2,3,2)
sns.boxplot(x = xtrain.eclickrate,color='green',whis=6)
plt.subplot(2,3,3)
sns.boxplot(x = xtrain.avgorder,color='pink',whis=3)
plt.subplot(2,3,4)
sns.boxplot(xtrain.eopenrate,color='blue')
plt.show()

In [None]:
plt.figure(figsize=(15,10))
plt.subplot(2,3,1)
sns.kdeplot(xtrain.esent,color='red');
plt.subplot(2,3,2)
sns.kdeplot(xtrain.eclickrate,color='green')
plt.subplot(2,3,3)
sns.kdeplot(xtrain.avgorder,color='black')
#plt.subplot(2,3,4)
#sns.kdeplot(xtrain.ordfreq,color='orange')
plt.subplot(2,3,4)
sns.kdeplot(xtrain.eopenrate,color='blue')
plt.show()

In [None]:
xtrain.describe()

## Data Visualization

In [None]:
sns.scatterplot(x = data.create_first, y = data.retained)

In [None]:
sns.scatterplot(x = data.first_last, y = data.retained)

In [None]:
plt.figure(figsize=(15,7))
sns.countplot(x = data.favday)
plt.show()

In [None]:
pd.crosstab(index=data.city,columns=data.retained).plot(kind='bar',figsize=(15,8))

In [None]:
plt.figure(figsize=(15,7))
sns.countplot(x = data.city)
plt.show()

In [None]:
plt.figure(figsize=(15,7))
sns.scatterplot(x= data.esent, y = data.eopenrate, hue= data.retained)
plt.show()

In [None]:
plt.figure(figsize=(15,7))
sns.scatterplot(x= data.esent, y = data.eclickrate, hue= data.retained)
plt.show()

In [None]:
plt.figure(figsize=(15,7))
sns.scatterplot(x= data.eclickrate, y = data.eopenrate, hue= data.retained)
plt.show()

In [None]:
plt.figure(figsize=(15,10))
sns.heatmap(data.corr(),annot=True,annot_kws={'size':20}, cmap = 'gist_earth');

In [None]:
xtrain.head()

In [None]:
xtest.head()

## Transformation

In [None]:
xtrain['avgorder'] = s.boxcox(xtrain.avgorder+1)[0]
xtrain['esent'] = s.boxcox(xtrain.esent+1)[0]
xtrain['eopenrate'] = s.boxcox(xtrain.eopenrate+1)[0]
xtrain['eclickrate'] = s.boxcox(xtrain.eclickrate+1)[0]
xtrain['create_first'] = s.boxcox(abs(xtrain.create_first)+1)[0]
xtrain['first_last'] = s.boxcox(xtrain.first_last+5)[0]

In [None]:
xtrain.head()

In [None]:
xtest['avgorder'] = s.boxcox(xtest.avgorder+1)[0]
xtest['esent'] = s.boxcox(xtest.esent+1)[0]
xtest['eopenrate'] = s.boxcox(xtest.eopenrate+1)[0]
xtest['eclickrate'] = s.boxcox(xtest.eclickrate+1)[0]
xtest['create_first'] = s.boxcox(abs(xtest.create_first)+1)[0]
xtest['first_last'] = s.boxcox(xtest.first_last+5)[0]

In [None]:
xtest.head()

In [None]:
xtest.describe()

In [None]:
xtrain.describe()

## Scaling

In [None]:
SS = StandardScaler()

In [None]:
a = SS.fit_transform(xtrain[['eclickrate','eopenrate','eclickrate','avgorder','esent','create_first','first_last']])

In [None]:
xtrain['eclickrate'] = a[:,0]
xtrain['eopenrate'] = a[:,1]
xtrain['eclickrate'] = a[:,2]
xtrain['avgorder'] = a[:,3]
xtrain['esent'] = a[:,4]
xtrain['create_first'] = a[:,5]
xtrain['first_last'] = a[:,6]

In [None]:
xtrain.head()

In [None]:
b = SS.transform(xtest[['eclickrate','eopenrate','eclickrate','avgorder','esent','create_first','first_last']])
b

In [None]:
xtest['eclickrate'] = b[:,0]
xtest['eopenrate'] = b[:,1]
xtest['eclickrate'] = b[:,2]
xtest['avgorder'] = b[:,3]
xtest['esent'] = b[:,4]
xtest['create_first'] = b[:,5]
xtest['first_last'] = b[:,6]

In [None]:
xtest.head()

## One Hot Encoding

In [None]:
xtrain = pd.concat((xtrain.drop(columns=['city','favday'],axis=1),pd.get_dummies(xtrain[['favday','city']],drop_first=True)),
                   axis=1)

In [None]:
xtrain.head()

In [None]:
xtest = pd.concat((xtest.drop(columns=['city','favday'],axis=1),pd.get_dummies(xtest[['favday','city']],drop_first=True))
                  ,axis=1)

In [None]:
xtest.head()

In [None]:
xtrain.to_csv('train.csv')

In [None]:
xtest.to_csv('test.csv')

## Machine Learning Modelling

#### 1.  LOGISTIC REGRESSION

In [None]:
print(ytest.shape)
print(xtest.shape)
print(xtrain.shape)
print(ytrain.shape)

#### Full Model

In [None]:
log_reg = sm.Logit(ytrain,xtrain).fit()
print(log_reg.summary())

In [None]:
odds = pd.DataFrame((np.exp(log_reg.params)),index = xtrain.columns,columns=['ODDS'])
odds

In [None]:
ypred = log_reg.predict(xtrain)
ypred = [1 if i > 0.5 else 0 for i in ypred]

In [None]:
# Metric for test dataset
print('Logistic Regression')
ypred_test = log_reg.predict(xtest)
ypred_test = [1 if i > 0.8 else 0 for i in ypred_test]
cr= classification_report(ytest,ypred_test)
cm = confusion_matrix(ytest,ypred_test)
print(cr)
tn = cm[0][0]
tp = cm[1][1]
accuracy = (tn+tp)/len(xtrain)
print('Accuracy_train:',accuracy)

cm_test = confusion_matrix(ytest,ypred_test)
#print('Confusion Matrix:',cm_test)

tn = cm_test[0][0]
tp = cm_test[1][1]
accuracy = (tn+tp)/len(xtest)
print('Accuracy_test:',accuracy)

print('Precision Score_test:',precision_score(ytest,ypred_test))

print('f1_Score_test:',f1_score(ytest,ypred_test))

print('Kappa Score test:',cohen_kappa_score(ytest,ypred_test))

In [None]:
logis = LogisticRegression().fit(xtrain,ytrain)

In [None]:
logis.

In [None]:
cv = cross_val_score(estimator=LogisticRegression(),X=xtest,y=ytest,scoring='accuracy',cv=10)
cv

In [None]:
lime_explainer = lime_tabular.LimeTabularExplainer(
    training_data=np.array(xtrain),
    feature_names=xtrain.columns,
    mode='classification'
)


lime_exp = lime_explainer.explain_instance(
    data_row=xtest.iloc[1],
    predict_fn=logis.predict_proba
)
lime_exp.show_in_notebook(show_table=True)

In [None]:
# Clearly there is overfitting in the full model.

## VIF

In [None]:
vif = [variance_inflation_factor(xtrain.values,i) for i in range(xtrain.shape[1])]

In [None]:
vif_df = pd.DataFrame({'Features':xtrain.columns,'VIF':vif})
vif_df.sort_values('VIF',ascending=False)

### Feature Selection

In [None]:
LR = LogisticRegression()

In [None]:
## RFE SElection
rfe = RFE(estimator=LR).fit(xtrain,ytrain)

feature_ranking = pd.DataFrame([xtrain.columns,rfe.ranking_],index=['Feature','Ranking']).T
rfe_feat=feature_ranking[feature_ranking.Ranking==1]
rfe_feat.Feature

In [None]:
feature_ranking.sort_values('Ranking').reset_index()

In [None]:
x = xtrain[rfe_feat.Feature]
y= ytrain

rfe_reg = sm.Logit(y,x).fit()
print(rfe_reg.summary())

In [None]:
rfe_ypred_test = rfe_reg.predict(xtest[rfe_feat.Feature])
rfe_ypred_test = [1 if i > 0.5 else 0 for i in rfe_ypred_test]
cm_1 = confusion_matrix(ytest,rfe_ypred_test)
print('Confusion Matrix:',cm_1)
sns.heatmap(cm,annot=True,cbar=False,fmt='d');

In [None]:
print('RFE Feature selected Logistic Regression:')
print(classification_report(ytest,rfe_ypred_test))
print('Cohen Kappa Score:',round(cohen_kappa_score(ytest,rfe_ypred_test),2))
print('ROC AUC Score:',round(roc_auc_score(ytest,rfe_ypred_test),3))

In [None]:
# The model using rfe features selection gave best result than forward and backward selection techniques

In [None]:
fpr,tpr,th = roc_curve(ytest,rfe_reg.predict(xtest[rfe_feat.Feature]))
print('ROC AUC Score:',round(roc_auc_score(ytest,rfe_ypred_test),3))
plt.plot(fpr,tpr)
plt.plot([0,1],[0,1],'r--')
plt.show()

In [None]:
log_rfe = LogisticRegression().fit(xtrain[rfe_feat.Feature],ytrain)

In [None]:
lime_explainer = lime_tabular.LimeTabularExplainer(
    training_data=np.array(xtrain[rfe_feat.Feature]),
    feature_names=rfe_feat.Feature,
    mode='classification'
)


lime_exp = lime_explainer.explain_instance(
    data_row=xtest[rfe_feat.Feature].iloc[1],
    predict_fn=log_rfe.predict_proba
)
lime_exp.show_in_notebook(show_table=True)

<a id="dtree"> </a>
## Decision Tree

In [None]:
dt = DecisionTreeClassifier()

###### Hyperparameter Tuning

In [None]:
params = {'criterion':['gini','entropy'],
    'max_depth':[1,2,3,4,5],
    'min_samples_split':[2,3,4,5],
    'min_samples_leaf':[1,2,3,4,5,6,7],
    'max_leaf_nodes':[1,2,3,4,5,6],
    'ccp_alpha':[0.0,0.1,0.3,0.5,0.7,1]}

dt_grid = GridSearchCV(dt,param_grid=params,scoring='accuracy',n_jobs=-1)
dt_grid.fit(xtrain,ytrain)

In [None]:
dt_grid.best_params_

In [None]:
tuned_mod = DecisionTreeClassifier(ccp_alpha =  0.0, criterion= 'gini', max_depth =  3, max_leaf_nodes =  4, 
                                   min_samples_leaf =  1, min_samples_split =  2)

In [None]:
tuned_mod.fit(xtrain,ytrain)

In [None]:
ypred_tuned = tuned_mod.predict(xtest)

In [None]:
pd.DataFrame({'Features':xtest.columns,'Importance':tuned_mod.feature_importances_}).sort_values('Importance')

In [None]:
print('Decision Tree:')
print(classification_report(ytest,ypred_tuned))
print('Cohen Kappa Score:',round(cohen_kappa_score(ytest,ypred_tuned),2))
print('ROC AUC Score:',round(roc_auc_score(ytest,tuned_mod.predict_proba(xtest)[:,1]),2))

In [None]:
dot = export_graphviz(tuned_mod,feature_names=xtrain.columns,class_names=['No','Yes'])
graph = pydotplus.graph_from_dot_data(dot)
Image(graph.create_png())

In [None]:
print('Cohen Kappa Score:',round(cohen_kappa_score(ytest,ypred_tuned),2))

In [None]:
plot_roc_curve(estimator=tuned_mod,X= xtest,y= ytest)

In [None]:
lime_explainer = lime_tabular.LimeTabularExplainer(
    training_data=np.array(xtrain),
    feature_names=xtrain.columns,
    mode='classification'
)


lime_exp = lime_explainer.explain_instance(
    data_row=xtest.iloc[1],
    predict_fn=tuned_mod.predict_proba
)
lime_exp.show_in_notebook(show_table=True)

### Random Forest (Bagging)

In [None]:
rf = RandomForestClassifier()

In [None]:
params = { 'n_estimators':[50,80,100],
    'max_depth':[3,4,5]}

In [None]:
rf_grid = GridSearchCV(estimator=rf,param_grid=params,cv=5,n_jobs=-1)
rf_grid.fit(xtrain,ytrain)

In [None]:
rf_grid.best_params_

In [None]:
print('Random Forest:')
ypred = rf_grid.predict(xtest)
print(classification_report(ytest,ypred))
print("Cohen Kappa Score:",round(cohen_kappa_score(ypred,ytest),3))
print('ROC AUC Score: 0.96')

In [None]:
rf.fit(xtrain,ytrain)

In [None]:
ypred = rf.predict(xtest)

In [None]:
print(classification_report(ytest,ypred))

In [None]:
cohen_kappa_score(ypred,ytest)

In [None]:
plot_roc_curve(estimator=rf, X = xtest, y = ytest)
plt.plot([0,1],[0,1],'r--')

## Boosting 

### 1.Adaboost

In [None]:
adaboost = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(),n_estimators=50)

In [None]:
adaboost.fit(xtrain,ytrain)

In [None]:
ypred_prob = adaboost.predict_proba(xtest)[:,1]

In [None]:
ypred = adaboost.predict(xtest)


In [None]:
print('Adaboost:')
print(classification_report(ytest,ypred))
print('Cohen Kappa Score:',round(cohen_kappa_score(ytest,ypred),2))
print('ROC AUC Score: 0.95')

In [None]:
fpr,tpr,th = roc_curve(ytest,ypred_prob)

In [None]:
plot_roc_curve(estimator=adaboost,X=xtest,y=ytest)
plt.plot([0,1],[0,1],'r--')
plt.show()

## XG Boost

In [None]:
xgb = XGBClassifier(gamma=1)

In [None]:
xgb.fit(xtrain,ytrain)

In [None]:
ypred =xgb.predict(xtest)
ypred_prob = xgb.predict_proba(xtest)[:,1]

In [None]:
print('XG Boost')
print(classification_report(ytest,ypred))
print('COhen Kappa Score:',round(cohen_kappa_score(ytest,ypred),2))
print('ROC AUC Score: 0.97')

In [None]:
fpr,tpr,th = roc_curve(ytest,ypred_prob)

In [None]:
plot_roc_curve(estimator=xgb,X=xtest,y=ytest)
plt.plot([0,1],[0,1],'r--')
plt.show()

In [None]:
print('Cohen Kappa Score',round(cohen_kappa_score(ytest,ypred),2))

## Stacking

In [None]:
base = [('dt',DecisionTreeClassifier()),('rf',RandomForestClassifier())]
stack_mod = StackingClassifier(estimators=base,final_estimator=AdaBoostClassifier())
stack_mod.fit(xtrain,ytrain)

In [None]:
ypred = stack_mod.predict(xtest)

In [None]:
print('Stacking:')
print(classification_report(ytest,ypred))
print('Cohen Kappa Score',round(cohen_kappa_score(ytest,ypred),2))
print('ROC AUC Score: 0.97')

In [None]:
plot_roc_curve(estimator=stack_mod,X=xtest,y=ytest)
plt.plot([0,1],[0,1],'r--')

In [None]:
sns.heatmap(confusion_matrix(ytest,ypred),annot=True,fmt='d',cbar=False,cmap='Blues')

In [None]:
plot_roc_curve(estimator=stack_mod,X=xtest,y=ytest)
plt.plot([0,1],[0,1],'r--')

In [None]:
print('Cohen Kappa Score',round(cohen_kappa_score(ytest,ypred),2))

## KNN Model

In [None]:
from sklearn.neighbors import RadiusNeighborsClassifier, KNeighborsClassifier

In [None]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(xtrain,ytrain)

In [None]:
knn_pred = knn.predict(xtest)

In [None]:
plot_roc_curve(estimator=knn,X=xtest,y=ytest)
plt.plot([0,1],[0,1],'r--')
plt.show()

In [None]:
rnn = RadiusNeighborsClassifier(radius=3)
rnn.fit(xtrain,ytrain)

In [None]:
rnn_pred = rnn.predict(xtest)

In [None]:
print('KNN Classifier:')
print(classification_report(ytest,rnn_pred))
print('Cohen Kappa Score',round(cohen_kappa_score(ytest,rnn_pred),3))
print('ROC AUC Score: 0.95')

In [None]:
plot_roc_curve(estimator=rnn,X=xtest,y=ytest)
plt.plot([0,1],[0,1],'r--')
plt.show()

In [None]:
print('Cohen Kappa Score',round(cohen_kappa_score(ytest,rnn_pred),3))

## Naive Bayes 

In [None]:
from sklearn.naive_bayes import GaussianNB

In [None]:
gnb = GaussianNB()
gnb.fit(xtrain,ytrain)
gnb_pred = gnb.predict(xtest)

In [None]:
plot_roc_curve(estimator=gnb,X=xtest,y=ytest)
plt.plot([0,1],[0,1],'r--')
plt.show()

In [None]:
print('Naive Bayes Classifier:')
print(classification_report(ytest,gnb_pred))
print('Cohen Kappa Score',round(cohen_kappa_score(ytest,gnb_pred),2))
print('ROC AUC Score: 0.93')

* As we can see the AUC-ROC score of Random Forest is highest than other models. Hence we can use RandomForest as our final model.

Lime