# PREDICTING CHURN RATE IN TELECOM INDUSTRY

# IMPORTING THE REQUIRED LIBRARIES

In [33]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report

import warnings
warnings.filterwarnings("ignore")

# READING THE DATA

In [34]:
tc=pd.read_csv('Telco-Customer-Churn.csv',na_values=[' ','/','?','$'])

In [35]:
pd.options.display.max_columns=None
#  option to view all the columns without missing any, in the outputs

In [36]:
tc.head(5)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [37]:
tc['gender'].unique()

array(['Female', 'Male'], dtype=object)

In [38]:
X= tc['TotalCharges'].groupby(tc['gender']).mean()
X

gender
Female    2283.190985
Male      2283.407861
Name: TotalCharges, dtype: float64

In [39]:
tc['TotalCharges'].isnull().sum()

11

In [40]:
tc[tc['InternetService']=='No']

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
11,7469-LKBCI,Male,0,No,No,16,Yes,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,No,Credit card (automatic),18.95,326.80,No
16,8191-XWSZG,Female,0,No,No,52,Yes,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,One year,No,Mailed check,20.65,1022.95,No
21,1680-VDCWW,Male,0,Yes,No,12,Yes,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,One year,No,Bank transfer (automatic),19.80,202.25,No
22,1066-JKSGK,Male,0,No,No,1,Yes,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Month-to-month,No,Mailed check,20.15,20.15,Yes
33,7310-EGVHZ,Male,0,No,No,1,Yes,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Month-to-month,No,Bank transfer (automatic),20.20,20.20,No
42,9867-JCZSP,Female,0,Yes,Yes,17,Yes,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,One year,No,Mailed check,20.75,418.25,No
58,3957-SQXML,Female,0,Yes,Yes,34,Yes,Yes,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,No,Credit card (automatic),24.95,894.30,No
68,3170-NMYVV,Female,0,Yes,Yes,50,Yes,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,No,Bank transfer (automatic),20.15,930.90,No
71,0731-EBJQB,Female,0,Yes,Yes,52,Yes,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,One year,Yes,Electronic check,20.40,1090.65,No
73,8028-PNXHQ,Male,0,Yes,Yes,62,Yes,Yes,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,Yes,Bank transfer (automatic),24.25,1424.60,No


In [41]:
#replace 'No internet service' to No for the following columns
replace_cols = [ 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
                'TechSupport','StreamingTV', 'StreamingMovies']
for i in replace_cols : 
    tc[i]  = tc[i].replace({'No internet service' : 'No'})

In [42]:
tc['SeniorCitizen']=tc['SeniorCitizen'].replace([0,1],['No','Yes'])
# 'SeniorCitizen' has only 1's and 0's and it is a categorical column so converting it into object

In [43]:
tc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
customerID          7043 non-null object
gender              7043 non-null object
SeniorCitizen       7043 non-null object
Partner             7043 non-null object
Dependents          7043 non-null object
tenure              7043 non-null int64
PhoneService        7043 non-null object
MultipleLines       7043 non-null object
InternetService     7043 non-null object
OnlineSecurity      7043 non-null object
OnlineBackup        7043 non-null object
DeviceProtection    7043 non-null object
TechSupport         7043 non-null object
StreamingTV         7043 non-null object
StreamingMovies     7043 non-null object
Contract            7043 non-null object
PaperlessBilling    7043 non-null object
PaymentMethod       7043 non-null object
MonthlyCharges      7043 non-null float64
TotalCharges        7032 non-null float64
Churn               7043 non-null object
dtypes: float64(2), int64(1), o

# EXPLORATORY DATA ANALYSIS

In [44]:
del tc['customerID']
#  customerID is the unqiue column and we cant get any insights from this, and hence it's removed

In [45]:
tc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 20 columns):
gender              7043 non-null object
SeniorCitizen       7043 non-null object
Partner             7043 non-null object
Dependents          7043 non-null object
tenure              7043 non-null int64
PhoneService        7043 non-null object
MultipleLines       7043 non-null object
InternetService     7043 non-null object
OnlineSecurity      7043 non-null object
OnlineBackup        7043 non-null object
DeviceProtection    7043 non-null object
TechSupport         7043 non-null object
StreamingTV         7043 non-null object
StreamingMovies     7043 non-null object
Contract            7043 non-null object
PaperlessBilling    7043 non-null object
PaymentMethod       7043 non-null object
MonthlyCharges      7043 non-null float64
TotalCharges        7032 non-null float64
Churn               7043 non-null object
dtypes: float64(2), int64(1), object(17)
memory usage: 1.1+ MB


In [46]:
for i in tc.columns:
    print(i,':',tc[i].nunique())

gender : 2
SeniorCitizen : 2
Partner : 2
Dependents : 2
tenure : 73
PhoneService : 2
MultipleLines : 3
InternetService : 3
OnlineSecurity : 2
OnlineBackup : 2
DeviceProtection : 2
TechSupport : 2
StreamingTV : 2
StreamingMovies : 2
Contract : 3
PaperlessBilling : 2
PaymentMethod : 4
MonthlyCharges : 1585
TotalCharges : 6530
Churn : 2


In [47]:
tc.describe()

Unnamed: 0,tenure,MonthlyCharges,TotalCharges
count,7043.0,7043.0,7032.0
mean,32.371149,64.761692,2283.300441
std,24.559481,30.090047,2266.771362
min,0.0,18.25,18.8
25%,9.0,35.5,401.45
50%,29.0,70.35,1397.475
75%,55.0,89.85,3794.7375
max,72.0,118.75,8684.8


In [48]:
df=tc[tc['TotalCharges'].isnull()]
df

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
488,Female,No,Yes,Yes,0,No,No phone service,DSL,Yes,No,Yes,Yes,Yes,No,Two year,Yes,Bank transfer (automatic),52.55,,No
753,Male,No,No,Yes,0,Yes,No,No,No,No,No,No,No,No,Two year,No,Mailed check,20.25,,No
936,Female,No,Yes,Yes,0,Yes,No,DSL,Yes,Yes,Yes,No,Yes,Yes,Two year,No,Mailed check,80.85,,No
1082,Male,No,Yes,Yes,0,Yes,Yes,No,No,No,No,No,No,No,Two year,No,Mailed check,25.75,,No
1340,Female,No,Yes,Yes,0,No,No phone service,DSL,Yes,Yes,Yes,Yes,Yes,No,Two year,No,Credit card (automatic),56.05,,No
3331,Male,No,Yes,Yes,0,Yes,No,No,No,No,No,No,No,No,Two year,No,Mailed check,19.85,,No
3826,Male,No,Yes,Yes,0,Yes,Yes,No,No,No,No,No,No,No,Two year,No,Mailed check,25.35,,No
4380,Female,No,Yes,Yes,0,Yes,No,No,No,No,No,No,No,No,Two year,No,Mailed check,20.0,,No
5218,Male,No,Yes,Yes,0,Yes,No,No,No,No,No,No,No,No,One year,Yes,Mailed check,19.7,,No
6670,Female,No,Yes,Yes,0,Yes,Yes,DSL,No,Yes,Yes,Yes,Yes,No,Two year,No,Mailed check,73.35,,No


In [49]:
tc['TotalCharges'].groupby(tc['gender']).mean()

gender
Female    2283.190985
Male      2283.407861
Name: TotalCharges, dtype: float64

In [61]:
g=tc.groupby('gender')
g1=g.get_group('Female')
g2=g.get_group('Male')

In [64]:
g1.fillna('90000',inplace=True)
g2.fillna('10000000000',inplace=True)

In [65]:
full_data=pd.concat([g1,g2])

In [69]:
full_data['gender'].value_counts()

Male      3555
Female    3488
Name: gender, dtype: int64

In [66]:
full_data.isnull().sum()

gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [57]:
tc[tc['gender']=='Female']['TotalCharges'].fillna(9090909090909,inplace=True)


In [59]:
tc['gender'].isnull().sum()

0

In [60]:
tc[tc['TotalCharges']==9090909090909]

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn


In [None]:
tc['TotalCharges'].groupby(tc['gender']).mean()

In [None]:
con1=(tc['gender']=='Male')& (tc['tenure']==0)
con2=(tc['TotalCharges'].isnull())
con3=con1&con2
tc[con3]=tc[con3].fillna(male)

In [None]:
con1=(tc['gender']=='Female')& (tc['tenure']==0)
con2=(tc['TotalCharges'].isnull())
con3=con1&con2
tc[con3]=tc[con3].fillna(female)

In [None]:
tc.info()

In [None]:
tc.describe()

In [None]:
catcols=tc.select_dtypes('object')
catcols

In [None]:
# Taking all columns whose categories are more than 2 and check if they are significantly different
l=[]
for i in catcols.columns:
    if catcols[i].nunique()>2:
        l.append(i)
print(l)

In [None]:
# Since there are more two categories we perform chi2_contingency
# we consider those columns whose p_val is <0.05
from scipy.stats import chi2_contingency
f=[]
for i in l:
    cc=chi2_contingency(pd.crosstab(catcols['Churn'],catcols[i]))
    if(cc[1]<=0.05):
        f.append(i)
print(f)

In [None]:
# Taking all columns whose categories are less than 2 and check if they are significantly different
k=[]
for i in catcols.columns:
    if catcols[i].nunique()==2:
        k.append(i)
k.pop(-1) # since 'Churn' is our predictive varibale we remove it from here, to compare every categorical value with 'Churn'
print(k)

# PROPORTION Z_TEST

In [None]:
# Since there are two categories we perform proportions_ztest
# we consider those columns whose p_val is <0.05
from statsmodels.stats.proportion import proportions_ztest
g=[]
for i in k:
    c=pd.crosstab(tc['Churn'],tc[i])
    x1=c.iloc[1,1]
    x2=c.iloc[1,0]
    n1=c.iloc[:,1].sum()
    n2=c.iloc[:,0].sum()
    cc=proportions_ztest([x1,x2],[n1,n2])
    if(cc[1]<=0.05):
        g.append(i)
print(g)

In [None]:
# List of categorical whose p<0.05 after proportion_ztest and chi2_contingency test
f_list=f+g
print(f_list,'\n')
print('Total number of required columns:',len(f_list))

In [None]:
tc['Churn'].value_counts(),

In [None]:
f_list

In [None]:
plt.pie(tc['Churn'].value_counts(),explode=(0,0.09),autopct='%1.1f%%',labels=('No','Yes'),shadow=True)
plt.savefig('Churn pct')
plt.show()

In [None]:
fig,axes=plt.subplots(4,4,figsize=(20,20))
axes=axes.flatten()
for i in range(0,len(f_list)):
    sns.countplot(tc[f_list[i]],hue=tc['Churn'],ax=axes[i])
plt.tight_layout()
plt.savefig('Attribute wise comparison with Churn')
plt.show()

In [None]:
# creating one hot encoding for the selected columns
tc_dum=pd.get_dummies(tc[f_list])
tc_dum.head()
print(tc_dum.shape)

In [None]:
num=tc.select_dtypes(['int64','float'])
num
X=pd.concat([tc_dum,num],axis=1)
y=tc['Churn']

# SPLITTING THE DATA INTO TRAIN AND TEST

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,random_state = 7,test_size=0.30)

# LOGISTIC REGRESSION (CLASSIFICATION)

In [None]:
from sklearn.linear_model import LogisticRegression
logreg=LogisticRegression()
logreg_result=logreg.fit(X_train,y_train)

print("Training set score: {:.3f}".format(logreg_result.score(X_train,y_train)))
print("Test score: {:.3f}".format(logreg_result.score(X_test,y_test)))

In [None]:
logreg_result

In [None]:
y_pred  = logreg.predict(X_test)
y_proba = logreg.predict_proba(X_test)
y_proba
# y_proba consists of p and (1-p) values, but we use (1-p) values for roc curve

In [None]:
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import auc
from sklearn.metrics import accuracy_score,classification_report

y_test1=y_test.replace(['Yes','No'],[True,False])
y_train1=y_train.replace(['Yes','No'],[True,False])
fpr, tpr, thresholds = roc_curve(y_test1, y_proba[:,1])

In [None]:
plt.plot(fpr,tpr)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.title('ROC curve for Logistic Regression')
plt.xlabel('False positive rate (1-Specificity)')
plt.ylabel('True positive rate (Sensitivity)')
plt.grid(True)

roc_auc = auc(fpr,tpr)
print("Area under the curve for LOGISTIC REGRESSION: " , roc_auc)

In [None]:
from statsmodels.tools import add_constant as add_constant
import statsmodels.api as sm
X_train1=sm.add_constant(X_train)
logit_model=sm.Logit(y_train1,X_train1)
result=logit_model.fit()
print(result.summary2())

# RECURSIVE FEATURE ELIMINATION

In [None]:
from sklearn.feature_selection import RFE

logit = LogisticRegression()

rfe = RFE(logit,10)
rfe = rfe.fit(X_train,y_train)

print(rfe.support_)
print(rfe.ranking_)
#identified columns Recursive Feature Elimination

In [None]:
idc_rfe = pd.DataFrame({"rfe_support" :rfe.support_,
                       "columns" :X_train.columns ,
                       "ranking" : rfe.ranking_,
                      })
cols = idc_rfe[idc_rfe["rfe_support"] == True]["columns"].tolist()
cols

In [None]:
# From the RFE we filter the best columns where support is True
X_train_rfe=X_train[cols]
X_test_rfe=X_test[cols]

# RANDOM FOREST

In [None]:
from sklearn.ensemble import RandomForestClassifier
RF_model=RandomForestClassifier(random_state=10)
RF_results=RF_model.fit(X_train_rfe,y_train)

In [None]:
y_pred_test = RF_model.predict(X_test_rfe)
y_pred_train = RF_model.predict(X_train_rfe)

In [None]:
print(classification_report(y_train,y_pred_train))
print('Train Accuracy Score for Random Forest 1st attempt',accuracy_score(y_train,y_pred_train))

In [None]:
print(classification_report(y_test,y_pred_test))
print('Test Accuracy Score for Random Forest 1st attempt',accuracy_score(y_test,y_pred_test))

# GRID SEARCH

In [None]:
from sklearn.model_selection import GridSearchCV
RF_model=RandomForestClassifier(n_estimators=10,random_state=10)
param_grid_rf={'n_estimators':[11,12,13,9],'max_features':['auto','sqrt'],}
clf=GridSearchCV(RF_model,param_grid_rf,cv=2)
clf.fit(X_train_rfe,y_train)

In [None]:
clf.best_params_

In [None]:
RF_best_model=RandomForestClassifier(n_estimators=9,random_state=10)
RF_best_model.fit(X_train_rfe,y_train)

In [None]:
y_train_pred=clf.predict(X_train_rfe)
y_test_pred=clf.predict(X_test_rfe)
(y_train_pred,y_test_pred)

In [None]:
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
print(classification_report(y_train,y_train_pred))
print('Train Accuracy Random Forest after Grid Search CV :',accuracy_score(y_train,y_train_pred))

In [None]:
print(classification_report(y_test,y_test_pred))
print('Test Accuracy Random Forest after Grid Search CV :',accuracy_score(y_test,y_test_pred))

In [None]:
y_test_pred=pd.DataFrame(RF_best_model.predict(X_test_rfe))
y_proba_rf = RF_best_model.predict_proba(X_test_rfe)
# y_proba_rf

In [None]:
y_test1=y_test.replace(['Yes','No'],[True,False])
y_train1=y_train.replace(['Yes','No'],[True,False])
fpr1, tpr1, thresholds = roc_curve(y_test1, y_proba_rf[:,1])
(fpr1, tpr1)
# print(len(y_proba_rf[:,1]))

In [None]:
plt.plot(fpr1,tpr1)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.title('ROC curve for Random Forest')
plt.xlabel('False positive rate (1-Specificity)')
plt.ylabel('True positive rate (Sensitivity)')
plt.grid(True)

roc_auc1 = auc(fpr1,tpr1)
print("Area under the curve for RANDOM FOREST: " , roc_auc1)

# BAGGING

In [None]:
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix

In [None]:
bag_model = BaggingClassifier(DecisionTreeClassifier(random_state=1))
bag_model.fit(X_train_rfe,y_train)

In [None]:
bag_pred = bag_model.predict(X_test_rfe)
accuracy_score(y_test, bag_pred)

In [None]:
bag_cm = confusion_matrix(y_test, bag_pred)
sns.heatmap(bag_cm,annot=True,fmt='d',cmap='Blues_r')
plt.show()

In [None]:
print(classification_report(y_test,bag_pred))

In [None]:
y_pred_bag=bag_model.predict(X_test_rfe)
y_pred_proba_bag=bag_model.predict_proba(X_test_rfe)
fpr2, tpr2, thresholds = roc_curve(y_test1, y_pred_proba_bag[:,1])
(fpr2, tpr2)

In [None]:
plt.plot(fpr2,tpr2)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.title('ROC curve for Bagging')
plt.xlabel('False positive rate (1-Specificity)')
plt.ylabel('True positive rate (Sensitivity)')
plt.grid(True)

roc_auc2 = auc(fpr2,tpr2)
print("Area under the curve for BAGGING: " , roc_auc2)

# K-NEAREST NEIGHBORS

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
knn=KNeighborsClassifier()
knn.fit(X_train_rfe,y_train)
y_pred_knn=knn.predict(X_test_rfe)

In [None]:
accuracy_score(y_pred_knn,y_test)

In [None]:
y_pred_proba_knn=knn.predict_proba(X_test_rfe)
fpr3, tpr3, thresholds = roc_curve(y_test1, y_pred_proba_knn[:,1])

In [None]:
plt.plot(fpr3,tpr3)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.title('ROC curve for K-Nearest Neighbors')
plt.xlabel('False positive rate (1-Specificity)')
plt.ylabel('True positive rate (Sensitivity)')
plt.grid(True)

roc_auc3 = auc(fpr3,tpr3)
print("Area under the curve for K-NEAREST NEIGHBORS: " , roc_auc3)

# GRADIENT BOOSTING

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
gb_model = GradientBoostingClassifier(learning_rate=0.01,random_state=1)
gb_model.fit(X_train_rfe,y_train)

In [None]:
gb_pred = gb_model.predict(X_test_rfe)
accuracy_score(y_test, gb_pred)

In [None]:
y_pred_gb=gb_model.predict(X_test_rfe)
y_pred_proba_gb=gb_model.predict_proba(X_test_rfe)
fpr4, tpr4, thresholds = roc_curve(y_test1, y_pred_proba_gb[:,1])

In [None]:
plt.plot(fpr4,tpr4)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.title('ROC curve for K-Nearest Neighbors')
plt.xlabel('False positive rate (1-Specificity)')
plt.ylabel('True positive rate (Sensitivity)')
plt.grid(True)

roc_auc4 = auc(fpr4,tpr4)
print("Area under the curve for GRADIENT BOOSTING: " , roc_auc4)

# COMPARING THE MODELS

In [None]:
# ROC Curves for Logistic Regression and Random Forest
plt.figure(figsize=(10,10))

plt.plot(fpr,tpr)# log reg
roc_auc = auc(fpr,tpr)

plt.plot(fpr1,tpr1)# rand for
roc_auc1 = auc(fpr1,tpr1)

plt.plot(fpr2,tpr2)# KNN
roc_auc2 = auc(fpr2,tpr2)

plt.plot(fpr3,tpr3)# Bagging
roc_auc3 = auc(fpr3,tpr3)

plt.plot(fpr4,tpr4)# Gradiant Boosting
roc_auc4 = auc(fpr4,tpr4)

print("Area under the curve for LOGISTIC REGRESSION: " , roc_auc)
print("Area under the curve for RANDOM FOREST:       " , roc_auc1)
print("Area under the curve for BAGGING:             " , roc_auc2)
print("Area under the curve for K-NEAREST NEIGHBORS: " , roc_auc3)
print("Area under the curve for GRADIENT BOOSTING:   " , roc_auc4)


plt.grid()
plt.title('Algorithm Wise ROC Comparison')
plt.legend(['LOGISTIC REGRESSION','RANDOM FOREST','KNN','BAGGING','GRADIENT BOSSTING'])
plt.savefig('ROC Curves')

In [None]:
models = [('LOGISTIC REGRESSION', LogisticRegression()),  
          ('RANDOM FOREST', RandomForestClassifier()), 
          ('KNN', KNeighborsClassifier()), 
          ('BAGGING',BaggingClassifier()),
          ('GRADIENT BOOSTING', GradientBoostingClassifier())]

In [None]:
seed = 7
results = []
names = []
from sklearn.model_selection import KFold, cross_val_score
for name, model in models:
    kfold = KFold(n_splits=10, random_state=seed)
    cv_results = cross_val_score(model, X_test_rfe, y_test, cv=kfold, scoring='accuracy')
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

In [None]:
fig, ax = plt.subplots(figsize=(12,6))
plt.boxplot(results)
plt.title('Algorithm Wise Test Accuracy Comparison')
ax.set_xticklabels(names,rotation=45)
plt.ylabel('')
plt.savefig('Algorithm Wise Test Accuracy Comparison')
plt.show()

In [None]:
seed_train = 7
results_train = []
names_train = []
from sklearn.model_selection import KFold, cross_val_score
for name, model in models:
    kfold = KFold(n_splits=10, random_state=seed_train)
    cv_results = cross_val_score(model, X_train_rfe, y_train, cv=kfold, scoring='accuracy')
    results_train.append(cv_results)
    names_train.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

In [None]:
plt.figure(figsize=(12,6))
fig, ax = plt.subplots(figsize=(12,6))
plt.boxplot(results_train)
plt.title('Algorithm Wise Train Accuracy Comparison')
ax.set_xticklabels(names_train,rotation=45)
plt.ylabel('')
plt.savefig('Algorithm Wise Train Accuracy Comparison')
plt.show()

# CONCLUSION