In [3]:


import sys
!{sys.executable} -m pip install xgboost


#Importing libraries
import numpy as np #Linear Algebra
import pandas as pd #Data processing
from sklearn.preprocessing import LabelEncoder #Data Preprocessing
from sklearn.preprocessing import StandardScaler #Data Preprocessing

#Importing Modeling libraries
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV

#Importing Evaluation libraries
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score,f1_score, roc_auc_score,log_loss
from sklearn import metrics 



# Reading Data
telcom = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')

#Data Overview
print ("Rows     : " ,telcom.shape[0])
print ("Columns  : " ,telcom.shape[1])
print ("\nFeatures : \n" ,telcom.columns.tolist())
print ("\nMissing values :  ", telcom.isnull().sum().values.sum())
print ("\nUnique values :  \n",telcom.nunique())

#first few rows
telcom.head()



Rows     :  7043
Columns  :  21

Features : 
 ['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn']

Missing values :   0

Unique values :  
 customerID          7043
gender                 2
SeniorCitizen          2
Partner                2
Dependents             2
tenure                73
PhoneService           2
MultipleLines          3
InternetService        3
OnlineSecurity         3
OnlineBackup           3
DeviceProtection       3
TechSupport            3
StreamingTV            3
StreamingMovies        3
Contract               3
PaperlessBilling       2
PaymentMethod          4
MonthlyCharges      1585
TotalCharges        6531
Churn                  2
dtype: int64


You are using pip version 19.0.3, however version 19.2.2 is available.
You should consider upgrading via the 'python -m pip install --upgrade pip' command.


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [5]:
#Data Cleaning
#Replacing spaces with null values in total charges column
telcom['TotalCharges'] = telcom["TotalCharges"].replace(" ",np.nan)

#Dropping null values from total charges column
telcom = telcom[telcom["TotalCharges"].notnull()]
telcom = telcom.reset_index()[telcom.columns]

#Data Preprocessing (Before Visualization)  
#convert string to float type
telcom["TotalCharges"] = telcom["TotalCharges"].astype(dtype=np.float64)

#replace 0 and 1 values to Yes and No
telcom["SeniorCitizen"] = telcom["SeniorCitizen"].replace({1:"Yes",0:"No"}) 
    

#replace 'No internet service' to No for the following columns
replace_cols = [ 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
                'TechSupport','StreamingTV', 'StreamingMovies']
for i in replace_cols : 
    telcom[i]  = telcom[i].replace({'No internet service' : 'No'}) 
        

#Tenure to categorical column
def tenure_lab(telcom) :
    
    if telcom["tenure"] <= 12 :
        return "Tenure_0-12"
    elif (telcom["tenure"] > 12) & (telcom["tenure"] <= 24 ):
        return "Tenure_12-24"
    elif (telcom["tenure"] > 24) & (telcom["tenure"] <= 48) :
        return "Tenure_24-48"
    elif (telcom["tenure"] > 48) & (telcom["tenure"] <= 60) :
        return "Tenure_48-60"
    elif telcom["tenure"] > 60 :
        return "Tenure_gt_60"
telcom["tenure_group"] = telcom.apply(lambda telcom:tenure_lab(telcom),axis = 1)


In [47]:
#Data Preprocessing 

#Separating churn and non churn customers
#Separating catagorical and numerical columns
#Separating Binary columns with 2 values and more than 2 values
churn     = telcom[telcom["Churn"] == "Yes"]
not_churn = telcom[telcom["Churn"] == "No"]
#customer id col
Id_col     = ['customerID']
#Target columns
target_col = ["Churn"]# target col
#categorical columns
cat_cols   = telcom.nunique()[telcom.nunique() < 6].keys().tolist()
cat_cols   = [x for x in cat_cols if x not in target_col]
#numerical columns
num_cols   = [x for x in telcom.columns if x not in cat_cols + target_col + Id_col]
#Binary columns with 2 values
bin_cols   = telcom.nunique()[telcom.nunique() == 2].keys().tolist()
#Columns more than 2 values
multi_cols = [i for i in cat_cols if i not in bin_cols]

#Label encoding Binary columns
le = LabelEncoder()
for i in bin_cols :
    telcom[i] = le.fit_transform(telcom[i])
    
#Duplicating columns for multi value columns
telcom = pd.get_dummies(data = telcom,columns = multi_cols )

In [48]:
#Scaling Numerical columns
std = StandardScaler()
scaled = std.fit_transform(telcom[num_cols])
scaled = pd.DataFrame(scaled,columns=num_cols)

#dropping original values and merging scaled values for numerical columns
df_telcom_og = telcom.copy()
telcom = telcom.drop(columns = num_cols,axis = 1)
telcom = telcom.merge(scaled,left_index=True,right_index=True,how = "left")
#print (scaled)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [49]:
##Dropping unnecessary descriptive features
telcom = telcom.drop(columns=['tenure','customerID','gender', 'PhoneService', 'OnlineBackup', 'DeviceProtection', 'StreamingTV', 'StreamingMovies', 'MultipleLines_No', 'MultipleLines_No phone service', 'MultipleLines_Yes', 'InternetService_DSL', 'PaymentMethod_Bank transfer (automatic)', 'PaymentMethod_Credit card (automatic)', 'PaymentMethod_Mailed check', 'tenure_group_Tenure_12-24', 'tenure_group_Tenure_24-48', 'tenure_group_Tenure_48-60'])
telcom_tf = telcom.iloc[:,6].values
telcom_df = telcom.drop(columns=['Churn'])

telcom = telcom.drop(columns=['tenure','customerID','gender', 'PhoneService', 'OnlineBackup', 'DeviceProtection', 'StreamingTV', 'StreamingMovies', 'MultipleLines_No', 'MultipleLines_No phone service', 'MultipleLines_Yes', 'InternetService_DSL', 'PaymentMethod_Bank transfer (automatic)', 'PaymentMethod_Credit card (automatic)', 'PaymentMethod_Mailed check', 'tenure_group_Tenure_12-24', 'tenure_group_Tenure_24-48', 'tenure_group_Tenure_48-60'])
#print ("\nUnique values :  \n",telcom.nunique())
telcom = telcom.drop(columns=['Partner','Dependents','OnlineSecurity','TechSupport', 'InternetService_No','Contract_Two year','tenure_group_Tenure_gt_60','MonthlyCharges','TotalCharges'])
#print ("\nUnique values :  \n",telcom.nunique())
telcom_tf = telcom.iloc[:,2]
telcom_df = telcom.drop(columns=['Churn'])
#print(telcom_df)
#print(telcom_tf)

In [50]:
############################# Modeling #################################

In [51]:
# Splitting the dataset into the Training set and Test set  (VotingClassifier)


X = telcom_df
y = telcom_tf

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)
v1 = DecisionTreeClassifier(max_depth = 3, splitter  = 'best', criterion = 'entropy', random_state = 0)
v2 = KNeighborsClassifier(algorithm ='auto', n_neighbors = 50)
v3 = RandomForestClassifier (n_estimators = 100 ,max_depth = 3 ,criterion = 'entropy', random_state = 0)
v4 = SVC (C = 10, kernel = 'poly' ,random_state = 0)

voting = VotingClassifier(estimators = [('tree',v1), ('knn',v2), ('rfc',v3), ('SVM',v4)], voting = 'hard')
voting.fit(X_train, y_train)
predictions = voting.predict(X_test)
print(accuracy_score(y_test,predictions))

print('confusion_matrix of KNN: ', confusion_matrix(y_test,predictions))
print('precision_score of KNN: ', precision_score(y_test,predictions))
print('recall_score of KNN: ', recall_score(y_test,predictions))
print('roc_auc_score of KNN: ',roc_auc_score(y_test,predictions))
print('f1_score of KNN: ',f1_score(y_test,predictions))



0.7946530147895335
confusion_matrix of KNN:  [[1239   72]
 [ 289  158]]
precision_score of KNN:  0.6869565217391305
recall_score of KNN:  0.3534675615212528
roc_auc_score of KNN:  0.6492738265272168
f1_score of KNN:  0.4667651403249631


In [54]:
################################################ cross_validation (RandomForestClassifier)
X = telcom_df
y = telcom_tf

rf_classifier = RandomForestClassifier (n_estimators = 1000 ,max_depth = 5 ,criterion = 'gini', bootstrap = 'true', random_state = 0)

scores = cross_val_score(rf_classifier, X, y, cv=10)
print(scores)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
print("scores_mean",scores.mean())

scores = cross_val_score(rf_classifier, X, y, cv=10, scoring = 'f1')
print("f1",scores.mean())

scores = cross_val_score(rf_classifier, X, y, cv=10, scoring = 'balanced_accuracy')
print("balanced_accuracy",scores.mean())

scores = cross_val_score(rf_classifier, X, y, cv=10, scoring = 'roc_auc')
print("roc_auc",scores.mean())

scores = cross_val_score(rf_classifier, X, y, cv=10, scoring = 'recall')
print("recall",scores.mean())


scores = cross_val_score(rf_classifier, X, y, cv=10, scoring = 'precision')
print("precision",scores.mean())


############################################### GridSearch (RandomForestClassifier)
grid_param = {  
    'n_estimators': [100, 300, 500, 800, 1000],
    'max_depth': [3,4,5],
    'criterion': ['gini', 'entropy'],
    'bootstrap': [True, False]
}

gd_sr = GridSearchCV(estimator= rf_classifier,  
                     param_grid=grid_param,
                     scoring='accuracy',
                     cv=5,
                     n_jobs=-1)

gd_sr.fit(X, y)  

best_parameters = gd_sr.best_params_  
print(best_parameters)  

best_result = gd_sr.best_score_  
print(best_result) 

[0.80681818 0.80681818 0.78551136 0.80369844 0.77667141 0.76671408
 0.80654339 0.79800853 0.78378378 0.80911681]
Accuracy: 0.79 (+/- 0.03)
scores_mean 0.7943684166546009
f1 0.5219858252677492
balanced_accuracy 0.6759738429174366
roc_auc 0.8428696719005065
recall 0.42322465643148754
precision 0.6827797382830687
{'bootstrap': True, 'criterion': 'gini', 'max_depth': 5, 'n_estimators': 1000}
0.7973549488054608


In [55]:
################################################ cross_validation (SVM)
X = telcom_df
y = telcom_tf

svm_classifier = SVC (C = 10, kernel = 'poly' ,random_state = 0)
scores = cross_val_score(svm_classifier, X, y, cv=10)
print(scores)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
print("scores_mean",scores.mean())

scores = cross_val_score(svm_classifier, X, y, cv=10, scoring = 'f1')
print("f1",scores.mean())

scores = cross_val_score(svm_classifier, X, y, cv=10, scoring = 'balanced_accuracy')
print("balanced_accuracy",scores.mean())

scores = cross_val_score(svm_classifier, X, y, cv=10, scoring = 'roc_auc')
print("roc_auc",scores.mean())

scores = cross_val_score(svm_classifier, X, y, cv=10, scoring = 'recall')
print("recall",scores.mean())


scores = cross_val_score(svm_classifier, X, y, cv=10, scoring = 'precision')
print("precision",scores.mean())

############################################### GridSearch (SVM)
grid_param = {  
    'kernel': ['rbf', 'linear', 'poly'],
         'C': [0.001, 0.01, 0.1, 1, 10]  
}

gd_sr = GridSearchCV(estimator= svm_classifier,  
                     param_grid=grid_param,
                     scoring='accuracy',
                     cv=5,
                     n_jobs=-1)


gd_sr.fit(X, y)  

best_parameters = gd_sr.best_params_  
print(best_parameters)  

best_result = gd_sr.best_score_  
print(best_result)




[0.80539773 0.8125     0.78267045 0.81507824 0.78662873 0.77667141
 0.80796586 0.80227596 0.79374111 0.80769231]
Accuracy: 0.80 (+/- 0.03)
scores_mean 0.7990621798187588




f1 0.5480875522204876




balanced_accuracy 0.6906013321369937




roc_auc 0.7976271515346747




recall 0.4590650336380886




precision 0.6811190426214447




{'C': 10, 'kernel': 'poly'}
0.7986348122866894


In [56]:
# ################################################ cross_validation (XGBoost)

X = telcom_df
y = telcom_tf


b_classifier = XGBClassifier(booster= 'gbtree', max_depth = 4)
scores = cross_val_score(b_classifier, X, y, cv=10)
print(scores)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
print("scores_mean",scores.mean())

scores = cross_val_score(b_classifier, X, y, cv=10, scoring = 'f1')
print("f1",scores.mean())

scores = cross_val_score(b_classifier, X, y, cv=10, scoring = 'balanced_accuracy')
print("balanced_accuracy",scores.mean())

scores = cross_val_score(b_classifier, X, y, cv=10, scoring = 'roc_auc')
print("roc_auc",scores.mean())

scores = cross_val_score(b_classifier, X, y, cv=10, scoring = 'recall')
print("recall",scores.mean())


scores = cross_val_score(b_classifier, X, y, cv=10, scoring = 'precision')
print("precision",scores.mean())

############################################### GridSearch (XGBoost)
grid_param = {  
           'booster' : ['gbtree', 'gblinear'],
         'max_depth' : [4,6,8,10],
}

gd_sr = GridSearchCV(estimator= b_classifier,  
                     param_grid=grid_param,
                     scoring='accuracy',
                     cv=5,
                     n_jobs=-1)

gd_sr.fit(X, y)  

best_parameters = gd_sr.best_params_  
print(best_parameters)  

best_result = gd_sr.best_score_  
print(best_result)


[0.80965909 0.81960227 0.796875   0.81792319 0.79800853 0.77524893
 0.81792319 0.79516358 0.78520626 0.8048433 ]
Accuracy: 0.80 (+/- 0.03)
scores_mean 0.8020453352690196
f1 0.5876084127558684
balanced_accuracy 0.7154992827880428
roc_auc 0.8433689920271729
recall 0.5307515381519176
precision 0.659161145502215
{'booster': 'gbtree', 'max_depth': 4}
0.7989192263936291


In [57]:
################################################ cross_validation (GaussianNB)
X = telcom_df
y = telcom_tf

model = GaussianNB()
scores = cross_val_score(model, X, y, cv=10)
#print(scores)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
print("scores_mean",scores.mean())

scores = cross_val_score(model, X, y, cv=10, scoring = 'f1')
print("f1",scores.mean())

scores = cross_val_score(model, X, y, cv=10, scoring = 'balanced_accuracy')
print("balanced_accuracy",scores.mean())

scores = cross_val_score(model, X, y, cv=10, scoring = 'roc_auc')
print("roc_auc",scores.mean())

scores = cross_val_score(model, X, y, cv=10, scoring = 'recall')
print("recall",scores.mean())


scores = cross_val_score(model, X, y, cv=10, scoring = 'precision')
print("precision",scores.mean())

Accuracy: 0.73 (+/- 0.04)
scores_mean 0.7275324632232527
f1 0.6108187361556892
balanced_accuracy 0.7519777024413117
roc_auc 0.8308168588656747
recall 0.8041630728537749
precision 0.49264960119006895


In [58]:
################################################ cross_validation (KNeighborsClassifier)
X = telcom_df
y = telcom_tf

knn = KNeighborsClassifier(algorithm ='auto', n_neighbors = 50)
scores = cross_val_score(knn, X, y, cv=10)
print(scores)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
print("scores_mean",scores.mean())

scores = cross_val_score(knn, X, y, cv=10, scoring = 'f1')
print("f1",scores.mean())

scores = cross_val_score(knn, X, y, cv=10, scoring = 'balanced_accuracy')
print("balanced_accuracy",scores.mean())

scores = cross_val_score(knn, X, y, cv=10, scoring = 'roc_auc')
print("roc_auc",scores.mean())

scores = cross_val_score(knn, X, y, cv=10, scoring = 'recall')
print("recall",scores.mean())


scores = cross_val_score(knn, X, y, cv=10, scoring = 'precision')
print("precision",scores.mean())

############################################### GridSearch (KNeighborsClassifier)
grid_param = {  
    'n_neighbors': [3,17, 30, 20, 50],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
}

gd_sr = GridSearchCV(estimator= knn,  
                     param_grid=grid_param,
                     scoring='accuracy',
                    cv=5,
                     n_jobs=-1 )


gd_sr.fit(X, y)  

best_parameters = gd_sr.best_params_  
print(best_parameters)  

best_result = gd_sr.best_score_  
print(best_result) 


[0.80823864 0.79403409 0.79545455 0.80796586 0.77951636 0.77382646
 0.82503556 0.80369844 0.79089616 0.7962963 ]
Accuracy: 0.80 (+/- 0.03)
scores_mean 0.7974962402593981
f1 0.5879420804079836
balanced_accuracy 0.7164947241475291
roc_auc 0.8356587503658842
recall 0.5435857627508481
precision 0.6408436512261458
{'algorithm': 'auto', 'n_neighbors': 50}
0.7963594994311718


In [59]:
################################################ cross_validation (LogisticRegression)

logistic_classifier = LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1, penalty='l2', random_state=None, solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

scores = cross_val_score(logistic_classifier, X, y, cv=10)

print(scores)

print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
print("scores_mean",scores.mean())

scores = cross_val_score(logistic_classifier, X, y, cv=10, scoring = 'f1')
print("f1",scores.mean())

scores = cross_val_score(logistic_classifier, X, y, cv=10, scoring = 'balanced_accuracy')
print("balanced_accuracy",scores.mean())

scores = cross_val_score(logistic_classifier, X, y, cv=10, scoring = 'roc_auc')
print("roc_auc",scores.mean())

scores = cross_val_score(logistic_classifier, X, y, cv=10, scoring = 'recall')
print("recall",scores.mean())


scores = cross_val_score(logistic_classifier, X, y, cv=10, scoring = 'precision')
print("precision",scores.mean())

[0.80539773 0.80681818 0.78551136 0.80796586 0.78662873 0.77667141
 0.81223329 0.80227596 0.79800853 0.7962963 ]
Accuracy: 0.80 (+/- 0.02)
scores_mean 0.7977807352807353
f1 0.5655183622430324
balanced_accuracy 0.7013319709065964
roc_auc 0.8404721456505619
recall 0.4954401702029786
precision 0.6594176623997514


In [60]:
################################################ cross_validation (DecisionTreeClassifier)
X = telcom_df
y = telcom_tf

dt_classifier = DecisionTreeClassifier(max_depth = 8,  criterion = 'gini', random_state = 0)

scores = cross_val_score(dt_classifier, X, y, cv=10)
print(scores)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
print("scores_mean",scores.mean())

scores = cross_val_score(dt_classifier, X, y, cv=10, scoring = 'f1')
print("f1",scores.mean())

scores = cross_val_score(dt_classifier, X, y, cv=10, scoring = 'balanced_accuracy')
print("balanced_accuracy",scores.mean())

scores = cross_val_score(dt_classifier, X, y, cv=10, scoring = 'roc_auc')
print("roc_auc",scores.mean())

scores = cross_val_score(dt_classifier, X, y, cv=10, scoring = 'recall')
print("recall",scores.mean())


scores = cross_val_score(dt_classifier, X, y, cv=10, scoring = 'precision')
print("precision",scores.mean())

############################################### GridSearch (DecisionTreeClassifier)
grid_param = {  
    'max_depth': [5,2,16,8,9],
    'criterion': ['gini', 'entropy']
}

gd_sr = GridSearchCV(estimator= rf_classifier,  
                     param_grid=grid_param,
                     scoring='accuracy',
                     cv=5,
                     n_jobs=-1)

gd_sr.fit(X, y)  

best_parameters = gd_sr.best_params_  
print(best_parameters)  

best_result = gd_sr.best_score_  
print(best_result) 


[0.77840909 0.78977273 0.77556818 0.78947368 0.76955903 0.75533428
 0.78378378 0.76529161 0.77240398 0.78205128]
Accuracy: 0.78 (+/- 0.02)
scores_mean 0.7761647654739761
f1 0.5414920290915634
balanced_accuracy 0.6877924613278703
roc_auc 0.805992875127159
recall 0.4991604853084929
precision 0.5948239990936541
{'criterion': 'entropy', 'max_depth': 8}
0.8013367463026166
