Importing necessary Libraries

In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

Reading data

In [5]:
df = pd.read_csv("Telco-Customer-Churn (1).csv")
df = df.drop('customerID', axis = 1)
df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


Information of Data

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7043 non-null   object 
 1   SeniorCitizen     7043 non-null   int64  
 2   Partner           7043 non-null   object 
 3   Dependents        7043 non-null   object 
 4   tenure            7043 non-null   int64  
 5   PhoneService      7043 non-null   object 
 6   MultipleLines     7043 non-null   object 
 7   InternetService   7043 non-null   object 
 8   OnlineSecurity    7043 non-null   object 
 9   OnlineBackup      7043 non-null   object 
 10  DeviceProtection  7043 non-null   object 
 11  TechSupport       7043 non-null   object 
 12  StreamingTV       7043 non-null   object 
 13  StreamingMovies   7043 non-null   object 
 14  Contract          7043 non-null   object 
 15  PaperlessBilling  7043 non-null   object 
 16  PaymentMethod     7043 non-null   object 


Necessary Edge-Case Conversion

In [7]:
df = df[df["TotalCharges"] != " "]
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['TotalCharges'] = pd.to_numeric(df['TotalCharges'])


Pre-Processing Functions

In [8]:
def target_transformation(df,target,targetidx):
    for i in range(0,len(df)):
        if df.iloc[i,targetidx] == "No":
            df.iloc[i,targetidx] = "0"
        else:
            df.iloc[i,targetidx] = "1"
    df[target] = pd.to_numeric(df[target])
    return df

def MinMaxScaling(df,target):
    columns = df.columns
    idx = -1
    for col in columns:
        idx += 1
        if col == target:
            continue
        typeCol = str(df[col].dtype)
        if(typeCol != 'object'):
            df[col] = pd.to_numeric(df[col])
            MIN = df[col].min()
            MAX = df[col].max()
            for i in range(0,len(df)):
                df.iloc[i,idx] = (df.iloc[i,idx] - MIN) / (MAX - MIN)
    return df

def OneHotEncoding_objects(df):
    columns = df.columns
    for col in columns:
        typeCol = str(df[col].dtype)
        if typeCol == 'object':
            enc = pd.get_dummies(df[col])
            encCol = enc.columns
            newColumns = {}
            for i in range(0,len(encCol)):
                newColumns[encCol[i]] = col + encCol[i]
            enc.rename(columns = newColumns, inplace = True)
            df = df.join(enc)
            df = df.drop([col],axis=1)
    return df

Processing the Data

In [9]:
data = OneHotEncoding_objects(MinMaxScaling(target_transformation(df,'Churn',len(df.columns.to_list())-1),'Churn'))
data.head()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,Churn,genderFemale,genderMale,PartnerNo,PartnerYes,DependentsNo,...,StreamingMoviesYes,ContractMonth-to-month,ContractOne year,ContractTwo year,PaperlessBillingNo,PaperlessBillingYes,PaymentMethodBank transfer (automatic),PaymentMethodCredit card (automatic),PaymentMethodElectronic check,PaymentMethodMailed check
0,0,0.0,0.115423,0.001275,0,1,0,0,1,1,...,0,1,0,0,0,1,0,0,1,0
1,0,0.464789,0.385075,0.215867,0,0,1,1,0,1,...,0,0,1,0,1,0,0,0,0,1
2,0,0.014085,0.354229,0.01031,1,0,1,1,0,1,...,0,1,0,0,0,1,0,0,0,1
3,0,0.619718,0.239303,0.210241,0,0,1,1,0,1,...,0,0,1,0,1,0,1,0,0,0
4,0,0.014085,0.521891,0.01533,1,1,0,1,0,1,...,0,1,0,0,0,1,0,0,1,0


Report Generation for a Model

In [10]:
def Generate_Report(y_pred,y_test):
    print("Accuracy Score is: " + str(accuracy_score(y_test,y_pred)))
    cmat = confusion_matrix(y_test,y_pred)
    print("Confusion Matrix: ")
    print(cmat)
    crep = classification_report(y_test,y_pred)
    print("Classification Report: ")
    print(crep)

Test-Train Splitting the data for the Models

In [12]:
y = data['Churn']
x_columns = data.columns.to_list()
x_columns.remove('Churn')
x = data[x_columns]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 1324, shuffle = True)

**(Without GridSearch & RandomizedSearch) Training the Models**

SVM Classifier

In [13]:
svcModel = SVC()
svcModel.fit(x_train,y_train)
Generate_Report(svcModel.predict(x_test),y_test)

Accuracy Score is: 0.7919431279620853
Confusion Matrix: 
[[1399  150]
 [ 289  272]]
Classification Report: 
              precision    recall  f1-score   support

           0       0.83      0.90      0.86      1549
           1       0.64      0.48      0.55       561

    accuracy                           0.79      2110
   macro avg       0.74      0.69      0.71      2110
weighted avg       0.78      0.79      0.78      2110



Decision Tree Classifier

In [14]:
dtModel = DecisionTreeClassifier()
dtModel.fit(x_train,y_train)
Generate_Report(dtModel.predict(x_test),y_test)

Accuracy Score is: 0.7322274881516587
Confusion Matrix: 
[[1270  279]
 [ 286  275]]
Classification Report: 
              precision    recall  f1-score   support

           0       0.82      0.82      0.82      1549
           1       0.50      0.49      0.49       561

    accuracy                           0.73      2110
   macro avg       0.66      0.66      0.66      2110
weighted avg       0.73      0.73      0.73      2110



Random Forest Classifier

In [15]:
rfModel = RandomForestClassifier()
rfModel.fit(x_train,y_train)
Generate_Report(rfModel.predict(x_test),y_test)

Accuracy Score is: 0.7753554502369668
Confusion Matrix: 
[[1375  174]
 [ 300  261]]
Classification Report: 
              precision    recall  f1-score   support

           0       0.82      0.89      0.85      1549
           1       0.60      0.47      0.52       561

    accuracy                           0.78      2110
   macro avg       0.71      0.68      0.69      2110
weighted avg       0.76      0.78      0.77      2110



AdaBoost Classifier

In [16]:
abModel = AdaBoostClassifier()
abModel.fit(x_train,y_train)
Generate_Report(abModel.predict(x_test),y_test)

Accuracy Score is: 0.79478672985782
Confusion Matrix: 
[[1394  155]
 [ 278  283]]
Classification Report: 
              precision    recall  f1-score   support

           0       0.83      0.90      0.87      1549
           1       0.65      0.50      0.57       561

    accuracy                           0.79      2110
   macro avg       0.74      0.70      0.72      2110
weighted avg       0.78      0.79      0.79      2110



**Hyper-Parameter Tuning with GridSearchCV and RandomizedSearchCV**

SVM Classifier

In [21]:
svmHModel = SVC()
svcparams = [{'kernel' : ['linear', 'poly', 'rbf', 'sigmoid'],
              'probability' : [True, False]}]
GSsvm = GridSearchCV(svmHModel, svcparams, scoring='accuracy')
RSsvm = RandomizedSearchCV(svmHModel, svcparams, scoring='accuracy')
GSsvm.fit(x_train, y_train)
RSsvm.fit(x_train, y_train)
print('Grid Search Report')
Generate_Report(GSsvm.predict(x_test),y_test)
print('Randomized Search Report')
Generate_Report(RSsvm.predict(x_test),y_test)



Grid Search Report
Accuracy Score is: 0.8023696682464455
Confusion Matrix: 
[[1387  162]
 [ 255  306]]
Classification Report: 
              precision    recall  f1-score   support

           0       0.84      0.90      0.87      1549
           1       0.65      0.55      0.59       561

    accuracy                           0.80      2110
   macro avg       0.75      0.72      0.73      2110
weighted avg       0.79      0.80      0.80      2110

Randomized Search Report
Accuracy Score is: 0.8023696682464455
Confusion Matrix: 
[[1387  162]
 [ 255  306]]
Classification Report: 
              precision    recall  f1-score   support

           0       0.84      0.90      0.87      1549
           1       0.65      0.55      0.59       561

    accuracy                           0.80      2110
   macro avg       0.75      0.72      0.73      2110
weighted avg       0.79      0.80      0.80      2110



Decision Tree Classifier

In [22]:
dtHModel = DecisionTreeClassifier()
dtparams = [{'criterion' : ['gini','entropy','log_loss'],
             'max_features' : [None,"sqrt","log2"],
             'max_depth' : [None,10,20,30]}]
GSdt = GridSearchCV(dtHModel, dtparams, scoring='accuracy')
RSdt = RandomizedSearchCV(dtHModel, dtparams, scoring='accuracy')
GSdt.fit(x_train, y_train)
RSdt.fit(x_train, y_train)
print('Grid Search Report')
Generate_Report(GSdt.predict(x_test),y_test)
print('Randomized Search Report')
Generate_Report(RSdt.predict(x_test),y_test)

Grid Search Report
Accuracy Score is: 0.752132701421801
Confusion Matrix: 
[[1327  222]
 [ 301  260]]
Classification Report: 
              precision    recall  f1-score   support

           0       0.82      0.86      0.84      1549
           1       0.54      0.46      0.50       561

    accuracy                           0.75      2110
   macro avg       0.68      0.66      0.67      2110
weighted avg       0.74      0.75      0.75      2110

Randomized Search Report
Accuracy Score is: 0.7663507109004739
Confusion Matrix: 
[[1335  214]
 [ 279  282]]
Classification Report: 
              precision    recall  f1-score   support

           0       0.83      0.86      0.84      1549
           1       0.57      0.50      0.53       561

    accuracy                           0.77      2110
   macro avg       0.70      0.68      0.69      2110
weighted avg       0.76      0.77      0.76      2110



Random Forest Classifier

In [27]:
rfHModel = RandomForestClassifier()
rfparams = [{'criterion' : ['gini','entropy','log_loss'],
             'max_depth' : [None,10,20,30],
             'n_estimators' : [80,100,120]}]
GSrf = GridSearchCV(rfHModel, rfparams, scoring='accuracy')
RSrf = RandomizedSearchCV(rfHModel, rfparams, scoring='accuracy')
GSrf.fit(x_train, y_train)
RSrf.fit(x_train, y_train)
print('Grid Search Report')
Generate_Report(GSrf.predict(x_test),y_test)
print('Randomized Search Report')
Generate_Report(RSrf.predict(x_test),y_test)

Grid Search Report
Accuracy Score is: 0.7981042654028436
Confusion Matrix: 
[[1392  157]
 [ 269  292]]
Classification Report: 
              precision    recall  f1-score   support

           0       0.84      0.90      0.87      1549
           1       0.65      0.52      0.58       561

    accuracy                           0.80      2110
   macro avg       0.74      0.71      0.72      2110
weighted avg       0.79      0.80      0.79      2110

Randomized Search Report
Accuracy Score is: 0.7966824644549763
Confusion Matrix: 
[[1397  152]
 [ 277  284]]
Classification Report: 
              precision    recall  f1-score   support

           0       0.83      0.90      0.87      1549
           1       0.65      0.51      0.57       561

    accuracy                           0.80      2110
   macro avg       0.74      0.70      0.72      2110
weighted avg       0.79      0.80      0.79      2110



AdaBoost Classifier

In [26]:
abHModel = AdaBoostClassifier()
abparams = [{'algorithm' : ['SAMME','SAMME.R'],
             'n_estimators' : [30, 50, 70, 90]}]
GSab = GridSearchCV(abHModel, abparams, scoring='accuracy')
RSab = RandomizedSearchCV(abHModel, abparams, scoring='accuracy')
GSab.fit(x_train, y_train)
RSab.fit(x_train, y_train)
print('Grid Search Report')
Generate_Report(GSab.predict(x_test),y_test)
print('Randomized Search Report')
Generate_Report(RSab.predict(x_test),y_test)



Grid Search Report
Accuracy Score is: 0.79478672985782
Confusion Matrix: 
[[1394  155]
 [ 278  283]]
Classification Report: 
              precision    recall  f1-score   support

           0       0.83      0.90      0.87      1549
           1       0.65      0.50      0.57       561

    accuracy                           0.79      2110
   macro avg       0.74      0.70      0.72      2110
weighted avg       0.78      0.79      0.79      2110

Randomized Search Report
Accuracy Score is: 0.79478672985782
Confusion Matrix: 
[[1394  155]
 [ 278  283]]
Classification Report: 
              precision    recall  f1-score   support

           0       0.83      0.90      0.87      1549
           1       0.65      0.50      0.57       561

    accuracy                           0.79      2110
   macro avg       0.74      0.70      0.72      2110
weighted avg       0.78      0.79      0.79      2110



**Conclusion**

We got SVC as the highest accuracy after Hyper Parameter Tuning with an accuracy of 0.8024, and all the algorithms got their accuracy boosted by 2% after Grid and Random SearchCV.
