# **Model Training**

In [87]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.feature_selection import SelectKBest, mutual_info_classif
import warnings
warnings.filterwarnings('ignore')

In [88]:
df = pd.read_csv('../data/cleaned/final.csv')
df.head()

Unnamed: 0,gender,SeniorCitizen,Marital_Status,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn,tenure_group
0,Female,0,Married,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No,0-19
1,Male,0,Single,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No,20-39
2,Male,0,Single,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes,0-19
3,Male,0,Single,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No,40-59
4,Female,0,Single,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes,0-19


In [89]:
X = df.drop('Churn', axis=1)
y = df['Churn']

In [90]:
le = LabelEncoder()

X_encoded1 = pd.get_dummies(X)
y_encoded = le.fit_transform(y)

In [91]:
X_encoded2 = pd.get_dummies(X, drop_first=True)

In [92]:
X_encoded1.head()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,gender_Female,gender_Male,Marital_Status_Married,Marital_Status_Single,Dependents_No,Dependents_Yes,PhoneService_No,PhoneService_Yes,MultipleLines_No,MultipleLines_No phone service,MultipleLines_Yes,InternetService_DSL,InternetService_Fiber optic,InternetService_No,OnlineSecurity_No,OnlineSecurity_No internet service,OnlineSecurity_Yes,OnlineBackup_No,OnlineBackup_No internet service,OnlineBackup_Yes,DeviceProtection_No,DeviceProtection_No internet service,DeviceProtection_Yes,TechSupport_No,TechSupport_No internet service,TechSupport_Yes,StreamingTV_No,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,PaperlessBilling_No,PaperlessBilling_Yes,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure_group_0-19,tenure_group_20-39,tenure_group_40-59,tenure_group_60-72
0,0,1,29.85,29.85,True,False,True,False,True,False,True,False,False,True,False,True,False,False,True,False,False,False,False,True,True,False,False,True,False,False,True,False,False,True,False,False,True,False,False,False,True,False,False,True,False,True,False,False,False
1,0,34,56.95,1889.5,False,True,False,True,True,False,False,True,True,False,False,True,False,False,False,False,True,True,False,False,False,False,True,True,False,False,True,False,False,True,False,False,False,True,False,True,False,False,False,False,True,False,True,False,False
2,0,2,53.85,108.15,False,True,False,True,True,False,False,True,True,False,False,True,False,False,False,False,True,False,False,True,True,False,False,True,False,False,True,False,False,True,False,False,True,False,False,False,True,False,False,False,True,True,False,False,False
3,0,45,42.3,1840.75,False,True,False,True,True,False,True,False,False,True,False,True,False,False,False,False,True,True,False,False,False,False,True,False,False,True,True,False,False,True,False,False,False,True,False,True,False,True,False,False,False,False,False,True,False
4,0,2,70.7,151.65,True,False,False,True,True,False,False,True,True,False,False,False,True,False,True,False,False,True,False,False,True,False,False,True,False,False,True,False,False,True,False,False,True,False,False,False,True,False,False,True,False,True,False,False,False


In [93]:
selector = SelectKBest(score_func=mutual_info_classif, k=10)
X_selected = selector.fit_transform(X_encoded1, y_encoded)

sel_col1 = X_encoded1.columns[selector.get_support()]
X_selected1 = X_encoded1[sel_col1]
sel_col1

Index(['tenure', 'MonthlyCharges', 'InternetService_Fiber optic',
       'OnlineSecurity_No', 'OnlineBackup_No', 'TechSupport_No',
       'Contract_Month-to-month', 'Contract_Two year',
       'PaymentMethod_Electronic check', 'tenure_group_0-19'],
      dtype='object')

In [94]:
selector = SelectKBest(score_func=mutual_info_classif, k=10)
X_selected = selector.fit_transform(X_encoded2, y_encoded)
sel_col2 = X_encoded2.columns[selector.get_support()]
X_selected2 = X_encoded2[sel_col2]
sel_col2

Index(['tenure', 'MonthlyCharges', 'TotalCharges',
       'InternetService_Fiber optic', 'OnlineSecurity_No internet service',
       'OnlineBackup_No internet service', 'StreamingTV_No internet service',
       'StreamingMovies_No internet service', 'Contract_Two year',
       'PaymentMethod_Electronic check'],
      dtype='object')

In [95]:
X_selected2.head()

Unnamed: 0,tenure,MonthlyCharges,TotalCharges,InternetService_Fiber optic,OnlineSecurity_No internet service,OnlineBackup_No internet service,StreamingTV_No internet service,StreamingMovies_No internet service,Contract_Two year,PaymentMethod_Electronic check
0,1,29.85,29.85,False,False,False,False,False,False,True
1,34,56.95,1889.5,False,False,False,False,False,False,False
2,2,53.85,108.15,False,False,False,False,False,False,False
3,45,42.3,1840.75,False,False,False,False,False,False,False
4,2,70.7,151.65,True,False,False,False,False,False,True


In [96]:
X_trainf1, X_testf1, y_trainf1 , y_testf1 = train_test_split(X_selected1, y_encoded, test_size=0.2, random_state=42)

In [97]:
sc = StandardScaler()
X_train_scf1 = sc.fit_transform(X_trainf1)
X_test_scf1 = sc.transform(X_testf1)

### **Model Training (Using Selected Features 1)**

#### KNN

In [98]:
K = []
scores = {}

for k in range(2, 31):
    clf = KNeighborsClassifier(n_neighbors=k)
    clf.fit(X_train_scf1, y_trainf1)

    train_score = clf.score(X_train_scf1, y_trainf1)
    test_score = clf.score(X_test_scf1, y_testf1)
    K.append(k)

    scores[k] = [train_score, test_score]

In [99]:
for keys, values in scores.items():
    print(keys, ':', values)

2 : [0.8626961483594865, 0.7988587731811697]
3 : [0.8664407988587732, 0.7774607703281027]
4 : [0.8421897289586305, 0.7945791726105563]
5 : [0.838623395149786, 0.7831669044222539]
6 : [0.8261412268188303, 0.8038516405135521]
7 : [0.8293509272467903, 0.7960057061340942]
8 : [0.8232881597717546, 0.8002853067047075]
9 : [0.8202567760342369, 0.7902995720399429]
10 : [0.8150855920114123, 0.8017118402282454]
11 : [0.817225392296719, 0.7960057061340942]
12 : [0.8124108416547788, 0.8052781740370899]
13 : [0.8115192582025678, 0.8024251069900142]
14 : [0.8109843081312411, 0.8074179743223966]
15 : [0.8118758915834522, 0.8045649072753209]
16 : [0.8079529243937232, 0.8081312410841655]
17 : [0.8093794579172611, 0.8109843081312411]
18 : [0.8088445078459344, 0.8131241084165478]
19 : [0.8100927246790299, 0.8152639087018545]
20 : [0.8097360912981455, 0.8088445078459344]
21 : [0.8075962910128388, 0.8138373751783167]
22 : [0.8093794579172611, 0.8138373751783167]
23 : [0.8092011412268189, 0.8152639087018545

In [100]:
model_knn = KNeighborsClassifier(n_neighbors=29)
model_knn.fit(X_train_scf1, y_trainf1)


0,1,2
,n_neighbors,29
,weights,'uniform'
,algorithm,'auto'
,leaf_size,30
,p,2
,metric,'minkowski'
,metric_params,
,n_jobs,


In [101]:
y_pred_knn = model_knn.predict(X_test_scf1)

In [102]:
def evaluate(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    cr = classification_report(y_true, y_pred)
    print(cm)
    print(cr)


In [103]:
evaluate(y_testf1, y_pred_knn)

[[967 114]
 [149 172]]
              precision    recall  f1-score   support

           0       0.87      0.89      0.88      1081
           1       0.60      0.54      0.57       321

    accuracy                           0.81      1402
   macro avg       0.73      0.72      0.72      1402
weighted avg       0.81      0.81      0.81      1402



In [104]:
pd.Series(y_trainf1).value_counts()/len(y_trainf1)*100

0    72.610556
1    27.389444
Name: count, dtype: float64

NOTE: The data is imbalanced so will check for other models and conclude if need of oversampling.

#### RandomForest

In [105]:
model_rf = RandomForestClassifier(n_estimators=100, random_state=42)
model_rf.fit(X_train_scf1, y_trainf1)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [106]:
y_pred_rf = model_rf.predict(X_test_scf1)

In [107]:
evaluate(y_testf1, y_pred_rf)

[[941 140]
 [160 161]]
              precision    recall  f1-score   support

           0       0.85      0.87      0.86      1081
           1       0.53      0.50      0.52       321

    accuracy                           0.79      1402
   macro avg       0.69      0.69      0.69      1402
weighted avg       0.78      0.79      0.78      1402



#### Naive Bayes

In [108]:
model_nb = GaussianNB()
model_nb.fit(X_train_scf1, y_trainf1)

0,1,2
,priors,
,var_smoothing,1e-09


In [109]:
y_pred_nb = model_nb.predict(X_test_scf1)

In [110]:
evaluate(y_testf1, y_pred_nb)

[[771 310]
 [ 61 260]]
              precision    recall  f1-score   support

           0       0.93      0.71      0.81      1081
           1       0.46      0.81      0.58       321

    accuracy                           0.74      1402
   macro avg       0.69      0.76      0.69      1402
weighted avg       0.82      0.74      0.76      1402



#### SVM

In [111]:
model_svm = SVC(kernel='linear')
model_svm.fit(X_train_scf1, y_trainf1)

0,1,2
,C,1.0
,kernel,'linear'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,False
,tol,0.001
,cache_size,200
,class_weight,


In [112]:
y_pred_svm  = model_svm.predict(X_test_scf1)

In [113]:
evaluate(y_testf1, y_pred_svm)

[[1011   70]
 [ 169  152]]
              precision    recall  f1-score   support

           0       0.86      0.94      0.89      1081
           1       0.68      0.47      0.56       321

    accuracy                           0.83      1402
   macro avg       0.77      0.70      0.73      1402
weighted avg       0.82      0.83      0.82      1402



#### **Model Selected from feature set 1 for HPO is -> KNN (Balanced Performance)**

### **Model Training (Using Selected Features 2)**

In [114]:
X_trainf2, X_testf2, y_trainf2, y_testf2 = train_test_split(X_selected2, y_encoded, test_size=0.2, random_state=42)

In [115]:
X_train_scf2 = sc.fit_transform(X_trainf2)
X_test_scf2 = sc.transform(X_testf2)

#### KNN

In [116]:
K = []
scores = {}

for k in range(2, 31):
    clf = KNeighborsClassifier(n_neighbors=k)
    clf.fit(X_train_scf2, y_trainf2)

    train_score = clf.score(X_train_scf2, y_trainf2)
    test_score = clf.score(X_test_scf2, y_testf2)
    K.append(k)

    scores[k] = [train_score, test_score]

In [117]:
for keys, values in scores.items():
    print(keys, ':', values)

2 : [0.8562767475035663, 0.8017118402282454]
3 : [0.8605563480741797, 0.7781740370898717]
4 : [0.8334522111269614, 0.8095577746077033]
5 : [0.8334522111269614, 0.8009985734664765]
6 : [0.8264978601997147, 0.8088445078459344]
7 : [0.8277460770328102, 0.8074179743223966]
8 : [0.8163338088445078, 0.8102710413694721]
9 : [0.8145506419400856, 0.8031383737517832]
10 : [0.8106276747503567, 0.8109843081312411]
11 : [0.8145506419400856, 0.8031383737517832]
12 : [0.8104493580599144, 0.81169757489301]
13 : [0.8100927246790299, 0.8009985734664765]
14 : [0.8059914407988588, 0.8138373751783167]
15 : [0.8074179743223966, 0.8088445078459344]
16 : [0.8059914407988588, 0.8124108416547788]
17 : [0.8058131241084165, 0.8067047075606276]
18 : [0.8050998573466477, 0.8109843081312411]
19 : [0.8054564907275321, 0.8038516405135521]
20 : [0.8052781740370899, 0.8131241084165478]
21 : [0.8038516405135521, 0.8081312410841655]
22 : [0.8031383737517832, 0.8131241084165478]
23 : [0.806169757489301, 0.8138373751783167]

In [118]:
model_knn = KNeighborsClassifier(n_neighbors=28)
model_knn.fit(X_train_scf2, y_trainf2)


0,1,2
,n_neighbors,28
,weights,'uniform'
,algorithm,'auto'
,leaf_size,30
,p,2
,metric,'minkowski'
,metric_params,
,n_jobs,


In [119]:
y_pred_knn = model_knn.predict(X_test_scf2)

In [120]:
evaluate(y_testf2, y_pred_knn)

[[984  97]
 [157 164]]
              precision    recall  f1-score   support

           0       0.86      0.91      0.89      1081
           1       0.63      0.51      0.56       321

    accuracy                           0.82      1402
   macro avg       0.75      0.71      0.72      1402
weighted avg       0.81      0.82      0.81      1402



#### Random Forest

In [121]:
model_rf = RandomForestClassifier(n_estimators=100, random_state=42)
model_rf.fit(X_train_scf2, y_trainf2)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [122]:
y_pred_rf = model_rf.predict(X_test_scf2)

In [123]:
evaluate(y_testf2, y_pred_rf)

[[946 135]
 [158 163]]
              precision    recall  f1-score   support

           0       0.86      0.88      0.87      1081
           1       0.55      0.51      0.53       321

    accuracy                           0.79      1402
   macro avg       0.70      0.69      0.70      1402
weighted avg       0.79      0.79      0.79      1402



#### Naive Bayes

In [124]:
model_nb = GaussianNB()
model_nb.fit(X_train_scf2, y_trainf2)

0,1,2
,priors,
,var_smoothing,1e-09


In [125]:
y_pred_nb = model_nb.predict(X_test_scf2)

In [126]:
evaluate(y_testf2, y_pred_rf)

[[946 135]
 [158 163]]
              precision    recall  f1-score   support

           0       0.86      0.88      0.87      1081
           1       0.55      0.51      0.53       321

    accuracy                           0.79      1402
   macro avg       0.70      0.69      0.70      1402
weighted avg       0.79      0.79      0.79      1402



#### SVM

In [127]:
model_svm = SVC(kernel='linear')
model_svm.fit(X_train_scf2, y_trainf2)

0,1,2
,C,1.0
,kernel,'linear'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,False
,tol,0.001
,cache_size,200
,class_weight,


In [128]:
y_pred_svm = model_svm.predict(X_test_scf2)

In [130]:
evaluate(y_test,y_pred_svm)

[[975 106]
 [149 172]]
              precision    recall  f1-score   support

           0       0.87      0.90      0.88      1081
           1       0.62      0.54      0.57       321

    accuracy                           0.82      1402
   macro avg       0.74      0.72      0.73      1402
weighted avg       0.81      0.82      0.81      1402



Model with Fesature set 1 is selected , and now we will do oversampling , then we will do hyperparameter tuning.