In [27]:
#importing libraries

import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from imblearn.combine import SMOTEENN
import numpy as np

In [7]:
df=pd.read_csv("df_preprocessed.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,SeniorCitizen,MonthlyCharges,TotalCharges,Churn,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,...,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure_group_1 - 12,tenure_group_13 - 24,tenure_group_25 - 36,tenure_group_37 - 48,tenure_group_49 - 60,tenure_group_61 - 72
0,0,0,29.85,29.85,0,1,0,0,1,1,...,0,0,1,0,1,0,0,0,0,0
1,1,0,56.95,1889.5,0,0,1,1,0,1,...,0,0,0,1,0,0,1,0,0,0
2,2,0,53.85,108.15,1,0,1,1,0,1,...,0,0,0,1,1,0,0,0,0,0
3,3,0,42.3,1840.75,0,0,1,1,0,1,...,1,0,0,0,0,0,0,1,0,0
4,4,0,70.7,151.65,1,1,0,1,0,1,...,0,0,1,0,1,0,0,0,0,0


In [8]:
df=df.drop('Unnamed: 0',axis=1)

In [9]:
x=df.drop('Churn',axis=1)

In [10]:
y=df['Churn']

In [11]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

## Decision Tree

In [12]:
model_dt = DecisionTreeClassifier(criterion = "gini", random_state = 100, max_depth=6, min_samples_leaf=8)
model_dt.fit(x_train,y_train)
y_pred=model_dt.predict(x_test)
model_dt.score(x_test,y_test)
print(classification_report(y_test, y_pred, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.83      0.90      0.86      1023
           1       0.65      0.49      0.56       384

    accuracy                           0.79      1407
   macro avg       0.74      0.70      0.71      1407
weighted avg       0.78      0.79      0.78      1407



### Inference:

The accuracy is low, and the *precision* and *recall* of class 1 is low as the dataset is *skewed*.


## Upsampling with SMOTEENN
**Edited nearest Neighbour (ENN):**
It removes the samples whose class label differs from atleast half of its K nearest neighbours


**Synthetic Minority Oversampling Technique (SMOTE):**
It creates synthetic samples by, randomly selecting one of the sample in minority class, calculating the distance between its K nearest neighbours, and multiplying it by a random number between 0 and 1, and then adding it to the data.
<br><br>




In [35]:
from imblearn.combine import SMOTEENN
from collections import Counter

In [36]:
smot = SMOTEENN()
x_upSampled, y_upSampled = smot.fit_resample(x,y)

In [38]:
print("The number of classes before fit {}".format(Counter(y)))
print("The number of classes after fit {}".format(Counter(y_upSampled)))

The number of classes before fit Counter({0: 5163, 1: 1869})
The number of classes after fit Counter({1: 3238, 0: 2661})


In [39]:
xup_train, xup_test, yup_train, yup_test = train_test_split(x_upSampled, y_upSampled, test_size=0.2)

In [40]:
# Training the resampled data with decsion tree

model_dt = DecisionTreeClassifier(criterion = "gini", random_state = 100, max_depth=6, min_samples_leaf=8)

model_dt.fit(xup_train, yup_train)

yup_pred = model_dt.predict(xup_test)

model_dt.score(xup_test,yup_test)

print(classification_report(yup_test, yup_pred, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.96      0.88      0.92       534
           1       0.91      0.97      0.94       646

    accuracy                           0.93      1180
   macro avg       0.94      0.93      0.93      1180
weighted avg       0.93      0.93      0.93      1180



## Training Random Forests

In [41]:
from sklearn.ensemble import RandomForestClassifier



In [49]:
clf = RandomForestClassifier(n_estimators=150)

clf.fit(xup_train, yup_train)

y_pred = clf.predict(xup_test)

clf.score(xup_test,yup_test)

print(classification_report(yup_test, yup_pred, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.96      0.88      0.92       534
           1       0.91      0.97      0.94       646

    accuracy                           0.93      1180
   macro avg       0.94      0.93      0.93      1180
weighted avg       0.93      0.93      0.93      1180



## Training XGBoost Classifier

In [52]:
from xgboost import XGBClassifier


xgclf = XGBClassifier()

xgclf.fit(xup_train, yup_train)

y_pred = xgclf.predict(xup_test)

xgclf.score(xup_test,yup_test)

print(classification_report(yup_test, yup_pred, labels=[0,1]))



              precision    recall  f1-score   support

           0       0.96      0.88      0.92       534
           1       0.91      0.97      0.94       646

    accuracy                           0.93      1180
   macro avg       0.94      0.93      0.93      1180
weighted avg       0.93      0.93      0.93      1180



## Using PCA

In [50]:
from sklearn.decomposition import PCA

pca = PCA(0.9)

xr_train_pca = pca.fit_transform(xup_train)

xr_test_pca = pca.transform(xup_test)

explained_variance = pca.explained_variance_ratio_

In [51]:
clf = RandomForestClassifier(n_estimators=150)

clf.fit(xup_train, yup_train)

y_pred = clf.predict(xup_test)

clf.score(xup_test,yup_test)

print(classification_report(yup_test, yup_pred, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.96      0.88      0.92       534
           1       0.91      0.97      0.94       646

    accuracy                           0.93      1180
   macro avg       0.94      0.93      0.93      1180
weighted avg       0.93      0.93      0.93      1180



## Conclusion:

* There was a significant improvement in *accuracy*, *precision*, and *recall* after resampling the data with **SMOTEENN**.

* **Decision Trees**, **Random Forests** and **XGBoost** gave similar results so it is fine to use any one of them. 

* **PCA** did not have any impact on the performance

### Saving the model:

In [53]:
import pickle

filename = 'trained_model.sav'

pickle.dump(xgclf, open(filename, 'wb'))