In [1]:
import pandas as pd
from sklearn import metrics
from sklearn.metrics import recall_score 
from sklearn.metrics import classification_report 
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from imblearn.combine import SMOTEENN

In [7]:
data = pd.read_csv('churn_predictionML_data.csv')

In [8]:
data.head(2)

Unnamed: 0.1,Unnamed: 0,Tenure,MonthlyCharges,TotalCharges,Gender_Female,Gender_Male,SeniorCitizen_No,SeniorCitizen_Yes,Partner_No,Partner_Yes,...,PaymentMethod_Electronic check,PaymentMethod_Mailed check,Churn_No,Churn_Yes,tenure_grouped_less than 1 year,tenure_grouped_1-2 years,tenure_grouped_2-3 years,tenure_grouped_3-4 years,tenure_grouped_4-5 years,tenure_grouped_5+ years
0,0,1,29.85,29.85,1,0,1,0,0,1,...,1,0,1,0,1,0,0,0,0,0
1,1,34,56.95,1889.5,0,1,1,0,1,0,...,0,1,1,0,0,0,1,0,0,0


In [10]:
# drop unnecessary columns
data = data.drop(['Unnamed: 0','Churn_No','Dependents_No','Gender_Female','Partner_No','Churn_No','PhoneService_No','MultipleLines_No',
                  'MultipleLines_No phone service','OnlineSecurity_No','OnlineSecurity_No internet service','OnlineBackup_No',
                 'OnlineBackup_No internet service','DeviceProtection_No', 'DeviceProtection_No internet service','TechSupport_No',
                 'TechSupport_No internet service','StreamingTV_No','StreamingTV_No internet service','StreamingMovies_No',
                 'StreamingMovies_No internet service','PaperlessBilling_No'], axis = 1)

In [11]:
# rename some columns
data = data.rename(columns = {'Gender_Male':'Gender','Partner_Yes':'Partner','Dependents_Yes':'Dependents','PhoneService_Yes':'PhoneService',
                    'MultipleLines_Yes':'MultipleLines','OnlineBackup_Yes':'OnlineBackup','OnlineSecurity_Yes':'OnlineSecurity',
                    'DeviceProtection_Yes':'DeviceProtection','TechSupport_Yes':'TechSupport', 'StreamingTV_Yes':'StreamingTV',
                    'StreamingMovies_Yes':'StreamingMovies','Churn_Yes':'Churn'})

In [12]:
data.columns.unique()

Index(['Tenure', 'MonthlyCharges', 'TotalCharges', 'Gender',
       'SeniorCitizen_No', 'SeniorCitizen_Yes', 'Partner', 'Dependents',
       'PhoneService', 'MultipleLines', 'InternetService_DSL',
       'InternetService_Fiber optic', 'InternetService_No', 'OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies', 'Contract_Month-to-month', 'Contract_One year',
       'Contract_Two year', 'PaperlessBilling_Yes',
       'PaymentMethod_Bank transfer (automatic)',
       'PaymentMethod_Credit card (automatic)',
       'PaymentMethod_Electronic check', 'PaymentMethod_Mailed check', 'Churn',
       'tenure_grouped_less than 1 year', 'tenure_grouped_1-2 years',
       'tenure_grouped_2-3 years', 'tenure_grouped_3-4 years',
       'tenure_grouped_4-5 years', 'tenure_grouped_5+ years'],
      dtype='object')

### Creating x(independent) and y(target) variables

In [13]:
x = data.drop('Churn', axis = 1)
y = data['Churn']

In [14]:
# spliting data in train and test data
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.2, random_state = 100)

## Decision Tree Classifier

In [15]:
DTC_model = DecisionTreeClassifier(random_state = 100, max_depth = 6, min_samples_leaf = 8)

In [17]:
DTC_model.fit(x_train, y_train)

### Create prediction variables

In [18]:
y_pred = DTC_model.predict(x_test)

In [19]:
y_pred

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [20]:
DTC_model.score(x_test, y_test)

0.7711442786069652

In [21]:
from sklearn.metrics import precision_score, recall_score, f1_score


In [22]:
print(classification_report(y_test, y_pred, labels = [0,1]))

              precision    recall  f1-score   support

           0       0.84      0.84      0.84      1013
           1       0.59      0.60      0.60       394

    accuracy                           0.77      1407
   macro avg       0.72      0.72      0.72      1407
weighted avg       0.77      0.77      0.77      1407



#### Looking at the minority class(Churners), the precision, recall and f1_score are very low

This is because the data is imbanlanced

In [23]:
print(confusion_matrix(y_test,y_pred))

[[848 165]
 [157 237]]


### Prediction has lots of false negatives and false positives because the data is imbalanced

In [24]:
# using SMOTEENN() to create a balanced dataset
snt = SMOTEENN()

In [25]:
# create x and y variables
x_balanced, y_balanced = snt.fit_resample(x,y)

In [26]:
xbalanced_train, xbalanced_test, ybalanced_train, ybalanced_test = train_test_split(x_balanced,y_balanced, test_size = 0.2, random_state = 100)

In [27]:
DTC_model = DecisionTreeClassifier(criterion = 'gini', random_state = 100, max_depth = 6, min_samples_leaf = 8)

In [28]:
DTC_model.fit(xbalanced_train, ybalanced_train)

In [30]:
ybalanced_pred = DTC_model.predict(xbalanced_test)

In [31]:
ybalanced_pred

array([1, 0, 1, ..., 0, 1, 1], dtype=int64)

In [33]:
DTC_model.score(xbalanced_test, ybalanced_test)

0.9283246977547496

In [34]:
# Classification report of DecisionTreeClassifier model with balanced data 
print(classification_report(ybalanced_test,ybalanced_pred, labels= [0,1]))

              precision    recall  f1-score   support

           0       0.94      0.91      0.92       549
           1       0.92      0.94      0.93       609

    accuracy                           0.93      1158
   macro avg       0.93      0.93      0.93      1158
weighted avg       0.93      0.93      0.93      1158



#### With a balanced dataset the precision, recall and f1_score have increased
Increased accuracy

In [35]:
print(confusion_matrix(ybalanced_test,ybalanced_pred))

[[500  49]
 [ 34 575]]


### Now data is balanced, there is less error in the confusion matrix with less false positives and negatives

## Using a different classifier to compare model accuracy

## Random Forest Classifier

In [36]:
from sklearn.ensemble import RandomForestClassifier

In [37]:
RFC_model = RandomForestClassifier(random_state = 100, max_depth = 6, min_samples_leaf = 8)

In [38]:
RFC_model.fit(x_train, y_train)

### Creating prediction variables

In [39]:
y2_pred= RFC_model.predict(x_test)

In [40]:
y2_pred

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [43]:
RFC_model.score(x_test, y_test)

0.7917555081734187

In [44]:
print(classification_report(y_test, y2_pred, labels = [0,1]))

              precision    recall  f1-score   support

           0       0.82      0.91      0.86      1013
           1       0.68      0.49      0.57       394

    accuracy                           0.79      1407
   macro avg       0.75      0.70      0.72      1407
weighted avg       0.78      0.79      0.78      1407



In [45]:
print(confusion_matrix(y_test, y2_pred))

[[921  92]
 [201 193]]


### Create new model with balanced data

In [46]:
# creating a new model with balancd dataset
RFC_model_balanced = RandomForestClassifier(random_state = 100, max_depth = 6, min_samples_leaf = 8)

In [47]:
RFC_model_balanced.fit(xbalanced_train, ybalanced_train)

In [48]:
y2balanced_pred = RFC_model_balanced.predict(xbalanced_test)

In [49]:
y2balanced_pred

array([1, 0, 1, ..., 0, 1, 1], dtype=int64)

In [50]:
RFC_model_balanced.score(xbalanced_test, ybalanced_test)

0.9360967184801382

In [51]:
# Classification report of the RandomTreelassifier model with balanced data
print(classification_report(ybalanced_test, y2balanced_pred, labels = [0,1]))

              precision    recall  f1-score   support

           0       0.96      0.91      0.93       549
           1       0.92      0.96      0.94       609

    accuracy                           0.94      1158
   macro avg       0.94      0.93      0.94      1158
weighted avg       0.94      0.94      0.94      1158



In [52]:
print(confusion_matrix(ybalanced_test, y2balanced_pred))

[[497  52]
 [ 22 587]]


### Compare confusion matrix and classification score for both models

#### Confusion matrix

In [53]:
# confusion matrix of Decision Tree classifier with balanced data
print(confusion_matrix(ybalanced_test, ybalanced_pred))

[[500  49]
 [ 34 575]]


In [54]:
# confusion matrix of Random Forest classifier with balanced data
print(confusion_matrix(ybalanced_test, y2balanced_pred))

[[497  52]
 [ 22 587]]


#### Classification report

In [55]:
# Classification report of the DecisonTreeclassifier model with balanced data
print(classification_report(ybalanced_test,ybalanced_pred, labels = [0,1]))

              precision    recall  f1-score   support

           0       0.94      0.91      0.92       549
           1       0.92      0.94      0.93       609

    accuracy                           0.93      1158
   macro avg       0.93      0.93      0.93      1158
weighted avg       0.93      0.93      0.93      1158



In [56]:
# Classification report of the RandomTreeclassifier model with balanced data
print(classification_report(ybalanced_test, y2balanced_pred, labels = [0,1]))

              precision    recall  f1-score   support

           0       0.96      0.91      0.93       549
           1       0.92      0.96      0.94       609

    accuracy                           0.94      1158
   macro avg       0.94      0.93      0.94      1158
weighted avg       0.94      0.94      0.94      1158



Both models have almost similar outputs

The RandomForestClassifier has higher precision and F1 score for non-churners

And a higher recall and F1 score for Churners

### Save model

In [57]:
import pickle

In [58]:
model_name = 'RFchurnprediction_model.sav'

In [60]:
pickle.dump(RFC_model_balanced, open(model_name, 'wb'))

### Load model

In [61]:
load_model = pickle.load(open(model_name, 'rb'))

In [62]:
load_model.score(xbalanced_test, ybalanced_test)

0.9360967184801382