In [48]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import Normalizer, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks

In [8]:
churnData = pd.read_csv('files_for_lab/Customer-Churn.csv')
churnData

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No,Yes,No,No,No,No,Month-to-month,29.85,29.85,No
1,Male,0,No,No,34,Yes,Yes,No,Yes,No,No,No,One year,56.95,1889.5,No
2,Male,0,No,No,2,Yes,Yes,Yes,No,No,No,No,Month-to-month,53.85,108.15,Yes
3,Male,0,No,No,45,No,Yes,No,Yes,Yes,No,No,One year,42.30,1840.75,No
4,Female,0,No,No,2,Yes,No,No,No,No,No,No,Month-to-month,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,Male,0,Yes,Yes,24,Yes,Yes,No,Yes,Yes,Yes,Yes,One year,84.80,1990.5,No
7039,Female,0,Yes,Yes,72,Yes,No,Yes,Yes,No,Yes,Yes,One year,103.20,7362.9,No
7040,Female,0,Yes,Yes,11,No,Yes,No,No,No,No,No,Month-to-month,29.60,346.45,No
7041,Male,1,Yes,No,4,Yes,No,No,No,No,No,No,Month-to-month,74.40,306.6,Yes


In [9]:
churnData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7043 non-null   object 
 1   SeniorCitizen     7043 non-null   int64  
 2   Partner           7043 non-null   object 
 3   Dependents        7043 non-null   object 
 4   tenure            7043 non-null   int64  
 5   PhoneService      7043 non-null   object 
 6   OnlineSecurity    7043 non-null   object 
 7   OnlineBackup      7043 non-null   object 
 8   DeviceProtection  7043 non-null   object 
 9   TechSupport       7043 non-null   object 
 10  StreamingTV       7043 non-null   object 
 11  StreamingMovies   7043 non-null   object 
 12  Contract          7043 non-null   object 
 13  MonthlyCharges    7043 non-null   float64
 14  TotalCharges      7043 non-null   object 
 15  Churn             7043 non-null   object 
dtypes: float64(1), int64(2), object(13)
memory

In [22]:
for column in churnData.columns:
    print(f'Value counts for {column}:')
    print(churnData[column].value_counts())
    print('------')

Value counts for gender:
gender
Male      3555
Female    3488
Name: count, dtype: int64
------
Value counts for SeniorCitizen:
SeniorCitizen
0    5901
1    1142
Name: count, dtype: int64
------
Value counts for Partner:
Partner
No     3641
Yes    3402
Name: count, dtype: int64
------
Value counts for Dependents:
Dependents
No     4933
Yes    2110
Name: count, dtype: int64
------
Value counts for tenure:
tenure
1     613
72    362
2     238
3     200
4     176
     ... 
28     57
39     56
44     51
36     50
0      11
Name: count, Length: 73, dtype: int64
------
Value counts for PhoneService:
PhoneService
Yes    6361
No      682
Name: count, dtype: int64
------
Value counts for OnlineSecurity:
OnlineSecurity
No                     3498
Yes                    2019
No internet service    1526
Name: count, dtype: int64
------
Value counts for OnlineBackup:
OnlineBackup
No                     3088
Yes                    2429
No internet service    1526
Name: count, dtype: int64
------
Valu

In [11]:
churnData['TotalCharges'] = pd.to_numeric(churnData['TotalCharges'], errors = 'coerce')

In [12]:
churnData.isna().sum()

gender               0
SeniorCitizen        0
Partner              0
Dependents           0
tenure               0
PhoneService         0
OnlineSecurity       0
OnlineBackup         0
DeviceProtection     0
TechSupport          0
StreamingTV          0
StreamingMovies      0
Contract             0
MonthlyCharges       0
TotalCharges        11
Churn                0
dtype: int64

In [14]:
churnData['TotalCharges'] = churnData['TotalCharges'].fillna(churnData['TotalCharges'].mean())

In [15]:
churnData.isna().sum()

gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

### Scaling the features and training the model

In [35]:
X = churnData[['tenure', 'SeniorCitizen', 'MonthlyCharges', 'TotalCharges']]
y = churnData['Churn']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.20, random_state = 42)

X_transformer = StandardScaler()
X_train = X_transformer.fit_transform(X_train)
X_test = X_transformer.transform(X_test)

lr = LogisticRegression()
lr.fit(X_train, y_train)
x_pred = lr.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          No       0.83      0.93      0.88      1036
         Yes       0.70      0.48      0.57       373

    accuracy                           0.81      1409
   macro avg       0.77      0.70      0.72      1409
weighted avg       0.80      0.81      0.79      1409



In [38]:
# Checking the imbalance.
churnData['Churn'].value_counts()

Churn
No     5174
Yes    1869
Name: count, dtype: int64

In [41]:
# I will use RandomUnderSampler and RandomOverSampler separately to balance the data and re fit the model
# Then compare the results.
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler

X = churnData[['tenure', 'SeniorCitizen', 'MonthlyCharges', 'TotalCharges']]
y = churnData['Churn']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.20, random_state = 42)

X_transformer = StandardScaler()
X_train = X_transformer.fit_transform(X_train)
X_test = X_transformer.transform(X_test)

over = RandomOverSampler(random_state = 42)

X_train_over, y_train_over = over.fit_resample(X_train, y_train)


lr = LogisticRegression()
lr.fit(X_train_over, y_train_over)
x_pred = lr.predict(X_test)

print(classification_report(y_test, x_pred))

              precision    recall  f1-score   support

          No       0.90      0.73      0.81      1036
         Yes       0.51      0.76      0.61       373

    accuracy                           0.74      1409
   macro avg       0.70      0.75      0.71      1409
weighted avg       0.79      0.74      0.75      1409



- With over sampling precision went up with the Nos and down with Yes compare to our imbalanced data.
- Recall sees the best improvement of our data with both answers mid 70s compared to a big difference in our imbalanced data.
- f1-score sees a slight increase in balance.

In [42]:
X = churnData[['tenure', 'SeniorCitizen', 'MonthlyCharges', 'TotalCharges']]
y = churnData['Churn']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.20, random_state = 42)

X_transformer = StandardScaler()
X_train = X_transformer.fit_transform(X_train)
X_test = X_transformer.transform(X_test)

under = RandomUnderSampler(random_state = 42)

X_train_under, y_train_under = under.fit_resample(X_train, y_train)


lr = LogisticRegression()
lr.fit(X_train_under, y_train_under)
y_pred = lr.predict(X_test)

print(classification_report(y_test, x_pred))

              precision    recall  f1-score   support

          No       0.90      0.74      0.81      1036
         Yes       0.51      0.76      0.61       373

    accuracy                           0.74      1409
   macro avg       0.70      0.75      0.71      1409
weighted avg       0.79      0.74      0.76      1409



- Similar results to the over sampler but we see a 1 point gain in the recall for Nos

# Lab cross validation 

In [47]:
#First i will use SMOTE to upsample the data and then fit a Logistic Regression model.

X = churnData[['tenure', 'SeniorCitizen', 'MonthlyCharges', 'TotalCharges']]
y = churnData['Churn']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.20, random_state = 42)

smote = SMOTE(random_state = 42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)


lr = LogisticRegression()
lr.fit(X_train_smote, y_train_smote)
x_pred = lr.predict(X_test)

print(classification_report(y_test, x_pred))



              precision    recall  f1-score   support

          No       0.91      0.67      0.78      1036
         Yes       0.48      0.83      0.60       373

    accuracy                           0.71      1409
   macro avg       0.70      0.75      0.69      1409
weighted avg       0.80      0.71      0.73      1409



In [53]:
# and now using a decision tree classifer 
X = churnData[['tenure', 'SeniorCitizen', 'MonthlyCharges', 'TotalCharges']]
y = churnData['Churn']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.20, random_state = 42)

smote = SMOTE(random_state = 42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

clf = DecisionTreeClassifier()
clf.fit(X_train_smote, y_train_smote)
x_pred = clf.predict(X_test)

print(classification_report(y_test, x_pred))




              precision    recall  f1-score   support

          No       0.84      0.74      0.79      1036
         Yes       0.46      0.60      0.52       373

    accuracy                           0.71      1409
   macro avg       0.65      0.67      0.65      1409
weighted avg       0.74      0.71      0.72      1409



- Using SMOTE with LogisticRegression seems to be better at predicting YES compared to the decision tree classifier however with a lower recall for NO.
- Using SMOTE with the decision tree classifier is more balanced between the two classes but is less precise predicting YES.

#### Using TomekLinks

In [54]:
# Using Tomeklinks will downsample the data.
# We will do the same comparison between the two models in use.
X = churnData[['tenure', 'SeniorCitizen', 'MonthlyCharges', 'TotalCharges']]
y = churnData['Churn']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.20, random_state = 42)

tomek = TomekLinks()

X_train_down, y_train_down = tomek.fit_resample(X_train, y_train)

lr = LogisticRegression()
lr.fit(X_train_down, y_train_down)
x_pred = lr.predict(X_test)

print(classification_report(y_test, x_pred))




              precision    recall  f1-score   support

          No       0.85      0.88      0.86      1036
         Yes       0.62      0.56      0.59       373

    accuracy                           0.79      1409
   macro avg       0.73      0.72      0.72      1409
weighted avg       0.79      0.79      0.79      1409



In [55]:
X = churnData[['tenure', 'SeniorCitizen', 'MonthlyCharges', 'TotalCharges']]
y = churnData['Churn']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.20, random_state = 42)

tomek = TomekLinks()

X_train_down, y_train_down = tomek.fit_resample(X_train, y_train)

clf = DecisionTreeClassifier()
clf.fit(X_train_down, y_train_down)
x_pred = clf.predict(X_test)

print(classification_report(y_test, x_pred))

              precision    recall  f1-score   support

          No       0.83      0.79      0.81      1036
         Yes       0.48      0.54      0.51       373

    accuracy                           0.72      1409
   macro avg       0.65      0.67      0.66      1409
weighted avg       0.74      0.72      0.73      1409



- Using Logistic Regression we found our best results in both classes with higher precision and recall found in both YES and NO.