In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

In [3]:
data=pd.read_csv('files_for_lab/customer_churn.csv')
#data.Churn=data.Churn.replace(('Yes', 'No'), (1, 0), inplace=True)

data.head()


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [4]:
data.isna().sum()
#no nan cleaning is needed

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

Task say only use tenure, SeniorCitizen and MonthlyCharges for the logistic regression, to get Churn.
all X_data are numerical so no num cat split 

In [5]:
X=data[['tenure', 'SeniorCitizen', 'MonthlyCharges']]
y=data.Churn
#test-split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)

transformer = StandardScaler().fit(X_train)
X_train_scaled = pd.DataFrame(transformer.transform(X_train),columns=X.columns)
y_train = y_train.reset_index(drop=True) 

In [6]:
LR_normal = LogisticRegression(random_state=0, solver='lbfgs').fit(X_train_scaled, y_train)
X_test_scaled=pd.DataFrame(transformer.transform(X_test),columns=X.columns)
pred_normal=LR_normal.predict(X_test_scaled)


All prediction scores for the normal model will be printed hear

In [9]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
print('Score:', LR_normal.score(X_test_scaled,y_test))
print('precision:', precision_score(y_test,pred_normal,pos_label='Yes')) #poslabel defines the variable on which the test should run on
print('recall:', recall_score(y_test,pred_normal,pos_label='Yes'))
print('f1:', f1_score(y_test,pred_normal,pos_label='Yes'))
print('confusion matrix:', confusion_matrix(y_test,pred_normal))


Score: 0.7819420783645656
precision: 0.6323529411764706
recall: 0.45358649789029537
f1: 0.5282555282555284
confusion matrix: [[1162  125]
 [ 259  215]]


Because the most customers does not churn a very simple model could be that it predicts, that no one churn. In this case the system would be over 70% of the cases right.

In [12]:
print(y_test.value_counts(),1287/(1287+474))

No     1287
Yes     474
Name: Churn, dtype: int64 0.7308347529812607


Prework before sampling

In [16]:
train=pd.concat([X_train_scaled,y_train],axis=1)
yes_train=train[train.Churn == 'Yes']
no_train=train[train.Churn == 'No']
display(yes_train.head())
display(no_train.shape)

Unnamed: 0,tenure,SeniorCitizen,MonthlyCharges,Churn
2,-0.272336,-0.442532,0.333768,Yes
3,0.258288,-0.442532,-1.157408,Yes
4,0.21747,-0.442532,0.988756,Yes
5,-1.047863,-0.442532,-0.179914,Yes
12,-0.966229,-0.442532,0.310495,Yes


(3887, 4)

Oversampling

In [19]:
from sklearn.utils import resample
yes_oversample=resample(yes_train,replace=True,n_samples=len(no_train))
train_oversample=pd.concat([yes_oversample,no_train],axis=0)
y_oversample=train_oversample.Churn
X_oversample=train_oversample.drop('Churn',axis=1)

Create model and test

In [20]:
LR_os = LogisticRegression(random_state=0, solver='lbfgs').fit(X_oversample, y_oversample)
pred_os=LR_os.predict(X_test_scaled)
print('Score:', LR_os.score(X_test_scaled,y_test))
print('precision:', precision_score(y_test,pred_os,pos_label='Yes'))
print('recall:', recall_score(y_test,pred_os,pos_label='Yes'))
print('f1:', f1_score(y_test,pred_os,pos_label='Yes'))
print('confusion matrix:', confusion_matrix(y_test,pred_os))


Score: 0.7234525837592277
precision: 0.4908579465541491
recall: 0.7362869198312236
f1: 0.5890295358649789
confusion matrix: [[925 362]
 [125 349]]


worse precision, but better recall score

Undersample

In [21]:
no_us=resample(no_train,replace=False,n_samples=len(yes_train))
train_us=pd.concat([yes_train,no_us],axis=0)
y_us=train_us.Churn
X_us=train_us.drop('Churn',axis=1)

In [22]:
LR_us = LogisticRegression(random_state=0, solver='lbfgs').fit(X_us, y_us)
pred_us=LR_us.predict(X_test_scaled)
print('Score:', LR_us.score(X_test_scaled,y_test))
print('precision:', precision_score(y_test,pred_us,pos_label='Yes'))
print('recall:', recall_score(y_test,pred_us,pos_label='Yes'))
print('f1:', f1_score(y_test,pred_us,pos_label='Yes'))
print('confusion matrix:', confusion_matrix(y_test,pred_us))


Score: 0.727427597955707
precision: 0.4956521739130435
recall: 0.7215189873417721
f1: 0.5876288659793815
confusion matrix: [[939 348]
 [132 342]]


Similar  results compare to oversampling

Smote

In [26]:
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=100,k_neighbors=5)
X_sm,y_sm = sm.fit_resample(X_train_scaled,y_train)

LR_sm = LogisticRegression(random_state=0, solver='lbfgs').fit(X_sm, y_sm)
pred_sm=LR_sm.predict(X_test_scaled)
print('Score:', LR_sm.score(X_test_scaled,y_test))
print('precision:', precision_score(y_test,pred_sm,pos_label='Yes'))
print('recall:', recall_score(y_test,pred_sm,pos_label='Yes'))
print('f1:', f1_score(y_test,pred_sm,pos_label='Yes'))
print('confusion matrix:', confusion_matrix(y_test,pred_sm))

Score: 0.7245883021010789
precision: 0.49213161659513593
recall: 0.7257383966244726
f1: 0.586530264279625
confusion matrix: [[932 355]
 [130 344]]


Similar  results compare to oversampling