In [388]:
import pandas as pd
import numpy as np

In [389]:
churnData=pd.read_csv('files_for_lab/Customer-Churn.csv')
churnData.dtypes

gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

Transform `TotalCharges` into a numeric

In [390]:
churnData=churnData.apply(lambda x: x.replace(' ',np.NaN))
churnData.TotalCharges=pd.to_numeric(churnData.TotalCharges)

FillNa

In [391]:
display(churnData.isna().sum())
filnum=churnData.TotalCharges.mean()
churnData.TotalCharges=churnData.TotalCharges.fillna(filnum)

gender               0
SeniorCitizen        0
Partner              0
Dependents           0
tenure               0
PhoneService         0
OnlineSecurity       0
OnlineBackup         0
DeviceProtection     0
TechSupport          0
StreamingTV          0
StreamingMovies      0
Contract             0
MonthlyCharges       0
TotalCharges        11
Churn                0
dtype: int64

In [392]:
churnData.dtypes

gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
MonthlyCharges      float64
TotalCharges        float64
Churn                object
dtype: object

The list of refered X values are equal to the list of the numerics

In [393]:
y=churnData.Churn
X=churnData.select_dtypes(np.number)
display(X.head())
display(y.head())

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges
0,0,1,29.85,29.85
1,0,34,56.95,1889.5
2,0,2,53.85,108.15
3,0,45,42.3,1840.75
4,0,2,70.7,151.65


0     No
1     No
2    Yes
3     No
4    Yes
Name: Churn, dtype: object

Train test split and scaling

In [394]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

scaler=MinMaxScaler()
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)
X_train_scale=scaler.fit_transform(X_train)
X_test_scale=scaler.transform(X_test)
print(X_train_scale.shape)
print(y_train.shape)

(5634, 4)
(5634,)


Make a Logistic regression model and test this model

In [395]:
from sklearn.linear_model import LogisticRegression

LR=LogisticRegression().fit(X_train_scale, y_train)
display(LR.score(X_test_scale,y_test))
display

0.7970191625266146

<function IPython.core.display.display(*objs, include=None, exclude=None, metadata=None, transient=None, display_id=None, **kwargs)>

otherr metrics

In [396]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

pred=LR.predict(X_test_scale)

print('confusion matrix:', confusion_matrix(y_test,pred))
lr={'name' : 'Not sampled model',
'R2-Score': LR.score(X_test_scale,y_test),
'precision': precision_score(y_test,pred,pos_label='Yes'),
'recall': recall_score(y_test,pred,pos_label='Yes'),
'f1': f1_score(y_test,pred,pos_label='Yes')}
lr=pd.DataFrame([lr])
lr

confusion matrix: [[952  93]
 [193 171]]


Unnamed: 0,name,R2-Score,precision,recall,f1
0,Not sampled model,0.797019,0.647727,0.46978,0.544586


## Upsampling

Checking for imbalance and splitting up the DataSets

In [397]:
from sklearn.utils import resample

display(y_train.value_counts())

train=pd.concat([X_train,y_train],axis=1)
train
churn_yes = train[train.Churn == 'Yes']
churn_no = train[train.Churn == 'No']
print('Yes:', churn_yes.shape,', No:',churn_no.shape)

No     4129
Yes    1505
Name: Churn, dtype: int64

Yes: (1505, 5) , No: (4129, 5)


Upsample and running test

In [398]:
churn_yes_up = resample(churn_yes, 
                            replace=True, 
                            n_samples = len(churn_no))

train_up=pd.concat([churn_yes_up,churn_no])
y_up=train_up.Churn
X_up=train_up.drop(['Churn'],axis=1)
X_up_scale=scaler.transform(X_up)
LR_up=LogisticRegression().fit(X_up_scale, y_up) # To t
pred_up=LR_up.predict(X_test_scale)

print('confusion matrix:', confusion_matrix(y_test,pred_up))
lr_up={'name' : 'Oversampled model',
'R2-Score': LR_up.score(X_test_scale,y_test),
'precision': precision_score(y_test,pred_up,pos_label='Yes'),
'recall': recall_score(y_test,pred_up,pos_label='Yes'),
'f1': f1_score(y_test,pred_up,pos_label='Yes')}
lr_up=pd.DataFrame([lr_up])
lr_up

confusion matrix: [[783 262]
 [106 258]]


Unnamed: 0,name,R2-Score,precision,recall,f1
0,Oversampled model,0.738822,0.496154,0.708791,0.58371


## Downsampling

In [399]:
churn_no_down = resample(churn_no, 
                            replace=False, 
                            n_samples = len(churn_yes))

train_down=pd.concat([churn_yes,churn_no_down])
y_down=train_down.Churn
X_down=train_down.drop(['Churn'],axis=1)
X_down_scale=scaler.transform(X_down)
LR_down=LogisticRegression().fit(np.array(X_down_scale), y_down)
pred_down=LR_down.predict(X_test_scale)

print('confusion matrix:', confusion_matrix(y_test,pred_down))
lr_down={'name' : 'Undersampled model',
'R2-Score': LR_down.score(X_test_scale,y_test),
'precision': precision_score(y_test,pred_down,pos_label='Yes'),
'recall': recall_score(y_test,pred_down,pos_label='Yes'),
'f1': f1_score(y_test,pred_down,pos_label='Yes')}
lr_down=pd.DataFrame([lr_down])
lr_down

confusion matrix: [[775 270]
 [106 258]]


Unnamed: 0,name,R2-Score,precision,recall,f1
0,Undersampled model,0.733144,0.488636,0.708791,0.578475


Compare resutls

In [400]:
lr_sol=pd.concat([lr,lr_up,lr_down])
lr_sol

Unnamed: 0,name,R2-Score,precision,recall,f1
0,Not sampled model,0.797019,0.647727,0.46978,0.544586
0,Oversampled model,0.738822,0.496154,0.708791,0.58371
0,Undersampled model,0.733144,0.488636,0.708791,0.578475


With the sampled solutions, it is possible to detect more customers who will churn. But therefor the `Logistic Regressor`  also labels more  customer falsely  as customers who will churn. This is the reason because the R2-score become smaller. The small differences of the scorers between the Over and Undersampled model can be explained by theire randomness.