In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import export_text
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn import linear_model
from sklearn.metrics import r2_score
from sklearn.linear_model import LogisticRegression

In [2]:
churndata = pd.read_csv('./files_for_lab/Customer-Churn.csv')
churndata.columns = [i.lower().replace(' ', '_') for i in churndata.columns]

In [3]:
churndata

Unnamed: 0,gender,seniorcitizen,partner,dependents,tenure,phoneservice,onlinesecurity,onlinebackup,deviceprotection,techsupport,streamingtv,streamingmovies,contract,monthlycharges,totalcharges,churn
0,Female,0,Yes,No,1,No,No,Yes,No,No,No,No,Month-to-month,29.85,29.85,No
1,Male,0,No,No,34,Yes,Yes,No,Yes,No,No,No,One year,56.95,1889.5,No
2,Male,0,No,No,2,Yes,Yes,Yes,No,No,No,No,Month-to-month,53.85,108.15,Yes
3,Male,0,No,No,45,No,Yes,No,Yes,Yes,No,No,One year,42.30,1840.75,No
4,Female,0,No,No,2,Yes,No,No,No,No,No,No,Month-to-month,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,Male,0,Yes,Yes,24,Yes,Yes,No,Yes,Yes,Yes,Yes,One year,84.80,1990.5,No
7039,Female,0,Yes,Yes,72,Yes,No,Yes,Yes,No,Yes,Yes,One year,103.20,7362.9,No
7040,Female,0,Yes,Yes,11,No,Yes,No,No,No,No,No,Month-to-month,29.60,346.45,No
7041,Male,1,Yes,No,4,Yes,No,No,No,No,No,No,Month-to-month,74.40,306.6,Yes


In [4]:
churndata.dtypes

gender               object
seniorcitizen         int64
partner              object
dependents           object
tenure                int64
phoneservice         object
onlinesecurity       object
onlinebackup         object
deviceprotection     object
techsupport          object
streamingtv          object
streamingmovies      object
contract             object
monthlycharges      float64
totalcharges         object
churn                object
dtype: object

In [5]:
churndata['totalcharges'] = churndata['totalcharges'].apply(pd.to_numeric, errors='coerce')

In [6]:
churndata.isna().sum()

gender               0
seniorcitizen        0
partner              0
dependents           0
tenure               0
phoneservice         0
onlinesecurity       0
onlinebackup         0
deviceprotection     0
techsupport          0
streamingtv          0
streamingmovies      0
contract             0
monthlycharges       0
totalcharges        11
churn                0
dtype: int64

In [7]:
churndata[churndata['totalcharges'].isna()]

Unnamed: 0,gender,seniorcitizen,partner,dependents,tenure,phoneservice,onlinesecurity,onlinebackup,deviceprotection,techsupport,streamingtv,streamingmovies,contract,monthlycharges,totalcharges,churn
488,Female,0,Yes,Yes,0,No,Yes,No,Yes,Yes,Yes,No,Two year,52.55,,No
753,Male,0,No,Yes,0,Yes,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,20.25,,No
936,Female,0,Yes,Yes,0,Yes,Yes,Yes,Yes,No,Yes,Yes,Two year,80.85,,No
1082,Male,0,Yes,Yes,0,Yes,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,25.75,,No
1340,Female,0,Yes,Yes,0,No,Yes,Yes,Yes,Yes,Yes,No,Two year,56.05,,No
3331,Male,0,Yes,Yes,0,Yes,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,19.85,,No
3826,Male,0,Yes,Yes,0,Yes,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,25.35,,No
4380,Female,0,Yes,Yes,0,Yes,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,20.0,,No
5218,Male,0,Yes,Yes,0,Yes,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,One year,19.7,,No
6670,Female,0,Yes,Yes,0,Yes,No,Yes,Yes,Yes,Yes,No,Two year,73.35,,No


In [8]:
churndata.totalcharges.fillna(churndata.monthlycharges, inplace=True) # filling the NaNs with at the value of a monthly charge, cause this would be the case for a fresh contract, filling with 0 seems to make no sense

In [9]:
churndata.tenure.value_counts()

1     613
72    362
2     238
3     200
4     176
     ... 
28     57
39     56
44     51
36     50
0      11
Name: tenure, Length: 73, dtype: int64

In [10]:
churndata.seniorcitizen.value_counts() #result will be the same if using MinMaxScaler (0.0/1.0)

0    5901
1    1142
Name: seniorcitizen, dtype: int64

In [11]:
churndataf = churndata[['seniorcitizen','tenure','monthlycharges','totalcharges','churn']]

In [12]:
churndataf

Unnamed: 0,seniorcitizen,tenure,monthlycharges,totalcharges,churn
0,0,1,29.85,29.85,No
1,0,34,56.95,1889.50,No
2,0,2,53.85,108.15,Yes
3,0,45,42.30,1840.75,No
4,0,2,70.70,151.65,Yes
...,...,...,...,...,...
7038,0,24,84.80,1990.50,No
7039,0,72,103.20,7362.90,No
7040,0,11,29.60,346.45,No
7041,1,4,74.40,306.60,Yes


In [13]:
y = churndataf['churn']
X = churndataf.drop(['churn'], axis=1)

In [14]:
MinMaxtransformer = MinMaxScaler().fit(X)
x_normalized = MinMaxtransformer.transform(X)
print(x_normalized.shape)
x_normalized = pd.DataFrame(x_normalized,columns=X.columns)
x_normalized.head()

(7043, 4)


Unnamed: 0,seniorcitizen,tenure,monthlycharges,totalcharges
0,0.0,0.013889,0.115423,0.001275
1,0.0,0.472222,0.385075,0.215867
2,0.0,0.027778,0.354229,0.01031
3,0.0,0.625,0.239303,0.210241
4,0.0,0.027778,0.521891,0.01533


In [15]:
y.value_counts()

No     5174
Yes    1869
Name: churn, dtype: int64

In [16]:
churndataf_scaled =pd.concat([x_normalized, y],axis=1)

In [17]:
churndataf_scaled

Unnamed: 0,seniorcitizen,tenure,monthlycharges,totalcharges,churn
0,0.0,0.013889,0.115423,0.001275,No
1,0.0,0.472222,0.385075,0.215867,No
2,0.0,0.027778,0.354229,0.010310,Yes
3,0.0,0.625000,0.239303,0.210241,No
4,0.0,0.027778,0.521891,0.015330,Yes
...,...,...,...,...,...
7038,0.0,0.333333,0.662189,0.227521,No
7039,0.0,1.000000,0.845274,0.847461,No
7040,0.0,0.152778,0.112935,0.037809,No
7041,1.0,0.055556,0.558706,0.033210,Yes


In [18]:
X = x_normalized

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1337)

In [20]:
classification = LogisticRegression(random_state=0, solver='saga',
                  multi_class='multinomial').fit(X_train, y_train)

In [21]:
predictions = classification.predict(X_train)
classification.score(X_train, y_train)

0.7912590722625434

In [22]:
predictions = classification.predict(X_test)
classification.score(X_test, y_test)

0.798581560283688

# downsampling

In [23]:
category0 = churndataf_scaled[churndataf_scaled['churn'] == 'No']
category1 = churndataf_scaled[churndataf_scaled['churn'] == 'Yes']

In [24]:
category0_undersampled = resample(category0, 
                                   replace=False, 
                                   n_samples = len(category1))

In [25]:
print(category0_undersampled.shape)
print(category1.shape)

(1869, 5)
(1869, 5)


In [26]:
data_downsampled = pd.concat([category0_undersampled, category1], axis=0)

In [27]:
data_downsampled

Unnamed: 0,seniorcitizen,tenure,monthlycharges,totalcharges,churn
2380,0.0,0.833333,0.850746,0.686724,No
3607,0.0,0.180556,0.367164,0.083556,No
359,0.0,0.583333,0.784577,0.461337,No
4750,0.0,0.125000,0.211940,0.040872,No
5136,1.0,0.027778,0.540796,0.015636,No
...,...,...,...,...,...
7021,0.0,0.166667,0.413433,0.081814,Yes
7026,0.0,0.125000,0.258209,0.044375,Yes
7032,1.0,0.013889,0.572139,0.006572,Yes
7034,0.0,0.930556,0.842786,0.792459,Yes


In [28]:
y = data_downsampled['churn']
X = data_downsampled.drop(['churn'], axis=1)

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1337)

In [30]:
classification = LogisticRegression(random_state=0, solver='saga',
                  multi_class='multinomial').fit(X_train, y_train)

In [31]:
predictions = classification.predict(X_train)
classification.score(X_train, y_train)

0.7327586206896551

In [32]:
predictions = classification.predict(X_test)
classification.score(X_test, y_test)

0.6978609625668449

# upsampling

In [33]:
category0 = churndataf_scaled[churndataf_scaled['churn'] == 'No']
category1 = churndataf_scaled[churndataf_scaled['churn'] == 'Yes']

In [34]:
category1_oversampled = resample(category1, 
                                  replace=True, 
                                  n_samples = len(category0))

In [35]:
print(category0.shape)
print(category1_oversampled.shape)

(5174, 5)
(5174, 5)


In [36]:
data_upsampled = pd.concat([category0, category1_oversampled], axis=0)

In [37]:
data_upsampled

Unnamed: 0,seniorcitizen,tenure,monthlycharges,totalcharges,churn
0,0.0,0.013889,0.115423,0.001275,No
1,0.0,0.472222,0.385075,0.215867,No
3,0.0,0.625000,0.239303,0.210241,No
6,0.0,0.305556,0.704975,0.222779,No
7,0.0,0.138889,0.114428,0.032668,No
...,...,...,...,...,...
1211,0.0,0.152778,0.478607,0.083314,Yes
2294,0.0,0.166667,0.880100,0.142523,Yes
5541,0.0,0.847222,0.879104,0.739626,Yes
223,0.0,0.055556,0.316418,0.018526,Yes


In [38]:
y = data_upsampled['churn']
X = data_upsampled.drop(['churn'], axis=1)

In [39]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1337)

In [40]:
classification = LogisticRegression(random_state=0, solver='saga',
                  multi_class='multinomial').fit(X_train, y_train)

In [41]:
predictions = classification.predict(X_train)
classification.score(X_train, y_train)

0.7314506603672286

In [42]:
predictions = classification.predict(X_test)
classification.score(X_test, y_test)

0.7478260869565218

# SMOTE

In [43]:
from imblearn.over_sampling import SMOTE

In [44]:
y = churndataf_scaled['churn']
X = churndataf_scaled.drop(['churn'], axis=1)

In [51]:
X

Unnamed: 0,seniorcitizen,tenure,monthlycharges,totalcharges
0,0.0,0.013889,0.115423,0.001275
1,0.0,0.472222,0.385075,0.215867
2,0.0,0.027778,0.354229,0.010310
3,0.0,0.625000,0.239303,0.210241
4,0.0,0.027778,0.521891,0.015330
...,...,...,...,...
7038,0.0,0.333333,0.662189,0.227521
7039,0.0,1.000000,0.845274,0.847461
7040,0.0,0.152778,0.112935,0.037809
7041,1.0,0.055556,0.558706,0.033210


In [50]:
y

0        No
1        No
2       Yes
3        No
4       Yes
       ... 
7038     No
7039     No
7040     No
7041    Yes
7042     No
Name: churn, Length: 7043, dtype: object

In [53]:
y.replace(['No','Yes'],[0,1],inplace=True)

In [54]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1337)

In [55]:
sm = SMOTE(random_state=100,k_neighbors=2)
X_train_SMOTE,y_train_SMOTE = sm.fit_resample(X_train,y_train)

In [56]:
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

In [57]:
LR = LogisticRegression(max_iter=1000)
LR.fit(X_train_SMOTE, y_train_SMOTE)
pred = LR.predict(X_test)

print("precision: ",precision_score(y_test,pred))
print("recall: ",recall_score(y_test,pred))
print("f1: ",f1_score(y_test,pred))

precision:  0.5435540069686411
recall:  0.7878787878787878
f1:  0.643298969072165
