In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from imblearn.over_sampling import SMOTE

In [2]:
churn = pd.read_csv('customer_churn.csv')

In [3]:
churn

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.30,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,6840-RESVB,Male,0,Yes,Yes,24,Yes,Yes,DSL,Yes,...,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.80,1990.5,No
7039,2234-XADUH,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,No,...,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.20,7362.9,No
7040,4801-JZAZL,Female,0,Yes,Yes,11,No,No phone service,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.60,346.45,No
7041,8361-LTMKD,Male,1,Yes,No,4,Yes,Yes,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Mailed check,74.40,306.6,Yes


In [4]:
churn.isnull().sum()

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [5]:
churn['Churn'].replace({'Yes': 1, "No": 0},inplace=True)

In [6]:
X = churn.drop('Churn',axis = 1)
y = churn['Churn']

In [7]:
X = X[['tenure', 'SeniorCitizen', 'MonthlyCharges']]

In [8]:
X.head()

Unnamed: 0,tenure,SeniorCitizen,MonthlyCharges
0,1,0,29.85
1,34,0,56.95
2,2,0,53.85
3,45,0,42.3
4,2,0,70.7


In [9]:
y.head()

0    0
1    0
2    1
3    0
4    1
Name: Churn, dtype: int64

In [10]:
X['SeniorCitizen'] = X['SeniorCitizen'].astype(object)

In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [12]:
X_train_num = X_train.select_dtypes('number')
X_train_cat = X_train.select_dtypes('object')

display(X_train_num.head())
X_train_cat.head()

Unnamed: 0,tenure,MonthlyCharges
3296,24,49.3
6397,54,65.25
6043,3,40.15
5309,61,20.25
3000,12,84.45


Unnamed: 0,SeniorCitizen
3296,0
6397,1
6043,0
5309,0
3000,0


In [13]:
transformer = StandardScaler().fit(X_train_num)
X_train_num_scaled = pd.DataFrame(transformer.transform(X_train_num),columns=X_train_num.columns)
X_train_num_scaled.head()

Unnamed: 0,tenure,MonthlyCharges
0,-0.340191,-0.514314
1,0.88321,0.01784
2,-1.196572,-0.819594
3,1.16867,-1.483535
4,-0.829552,0.658427


In [14]:
X_test_num = X_test.select_dtypes('number')
X_test_cat = X_test.select_dtypes('object')

display(X_test_num.head())
X_test_cat.head()

Unnamed: 0,tenure,MonthlyCharges
2200,19,58.2
4627,60,116.6
3225,13,71.95
2828,1,20.45
3768,55,77.75


Unnamed: 0,SeniorCitizen
2200,0
4627,0
3225,0
2828,0
3768,0


In [15]:
transformer = StandardScaler().fit(X_test_num)
X_test_num_scaled = pd.DataFrame(transformer.transform(X_test_num),columns=X_test_num.columns)

X_test_num_scaled.head()

Unnamed: 0,tenure,MonthlyCharges
0,-0.545642,-0.220199
1,1.116633,1.698898
2,-0.788902,0.231643
3,-1.275422,-1.460712
4,0.913916,0.422238


In [16]:
X_train_cat = X_train_cat.reset_index(drop=True)
X_test_cat = X_test_cat.reset_index(drop=True)

In [17]:
X_train_transformed = pd.concat([X_train_num_scaled, X_train_cat], axis=1)
X_test_transformed = pd.concat([X_test_num_scaled, X_test_cat], axis=1)

In [18]:
LR = LogisticRegression(random_state=0, solver='lbfgs')
LR.fit(X_train_transformed, y_train)
LR.score(X_test_transformed, y_test)

0.7825099375354913

In [19]:
pred = LR.predict(X_test_transformed)

print("precision: ",precision_score(y_test,pred))
print("recall: ",recall_score(y_test,pred))
print("f1: ",f1_score(y_test,pred))


precision:  0.6169590643274854
recall:  0.4557235421166307
f1:  0.524223602484472


In [20]:
cm = confusion_matrix(y_test, pred)

In [21]:
cm

array([[1167,  131],
       [ 252,  211]], dtype=int64)

In [22]:
sm = SMOTE(random_state=100, k_neighbors=3)
X_train_SMOTE,y_train_SMOTE = sm.fit_resample(X_train_transformed,y_train)

In [23]:
X_train_SMOTE.shape

(7752, 3)

In [24]:
LR = LogisticRegression(random_state=0, solver='lbfgs')
LR.fit(X_train_SMOTE, y_train_SMOTE)
pred = LR.predict(X_test_transformed)

print("precision: ",precision_score(y_test,pred))
print("recall: ",recall_score(y_test,pred))
print("f1: ",f1_score(y_test,pred))

precision:  0.4804630969609262
recall:  0.7170626349892009
f1:  0.5753899480069324


In [25]:
cm = confusion_matrix(y_test, pred)
cm

array([[939, 359],
       [131, 332]], dtype=int64)

Imbalanced data:
precision:  0.6169590643274854
recall:  0.4557235421166307
f1:  0.524223602484472

SMOTE:
precision:  0.4804630969609262
recall:  0.7170626349892009
f1:  0.5753899480069324

As we can see with SMOTE we got much higher recall and a bit higher f1, but precision got lower

In [26]:
print("Happy Friday")

Happy Friday
