In [3]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

In [2]:
data = 'https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-03-churn-prediction/WA_Fn-UseC_-Telco-Customer-Churn.csv'

In [3]:
!wget $data -O data-week3.csv

--2023-10-04 11:01:42--  https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-03-churn-prediction/WA_Fn-UseC_-Telco-Customer-Churn.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.108.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 977501 (955K) [text/plain]
Saving to: ‘data-week3.csv’


2023-10-04 11:01:42 (52.4 MB/s) - ‘data-week3.csv’ saved [977501/977501]



In [4]:
df = pd.read_csv('data-week3.csv')

## Data preprocessing

In [5]:
df.columns = df.columns.str.lower().str.replace(' ','_')

categorial_cols = list(df.dtypes[df.dtypes == 'object'].index)

for col in categorial_cols:
    df[col] = df[col].str.lower().str.replace(' ', '_')
# Convert total_charges to numerical values, ignore errors as NA
total_charges = pd.to_numeric(df.totalcharges, errors='coerce')
df.totalcharges = total_charges
df.totalcharges = df.totalcharges.fillna(0)
df.head().T
# Filter dataframe by specific column, pick specific columns
df[df['totalcharges'].isnull()][['customerid', 'totalcharges']]
df.churn = (df.churn == 'yes').astype('int')
df.head()

Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,...,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
0,7590-vhveg,female,0,yes,no,1,no,no_phone_service,dsl,no,...,no,no,no,no,month-to-month,yes,electronic_check,29.85,29.85,0
1,5575-gnvde,male,0,no,no,34,yes,no,dsl,yes,...,yes,no,no,no,one_year,no,mailed_check,56.95,1889.5,0
2,3668-qpybk,male,0,no,no,2,yes,no,dsl,yes,...,no,no,no,no,month-to-month,yes,mailed_check,53.85,108.15,1
3,7795-cfocw,male,0,no,no,45,no,no_phone_service,dsl,yes,...,yes,yes,no,no,one_year,no,bank_transfer_(automatic),42.3,1840.75,0
4,9237-hqitu,female,0,no,no,2,yes,no,fiber_optic,no,...,no,no,no,no,month-to-month,yes,electronic_check,70.7,151.65,1


## Setting up the validation framework

In [6]:
from sklearn.model_selection import train_test_split

In [61]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)

In [62]:
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

In [63]:
len(df_train),len(df_val),len(df_test)

(4225, 1409, 1409)

- Reset schuffled indcies

In [64]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [65]:
y_train = df_train.churn.values
y_val = df_val.churn.values
y_test = df_test.churn.values

In [66]:
del df_train['churn']
del df_val['churn']
del df_test['churn']

## EDA
- look at the target variable
- look at the numerical and categorical variables

In [67]:
df_full_train = df_full_train.reset_index(drop = True)

In [68]:
df_full_train.churn.value_counts()

0    4113
1    1521
Name: churn, dtype: int64

- Churn rate

In [69]:
global_churn_rate = round(df_full_train.churn.mean(), 2)

In [70]:
numerical = ['tenure', 'monthlycharges', 'totalcharges']

In [72]:
df_full_train.columns

Index(['customerid', 'gender', 'seniorcitizen', 'partner', 'dependents',
       'tenure', 'phoneservice', 'multiplelines', 'internetservice',
       'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport',
       'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling',
       'paymentmethod', 'monthlycharges', 'totalcharges', 'churn'],
      dtype='object')

In [73]:
categorial = ['gender', 'seniorcitizen', 'partner', 'dependents',
     'phoneservice', 'multiplelines', 'internetservice',
       'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport',
       'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling',
       'paymentmethod']

In [74]:
df_full_train[categorial].nunique()

gender              2
seniorcitizen       2
partner             2
dependents          2
phoneservice        2
multiplelines       3
internetservice     3
onlinesecurity      3
onlinebackup        3
deviceprotection    3
techsupport         3
streamingtv         3
streamingmovies     3
contract            3
paperlessbilling    2
paymentmethod       4
dtype: int64

In [75]:
female_churn = df_full_train[df_full_train.gender == 'female'].churn.mean()
male_churn = df_full_train[df_full_train.gender == 'male'].churn.mean()

In [76]:
partner_churn = df_full_train[df_full_train.partner == 'yes'].churn.mean()
partner_no_churn = df_full_train[df_full_train.partner == 'no'].churn.mean()



## Feature importance
global - group
- '<' 0 more likely to churn
- '>' 0 less likely to churn

In [77]:
female_churn
partner_no_churn

global_churn_rate - partner_no_churn

-0.0598090040927694

- Risk rate
    - '>' 1 more likely to churn
    - '<' 1 less likely to churn

In [78]:
risk_rate = partner_no_churn / global_churn_rate
risk_rate

1.22151482997322

In [79]:
partner_risk_rate = partner_churn / global_churn_rate
partner_risk_rate

0.7593826246675986

In [80]:
from IPython.display import display

In [81]:
for cat in categorial:
    df_group = df_full_train.groupby(cat).churn.agg(['mean', 'count'])
    df_group['difference'] = df_group['mean'] - global_churn_rate
    df_group['risk'] = df_group['mean'] / global_churn_rate
    display(df_group)
    print()

Unnamed: 0_level_0,mean,count,difference,risk
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,0.276824,2796,0.006824,1.025274
male,0.263214,2838,-0.006786,0.974865





Unnamed: 0_level_0,mean,count,difference,risk
seniorcitizen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.24227,4722,-0.02773,0.897297
1,0.413377,912,0.143377,1.531027





Unnamed: 0_level_0,mean,count,difference,risk
partner,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.329809,2932,0.059809,1.221515
yes,0.205033,2702,-0.064967,0.759383





Unnamed: 0_level_0,mean,count,difference,risk
dependents,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.31376,3968,0.04376,1.162074
yes,0.165666,1666,-0.104334,0.613579





Unnamed: 0_level_0,mean,count,difference,risk
phoneservice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.241316,547,-0.028684,0.893764
yes,0.273049,5087,0.003049,1.011292





Unnamed: 0_level_0,mean,count,difference,risk
multiplelines,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.257407,2700,-0.012593,0.953361
no_phone_service,0.241316,547,-0.028684,0.893764
yes,0.290742,2387,0.020742,1.07682





Unnamed: 0_level_0,mean,count,difference,risk
internetservice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
dsl,0.192347,1934,-0.077653,0.712398
fiber_optic,0.425171,2479,0.155171,1.574709
no,0.077805,1221,-0.192195,0.288167





Unnamed: 0_level_0,mean,count,difference,risk
onlinesecurity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.420921,2801,0.150921,1.558967
no_internet_service,0.077805,1221,-0.192195,0.288167
yes,0.153226,1612,-0.116774,0.567503





Unnamed: 0_level_0,mean,count,difference,risk
onlinebackup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.404323,2498,0.134323,1.497494
no_internet_service,0.077805,1221,-0.192195,0.288167
yes,0.217232,1915,-0.052768,0.804564





Unnamed: 0_level_0,mean,count,difference,risk
deviceprotection,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.395875,2473,0.125875,1.466205
no_internet_service,0.077805,1221,-0.192195,0.288167
yes,0.230412,1940,-0.039588,0.853379





Unnamed: 0_level_0,mean,count,difference,risk
techsupport,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.418914,2781,0.148914,1.551534
no_internet_service,0.077805,1221,-0.192195,0.288167
yes,0.159926,1632,-0.110074,0.59232





Unnamed: 0_level_0,mean,count,difference,risk
streamingtv,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.342832,2246,0.072832,1.269747
no_internet_service,0.077805,1221,-0.192195,0.288167
yes,0.302723,2167,0.032723,1.121195





Unnamed: 0_level_0,mean,count,difference,risk
streamingmovies,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.338906,2213,0.068906,1.255209
no_internet_service,0.077805,1221,-0.192195,0.288167
yes,0.307273,2200,0.037273,1.138047





Unnamed: 0_level_0,mean,count,difference,risk
contract,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
month-to-month,0.431701,3104,0.161701,1.598893
one_year,0.120573,1186,-0.149427,0.446568
two_year,0.028274,1344,-0.241726,0.104718





Unnamed: 0_level_0,mean,count,difference,risk
paperlessbilling,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.172071,2313,-0.097929,0.6373
yes,0.338151,3321,0.068151,1.252412





Unnamed: 0_level_0,mean,count,difference,risk
paymentmethod,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
bank_transfer_(automatic),0.168171,1219,-0.101829,0.622854
credit_card_(automatic),0.164339,1217,-0.105661,0.608661
electronic_check,0.45589,1893,0.18589,1.688482
mailed_check,0.19387,1305,-0.07613,0.718036





## Mutual information

In [82]:
from sklearn.metrics import mutual_info_score

In [83]:
mutual_info_score(df_full_train.churn, df_full_train.contract)

0.09832038740415643

In [84]:
mutual_info_score(df_full_train.churn, df_full_train.paymentmethod)

0.04321002753158276

In [85]:
def mutual_info_churn_score(series):
    return mutual_info_score(series, df_full_train.churn)

In [86]:
#forEach column apply mutual info score to check what are the most columns impact on the target variable churn 
df_full_train[categorial].apply(mutual_info_churn_score).sort_values(ascending = False)

contract            0.098320
onlinesecurity      0.063085
techsupport         0.061032
internetservice     0.055868
onlinebackup        0.046923
deviceprotection    0.043453
paymentmethod       0.043210
streamingtv         0.031853
streamingmovies     0.031581
paperlessbilling    0.017589
dependents          0.012346
partner             0.009968
seniorcitizen       0.009410
multiplelines       0.000857
phoneservice        0.000229
gender              0.000117
dtype: float64

## Correlation
corrleation coefficient

In [87]:
df_full_train[numerical].corrwith(df_full_train.churn)

tenure           -0.351885
monthlycharges    0.196805
totalcharges     -0.196353
dtype: float64

In [88]:
df_full_train[df_full_train['tenure'] <= 2 ].churn.mean()

0.5953420669577875

In [89]:
 df_full_train[(df_full_train['tenure'] > 2) & (df_full_train['tenure'] <= 12)].churn.mean()

0.3994413407821229

In [90]:
df_full_train[df_full_train['tenure'] > 12 ].churn.mean()

0.17634908339788277

## One Hot Encoding

In [91]:
from sklearn.feature_extraction import DictVectorizer

In [92]:
dicts = df_train[['gender', 'contract']].iloc[:100].to_dict(orient='records')
train_dicts = df_train[categorial + numerical].to_dict(orient='records')

In [93]:
dv = DictVectorizer(sparse=False)

In [94]:
X_train = dv.fit_transform(train_dicts)

In [95]:
dv.get_feature_names_out()

array(['contract=month-to-month', 'contract=one_year',
       'contract=two_year', 'dependents=no', 'dependents=yes',
       'deviceprotection=no', 'deviceprotection=no_internet_service',
       'deviceprotection=yes', 'gender=female', 'gender=male',
       'internetservice=dsl', 'internetservice=fiber_optic',
       'internetservice=no', 'monthlycharges', 'multiplelines=no',
       'multiplelines=no_phone_service', 'multiplelines=yes',
       'onlinebackup=no', 'onlinebackup=no_internet_service',
       'onlinebackup=yes', 'onlinesecurity=no',
       'onlinesecurity=no_internet_service', 'onlinesecurity=yes',
       'paperlessbilling=no', 'paperlessbilling=yes', 'partner=no',
       'partner=yes', 'paymentmethod=bank_transfer_(automatic)',
       'paymentmethod=credit_card_(automatic)',
       'paymentmethod=electronic_check', 'paymentmethod=mailed_check',
       'phoneservice=no', 'phoneservice=yes', 'seniorcitizen',
       'streamingmovies=no', 'streamingmovies=no_internet_service',

In [96]:
X_train.shape

(4225, 45)

In [97]:
val_dicts = df_val[categorial + numerical].to_dict(orient='records')
# only apply transform, already fit on training data
X_val = dv.transform(val_dicts)

## Logistic Regression

In [98]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

In [99]:
z = np.linspace(-5, 5, 51)
sigmoid(z)

array([0.00669285, 0.00816257, 0.0099518 , 0.01212843, 0.01477403,
       0.01798621, 0.02188127, 0.02659699, 0.03229546, 0.03916572,
       0.04742587, 0.05732418, 0.06913842, 0.0831727 , 0.09975049,
       0.11920292, 0.14185106, 0.16798161, 0.19781611, 0.23147522,
       0.26894142, 0.31002552, 0.35434369, 0.40131234, 0.450166  ,
       0.5       , 0.549834  , 0.59868766, 0.64565631, 0.68997448,
       0.73105858, 0.76852478, 0.80218389, 0.83201839, 0.85814894,
       0.88079708, 0.90024951, 0.9168273 , 0.93086158, 0.94267582,
       0.95257413, 0.96083428, 0.96770454, 0.97340301, 0.97811873,
       0.98201379, 0.98522597, 0.98787157, 0.9900482 , 0.99183743,
       0.99330715])

In [100]:
def linear_regression(xi):    
    return np.dot(xi, w) + w0

def logistic_regression(xi):    
    scores  = np.dot(xi, w) + w0
    predictions = sigmoid(scores)
    
    return predictions

  ## Training logistic regression using sklearn

In [101]:
from sklearn.linear_model import LogisticRegression

In [102]:
lr = LogisticRegression()
y_train

array([0, 0, 1, ..., 1, 0, 1])

In [108]:
lr.fit(X_train, y_train )
#Hard predictions
lr.predict(X_train)
# Soft predictions
y_pred = lr.predict_proba(X_train)[:,1]

In [112]:
y_val_pred = lr.predict_proba(X_val)[:,1]
y_val_pred
churn_decision = (y_pred >= 0.5)
churn_decision_val = (y_val_pred >= 0.5)

In [113]:
(y_val == churn_decision_val).mean()

0.8034066713981547

In [115]:
df_pred = pd.DataFrame()
df_pred['probability'] = y_pred
df_pred['prediction'] = churn_decision.astype(int)
df_pred['actual'] = y_train
df_pred['correct'] = df_pred.actual == df_pred.prediction
df_pred

Unnamed: 0,probability,prediction,actual,correct
0,0.095477,0,0,True
1,0.679322,1,0,False
2,0.633674,1,1,True
3,0.717521,1,0,False
4,0.047681,0,0,True
...,...,...,...,...
4220,0.132772,0,0,True
4221,0.304563,0,0,True
4222,0.531617,1,1,True
4223,0.042544,0,0,True


- Accuracy

In [116]:
df_pred.correct.mean()

0.805207100591716

In [None]:
from sklearn.linear_model import LinearRegression