In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('Telco-Customer-Churn.csv')

In [3]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [4]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

categorical_columns = list(df.dtypes[df.dtypes == 'object'].index)

In [5]:
for c in categorical_columns:
    df[c] = df[c].str.lower().str.replace(' ', '_')

In [6]:
df.head()

Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,...,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
0,7590-vhveg,female,0,yes,no,1,no,no_phone_service,dsl,no,...,no,no,no,no,month-to-month,yes,electronic_check,29.85,29.85,no
1,5575-gnvde,male,0,no,no,34,yes,no,dsl,yes,...,yes,no,no,no,one_year,no,mailed_check,56.95,1889.5,no
2,3668-qpybk,male,0,no,no,2,yes,no,dsl,yes,...,no,no,no,no,month-to-month,yes,mailed_check,53.85,108.15,yes
3,7795-cfocw,male,0,no,no,45,no,no_phone_service,dsl,yes,...,yes,yes,no,no,one_year,no,bank_transfer_(automatic),42.3,1840.75,no
4,9237-hqitu,female,0,no,no,2,yes,no,fiber_optic,no,...,no,no,no,no,month-to-month,yes,electronic_check,70.7,151.65,yes


In [7]:
df.dtypes

customerid           object
gender               object
seniorcitizen         int64
partner              object
dependents           object
tenure                int64
phoneservice         object
multiplelines        object
internetservice      object
onlinesecurity       object
onlinebackup         object
deviceprotection     object
techsupport          object
streamingtv          object
streamingmovies      object
contract             object
paperlessbilling     object
paymentmethod        object
monthlycharges      float64
totalcharges         object
churn                object
dtype: object

In [8]:
df.totalcharges

0         29.85
1        1889.5
2        108.15
3       1840.75
4        151.65
         ...   
7038     1990.5
7039     7362.9
7040     346.45
7041      306.6
7042     6844.5
Name: totalcharges, Length: 7043, dtype: object

In [9]:
tc = pd.to_numeric( df.totalcharges, errors='coerce')

In [10]:
df.loc[tc.isnull(), ['totalcharges']]

Unnamed: 0,totalcharges
488,_
753,_
936,_
1082,_
1340,_
3331,_
3826,_
4380,_
5218,_
6670,_


In [11]:
df['totalcharges'] = pd.to_numeric( df.totalcharges, errors='coerce')

In [12]:
df['totalcharges'] = df['totalcharges'].fillna(0)

In [13]:
df['totalcharges'].isnull().sum()

0

In [14]:
#Replace 0 and 1
df.churn.head()

0     no
1     no
2    yes
3     no
4    yes
Name: churn, dtype: object

In [15]:
(df.churn == 'yes').head()

0    False
1    False
2     True
3    False
4     True
Name: churn, dtype: bool

In [16]:
(df.churn == 'yes').astype(int).head()

0    0
1    0
2    1
3    0
4    1
Name: churn, dtype: int64

In [17]:
df.churn.unique()

array(['no', 'yes'], dtype=object)

In [18]:
df['churn'] = (df.churn == 'yes').astype(int)

In [19]:
df.head()

Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,...,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
0,7590-vhveg,female,0,yes,no,1,no,no_phone_service,dsl,no,...,no,no,no,no,month-to-month,yes,electronic_check,29.85,29.85,0
1,5575-gnvde,male,0,no,no,34,yes,no,dsl,yes,...,yes,no,no,no,one_year,no,mailed_check,56.95,1889.5,0
2,3668-qpybk,male,0,no,no,2,yes,no,dsl,yes,...,no,no,no,no,month-to-month,yes,mailed_check,53.85,108.15,1
3,7795-cfocw,male,0,no,no,45,no,no_phone_service,dsl,yes,...,yes,yes,no,no,one_year,no,bank_transfer_(automatic),42.3,1840.75,0
4,9237-hqitu,female,0,no,no,2,yes,no,fiber_optic,no,...,no,no,no,no,month-to-month,yes,electronic_check,70.7,151.65,1


In [20]:
#Validation framework

In [21]:
from sklearn.model_selection import train_test_split

In [22]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)

In [23]:
df_full_train.shape, df_test.shape

((5634, 21), (1409, 21))

In [24]:
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

In [25]:
df_train.shape, df_val.shape

((4225, 21), (1409, 21))

In [26]:
# Resetting index
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [27]:
y_train = df_train.churn.values
y_val = df_val.churn.values
y_test = df_test.churn.values

In [28]:
del df_train['churn']
del df_val['churn']
del df_test['churn']

In [29]:
df_train.head()

Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,onlinebackup,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges
0,8015-ihcgw,female,0,yes,yes,72,yes,yes,fiber_optic,yes,yes,yes,yes,yes,yes,two_year,yes,electronic_check,115.5,8425.15
1,1960-uycnn,male,0,no,no,10,yes,yes,fiber_optic,no,yes,yes,no,no,yes,month-to-month,yes,electronic_check,95.25,1021.55
2,9250-wypll,female,0,no,no,5,yes,yes,fiber_optic,no,no,no,no,no,no,month-to-month,no,electronic_check,75.55,413.65
3,6786-obwqr,female,0,yes,yes,5,yes,no,fiber_optic,no,no,no,no,yes,no,month-to-month,yes,electronic_check,80.85,356.1
4,1328-euzhc,female,0,yes,no,18,yes,no,no,no_internet_service,no_internet_service,no_internet_service,no_internet_service,no_internet_service,no_internet_service,two_year,no,mailed_check,20.1,370.5


In [30]:
#EDA

In [31]:
df_full_train = df_full_train.reset_index(drop=True)

In [32]:
df_full_train.head()

Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,...,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
0,5442-pptjy,male,0,yes,yes,12,yes,no,no,no_internet_service,...,no_internet_service,no_internet_service,no_internet_service,no_internet_service,two_year,no,mailed_check,19.7,258.35,0
1,6261-rcvns,female,0,no,no,42,yes,no,dsl,yes,...,yes,yes,no,yes,one_year,no,credit_card_(automatic),73.9,3160.55,1
2,2176-osjuv,male,0,yes,no,71,yes,yes,dsl,yes,...,no,yes,no,no,two_year,no,bank_transfer_(automatic),65.15,4681.75,0
3,6161-erdgd,male,0,yes,yes,71,yes,yes,dsl,yes,...,yes,yes,yes,yes,one_year,no,electronic_check,85.45,6300.85,0
4,2364-ufrom,male,0,no,no,30,yes,no,dsl,yes,...,no,yes,yes,no,one_year,no,electronic_check,70.4,2044.75,0


In [33]:
df_full_train.isnull().sum()

customerid          0
gender              0
seniorcitizen       0
partner             0
dependents          0
tenure              0
phoneservice        0
multiplelines       0
internetservice     0
onlinesecurity      0
onlinebackup        0
deviceprotection    0
techsupport         0
streamingtv         0
streamingmovies     0
contract            0
paperlessbilling    0
paymentmethod       0
monthlycharges      0
totalcharges        0
churn               0
dtype: int64

In [34]:
df_full_train.churn

0       0
1       1
2       0
3       0
4       0
       ..
5629    1
5630    0
5631    1
5632    1
5633    0
Name: churn, Length: 5634, dtype: int64

In [35]:
df_full_train.churn.value_counts()

churn
0    4113
1    1521
Name: count, dtype: int64

In [36]:
df_full_train.churn.value_counts(normalize=True)

churn
0    0.730032
1    0.269968
Name: proportion, dtype: float64

In [37]:
df_full_train.dtypes

customerid           object
gender               object
seniorcitizen         int64
partner              object
dependents           object
tenure                int64
phoneservice         object
multiplelines        object
internetservice      object
onlinesecurity       object
onlinebackup         object
deviceprotection     object
techsupport          object
streamingtv          object
streamingmovies      object
contract             object
paperlessbilling     object
paymentmethod        object
monthlycharges      float64
totalcharges        float64
churn                 int64
dtype: object

In [38]:
numerical = ['tenure', 'monthlycharges', 'totalcharges']

In [39]:
df_full_train.columns

Index(['customerid', 'gender', 'seniorcitizen', 'partner', 'dependents',
       'tenure', 'phoneservice', 'multiplelines', 'internetservice',
       'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport',
       'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling',
       'paymentmethod', 'monthlycharges', 'totalcharges', 'churn'],
      dtype='object')

In [40]:
categorical = [ 'gender', 'seniorcitizen', 'partner', 'dependents',
        'phoneservice', 'multiplelines', 'internetservice',
       'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport',
       'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling',
       'paymentmethod']

In [43]:
df_full_train[categorical].nunique()

gender              2
seniorcitizen       2
partner             2
dependents          2
phoneservice        2
multiplelines       3
internetservice     3
onlinesecurity      3
onlinebackup        3
deviceprotection    3
techsupport         3
streamingtv         3
streamingmovies     3
contract            3
paperlessbilling    2
paymentmethod       4
dtype: int64

In [45]:
churn_female = df_full_train[df_full_train['gender'] == 'female'].churn.mean()
churn_female

0.27682403433476394

In [46]:
churn_male = df_full_train[df_full_train['gender'] == 'male'].churn.mean()
churn_male

0.2632135306553911

In [47]:
global_churn = df_full_train.churn.mean()
global_churn

0.26996805111821087

In [48]:
churn_partner = df_full_train[df_full_train['partner'] == 'yes'].churn.mean()
churn_partner

0.20503330866025166

In [49]:
churn_no_partner = df_full_train[df_full_train['partner'] == 'no'].churn.mean()
churn_no_partner

0.3298090040927694

In [50]:
global_churn - churn_partner

0.06493474245795922

In [51]:
global_churn-churn_no_partner

-0.05984095297455855

In [52]:
df_full_train.groupby('gender').churn.mean()

gender
female    0.276824
male      0.263214
Name: churn, dtype: float64

In [62]:
from IPython.display import display

In [63]:
for c in categorical:
    print(c)
    df_group = df_full_train.groupby(c).churn.agg(['mean', 'count']).reset_index()
    df_group['diff'] = df_group['mean'] - global_churn
    df_group['risk'] = df_group['mean'] / global_churn
    display(df_group)
    print()

gender


Unnamed: 0,gender,mean,count,diff,risk
0,female,0.276824,2796,0.006856,1.025396
1,male,0.263214,2838,-0.006755,0.97498



seniorcitizen


Unnamed: 0,seniorcitizen,mean,count,diff,risk
0,0,0.24227,4722,-0.027698,0.897403
1,1,0.413377,912,0.143409,1.531208



partner


Unnamed: 0,partner,mean,count,diff,risk
0,no,0.329809,2932,0.059841,1.221659
1,yes,0.205033,2702,-0.064935,0.759472



dependents


Unnamed: 0,dependents,mean,count,diff,risk
0,no,0.31376,3968,0.043792,1.162212
1,yes,0.165666,1666,-0.104302,0.613651



phoneservice


Unnamed: 0,phoneservice,mean,count,diff,risk
0,no,0.241316,547,-0.028652,0.89387
1,yes,0.273049,5087,0.003081,1.011412



multiplelines


Unnamed: 0,multiplelines,mean,count,diff,risk
0,no,0.257407,2700,-0.012561,0.953474
1,no_phone_service,0.241316,547,-0.028652,0.89387
2,yes,0.290742,2387,0.020773,1.076948



internetservice


Unnamed: 0,internetservice,mean,count,diff,risk
0,dsl,0.192347,1934,-0.077621,0.712482
1,fiber_optic,0.425171,2479,0.155203,1.574895
2,no,0.077805,1221,-0.192163,0.288201



onlinesecurity


Unnamed: 0,onlinesecurity,mean,count,diff,risk
0,no,0.420921,2801,0.150953,1.559152
1,no_internet_service,0.077805,1221,-0.192163,0.288201
2,yes,0.153226,1612,-0.116742,0.56757



onlinebackup


Unnamed: 0,onlinebackup,mean,count,diff,risk
0,no,0.404323,2498,0.134355,1.497672
1,no_internet_service,0.077805,1221,-0.192163,0.288201
2,yes,0.217232,1915,-0.052736,0.80466



deviceprotection


Unnamed: 0,deviceprotection,mean,count,diff,risk
0,no,0.395875,2473,0.125907,1.466379
1,no_internet_service,0.077805,1221,-0.192163,0.288201
2,yes,0.230412,1940,-0.039556,0.85348



techsupport


Unnamed: 0,techsupport,mean,count,diff,risk
0,no,0.418914,2781,0.148946,1.551717
1,no_internet_service,0.077805,1221,-0.192163,0.288201
2,yes,0.159926,1632,-0.110042,0.59239



streamingtv


Unnamed: 0,streamingtv,mean,count,diff,risk
0,no,0.342832,2246,0.072864,1.269897
1,no_internet_service,0.077805,1221,-0.192163,0.288201
2,yes,0.302723,2167,0.032755,1.121328



streamingmovies


Unnamed: 0,streamingmovies,mean,count,diff,risk
0,no,0.338906,2213,0.068938,1.255358
1,no_internet_service,0.077805,1221,-0.192163,0.288201
2,yes,0.307273,2200,0.037305,1.138182



contract


Unnamed: 0,contract,mean,count,diff,risk
0,month-to-month,0.431701,3104,0.161733,1.599082
1,one_year,0.120573,1186,-0.149395,0.446621
2,two_year,0.028274,1344,-0.241694,0.10473



paperlessbilling


Unnamed: 0,paperlessbilling,mean,count,diff,risk
0,no,0.172071,2313,-0.097897,0.637375
1,yes,0.338151,3321,0.068183,1.25256



paymentmethod


Unnamed: 0,paymentmethod,mean,count,diff,risk
0,bank_transfer_(automatic),0.168171,1219,-0.101797,0.622928
1,credit_card_(automatic),0.164339,1217,-0.10563,0.608733
2,electronic_check,0.45589,1893,0.185922,1.688682
3,mailed_check,0.19387,1305,-0.076098,0.718121





In [61]:
df_group

Unnamed: 0,gender,mean,count,diff,risk
0,female,0.276824,2796,0.006856,1.025396
1,male,0.263214,2838,-0.006755,0.97498


In [64]:
#Mutual informaton

In [65]:
from sklearn.metrics import mutual_info_score

In [66]:
mutual_info_score(df_full_train.churn, df_full_train.contract)

0.0983203874041556

In [67]:
mutual_info_score(df_full_train.churn, df_full_train.gender)

0.0001174846211139946

In [69]:
def mutual_info_churn_score(series):
    return mutual_info_score(series, df_full_train.churn)

In [74]:
df_full_train[categorical].apply(mutual_info_churn_score, ).sort_values(ascending = False)

contract            0.098320
onlinesecurity      0.063085
techsupport         0.061032
internetservice     0.055868
onlinebackup        0.046923
deviceprotection    0.043453
paymentmethod       0.043210
streamingtv         0.031853
streamingmovies     0.031581
paperlessbilling    0.017589
dependents          0.012346
partner             0.009968
seniorcitizen       0.009410
multiplelines       0.000857
phoneservice        0.000229
gender              0.000117
dtype: float64

In [75]:
#Correlation

In [77]:
df_full_train[numerical].corrwith(df_full_train.churn)

tenure           -0.351885
monthlycharges    0.196805
totalcharges     -0.196353
dtype: float64

In [80]:
df_full_train[['tenure']].corrwith(df_full_train.totalcharges)

tenure    0.828268
dtype: float64

In [81]:
# One hot Encoding

In [84]:
df_full_train.columns

Index(['customerid', 'gender', 'seniorcitizen', 'partner', 'dependents',
       'tenure', 'phoneservice', 'multiplelines', 'internetservice',
       'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport',
       'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling',
       'paymentmethod', 'monthlycharges', 'totalcharges', 'churn'],
      dtype='object')

In [85]:
df_full_train[['contract', 'gender']]

Unnamed: 0,contract,gender
0,two_year,male
1,one_year,female
2,two_year,male
3,one_year,male
4,one_year,male
...,...,...
5629,month-to-month,male
5630,two_year,male
5631,month-to-month,male
5632,month-to-month,male


In [86]:
from sklearn.feature_extraction import DictVectorizer

In [94]:
df_train[['gender', 'contract']].iloc[:100].to_dict(orient = 'records')

[{'gender': 'female', 'contract': 'two_year'},
 {'gender': 'male', 'contract': 'month-to-month'},
 {'gender': 'female', 'contract': 'month-to-month'},
 {'gender': 'female', 'contract': 'month-to-month'},
 {'gender': 'female', 'contract': 'two_year'},
 {'gender': 'male', 'contract': 'month-to-month'},
 {'gender': 'male', 'contract': 'month-to-month'},
 {'gender': 'female', 'contract': 'month-to-month'},
 {'gender': 'female', 'contract': 'two_year'},
 {'gender': 'female', 'contract': 'month-to-month'},
 {'gender': 'female', 'contract': 'two_year'},
 {'gender': 'male', 'contract': 'month-to-month'},
 {'gender': 'female', 'contract': 'two_year'},
 {'gender': 'female', 'contract': 'month-to-month'},
 {'gender': 'female', 'contract': 'month-to-month'},
 {'gender': 'male', 'contract': 'month-to-month'},
 {'gender': 'female', 'contract': 'two_year'},
 {'gender': 'female', 'contract': 'month-to-month'},
 {'gender': 'male', 'contract': 'one_year'},
 {'gender': 'male', 'contract': 'two_year'},
 {

In [95]:
dicts = df_train[['gender', 'contract']].iloc[:100].to_dict(orient = 'records')

In [97]:
dv = DictVectorizer(sparse=False)

In [102]:
dv.fit(dicts)

In [99]:
dv.transform(dicts)

array([[0., 0., 1., 1., 0.],
       [1., 0., 0., 0., 1.],
       [1., 0., 0., 1., 0.],
       [1., 0., 0., 1., 0.],
       [0., 0., 1., 1., 0.],
       [1., 0., 0., 0., 1.],
       [1., 0., 0., 0., 1.],
       [1., 0., 0., 1., 0.],
       [0., 0., 1., 1., 0.],
       [1., 0., 0., 1., 0.],
       [0., 0., 1., 1., 0.],
       [1., 0., 0., 0., 1.],
       [0., 0., 1., 1., 0.],
       [1., 0., 0., 1., 0.],
       [1., 0., 0., 1., 0.],
       [1., 0., 0., 0., 1.],
       [0., 0., 1., 1., 0.],
       [1., 0., 0., 1., 0.],
       [0., 1., 0., 0., 1.],
       [0., 0., 1., 0., 1.],
       [1., 0., 0., 0., 1.],
       [0., 1., 0., 1., 0.],
       [1., 0., 0., 1., 0.],
       [0., 0., 1., 1., 0.],
       [1., 0., 0., 0., 1.],
       [0., 0., 1., 0., 1.],
       [1., 0., 0., 1., 0.],
       [1., 0., 0., 1., 0.],
       [1., 0., 0., 1., 0.],
       [0., 1., 0., 1., 0.],
       [1., 0., 0., 0., 1.],
       [1., 0., 0., 0., 1.],
       [0., 1., 0., 1., 0.],
       [0., 1., 0., 1., 0.],
       [1., 0.

In [101]:
dv.get_feature_names_out()

array(['contract=month-to-month', 'contract=one_year',
       'contract=two_year', 'gender=female', 'gender=male'], dtype=object)

In [113]:
train_dicts = df_train[categorical + numerical].to_dict(orient = 'records')

In [114]:
train_dicts[0]

{'gender': 'female',
 'seniorcitizen': 0,
 'partner': 'yes',
 'dependents': 'yes',
 'phoneservice': 'yes',
 'multiplelines': 'yes',
 'internetservice': 'fiber_optic',
 'onlinesecurity': 'yes',
 'onlinebackup': 'yes',
 'deviceprotection': 'yes',
 'techsupport': 'yes',
 'streamingtv': 'yes',
 'streamingmovies': 'yes',
 'contract': 'two_year',
 'paperlessbilling': 'yes',
 'paymentmethod': 'electronic_check',
 'tenure': 72,
 'monthlycharges': 115.5,
 'totalcharges': 8425.15}

In [115]:
dv = DictVectorizer(sparse=False)

In [116]:
dv.fit(train_dicts)

In [117]:
dv.get_feature_names_out()

array(['contract=month-to-month', 'contract=one_year',
       'contract=two_year', 'dependents=no', 'dependents=yes',
       'deviceprotection=no', 'deviceprotection=no_internet_service',
       'deviceprotection=yes', 'gender=female', 'gender=male',
       'internetservice=dsl', 'internetservice=fiber_optic',
       'internetservice=no', 'monthlycharges', 'multiplelines=no',
       'multiplelines=no_phone_service', 'multiplelines=yes',
       'onlinebackup=no', 'onlinebackup=no_internet_service',
       'onlinebackup=yes', 'onlinesecurity=no',
       'onlinesecurity=no_internet_service', 'onlinesecurity=yes',
       'paperlessbilling=no', 'paperlessbilling=yes', 'partner=no',
       'partner=yes', 'paymentmethod=bank_transfer_(automatic)',
       'paymentmethod=credit_card_(automatic)',
       'paymentmethod=electronic_check', 'paymentmethod=mailed_check',
       'phoneservice=no', 'phoneservice=yes', 'seniorcitizen',
       'streamingmovies=no', 'streamingmovies=no_internet_service',

In [118]:
X_train = dv.fit_transform(train_dicts)

In [121]:
X_train.shape

(4225, 45)

In [122]:
val_dicts = df_val[categorical + numerical].to_dict(orient = 'records')

In [123]:
X_val = dv.transform(val_dicts)

In [124]:
X_val.shape

(1409, 45)

In [125]:
# Logistic Regression

In [126]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

In [127]:
def linear_regression(xi):
    result = w0

    for j in range(len (w)):
        result = result + xi[j] * w[j]
    return result

In [128]:
def lodistic_regression(xi):
    score = w0

    for j in range(len (w)):
        score = score + xi[j] * w[j]
    
    result = sigmoid(score)
    return result

In [129]:
from sklearn.linear_model import LogisticRegression

In [130]:
model = LogisticRegression()

In [133]:
model.fit(X_train, y_train)

In [135]:
model.coef_[0].round(3)

array([ 0.475, -0.175, -0.408, -0.03 , -0.078,  0.063, -0.089, -0.081,
       -0.034, -0.073, -0.335,  0.316, -0.089,  0.004, -0.258,  0.141,
        0.009,  0.063, -0.089, -0.081,  0.266, -0.089, -0.284, -0.231,
        0.124, -0.166,  0.058, -0.087, -0.032,  0.07 , -0.059,  0.141,
       -0.249,  0.215, -0.12 , -0.089,  0.102, -0.071, -0.089,  0.052,
        0.213, -0.089, -0.232, -0.07 ,  0.   ])

In [136]:
model.intercept_

array([-0.10903675])

In [140]:
#Hard prediction
model.predict(X_train)

array([0, 1, 1, ..., 1, 0, 1])

In [141]:
#Soft prediction
model.predict_proba(X_train)

array([[0.90452087, 0.09547913],
       [0.32067001, 0.67932999],
       [0.36631915, 0.63368085],
       ...,
       [0.46838898, 0.53161102],
       [0.9574582 , 0.0425418 ],
       [0.30127077, 0.69872923]])

In [146]:
#Soft prediction
y_pred = model.predict_proba(X_train)[:,1]

In [147]:
y_pred

array([0.09547913, 0.67932999, 0.63368085, ..., 0.53161102, 0.0425418 ,
       0.69872923])

In [149]:
churn_decision = (y_pred >= 0.5)

In [150]:
churn_decision

array([False,  True,  True, ...,  True, False,  True])

In [151]:
y_pred = model.predict_proba(X_val)[:,1]

In [152]:
churn_decision = (y_pred >= 0.5)

In [153]:
churn_decision

array([False, False, False, ..., False,  True,  True])

In [155]:
df_val[churn_decision].customerid

3       8433-wxgna
8       3440-jpscl
11      2637-fkfsy
12      7228-omtpn
19      6711-fldfb
           ...    
1397    5976-jcjrh
1398    2034-cgrhz
1399    5276-kqwhg
1407    6521-yytyi
1408    3049-solay
Name: customerid, Length: 311, dtype: object

In [156]:
y_val

array([0, 0, 0, ..., 0, 1, 1])

In [157]:
churn_decision.astype(int)

array([0, 0, 0, ..., 0, 1, 1])

In [158]:
(y_val == churn_decision).mean()

0.8034066713981547

In [159]:
#Model interpetation

In [160]:
dict(zip(dv.get_feature_names_out(), model.coef_[0].round(3)))

{'contract=month-to-month': 0.475,
 'contract=one_year': -0.175,
 'contract=two_year': -0.408,
 'dependents=no': -0.03,
 'dependents=yes': -0.078,
 'deviceprotection=no': 0.063,
 'deviceprotection=no_internet_service': -0.089,
 'deviceprotection=yes': -0.081,
 'gender=female': -0.034,
 'gender=male': -0.073,
 'internetservice=dsl': -0.335,
 'internetservice=fiber_optic': 0.316,
 'internetservice=no': -0.089,
 'monthlycharges': 0.004,
 'multiplelines=no': -0.258,
 'multiplelines=no_phone_service': 0.141,
 'multiplelines=yes': 0.009,
 'onlinebackup=no': 0.063,
 'onlinebackup=no_internet_service': -0.089,
 'onlinebackup=yes': -0.081,
 'onlinesecurity=no': 0.266,
 'onlinesecurity=no_internet_service': -0.089,
 'onlinesecurity=yes': -0.284,
 'paperlessbilling=no': -0.231,
 'paperlessbilling=yes': 0.124,
 'partner=no': -0.166,
 'partner=yes': 0.058,
 'paymentmethod=bank_transfer_(automatic)': -0.087,
 'paymentmethod=credit_card_(automatic)': -0.032,
 'paymentmethod=electronic_check': 0.07,
 

In [161]:
#Using the model

In [162]:
dicts_full_train = df_full_train[categorical + numerical].to_dict(orient= 'records')

In [163]:
dv = DictVectorizer(sparse = False)

In [164]:
X_full_train = dv.fit_transform(dicts_full_train)

In [165]:
y_full_train = df_full_train.churn.values

In [166]:
model = LogisticRegression()
model.fit(X_full_train, y_full_train)

In [167]:
dicts_test = df_test[categorical + numerical].to_dict(orient= 'records')

In [168]:
X_test = dv.fit_transform(dicts_test)

In [171]:
y_pred = model.predict_proba(X_test)[:,1]

In [172]:
churn_decision = (y_pred >=0.5)

In [173]:
(churn_decision == y_test).mean()

0.815471965933286