### 3.2 Data Preparation

In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

df = pd.read_csv('Telco-Customer-Churn.csv')
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [15]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

In [16]:
df.totalcharges = pd.to_numeric(df.totalcharges,errors='coerce')

In [17]:
str_columns = list(df.dtypes[df.dtypes == 'object'].index)
for col in str_columns:
    df[col] = df[col].str.lower().str.replace(' ','_')

In [18]:
df.isnull().sum()
df.totalcharges = df.totalcharges.fillna(0)
df.isnull().sum()

customerid          0
gender              0
seniorcitizen       0
partner             0
dependents          0
tenure              0
phoneservice        0
multiplelines       0
internetservice     0
onlinesecurity      0
onlinebackup        0
deviceprotection    0
techsupport         0
streamingtv         0
streamingmovies     0
contract            0
paperlessbilling    0
paymentmethod       0
monthlycharges      0
totalcharges        0
churn               0
dtype: int64

In [19]:
df.churn

0        no
1        no
2       yes
3        no
4       yes
       ... 
7038     no
7039     no
7040     no
7041    yes
7042     no
Name: churn, Length: 7043, dtype: object

In [20]:
df.churn = (df.churn == 'yes').astype(int)

### 3.3 Setting up the validation framework

In [47]:
from sklearn.model_selection import train_test_split
df_fulltrain, df_test = train_test_split(df, test_size = 0.2, random_state =1)
df_train, df_val = train_test_split(df_fulltrain, test_size = 0.25, random_state =1)
df_fulltrain.shape, df_test.shape, df_val.shape
df.nunique()
numerical = ['tenure', 'monthlycharges', 'totalcharges']
categorical = ['gender', 'seniorcitizen', 'partner', 'dependents',
       'phoneservice', 'multiplelines', 'internetservice',
       'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport',
       'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling',
       'paymentmethod']
fields = categorical + numerical
df_fulltrain.reset_index(drop=True)
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)
y_train = df_train.churn
y_val = df_val.churn
y_test = df_test.churn
df_train = df_train[fields]
df_val = df_val[fields]
df_test = df_test[fields]

### 3.4 Training your model

In [48]:
from sklearn.feature_extraction import DictVectorizer

dv = DictVectorizer(sparse = False)
dict_train = df_train.to_dict(orient='records')
dv.fit(dict_train)
X_train = dv.transform(dict_train)
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(max_iter=100000)
model.fit(X_train, y_train)

In [51]:
w = model.coef_[0]
w

array([ 6.84563865e-01,  3.85627805e-02, -6.82106720e-01,  5.59874182e-02,
       -1.49674921e-02,  1.13934134e-01, -1.59961569e-01,  8.70473614e-02,
        3.94140853e-02,  1.60584095e-03, -4.97473553e-01,  6.98455049e-01,
       -1.59961569e-01, -1.79930514e-02, -1.87166364e-01,  6.61224966e-02,
        1.62063794e-01,  1.16725547e-01, -1.59961569e-01,  8.42559483e-02,
        2.85032504e-01, -1.59961569e-01, -8.40510084e-02, -1.61238079e-01,
        2.02258005e-01, -4.45575410e-02,  8.55774672e-02, -5.15350530e-02,
       -2.96985505e-03,  1.06354594e-01, -1.08297599e-02,  6.61224966e-02,
       -2.51025705e-02,  1.93985926e-01, -9.41528635e-02, -1.59961569e-01,
        2.95134358e-01, -5.39335450e-02, -1.59961569e-01,  2.54915040e-01,
        2.35033033e-01, -1.59961569e-01, -3.40515382e-02, -6.85347146e-02,
        3.95932211e-04])

In [52]:
w0 = model.intercept_[0]
w0

np.float64(-0.045323382519725265)

In [55]:
dict_val = df_val.to_dict(orient='records')
X_val = dv.transform(dict_val)
predict = (model.predict_proba(X_val)[:,1].round(2)) > 0.5
(predict == y_val).mean()

np.float64(0.8076650106458482)

### 3.5 Retrain and Use model

In [66]:
dict_fulltrain = df_fulltrain[fields].to_dict(orient='records')
X_fulltrain = dv.transform(dict_fulltrain)
y_fulltrain = df_fulltrain.churn.values
model = LogisticRegression(max_iter=100000)
model.fit(np.concatenate([X_train,X_val]), np.concatenate([y_train,y_val]))
w = model.coef_[0]
w0 = model.intercept_[0]
w0

np.float64(-0.12307907358470903)

In [67]:
dict_test = df_test.to_dict(orient='records')
X_test = dv.transform(dict_test)
predict = (model.predict_proba(X_test)[:,1].round(2)) > 0.5
(predict == y_test).mean()

np.float64(0.8133427963094393)