#Data Preparation
#- Download data from Kaggle
#- Look at the data

#- Check if all the columns read correctly
#- Check if the churn variable needs any preparation

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv(r'C:\Users\Spike\Desktop\Alexey ML Zoomcamp\Telco Churn\WA_Fn-UseC_-Telco-Customer-Churn.csv')


In [3]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [4]:
df.head().T

Unnamed: 0,0,1,2,3,4
customerID,7590-VHVEG,5575-GNVDE,3668-QPYBK,7795-CFOCW,9237-HQITU
gender,Female,Male,Male,Male,Female
SeniorCitizen,0,0,0,0,0
Partner,Yes,No,No,No,No
Dependents,No,No,No,No,No
tenure,1,34,2,45,2
PhoneService,No,Yes,Yes,No,Yes
MultipleLines,No phone service,No,No,No phone service,No
InternetService,DSL,DSL,DSL,DSL,Fiber optic
OnlineSecurity,No,Yes,Yes,Yes,No


In [5]:
#- Make the column names and values look uniform

In [6]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

categorical_columns = list(df.dtypes[df.dtypes == 'object'].index)

for c in categorical_columns:
    df[c] = df[c].str.lower().str.replace(' ', '_')

In [7]:
df.head()

Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,...,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
0,7590-vhveg,female,0,yes,no,1,no,no_phone_service,dsl,no,...,no,no,no,no,month-to-month,yes,electronic_check,29.85,29.85,no
1,5575-gnvde,male,0,no,no,34,yes,no,dsl,yes,...,yes,no,no,no,one_year,no,mailed_check,56.95,1889.5,no
2,3668-qpybk,male,0,no,no,2,yes,no,dsl,yes,...,no,no,no,no,month-to-month,yes,mailed_check,53.85,108.15,yes
3,7795-cfocw,male,0,no,no,45,no,no_phone_service,dsl,yes,...,yes,yes,no,no,one_year,no,bank_transfer_(automatic),42.3,1840.75,no
4,9237-hqitu,female,0,no,no,2,yes,no,fiber_optic,no,...,no,no,no,no,month-to-month,yes,electronic_check,70.7,151.65,yes


In [8]:
df.dtypes

customerid           object
gender               object
seniorcitizen         int64
partner              object
dependents           object
tenure                int64
phoneservice         object
multiplelines        object
internetservice      object
onlinesecurity       object
onlinebackup         object
deviceprotection     object
techsupport          object
streamingtv          object
streamingmovies      object
contract             object
paperlessbilling     object
paymentmethod        object
monthlycharges      float64
totalcharges         object
churn                object
dtype: object

In [9]:
df.totalcharges.head()

0      29.85
1     1889.5
2     108.15
3    1840.75
4     151.65
Name: totalcharges, dtype: object

In [10]:
tc = pd.to_numeric(df.totalcharges, errors='coerce')

In [11]:
tc.isnull().sum()

11

In [12]:
df.totalcharges.isna().head()

0    False
1    False
2    False
3    False
4    False
Name: totalcharges, dtype: bool

In [13]:
df[tc.isnull()][['customerid', 'totalcharges']]

Unnamed: 0,customerid,totalcharges
488,4472-lvygi,_
753,3115-czmzd,_
936,5709-lvoeq,_
1082,4367-nuyao,_
1340,1371-dwpaz,_
3331,7644-omvmy,_
3826,3213-vvolg,_
4380,2520-sgtta,_
5218,2923-arzlg,_
6670,4075-wkniu,_


In [14]:
df.totalcharges = pd.to_numeric(df.totalcharges, errors='coerce')

In [15]:
df.totalcharges = df.totalcharges.fillna(0)

In [16]:
(df.churn == 'no').astype(int).head()

0    1
1    1
2    0
3    1
4    0
Name: churn, dtype: int32

In [17]:
# Setting up the validation 
#Perform the train/validation/test split with Scikit-Learn

In [18]:
from sklearn.model_selection import train_test_split

In [19]:
train_test_split(df, test_size= 0.2, random_state=1)

[      customerid  gender  seniorcitizen partner dependents  tenure  \
 1814  5442-pptjy    male              0     yes        yes      12   
 5946  6261-rcvns  female              0      no         no      42   
 3881  2176-osjuv    male              0     yes         no      71   
 2389  6161-erdgd    male              0     yes        yes      71   
 3676  2364-ufrom    male              0      no         no      30   
 611   4765-oxppd  female              0     yes        yes       9   
 588   0742-lafqk    male              0     yes         no      72   
 2687  5213-twwju    male              0      no         no      28   
 1550  9957-yodkz    male              1     yes         no       6   
 2890  1732-vhubq  female              1     yes        yes      47   
 1072  7771-zonat    male              0      no         no      22   
 5150  7017-vfuly  female              0     yes         no       2   
 4643  1230-qajdw    male              0      no         no       3   
 997  

In [37]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state =1)

In [38]:
len(df_full_train), len(df_test)

(5634, 1409)

In [39]:
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

In [40]:
len(df_train), len(df_val), len(df_test)

(4225, 1409, 1409)

#Arrange dataframe in chronological order

In [24]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [41]:
y_train = df_train.churn.values
y_val = df_val.churn.values
y_test = df_test.churn.values

In [42]:
del df_train['churn']
del df_val['churn']
del df_test['churn']


In [27]:
#EDA
#Check missing values
#Look at the target variable (churn)
#Look at numerical and categorical variables

In [43]:
df_full_train = df_full_train.reset_index(drop=True)
df_full_train

Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,...,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
0,5442-pptjy,male,0,yes,yes,12,yes,no,no,no_internet_service,...,no_internet_service,no_internet_service,no_internet_service,no_internet_service,two_year,no,mailed_check,19.70,258.35,no
1,6261-rcvns,female,0,no,no,42,yes,no,dsl,yes,...,yes,yes,no,yes,one_year,no,credit_card_(automatic),73.90,3160.55,yes
2,2176-osjuv,male,0,yes,no,71,yes,yes,dsl,yes,...,no,yes,no,no,two_year,no,bank_transfer_(automatic),65.15,4681.75,no
3,6161-erdgd,male,0,yes,yes,71,yes,yes,dsl,yes,...,yes,yes,yes,yes,one_year,no,electronic_check,85.45,6300.85,no
4,2364-ufrom,male,0,no,no,30,yes,no,dsl,yes,...,no,yes,yes,no,one_year,no,electronic_check,70.40,2044.75,no
5,4765-oxppd,female,0,yes,yes,9,yes,no,dsl,yes,...,yes,yes,no,no,month-to-month,no,mailed_check,65.00,663.05,yes
6,0742-lafqk,male,0,yes,no,72,yes,yes,fiber_optic,yes,...,yes,no,no,yes,two_year,yes,electronic_check,99.15,7422.10,no
7,5213-twwju,male,0,no,no,28,yes,no,fiber_optic,no,...,yes,yes,no,yes,month-to-month,no,electronic_check,91.00,2626.15,no
8,9957-yodkz,male,1,yes,no,6,yes,yes,fiber_optic,no,...,yes,no,no,no,month-to-month,yes,electronic_check,80.80,457.10,no
9,1732-vhubq,female,1,yes,yes,47,yes,no,fiber_optic,no,...,no,no,no,no,month-to-month,no,bank_transfer_(automatic),70.55,3309.25,yes


In [44]:
df_full_train.churn.value_counts()

no     4113
yes    1521
Name: churn, dtype: int64

In [30]:
#check in percentage

In [45]:
df_full_train.churn.value_counts(normalize=True)

no     0.730032
yes    0.269968
Name: churn, dtype: float64

In [32]:
#churn rate = 0.269968

In [70]:
(df_full_train.churn == 'no').astype(int).head()

0    1
1    0
2    1
3    1
4    1
Name: churn, dtype: int32

In [51]:
(df_full_train.churn == 'yes').astype(int).head()

0    0
1    1
2    0
3    0
4    0
Name: churn, dtype: int32

In [52]:
df_full_train.isnull().sum()

customerid          0
gender              0
seniorcitizen       0
partner             0
dependents          0
tenure              0
phoneservice        0
multiplelines       0
internetservice     0
onlinesecurity      0
onlinebackup        0
deviceprotection    0
techsupport         0
streamingtv         0
streamingmovies     0
contract            0
paperlessbilling    0
paymentmethod       0
monthlycharges      0
totalcharges        0
churn               0
dtype: int64

In [71]:
global_churn_rate = df_full_train.churn.mean()
round(global_churn_rate, 2)

TypeError: Could not convert noyesnononoyesnononoyesnononoyesnononononoyesnonoyesnonononoyesnononoyesnononoyesyesyesnoyesyesnononononoyesyesnononoyesnonoyesnonononoyesnonoyesyesnononononononononononoyesnononoyesyesnoyesnoyesnononoyesyesnonoyesyesnononononoyesnonononononononoyesnonononononoyesyesnononononoyesnonononoyesnononononoyesyesnonononononoyesyesnoyesyesnononononoyesnoyesnonoyesyesyesnoyesnonononononononoyesyesnononononononoyesnonononononoyesnoyesyesnoyesyesnoyesnonononononononoyesnonononononononoyesyesyesnononononononononoyesnoyesnonononoyesnonononononononoyesnononononoyesnonoyesnononoyesyesyesnononoyesnonononoyesnononononononoyesnonononoyesnoyesnononononononononoyesnononononononononononononononononoyesnononononoyesnononoyesnonononononononoyesyesnonononononoyesyesnoyesyesnononononononononoyesnonoyesnonoyesnonoyesyesnoyesyesnononoyesnoyesnoyesyesnonononononononoyesnonononononononononoyesnoyesnonononoyesnononoyesnononoyesnonoyesnononoyesnonoyesnoyesnoyesnoyesyesnonoyesyesnononononononononononoyesnoyesnonoyesyesnoyesnononoyesnonononononoyesnonoyesnonononoyesnononoyesnonononoyesnonononononoyesnoyesyesyesnoyesyesyesnononoyesyesnononononononononononoyesnoyesnononoyesnoyesnonononoyesnoyesnoyesnonononononoyesyesnoyesnononononononononononoyesyesyesnonoyesnonoyesnonoyesnonononoyesnonoyesnononoyesnonononononononononoyesnonoyesyesyesnononoyesyesnoyesnonononononononononononononononoyesnoyesnononoyesyesnonononoyesyesnonoyesnononononoyesnonononononononoyesnonononoyesnononoyesyesnonononoyesnonoyesyesnoyesyesnonoyesnononononononononononoyesnononoyesyesnononononononononoyesnonononoyesnononononoyesnononoyesyesyesnonoyesyesyesyesnoyesnonononononononoyesnonononononoyesyesnoyesnonoyesnonoyesnononoyesnoyesnonoyesyesnononoyesnoyesnonononononoyesnonoyesnononoyesyesnonoyesnononononoyesyesnononononoyesnonoyesnonoyesnononononoyesyesnonoyesnononoyesyesnonononononoyesnononononoyesnononoyesyesnononononoyesnononononoyesnoyesnonoyesnonononononoyesnononoyesnononoyesnononoyesnonononoyesyesnonoyesyesyesnonoyesnonononoyesnononoyesyesyesnoyesyesnoyesnoyesnoyesnoyesnononoyesnonononoyesnoyesnoyesyesyesnonononononoyesnonononoyesnoyesnoyesnonononononononoyesnoyesnononononoyesyesnoyesnononononoyesyesyesnonononononononoyesnonoyesyesnoyesnononononononoyesnonononoyesnononoyesnonononoyesnonoyesyesnononononononononononononononoyesnonononononoyesnoyesyesnononoyesnonononononononoyesnonononononononoyesnononononononononononoyesnoyesyesnononoyesnoyesnoyesnonoyesnononononononoyesnoyesnonononononononononoyesnononoyesnononoyesnonononononononononononononononononononoyesnoyesnonoyesnoyesnoyesnononoyesyesnonononononoyesnononoyesnonoyesnononononoyesnoyesnonononoyesnonoyesnonononononononoyesyesyesnoyesyesnononoyesyesyesnoyesnonononoyesyesnononoyesnonononoyesnonoyesnoyesnonoyesnonoyesnonononoyesnononoyesnonononoyesyesyesnonoyesnononononononoyesnoyesnoyesyesnonoyesnonononononononoyesyesyesnononononoyesnoyesnoyesnonononononononononoyesnoyesyesnoyesnonoyesnoyesnononoyesnononononononoyesnononononononononoyesnonononoyesnononononoyesyesnoyesnonononononoyesnononoyesnonoyesnonononononononononoyesnononononononoyesnoyesnonoyesnoyesnoyesnoyesnonoyesnononoyesnoyesnononoyesnonoyesnoyesnoyesnonononoyesyesnonoyesnonoyesnonononononononoyesnoyesnononononoyesnoyesnonoyesyesnononoyesnonoyesnononononoyesnonoyesyesnoyesyesnoyesyesnoyesnononoyesyesnonononononoyesnononoyesnonoyesnononoyesyesnonononononononoyesnonoyesnonononononononononoyesnonoyesyesyesnonoyesnonononoyesnoyesnononononononoyesnoyesyesyesyesnonoyesyesnonononoyesnoyesyesyesnoyesyesnoyesyesnonoyesyesnoyesyesyesnononoyesnoyesnononononononoyesnonononoyesnonononoyesnonoyesnonoyesnonoyesnonononononoyesyesnononoyesyesyesnoyesyesnonononoyesnonononoyesnononononoyesnonononoyesnononononoyesyesnoyesnoyesnonoyesnonoyesnonoyesyesnonononononononononoyesnonononoyesyesnononononoyesnoyesnononoyesnonononononononononononononoyesnononoyesnoyesnononoyesnoyesnonononononoyesnononoyesnononononoyesnoyesyesnoyesnoyesyesnoyesyesyesnonoyesyesnonononononoyesyesyesnonononononononononoyesnonononononoyesnoyesnononononoyesnonoyesnononononononononoyesnoyesnononononoyesnoyesyesyesnononoyesnonoyesnonononononononononononononononononononoyesyesyesyesyesnoyesnononoyesnononoyesnonononoyesnonononononononoyesnononoyesnonononoyesyesnonoyesnonononononononoyesnononononoyesyesnoyesnononononononoyesnoyesnononoyesnonoyesnonoyesnoyesnoyesyesnoyesnonononononononoyesyesnononononoyesnonoyesnonononoyesnonononoyesnoyesnonoyesnoyesnononononononononononoyesnoyesnononononoyesyesnonononononoyesnoyesnonoyesnonoyesyesnoyesyesnonononoyesnononoyesnonononoyesyesnonononononoyesnononoyesyesnoyesnoyesnoyesnonononononononoyesyesnonononononoyesnononononononononoyesnonononononononoyesnoyesnoyesnononononononoyesyesnoyesnonononononononononoyesnonoyesnononononoyesnononononononononoyesnonoyesnononoyesnononononononoyesyesyesyesnoyesyesnonoyesyesnonoyesyesnonoyesnonoyesnonoyesnononoyesnonononoyesyesnononononononononoyesnoyesyesyesnonoyesnonononononononononoyesyesnoyesnononononononoyesnonononoyesnoyesnoyesnononoyesnononoyesyesnoyesnonononononononononononononoyesnoyesnononononononoyesnonononoyesnonononoyesyesyesnononononoyesyesnonoyesnoyesnonononononoyesnonoyesnoyesnononononononononononononononoyesnonononononoyesnononoyesnononononoyesnononononoyesyesnonononoyesnonononononononoyesnoyesnonoyesyesnonononoyesnoyesyesnonononoyesnoyesnonoyesyesyesnonononoyesyesyesnonononoyesnononoyesnoyesnononoyesnoyesnonoyesyesyesnonoyesnoyesnononononononononoyesyesyesnoyesyesnoyesyesnononononoyesnonoyesyesnoyesnonoyesyesyesnononoyesyesyesyesnonoyesnoyesnonoyesnononononononononoyesnonononononoyesnononoyesnoyesnonoyesnononononononononoyesnonoyesnonononononononoyesyesnonononononoyesnononononoyesnonoyesnonononoyesnoyesyesnononoyesnoyesnoyesnoyesnoyesnoyesyesnoyesyesnononoyesnononononoyesnononononononononononononoyesnoyesyesnonononononononononononoyesnonoyesnoyesnoyesnononononononononononoyesnonononononononononoyesyesnonoyesnononononononoyesnoyesnoyesyesnonononononoyesyesnonoyesyesyesnoyesnonoyesnononononononononoyesyesnonoyesnoyesnonoyesnonoyesyesnononononononoyesnonononononononoyesnonoyesnonononoyesnononoyesnonoyesnoyesyesnoyesyesnoyesyesnononoyesnononoyesnoyesnonoyesnononononononononoyesyesnonoyesnononoyesnoyesyesyesnonononoyesnononoyesyesnoyesnononoyesnoyesyesnonononoyesnoyesnonoyesyesnoyesyesnonoyesyesnoyesnonoyesyesnonoyesyesnononononononoyesnononoyesyesnononononononononononononoyesnonoyesyesnonoyesnonoyesyesnononoyesnonoyesnonononoyesnoyesyesyesnononoyesnoyesnoyesnoyesnonoyesnononononononononononononoyesnonoyesnonoyesnonoyesnonononononoyesnoyesnonononononononononononononononononoyesnononoyesnononoyesyesnonononononononoyesnononoyesnonoyesnonononononononononononoyesyesnonononononoyesyesnononoyesyesyesnoyesyesnoyesnonononoyesyesnonoyesnononoyesyesnononononoyesnononoyesnononononoyesnonononoyesnoyesnonononoyesyesnononoyesnoyesnononononoyesnononoyesyesnononoyesyesnoyesyesnononononononononononoyesnoyesnononoyesyesyesnoyesnonononononoyesnononononononononononononononononononononononoyesnonononoyesyesnonoyesnonoyesnonononononononoyesyesyesyesnononononoyesyesyesnononoyesyesnononononoyesnononoyesnonononononononoyesnonononononoyesnononononoyesnononononononoyesnoyesnoyesyesnoyesnoyesnoyesnononononoyesnoyesnononoyesnonononononoyesyesyesnoyesnoyesnoyesnonoyesnononononoyesnonoyesnonononononoyesnoyesnonononoyesnononononoyesyesyesnoyesnonoyesnoyesnoyesyesnonoyesyesnoyesyesnononoyesnonoyesyesyesyesnonoyesnononoyesnonononoyesnoyesnonoyesyesnoyesyesnoyesyesyesyesyesnonoyesyesnonononoyesnonononononoyesnoyesnonononononononoyesnonononoyesyesyesyesnononononoyesyesyesyesnoyesnonoyesnonononononoyesnononononoyesnonoyesnoyesyesnonononononoyesnonononononononononononoyesnoyesnononoyesnonononononononononononoyesnonononoyesnononoyesnoyesnononononononononononoyesnoyesnonononoyesnoyesnononoyesnonononononononoyesnoyesnoyesnononoyesnononoyesnonononoyesnonononoyesyesnonononoyesnononoyesnonononononoyesnoyesyesnononononononononononononoyesnonononononononononoyesnononoyesnononononononoyesnoyesnonoyesnoyesnonononoyesnonononononononoyesnonoyesyesnonononononononononononononoyesnoyesyesyesyesnoyesyesnoyesnonoyesyesnonoyesnonononoyesnonononoyesnoyesnoyesnoyesyesnonononononoyesyesnonononononoyesnononoyesnononoyesnonononoyesnononononononononoyesnonononoyesyesnonononononononononoyesnonononoyesnononononononoyesnonoyesnonononoyesnonononononononoyesnoyesnoyesyesnononononononononononononoyesnonononoyesyesyesnononoyesnonoyesnonononoyesnoyesnononononoyesnonononononoyesnonononoyesnonononononononoyesnoyesnononononononoyesnononononononoyesnononoyesnononoyesnonoyesnonononoyesyesyesnoyesnoyesnoyesyesnononoyesnonononoyesnoyesyesnoyesnoyesnoyesnononononononoyesnonononoyesnoyesnononononoyesyesnonononononoyesyesyesnononoyesnononononoyesnonononononoyesnoyesnononoyesnonoyesnonononoyesyesnoyesnonononoyesnononononononoyesnononoyesnoyesnoyesyesnoyesyesyesnonoyesnoyesnonononoyesnonoyesnonoyesnonoyesnononononononoyesnononononoyesnononoyesnonononononoyesnonononoyesnoyesyesnonononononoyesyesyesnonoyesnoyesyesnononoyesnonononononononoyesnonononononoyesnononononononononononoyesnononononoyesyesnononoyesnononononononononoyesyesyesnononononononoyesyesnonoyesnononoyesnonoyesyesnononononononononoyesnonoyesyesnonoyesyesnonononononononononononoyesnononononononoyesyesnonoyesnonoyesnonoyesnoyesnononononononononononononononononoyesnononononononononoyesnonononoyesnoyesyesyesnonoyesnoyesyesnoyesnonoyesyesnonononononononoyesnononoyesnononoyesnononoyesnoyesnonoyesnonoyesnonononononononononononononononononoyesnoyesnoyesnoyesyesnoyesyesnonoyesnonononononononoyesnonononoyesnononoyesyesnoyesnoyesnoyesnoyesyesnoyesnoyesnononoyesnononoyesnononoyesnononoyesyesnononononononononoyesnononoyesyesyesnonononononoyesnonononononoyesnoyesnonononononononoyesnonoyesyesnonoyesnoyesnonononononononononoyesnononononoyesnononononononononononononononoyesnononoyesyesyesnonoyesnoyesyesyesnoyesnonoyesnonononoyesnononoyesnoyesyesnonononoyesyesnononononononoyesnoyesnononoyesnononononoyesnoyesnononononoyesnoyesnononononononononononoyesnononononoyesnonoyesnononononononoyesnoyesyesnononononononoyesnononoyesnonoyesnoyesnoyesyesnononoyesnonononononoyesnoyesnononononononononononoyesnoyesnononononononoyesnonononononononononoyesnoyesyesnoyesnoyesnoyesyesyesyesnoyesnonononoyesnonoyesnononononoyesnononononononoyesnonoyesyesnonononononononoyesyesyesyesnonononononoyesyesnonononoyesnonononoyesyesnononononononoyesnoyesyesnonoyesnonoyesnonononoyesyesnonononoyesnononoyesyesnoyesyesnonononoyesnononoyesnonononononononononoyesnoyesnononononononononononoyesnononoyesnononoyesnonoyesyesnoyesyesnonononononoyesnoyesyesnonononononoyesnoyesyesnoyesnonoyesnonoyesnoyesyesnoyesnonoyesnonononononononoyesyesyesyesnononononononononoyesnononoyesnononononononononononoyesnononononononoyesnononononoyesnoyesnonononoyesyesnonononoyesnonononononoyesnonononononononononoyesnonononononoyesnonononononononononononononoyesnoyesnonononononononononononononononoyesnonononoyesnoyesnoyesnoyesyesyesnonononononononoyesyesyesnoyesyesnononoyesnonononononononononoyesyesyesnonononononononononoyesnoyesnoyesyesnonononoyesnononononononononononoyesnononononononoyesnoyesyesyesnonoyesyesnonononononoyesnononoyesnonoyesnononoyesnonoyesyesnoyesnoyesyesnononononononoyesnonononoyesnonononononononoyesnoyesnononoyesnononoyesnonononononononononononononoyesnononoyesnonononononoyesnoyesnonononoyesnonononononoyesyesnonoyesnononoyesyesnononoyesnonoyesyesnononononononoyesnononoyesnonononononononoyesyesnoyesnonoyesnoyesnonononoyesnononononononoyesnoyesyesyesyesnoyesnononoyesyesnonononoyesnonoyesyesnoyesyesyesnonoyesnoyesyesnonononoyesyesyesnonononoyesyesyesyesnononononoyesyesyesnoyesnononononoyesnonononoyesnonononononoyesyesnoyesnoyesnonononoyesnonononoyesnononoyesnonononoyesnoyesnonoyesnonoyesnononoyesyesnonoyesnonoyesnononononoyesnononononoyesnonoyesnonononoyesnonoyesnoyesnoyesnonononononononoyesnonononononononononononoyesyesnonononononononoyesnononononononononoyesnoyesyesnoyesnoyesnonoyesnonononononoyesnoyesnonononononononoyesyesyesnononononononoyesnoyesnonoyesnoyesyesyesnonoyesnoyesnonoyesnononononononononoyesnononoyesnonononoyesnonoyesnoyesnononononoyesnonononoyesnononononononoyesnonononoyesnonononoyesnoyesnonononononononononononoyesyesnoyesyesyesnononoyesnonoyesnononoyesnoyesnononoyesyesnoyesnonoyesyesnonoyesnononononononoyesnonoyesnononoyesyesnoyesyesyesyesnonononononoyesnoyesnonononononoyesnonoyesyesnonononononononoyesnononononononononoyesnonoyesnonoyesyesnoyesnononononoyesnoyesnononononoyesnonoyesyesyesnonononoyesyesnononononononoyesnoyesnonoyesnonoyesyesnononononononononoyesyesyesnonononononononoyesnoyesnonoyesnonononoyesnononononononoyesnonononononononoyesnoyesnonoyesnononoyesnononoyesyesyesyesyesyesyesnonononoyesyesnoyesyesnononononoyesnononoyesnoyesnonoyesnononoyesnononoyesnonononononoyesnonoyesnonoyesyesyesyesyesyesyesyesnoyesyesnoyesnononononoyesnoyesyesno to numeric

In [None]:
#27% of our users are churning

In [61]:
df_full_train.dtypes

customerid           object
gender               object
seniorcitizen         int64
partner              object
dependents           object
tenure                int64
phoneservice         object
multiplelines        object
internetservice      object
onlinesecurity       object
onlinebackup         object
deviceprotection     object
techsupport          object
streamingtv          object
streamingmovies      object
contract             object
paperlessbilling     object
paymentmethod        object
monthlycharges      float64
totalcharges        float64
churn                object
dtype: object

In [62]:
numerical = ['tenure', 'monthlycharges', 'totalcharges']

In [63]:
df_full_train.columns

Index(['customerid', 'gender', 'seniorcitizen', 'partner', 'dependents',
       'tenure', 'phoneservice', 'multiplelines', 'internetservice',
       'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport',
       'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling',
       'paymentmethod', 'monthlycharges', 'totalcharges', 'churn'],
      dtype='object')

In [64]:
categorical = ['gender', 'seniorcitizen', 'partner', 'dependents',
        'phoneservice', 'multiplelines', 'internetservice',
       'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport',
       'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling',
       'paymentmethod']

In [65]:
df_full_train[categorical].nunique()

gender              2
seniorcitizen       2
partner             2
dependents          2
phoneservice        2
multiplelines       3
internetservice     3
onlinesecurity      3
onlinebackup        3
deviceprotection    3
techsupport         3
streamingtv         3
streamingmovies     3
contract            3
paperlessbilling    2
paymentmethod       4
dtype: int64

In [None]:
#Feature importance: Churn rate and risk ratio
#Part of EDA - identifying which features affect our target variable 
#Churn rate, Risk ratio, Mutual information

#churn rate

In [66]:
df_full_train.head()

Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,...,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
0,5442-pptjy,male,0,yes,yes,12,yes,no,no,no_internet_service,...,no_internet_service,no_internet_service,no_internet_service,no_internet_service,two_year,no,mailed_check,19.7,258.35,no
1,6261-rcvns,female,0,no,no,42,yes,no,dsl,yes,...,yes,yes,no,yes,one_year,no,credit_card_(automatic),73.9,3160.55,yes
2,2176-osjuv,male,0,yes,no,71,yes,yes,dsl,yes,...,no,yes,no,no,two_year,no,bank_transfer_(automatic),65.15,4681.75,no
3,6161-erdgd,male,0,yes,yes,71,yes,yes,dsl,yes,...,yes,yes,yes,yes,one_year,no,electronic_check,85.45,6300.85,no
4,2364-ufrom,male,0,no,no,30,yes,no,dsl,yes,...,no,yes,yes,no,one_year,no,electronic_check,70.4,2044.75,no


In [72]:
churn_female= df_full_train[df_full_train.gender == 'female'].churn.mean()
churn_female

TypeError: Could not convert yesyesyesnoyesyesnoyesnonononoyesnonononoyesnonoyesnoyesnononoyesnononononoyesnononoyesyesnoyesyesyesnoyesyesnononononoyesnononononononononoyesnonononoyesnononoyesnoyesnonoyesnonoyesnononononoyesnononoyesnonoyesnoyesyesyesnononononononoyesnonononononoyesnononononoyesnononoyesyesnonoyesnonononononononononononononononononononoyesnonononoyesyesnonoyesyesnoyesyesnononononoyesnoyesnonoyesnoyesnoyesnoyesnoyesnononononononoyesnonoyesnoyesyesnoyesnonoyesnonoyesnononononononoyesnoyesnonononoyesnoyesnononononoyesnoyesnononononoyesyesyesyesyesnononoyesnononononononoyesnonoyesnonoyesnononoyesyesnonononononoyesnonoyesnoyesnonoyesnonoyesnonoyesnonononoyesnoyesyesnononononononononononononoyesnonononoyesnonoyesnonononoyesyesnoyesnoyesyesnonononononononoyesnononononononononoyesnononononononoyesyesyesnoyesnonononoyesnonononoyesnonoyesnonononoyesyesnononoyesnoyesnononononoyesnononoyesnononoyesnoyesnononoyesyesnoyesnonononoyesnonoyesnonononoyesnononoyesnonoyesnonoyesnononoyesnonoyesyesyesyesyesyesnonononoyesyesyesyesnoyesnoyesnoyesnononononoyesyesyesnonononoyesnononoyesnononononononoyesnonononoyesyesnononononoyesnoyesnonononononononononoyesnoyesyesnonononononoyesnononoyesnoyesnonononononononononoyesnononoyesnoyesnoyesnonononononononononononoyesnonononononononononononoyesnonoyesnononoyesnonononoyesnonoyesnonoyesnonoyesnononononoyesnoyesnonononononononoyesyesnonoyesnononononoyesnononononoyesnoyesnonononononoyesnononoyesnonononononoyesnononononononoyesnonoyesnononoyesyesnononononoyesnoyesnononoyesnonononoyesnoyesyesnonoyesnoyesnonononoyesnoyesnonoyesnonononoyesnonoyesnonononononononoyesnoyesnononononononononononononoyesnoyesnononononononoyesnoyesnonononononoyesyesnoyesnonononononononoyesnoyesnoyesyesnononoyesnoyesyesnoyesyesnoyesnoyesnononoyesnononoyesnononononononononononoyesnoyesyesnoyesyesnoyesnonononoyesyesnononoyesyesnononononononononononononononononononononononoyesnononononoyesyesnonononononononoyesyesnoyesyesnonoyesnoyesyesnononononoyesnonoyesnoyesnonoyesnonononononoyesnonononononoyesyesyesnonononononononononononoyesyesyesnoyesnoyesnononononononononoyesnononoyesnoyesnononononononononononononononononoyesyesnonononononoyesyesnonoyesnoyesnonoyesyesnononononononononoyesnoyesnonoyesyesnonononoyesyesnonoyesnoyesyesnoyesnonononoyesnonononoyesnononoyesnononononononoyesyesnononononononononononononoyesyesnonononoyesyesnononononononononononoyesnonononoyesnoyesnononononoyesyesnoyesyesnoyesnoyesyesnoyesnonoyesyesnoyesyesnonononoyesyesyesnonononononoyesyesnononononoyesnoyesnonoyesyesnononononononononononononononononononoyesyesnonoyesyesnonoyesyesnononononononononononononononoyesnonononoyesnonoyesyesnonononononononoyesnoyesyesnononoyesnoyesyesyesnonoyesyesnonononoyesnonoyesyesnoyesnoyesnononononononononoyesnononononoyesyesnoyesyesyesnononoyesyesnonoyesyesnonononoyesnoyesnononononononononoyesnononoyesnononononoyesnonoyesnonoyesyesnoyesyesyesnonoyesyesnonononononononononoyesnononononoyesyesnonononononononoyesnonoyesyesnoyesnononononoyesnoyesnononoyesyesnoyesyesnoyesyesnonononononoyesnoyesnonoyesnononononononoyesyesnonoyesnoyesnoyesyesnonononoyesnonononoyesnoyesnonononoyesyesnoyesyesyesnoyesnononononoyesnonoyesyesnoyesyesnoyesyesnoyesyesnonononoyesnonoyesnonononononononoyesnoyesnoyesnonononoyesnononoyesyesyesnononoyesyesyesnonoyesnonononononononoyesnonoyesnonononononononononononononononoyesnononononononoyesnonononoyesyesnononoyesnoyesyesyesyesyesnononoyesnonoyesnononoyesnonoyesnonoyesnoyesyesnonononoyesnononononononoyesyesnonoyesyesyesnonononoyesnononononononononononoyesnonoyesnonononononoyesnoyesyesyesyesnononoyesnononoyesnonononoyesnononoyesyesyesyesnoyesnonoyesnoyesyesyesnonoyesnononononoyesyesnonononoyesnonoyesnononononoyesnoyesnoyesnonoyesyesyesnonononoyesnonoyesyesnonoyesyesyesnoyesyesnonononononononononoyesyesnononoyesyesnonononononoyesnonoyesnononononononononononoyesnoyesnonononoyesnonononononononononoyesyesnononoyesnoyesnonoyesnonononononoyesnoyesnonoyesyesnononononoyesnonononononoyesnononononononononononoyesnonononoyesnonoyesnononononononoyesnononononononononononononoyesnoyesyesnononoyesnonoyesnonoyesnoyesnoyesnonoyesnonoyesnononononoyesnonononononoyesnoyesnonononononoyesnonononononoyesnononononoyesnonononoyesnoyesnoyesyesnonononononononoyesnonoyesnonoyesnononononononoyesnonononoyesnononononononononoyesnonononoyesyesnoyesyesnonoyesnononoyesnoyesnoyesnonononononononoyesyesnononononononoyesnoyesnonononoyesnonononononononononoyesnonononoyesyesyesnoyesnonononoyesnononoyesnonononoyesnonononononoyesnonononononoyesyesyesnoyesnoyesyesnononononononononononoyesnonononononononononononoyesnonoyesyesnonononoyesyesyesnononononononoyesyesnononononononoyesnonoyesnonoyesnononononononoyesnonononononoyesnoyesyesnoyesyesyesnononoyesnonononoyesnoyesnoyesnoyesnonononononononononoyesnonoyesyesyesnonononononoyesyesyesnonoyesnononononononononononoyesnoyesyesnonoyesnononoyesnonononoyesnoyesnonoyesnononononononoyesnononononononoyesnononoyesyesyesnoyesnononoyesnoyesnonoyesyesnoyesyesnonoyesnoyesnononononononononononononoyesnonoyesnononononononoyesnoyesnononoyesnononoyesyesnonononoyesnonoyesnononononoyesyesnoyesnoyesnonononoyesnononoyesnononononononoyesnononoyesyesyesnoyesnononononononoyesnoyesnononoyesnononoyesnononoyesnoyesnononononoyesnononononoyesnononononononononoyesyesnononoyesnoyesnononononoyesyesnononoyesyesyesyesnononononoyesyesyesnonononononoyesyesnononononoyesnononoyesnonoyesnononoyesnononoyesnonononononoyesnononoyesnononononononoyesnononononononoyesnoyesyesnononoyesnoyesnoyesnononononononoyesyesnonononoyesnoyesnonononononononoyesnonononoyesyesyesnonoyesnonononononoyesnoyesnononoyesnonoyesnoyesnoyesyesnononoyesnonoyesnonononoyesnononononononononononononononoyesyesyesnonononoyesnoyesnoyesnonoyesnonoyesyesnonononononoyesnonononoyesnoyesnoyesnononononoyesyesyesyesnoyesnononoyesnoyesyesnoyesyesyesnoyesnononoyesnonononoyesyesnoyesyesyesyesnonononononoyesnonononoyesnoyesnonononononoyesnoyesyesnoyesyesyesnoyesnonononononoyesnononoyesnoyesnononononononononononoyesyesnonononononononononoyesyesnoyesnonononoyesnononononoyesnoyesnonoyesyesyesnonononononoyesnononononoyesnononoyesnoyesnononononoyesyesnonononononononoyesnoyesnononononononoyesnoyesnonoyesnoyesnonoyesnonoyesnonononononoyesyesnonononononononononoyesyesyesyesnonononononoyesnonoyesnononoyesnonoyesnononononoyesyesyesnonoyesnoyesnoyesnoyesnononoyesnoyesnonononoyesnonoyesyesyesnonoyesyesnonononononoyesnonoyesnononoyesnonoyesnoyesyesyesnononono to numeric

In [None]:
churn_male = df_full_train[df_full_train.gender == 'male'].churn.mean()
churn_male

In [None]:
#compare churn rate in gender to global churn rate

In [None]:
globlal_churn_rate = df_full_train.churn.mean()
global_churn_rate

In [None]:
#churn rate of people with partners

In [None]:
churn_partner = df_full_train[df_full_train.partner == 'yes'].churn.mean()
churn_partner

In [None]:
#churn rate of people with partners with no partners

In [None]:
churn_no_partner = df_full_train[df_full_train.partner == 'no'].churn.mean()
churn_no_partner

In [None]:
global_churn_rate - churn_partner

In [None]:
global_churn_rate - churn_no_partner

In [None]:
global_churn_rate - churn_female

In [None]:
#Global - GROUP(Churn)
#if difference greater >0 (means this group is less likely to churn)
#if difference greater <0 (means this group is more likely to churn)

In [None]:
# Risk ratio

In [None]:
churn_no_partner / global_churn_rate

In [None]:
churn_partner / global_churn_rate

In [None]:
#Risk ratio = Group churn rate/ Global churn rate
# RR > 1 = more likely to churn
# RR < 1 = less likely to churn

In [None]:
df_full_train.groupby('gender').churn.mean()

In [None]:
df_group = df_full_train.groupby('gender').churn.agg(['mean', 'count'])
df_group

In [None]:
df_group['diff'] = df_group['mean'] - global_churn_rate
df_group['risk'] = df_group['mean'] / global_churn_rate
df_group

In [None]:
#Difference and risk ratio for each categorical columns

In [60]:
for c in categorical:
    print(c)

NameError: name 'categorical' is not defined

In [58]:
from IPython.display import display

In [59]:
for c in categorical:
    print(c)
    df_group = df_full_train.groupby(c).churn.agg(['mean','count'])
    df_group['diff'] =df_group['mean'] - global_churn_rate
    df_group['risk'] =df_group['mean'] / global_churn_rate
    display(df_group)
    print()
    print()
    print()

NameError: name 'categorical' is not defined

In [None]:
#Feature importance: Mutual information

In [73]:
from sklearn.metrics import mutual_info_score

In [None]:
#learn the information you learn about 'churn' by observing 'contract'

In [74]:
mutual_info_score(df_full_train.churn, df_full_train.contract)

0.0983203874041556

In [None]:
#Compare churn to gender and see wha we can learn from it

In [75]:
mutual_info_score(df_full_train.gender, df_full_train.churn)

0.0001174846211139946

In [76]:
mutual_info_score(df_full_train.partner, df_full_train.churn)

0.009967689095399745

In [77]:
def mutual_info_churn_score(series):
    return mutual_info_score(series, df_full_train.churn)

In [78]:
df_full_train[categorical].apply(mutual_info_churn_score)

gender              0.000117
seniorcitizen       0.009410
partner             0.009968
dependents          0.012346
phoneservice        0.000229
multiplelines       0.000857
internetservice     0.055868
onlinesecurity      0.063085
onlinebackup        0.046923
deviceprotection    0.043453
techsupport         0.061032
streamingtv         0.031853
streamingmovies     0.031581
contract            0.098320
paperlessbilling    0.017589
paymentmethod       0.043210
dtype: float64

In [79]:
#order of importance to churn
mi = df_full_train[categorical].apply(mutual_info_churn_score)
mi.sort_values(ascending=False)

contract            0.098320
onlinesecurity      0.063085
techsupport         0.061032
internetservice     0.055868
onlinebackup        0.046923
deviceprotection    0.043453
paymentmethod       0.043210
streamingtv         0.031853
streamingmovies     0.031581
paperlessbilling    0.017589
dependents          0.012346
partner             0.009968
seniorcitizen       0.009410
multiplelines       0.000857
phoneservice        0.000229
gender              0.000117
dtype: float64

In [None]:
#Feature importance: Correlation

In [80]:
df_full_train.tenure.max()

72

In [81]:
df_full_train.tenure.min()

0

In [82]:
df_full_train[numerical].corrwith(df_full_train.churn)

TypeError: ("unsupported operand type(s) for /: 'str' and 'int'", 'occurred at index tenure')

In [None]:
#One hot encoding
#use scikit-learn to encode categorical features

In [None]:
#Logistic regression
#binary classification
#linear vs logistic regression

In [None]:
#Training logistic regression with Scikit-learn
#train a model with Scikit-learn
#Apply it to the validation dataset
#Calculate the accuracy

In [None]:
#Model Interpretation
#Look at the coefficients 
#Train a smaller model with fewer features

In [None]:
#Using the model

In [None]:
#Summary
#Feature importance - risk, mutual information, correlation
#one-hot encoding can be implemented with DictVectorizer
#logistic regression - linear model like linear regression
#output of log reg - probability
#interpretation of weights is similar to linear regression
