# Machine Learning: Classification

### 3.1 Churn Prediction

In [53]:
import pandas as pd
import numpy as np

import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

In [54]:
 df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')

In [55]:
# transpose the DataFrame to see the entire record of a single entity
df.head().T

Unnamed: 0,0,1,2,3,4
customerID,7590-VHVEG,5575-GNVDE,3668-QPYBK,7795-CFOCW,9237-HQITU
gender,Female,Male,Male,Male,Female
SeniorCitizen,0,0,0,0,0
Partner,Yes,No,No,No,No
Dependents,No,No,No,No,No
tenure,1,34,2,45,2
PhoneService,No,Yes,Yes,No,Yes
MultipleLines,No phone service,No,No,No phone service,No
InternetService,DSL,DSL,DSL,DSL,Fiber optic
OnlineSecurity,No,Yes,Yes,Yes,No


In [56]:
# make the data uniform by processing inconsistencies

df.columns = df.columns.str.lower().str.replace(' ', '_')

categorical_columns = list(df.dtypes[df.dtypes == 'object'].index)

for col in categorical_columns:
    df[col] = df[col].str.lower().str.replace(' ', '_')


In [57]:
# verify dataset inconsistencies have been remove

df.head().T

Unnamed: 0,0,1,2,3,4
customerid,7590-vhveg,5575-gnvde,3668-qpybk,7795-cfocw,9237-hqitu
gender,female,male,male,male,female
seniorcitizen,0,0,0,0,0
partner,yes,no,no,no,no
dependents,no,no,no,no,no
tenure,1,34,2,45,2
phoneservice,no,yes,yes,no,yes
multiplelines,no_phone_service,no,no,no_phone_service,no
internetservice,dsl,dsl,dsl,dsl,fiber_optic
onlinesecurity,no,yes,yes,yes,no


In [58]:
# peruse the data types

df.dtypes

customerid           object
gender               object
seniorcitizen         int64
partner              object
dependents           object
tenure                int64
phoneservice         object
multiplelines        object
internetservice      object
onlinesecurity       object
onlinebackup         object
deviceprotection     object
techsupport          object
streamingtv          object
streamingmovies      object
contract             object
paperlessbilling     object
paymentmethod        object
monthlycharges      float64
totalcharges         object
churn                object
dtype: object

In [59]:
# zooming in on "totalcharges" data types. Speculating it contains strings, numbers, and maybe whitespaces, underscore, etc
df.totalcharges         

0         29.85
1        1889.5
2        108.15
3       1840.75
4        151.65
         ...   
7038     1990.5
7039     7362.9
7040     346.45
7041      306.6
7042     6844.5
Name: totalcharges, Length: 7043, dtype: object

In [60]:
# attempt to convert "totalcharges" to numbers throws an error
# pd.to_numeric(df.totalcharges)

# ValueError: Unable to parse string "_" at position 488

In [61]:
# force pandas to replace values it can't parse with number

tc = pd.to_numeric(df.totalcharges, errors='coerce')

In [62]:
# check for missing values in total charges
tc.isnull().sum()


np.int64(11)

In [63]:
# zoom-in on few columns in total charges with missing values

df[tc.isnull()][['customerid', 'totalcharges']]

Unnamed: 0,customerid,totalcharges
488,4472-lvygi,_
753,3115-czmzd,_
936,5709-lvoeq,_
1082,4367-nuyao,_
1340,1371-dwpaz,_
3331,7644-omvmy,_
3826,3213-vvolg,_
4380,2520-sgtta,_
5218,2923-arzlg,_
6670,4075-wkniu,_


In [64]:
# force pandas to replace values it can't parse with number

df.totalcharges = pd.to_numeric(df.totalcharges, errors='coerce')

In [65]:
# fill the missing values in total charges with zeros

df.totalcharges = df.totalcharges.fillna(0)

In [66]:
# confirm the columns in total charges with missing values are filled

df[tc.isnull()][['customerid', 'totalcharges']]

Unnamed: 0,customerid,totalcharges
488,4472-lvygi,0.0
753,3115-czmzd,0.0
936,5709-lvoeq,0.0
1082,4367-nuyao,0.0
1340,1371-dwpaz,0.0
3331,7644-omvmy,0.0
3826,3213-vvolg,0.0
4380,2520-sgtta,0.0
5218,2923-arzlg,0.0
6670,4075-wkniu,0.0


In [69]:
# examine churn variables
df.churn

0        no
1        no
2       yes
3        no
4       yes
       ... 
7038     no
7039     no
7040     no
7041    yes
7042     no
Name: churn, Length: 7043, dtype: object

In [71]:
# examine first 5 churn variables, converting yes/no to True/False
(df.churn == 'yes').head()

0    False
1    False
2     True
3    False
4     True
Name: churn, dtype: bool

In [73]:
# convert the True/False churn variables to 1's and 0's
(df.churn == 'yes').astype('int').head()

0    0
1    0
2    1
3    0
4    1
Name: churn, dtype: int64

In [75]:
# write the  1's and 0's churn variables back into churn
df.churn = (df.churn == 'yes').astype('int')

###  3.2 Data Preparation

###  3.3 Setting Up Validation Framework

### 3.4  Exploratory Data Analysis (EDA)

### 3.5 Feature importance: Churn rate and risk ratio

**Feature importance analysis (part of EDA) - identifying which features affect our target variable**

* Churn rate
* Risk ratio
* Mutual information - Later 