# Chapter 3 - Machine Learning for Classification. 

## Project: Telco Customer Churn Predictor. 

In [1]:
# Importing Python Packages. 

# Data Manipulation. 
import pandas as pd 
import numpy as np

# Data Visualization Packages. 
import seaborn as sns 
import matplotlib.pyplot as plt

# Data Gathering Package. 
import wget 

# Other
%matplotlib inline

## Data Gathering. 

In [2]:
# Loading dataset. 
telco_data = pd.read_csv('data/telco_customer_churn.csv')

# Viewing dataset.
telco_data.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [3]:
# Viewing the shape of the dataset. 
print(f'There are {telco_data.shape[0]} rows and {telco_data.shape[1]} columns')

There are 7043 rows and 21 columns


In [4]:
# List of data columns 
telco_data.columns

Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [5]:
# Data structure. 
telco_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [6]:
# Checking for Null value.
telco_data.isna().sum()

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [7]:
# Removing upper case from column name.
column = telco_data.columns.str.lower()

# Replacing columns name with new column name.
telco_data.columns = column

In [8]:
# Checking column names
telco_data.columns

Index(['customerid', 'gender', 'seniorcitizen', 'partner', 'dependents',
       'tenure', 'phoneservice', 'multiplelines', 'internetservice',
       'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport',
       'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling',
       'paymentmethod', 'monthlycharges', 'totalcharges', 'churn'],
      dtype='object')

In [9]:
# Checking each feature and their unique values. 
column_info = {}
for col in column: 
    column_info[col] = [telco_data[col].nunique(), telco_data[col].unique()[0:5]]
    print("Columns name:", col, '\n')
    print(f"Unique values:  {telco_data[col].unique()[0:5]}\n")
    print(f"Number of unique values:  {telco_data[col].nunique()}\n\n")

Columns name: customerid 

Unique values:  ['7590-VHVEG' '5575-GNVDE' '3668-QPYBK' '7795-CFOCW' '9237-HQITU']

Number of unique values:  7043


Columns name: gender 

Unique values:  ['Female' 'Male']

Number of unique values:  2


Columns name: seniorcitizen 

Unique values:  [0 1]

Number of unique values:  2


Columns name: partner 

Unique values:  ['Yes' 'No']

Number of unique values:  2


Columns name: dependents 

Unique values:  ['No' 'Yes']

Number of unique values:  2


Columns name: tenure 

Unique values:  [ 1 34  2 45  8]

Number of unique values:  73


Columns name: phoneservice 

Unique values:  ['No' 'Yes']

Number of unique values:  2


Columns name: multiplelines 

Unique values:  ['No phone service' 'No' 'Yes']

Number of unique values:  3


Columns name: internetservice 

Unique values:  ['DSL' 'Fiber optic' 'No']

Number of unique values:  3


Columns name: onlinesecurity 

Unique values:  ['No' 'Yes' 'No internet service']

Number of unique values:  3


Columns n

In [10]:
# Checking dict 
column_info

{'customerid': [7043,
  array(['7590-VHVEG', '5575-GNVDE', '3668-QPYBK', '7795-CFOCW',
         '9237-HQITU'], dtype=object)],
 'gender': [2, array(['Female', 'Male'], dtype=object)],
 'seniorcitizen': [2, array([0, 1], dtype=int64)],
 'partner': [2, array(['Yes', 'No'], dtype=object)],
 'dependents': [2, array(['No', 'Yes'], dtype=object)],
 'tenure': [73, array([ 1, 34,  2, 45,  8], dtype=int64)],
 'phoneservice': [2, array(['No', 'Yes'], dtype=object)],
 'multiplelines': [3, array(['No phone service', 'No', 'Yes'], dtype=object)],
 'internetservice': [3, array(['DSL', 'Fiber optic', 'No'], dtype=object)],
 'onlinesecurity': [3,
  array(['No', 'Yes', 'No internet service'], dtype=object)],
 'onlinebackup': [3,
  array(['Yes', 'No', 'No internet service'], dtype=object)],
 'deviceprotection': [3,
  array(['No', 'Yes', 'No internet service'], dtype=object)],
 'techsupport': [3, array(['No', 'Yes', 'No internet service'], dtype=object)],
 'streamingtv': [3, array(['No', 'Yes', 'No inter

In [11]:
# Checking on our target feature. 
telco_data.churn.head()

0     No
1     No
2    Yes
3     No
4    Yes
Name: churn, dtype: object

In [12]:
# converting churn to int.
telco_data['churn'] = pd.to_numeric(telco_data['churn'] == 'Yes').astype(int)

# Checking Feature. 
telco_data['churn'].unique()

array([0, 1])

In [13]:
# viewing columns with  ' ' empty space. 
telco_data.query("totalcharges == ' '")

Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,...,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
488,4472-LVYGI,Female,0,Yes,Yes,0,No,No phone service,DSL,Yes,...,Yes,Yes,Yes,No,Two year,Yes,Bank transfer (automatic),52.55,,0
753,3115-CZMZD,Male,0,No,Yes,0,Yes,No,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,20.25,,0
936,5709-LVOEQ,Female,0,Yes,Yes,0,Yes,No,DSL,Yes,...,Yes,No,Yes,Yes,Two year,No,Mailed check,80.85,,0
1082,4367-NUYAO,Male,0,Yes,Yes,0,Yes,Yes,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,25.75,,0
1340,1371-DWPAZ,Female,0,Yes,Yes,0,No,No phone service,DSL,Yes,...,Yes,Yes,Yes,No,Two year,No,Credit card (automatic),56.05,,0
3331,7644-OMVMY,Male,0,Yes,Yes,0,Yes,No,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,19.85,,0
3826,3213-VVOLG,Male,0,Yes,Yes,0,Yes,Yes,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,25.35,,0
4380,2520-SGTTA,Female,0,Yes,Yes,0,Yes,No,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,20.0,,0
5218,2923-ARZLG,Male,0,Yes,Yes,0,Yes,No,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,One year,Yes,Mailed check,19.7,,0
6670,4075-WKNIU,Female,0,Yes,Yes,0,Yes,Yes,DSL,No,...,Yes,Yes,Yes,No,Two year,No,Mailed check,73.35,,0


In [14]:
# Replacing ' ' with 0
telco_data['totalcharges'] = telco_data['totalcharges'].replace(' ', 0)

# checking for ' ' in Total Charges column. 
telco_data.query("totalcharges == ' '")

Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,...,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn


In [15]:
# Converting Total charges to float data type. 
telco_data['totalcharges'] = telco_data['totalcharges'].astype(float)

# Checking Changes made. 
telco_data['totalcharges'].dtypes

dtype('float64')

## Data Validataion. 

In [16]:
# import sklearn data validation package. 
from sklearn.model_selection import train_test_split

In [17]:
telco_data.columns 

Index(['customerid', 'gender', 'seniorcitizen', 'partner', 'dependents',
       'tenure', 'phoneservice', 'multiplelines', 'internetservice',
       'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport',
       'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling',
       'paymentmethod', 'monthlycharges', 'totalcharges', 'churn'],
      dtype='object')

In [18]:
# Spliting Data to train and test dataset. 
telco_full_train, telco_test = train_test_split(
    telco_data, test_size = 0.2, random_state = 1)

# Checking data shape
telco_full_train.shape, telco_test.shape 

((5634, 21), (1409, 21))

In [19]:
# Spliting data into trianing and validataion dataset. 
telco_train, telco_val = train_test_split(
    telco_full_train, test_size = 0.25, random_state = 1)


# Checking data shape
telco_train.shape, telco_val.shape 

((4225, 21), (1409, 21))

In [20]:
# Extrat Y feature from each of the dataset.
telco_full_ytrain = telco_full_train['churn']
telco_ytrain = telco_train['churn']
telco_yval = telco_val['churn']
telco_ytest = telco_test['churn']

# Delete y from main dataset. 
del telco_full_train['churn']
del telco_train['churn']
del telco_val['churn']
del telco_test['churn']

# Checking the columns for each dataset. 
print(telco_full_train.columns, '\n')
print(telco_train.columns, '\n')
print(telco_val.columns, '\n')
print(telco_test.columns, '\n')


Index(['customerid', 'gender', 'seniorcitizen', 'partner', 'dependents',
       'tenure', 'phoneservice', 'multiplelines', 'internetservice',
       'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport',
       'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling',
       'paymentmethod', 'monthlycharges', 'totalcharges'],
      dtype='object') 

Index(['customerid', 'gender', 'seniorcitizen', 'partner', 'dependents',
       'tenure', 'phoneservice', 'multiplelines', 'internetservice',
       'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport',
       'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling',
       'paymentmethod', 'monthlycharges', 'totalcharges'],
      dtype='object') 

Index(['customerid', 'gender', 'seniorcitizen', 'partner', 'dependents',
       'tenure', 'phoneservice', 'multiplelines', 'internetservice',
       'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport',
       'streamingtv', 'streamingmo

## Exploratory Data Analysis (EDA).

In [22]:
# Checking for the present of null values.
telco_train.isna().sum()

customerid          0
gender              0
seniorcitizen       0
partner             0
dependents          0
tenure              0
phoneservice        0
multiplelines       0
internetservice     0
onlinesecurity      0
onlinebackup        0
deviceprotection    0
techsupport         0
streamingtv         0
streamingmovies     0
contract            0
paperlessbilling    0
paymentmethod       0
monthlycharges      0
totalcharges        0
dtype: int64

In [46]:
# Viewing the target feature distribution.
telco_data['churn'].value_counts(normalize = True).round(2)

0    0.73
1    0.27
Name: churn, dtype: float64

In [27]:
# Viewing the data structure for dataset. 
telco_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerid        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   seniorcitizen     7043 non-null   int64  
 3   partner           7043 non-null   object 
 4   dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   phoneservice      7043 non-null   object 
 7   multiplelines     7043 non-null   object 
 8   internetservice   7043 non-null   object 
 9   onlinesecurity    7043 non-null   object 
 10  onlinebackup      7043 non-null   object 
 11  deviceprotection  7043 non-null   object 
 12  techsupport       7043 non-null   object 
 13  streamingtv       7043 non-null   object 
 14  streamingmovies   7043 non-null   object 
 15  contract          7043 non-null   object 
 16  paperlessbilling  7043 non-null   object 


In [37]:
# checking seniorcitizen break down.
print(telco_data.seniorcitizen.value_counts())

# converting data type to object. 
telco_data['seniorcitizen'] = telco_data['seniorcitizen'].astype(object)
telco_full_train['seniorcitizen'] = telco_full_train['seniorcitizen'].astype(object)
telco_train['seniorcitizen'] = telco_train['seniorcitizen'].astype(object)
telco_val['seniorcitizen'] = telco_val['seniorcitizen'].astype(object)
telco_test['seniorcitizen'] = telco_test['seniorcitizen'].astype(object)

0    5901
1    1142
Name: seniorcitizen, dtype: int64


In [40]:
# checking senior citizen data sturcture. 
telco_data['seniorcitizen'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 7043 entries, 0 to 7042
Series name: seniorcitizen
Non-Null Count  Dtype 
--------------  ----- 
7043 non-null   object
dtypes: object(1)
memory usage: 55.1+ KB


In [41]:
telco_train.dtypes == 'object'

customerid           True
gender               True
seniorcitizen        True
partner              True
dependents           True
tenure              False
phoneservice         True
multiplelines        True
internetservice      True
onlinesecurity       True
onlinebackup         True
deviceprotection     True
techsupport          True
streamingtv          True
streamingmovies      True
contract             True
paperlessbilling     True
paymentmethod        True
monthlycharges      False
totalcharges        False
dtype: bool

In [43]:
# Grouping the data by data struture. 

# Object or categorical features. 
categorical = telco_train.columns[telco_train.dtypes == 'object'].to_list()

# Checking the list
categorical

['customerid',
 'gender',
 'seniorcitizen',
 'partner',
 'dependents',
 'phoneservice',
 'multiplelines',
 'internetservice',
 'onlinesecurity',
 'onlinebackup',
 'deviceprotection',
 'techsupport',
 'streamingtv',
 'streamingmovies',
 'contract',
 'paperlessbilling',
 'paymentmethod']

In [45]:
# Numerical features. 
numeric = telco_train.columns[telco_train.dtypes != 'object'].to_list()

# Checking the list
numeric

['tenure', 'monthlycharges', 'totalcharges']

In [None]:
# Checking the distri