# Chapter 3 - Machine Learning for Classification. 

## Project: Telco Customer Churn Predictor. 

In [1]:
# Importing Python Packages. 

# Data Manipulation. 
import pandas as pd 
import numpy as np

# Data Visualization Packages. 
import seaborn as sns 
import matplotlib.pyplot as plt

# Data Gathering Package. 
import wget 

# Other
%matplotlib inline

## Data Gathering. 

In [2]:
# Loading dataset. 
telco_data = pd.read_csv('data/telco_customer_churn.csv')

# Viewing dataset.
telco_data.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [3]:
# Viewing the shape of the dataset. 
print(f'There are {telco_data.shape[0]} rows and {telco_data.shape[1]} columns')

There are 7043 rows and 21 columns


In [4]:
# List of data columns 
telco_data.columns

Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [5]:
# Data structure. 
telco_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [6]:
# Checking for Null value.
telco_data.isna().sum()

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [7]:
# Removing upper case from column name.
column = telco_data.columns.str.lower()

# Replacing columns name with new column name.
telco_data.columns = column

In [8]:
# Checking column names
telco_data.columns

Index(['customerid', 'gender', 'seniorcitizen', 'partner', 'dependents',
       'tenure', 'phoneservice', 'multiplelines', 'internetservice',
       'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport',
       'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling',
       'paymentmethod', 'monthlycharges', 'totalcharges', 'churn'],
      dtype='object')

In [9]:
# Checking each feature and their unique values. 
column_info = {}
for col in column: 
    column_info[col] = [telco_data[col].nunique(), telco_data[col].unique()[0:5]]
    print("Columns name:", col, '\n')
    print(f"Unique values:  {telco_data[col].unique()[0:5]}\n")
    print(f"Number of unique values:  {telco_data[col].nunique()}\n\n")

Columns name: customerid 

Unique values:  ['7590-VHVEG' '5575-GNVDE' '3668-QPYBK' '7795-CFOCW' '9237-HQITU']

Number of unique values:  7043


Columns name: gender 

Unique values:  ['Female' 'Male']

Number of unique values:  2


Columns name: seniorcitizen 

Unique values:  [0 1]

Number of unique values:  2


Columns name: partner 

Unique values:  ['Yes' 'No']

Number of unique values:  2


Columns name: dependents 

Unique values:  ['No' 'Yes']

Number of unique values:  2


Columns name: tenure 

Unique values:  [ 1 34  2 45  8]

Number of unique values:  73


Columns name: phoneservice 

Unique values:  ['No' 'Yes']

Number of unique values:  2


Columns name: multiplelines 

Unique values:  ['No phone service' 'No' 'Yes']

Number of unique values:  3


Columns name: internetservice 

Unique values:  ['DSL' 'Fiber optic' 'No']

Number of unique values:  3


Columns name: onlinesecurity 

Unique values:  ['No' 'Yes' 'No internet service']

Number of unique values:  3


Columns n

In [10]:
# Checking dict 
column_info

{'customerid': [7043,
  array(['7590-VHVEG', '5575-GNVDE', '3668-QPYBK', '7795-CFOCW',
         '9237-HQITU'], dtype=object)],
 'gender': [2, array(['Female', 'Male'], dtype=object)],
 'seniorcitizen': [2, array([0, 1], dtype=int64)],
 'partner': [2, array(['Yes', 'No'], dtype=object)],
 'dependents': [2, array(['No', 'Yes'], dtype=object)],
 'tenure': [73, array([ 1, 34,  2, 45,  8], dtype=int64)],
 'phoneservice': [2, array(['No', 'Yes'], dtype=object)],
 'multiplelines': [3, array(['No phone service', 'No', 'Yes'], dtype=object)],
 'internetservice': [3, array(['DSL', 'Fiber optic', 'No'], dtype=object)],
 'onlinesecurity': [3,
  array(['No', 'Yes', 'No internet service'], dtype=object)],
 'onlinebackup': [3,
  array(['Yes', 'No', 'No internet service'], dtype=object)],
 'deviceprotection': [3,
  array(['No', 'Yes', 'No internet service'], dtype=object)],
 'techsupport': [3, array(['No', 'Yes', 'No internet service'], dtype=object)],
 'streamingtv': [3, array(['No', 'Yes', 'No inter

In [11]:
# Checking on our target feature. 
telco_data.churn.head()

0     No
1     No
2    Yes
3     No
4    Yes
Name: churn, dtype: object

In [12]:
# converting churn to int.
telco_data['churn'] = pd.to_numeric(telco_data['churn'] == 'Yes').astype(int)

# Checking Feature. 
telco_data['churn'].unique()

array([0, 1])

In [13]:
# viewing columns with  ' ' empty space. 
telco_data.query("totalcharges == ' '")

Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,...,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
488,4472-LVYGI,Female,0,Yes,Yes,0,No,No phone service,DSL,Yes,...,Yes,Yes,Yes,No,Two year,Yes,Bank transfer (automatic),52.55,,0
753,3115-CZMZD,Male,0,No,Yes,0,Yes,No,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,20.25,,0
936,5709-LVOEQ,Female,0,Yes,Yes,0,Yes,No,DSL,Yes,...,Yes,No,Yes,Yes,Two year,No,Mailed check,80.85,,0
1082,4367-NUYAO,Male,0,Yes,Yes,0,Yes,Yes,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,25.75,,0
1340,1371-DWPAZ,Female,0,Yes,Yes,0,No,No phone service,DSL,Yes,...,Yes,Yes,Yes,No,Two year,No,Credit card (automatic),56.05,,0
3331,7644-OMVMY,Male,0,Yes,Yes,0,Yes,No,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,19.85,,0
3826,3213-VVOLG,Male,0,Yes,Yes,0,Yes,Yes,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,25.35,,0
4380,2520-SGTTA,Female,0,Yes,Yes,0,Yes,No,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,20.0,,0
5218,2923-ARZLG,Male,0,Yes,Yes,0,Yes,No,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,One year,Yes,Mailed check,19.7,,0
6670,4075-WKNIU,Female,0,Yes,Yes,0,Yes,Yes,DSL,No,...,Yes,Yes,Yes,No,Two year,No,Mailed check,73.35,,0


In [14]:
# Replacing ' ' with 0
telco_data['totalcharges'] = telco_data['totalcharges'].replace(' ', 0)

# checking for ' ' in Total Charges column. 
telco_data.query("totalcharges == ' '")

Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,...,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn


In [15]:
# Converting Total charges to float data type. 
telco_data['totalcharges'] = telco_data['totalcharges'].astype(float)

# Checking Changes made. 
telco_data['totalcharges'].dtypes

dtype('float64')

In [16]:
# Checking for duplicate
telco_data.duplicated().sum()

0

In [17]:
# Removing columns that aren't useful.
del telco_data['customerid']

# Checking column list. 
telco_data.columns

Index(['gender', 'seniorcitizen', 'partner', 'dependents', 'tenure',
       'phoneservice', 'multiplelines', 'internetservice', 'onlinesecurity',
       'onlinebackup', 'deviceprotection', 'techsupport', 'streamingtv',
       'streamingmovies', 'contract', 'paperlessbilling', 'paymentmethod',
       'monthlycharges', 'totalcharges', 'churn'],
      dtype='object')

## Data Validataion. 

In [18]:
# import sklearn data validation package. 
from sklearn.model_selection import train_test_split

In [19]:
telco_data.columns 

Index(['gender', 'seniorcitizen', 'partner', 'dependents', 'tenure',
       'phoneservice', 'multiplelines', 'internetservice', 'onlinesecurity',
       'onlinebackup', 'deviceprotection', 'techsupport', 'streamingtv',
       'streamingmovies', 'contract', 'paperlessbilling', 'paymentmethod',
       'monthlycharges', 'totalcharges', 'churn'],
      dtype='object')

In [20]:
# Spliting Data to train and test dataset. 
telco_full_train, telco_test = train_test_split(
    telco_data, test_size = 0.2, random_state = 1)

# Checking data shape
telco_full_train.shape, telco_test.shape 

((5634, 20), (1409, 20))

In [21]:
# Spliting data into trianing and validataion dataset. 
telco_train, telco_val = train_test_split(
    telco_full_train, test_size = 0.25, random_state = 1)


# Checking data shape
telco_train.shape, telco_val.shape 

((4225, 20), (1409, 20))

In [22]:
# Extrat Y feature from each of the dataset.
telco_full_ytrain = telco_full_train['churn'].values
telco_ytrain = telco_train['churn'].values
telco_yval = telco_val['churn'].values
telco_ytest = telco_test['churn'].values

# Delete y from main dataset. 
del telco_full_train['churn']
del telco_train['churn']
del telco_val['churn']
del telco_test['churn']

# Checking the columns for each dataset. 
print(telco_full_train.columns, '\n')
print(telco_train.columns, '\n')
print(telco_val.columns, '\n')
print(telco_test.columns, '\n')


Index(['gender', 'seniorcitizen', 'partner', 'dependents', 'tenure',
       'phoneservice', 'multiplelines', 'internetservice', 'onlinesecurity',
       'onlinebackup', 'deviceprotection', 'techsupport', 'streamingtv',
       'streamingmovies', 'contract', 'paperlessbilling', 'paymentmethod',
       'monthlycharges', 'totalcharges'],
      dtype='object') 

Index(['gender', 'seniorcitizen', 'partner', 'dependents', 'tenure',
       'phoneservice', 'multiplelines', 'internetservice', 'onlinesecurity',
       'onlinebackup', 'deviceprotection', 'techsupport', 'streamingtv',
       'streamingmovies', 'contract', 'paperlessbilling', 'paymentmethod',
       'monthlycharges', 'totalcharges'],
      dtype='object') 

Index(['gender', 'seniorcitizen', 'partner', 'dependents', 'tenure',
       'phoneservice', 'multiplelines', 'internetservice', 'onlinesecurity',
       'onlinebackup', 'deviceprotection', 'techsupport', 'streamingtv',
       'streamingmovies', 'contract', 'paperlessbilling', 'pa

## Exploratory Data Analysis (EDA).

In [23]:
# Checking for the present of null values.
telco_train.isna().sum()

gender              0
seniorcitizen       0
partner             0
dependents          0
tenure              0
phoneservice        0
multiplelines       0
internetservice     0
onlinesecurity      0
onlinebackup        0
deviceprotection    0
techsupport         0
streamingtv         0
streamingmovies     0
contract            0
paperlessbilling    0
paymentmethod       0
monthlycharges      0
totalcharges        0
dtype: int64

In [24]:
# Viewing the target feature distribution.
telco_data['churn'].value_counts(normalize = True).round(2)

0    0.73
1    0.27
Name: churn, dtype: float64

In [25]:
# Viewing the data structure for dataset. 
telco_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7043 non-null   object 
 1   seniorcitizen     7043 non-null   int64  
 2   partner           7043 non-null   object 
 3   dependents        7043 non-null   object 
 4   tenure            7043 non-null   int64  
 5   phoneservice      7043 non-null   object 
 6   multiplelines     7043 non-null   object 
 7   internetservice   7043 non-null   object 
 8   onlinesecurity    7043 non-null   object 
 9   onlinebackup      7043 non-null   object 
 10  deviceprotection  7043 non-null   object 
 11  techsupport       7043 non-null   object 
 12  streamingtv       7043 non-null   object 
 13  streamingmovies   7043 non-null   object 
 14  contract          7043 non-null   object 
 15  paperlessbilling  7043 non-null   object 
 16  paymentmethod     7043 non-null   object 


In [26]:
# checking seniorcitizen break down.
print(telco_data.seniorcitizen.value_counts())

# converting data type to object. 
telco_data['seniorcitizen'] = telco_data['seniorcitizen'].astype(object)
telco_full_train['seniorcitizen'] = telco_full_train['seniorcitizen'].astype(object)
telco_train['seniorcitizen'] = telco_train['seniorcitizen'].astype(object)
telco_val['seniorcitizen'] = telco_val['seniorcitizen'].astype(object)
telco_test['seniorcitizen'] = telco_test['seniorcitizen'].astype(object)

0    5901
1    1142
Name: seniorcitizen, dtype: int64


In [27]:
# checking senior citizen data sturcture. 
telco_data['seniorcitizen'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 7043 entries, 0 to 7042
Series name: seniorcitizen
Non-Null Count  Dtype 
--------------  ----- 
7043 non-null   object
dtypes: object(1)
memory usage: 55.1+ KB


In [28]:
telco_train.dtypes == 'object'

gender               True
seniorcitizen        True
partner              True
dependents           True
tenure              False
phoneservice         True
multiplelines        True
internetservice      True
onlinesecurity       True
onlinebackup         True
deviceprotection     True
techsupport          True
streamingtv          True
streamingmovies      True
contract             True
paperlessbilling     True
paymentmethod        True
monthlycharges      False
totalcharges        False
dtype: bool

In [29]:
# Grouping the data by data struture. 

# Object or categorical features. 
categorical = telco_train.columns[telco_train.dtypes == 'object'].to_list()

# Checking the list
categorical

['gender',
 'seniorcitizen',
 'partner',
 'dependents',
 'phoneservice',
 'multiplelines',
 'internetservice',
 'onlinesecurity',
 'onlinebackup',
 'deviceprotection',
 'techsupport',
 'streamingtv',
 'streamingmovies',
 'contract',
 'paperlessbilling',
 'paymentmethod']

In [30]:
# Numerical features. 
numeric = telco_train.columns[telco_train.dtypes != 'object'].to_list()

# Checking the list
numeric

['tenure', 'monthlycharges', 'totalcharges']

In [31]:
# Checking the distribution of categorical features.
for col in categorical:
    print(telco_data[col].value_counts(normalize = True).round(2), '\n\n')
    

Male      0.5
Female    0.5
Name: gender, dtype: float64 


0    0.84
1    0.16
Name: seniorcitizen, dtype: float64 


No     0.52
Yes    0.48
Name: partner, dtype: float64 


No     0.7
Yes    0.3
Name: dependents, dtype: float64 


Yes    0.9
No     0.1
Name: phoneservice, dtype: float64 


No                  0.48
Yes                 0.42
No phone service    0.10
Name: multiplelines, dtype: float64 


Fiber optic    0.44
DSL            0.34
No             0.22
Name: internetservice, dtype: float64 


No                     0.50
Yes                    0.29
No internet service    0.22
Name: onlinesecurity, dtype: float64 


No                     0.44
Yes                    0.34
No internet service    0.22
Name: onlinebackup, dtype: float64 


No                     0.44
Yes                    0.34
No internet service    0.22
Name: deviceprotection, dtype: float64 


No                     0.49
Yes                    0.29
No internet service    0.22
Name: techsupport, dtype: float64 


In [32]:
# Checking the distribution of numeric feature.
for col in numeric: 
    print(telco_data[col].describe(), '\n\n')

count    7043.000000
mean       32.371149
std        24.559481
min         0.000000
25%         9.000000
50%        29.000000
75%        55.000000
max        72.000000
Name: tenure, dtype: float64 


count    7043.000000
mean       64.761692
std        30.090047
min        18.250000
25%        35.500000
50%        70.350000
75%        89.850000
max       118.750000
Name: monthlycharges, dtype: float64 


count    7043.000000
mean     2279.734304
std      2266.794470
min         0.000000
25%       398.550000
50%      1394.550000
75%      3786.600000
max      8684.800000
Name: totalcharges, dtype: float64 




## Feature Importance. 

In [33]:
# What is the mean Global churn rate 
glober_churn = telco_data.churn.mean()

# Check value. 
glober_churn

0.2653698707936959

In [34]:
# Checking the mean of contract
contract_churn = telco_data.groupby('contract')['churn'].mean()
contract_churn

contract
Month-to-month    0.427097
One year          0.112695
Two year          0.028319
Name: churn, dtype: float64

In [35]:
# Get the mean difference 
mean_diff = glober_churn - contract_churn

# Check the values.
mean_diff

contract
Month-to-month   -0.161727
One year          0.152675
Two year          0.237051
Name: churn, dtype: float64

>This implies that customer that have a month to month plan are more likely to churn than customers with one or two years contract. 

In [36]:
# Using risk ratio to derive relationship.
risk_ratio = contract_churn/glober_churn

# Check the values. 
risk_ratio

contract
Month-to-month    1.609440
One year          0.424672
Two year          0.106714
Name: churn, dtype: float64

> The above reinforce that that customers with a month to month contract are more likely to churn than customers with a one year contract, of which are also more likely to churn than customers with a two year contract. 

In [37]:
# Duplicating mean dif and risk ration metrics to features. 

# Importing display package from ipython.
from IPython.display import display

# Dupliacte loop
for col in categorical: 
    print(col)
    telco_group = telco_data.groupby(col)['churn'].agg(['mean', 'count'])
    telco_group['mean_diff'] = glober_churn - telco_group['mean']
    telco_group['risk_ratio'] = telco_group['mean']/glober_churn
    display(telco_group)
    print()
    print()

gender


Unnamed: 0_level_0,mean,count,mean_diff,risk_ratio
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Female,0.269209,3488,-0.003839,1.014466
Male,0.261603,3555,0.003766,0.985807




seniorcitizen


Unnamed: 0_level_0,mean,count,mean_diff,risk_ratio
seniorcitizen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.236062,5901,0.029308,0.889557
1,0.416813,1142,-0.151443,1.570686




partner


Unnamed: 0_level_0,mean,count,mean_diff,risk_ratio
partner,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
No,0.32958,3641,-0.06421,1.241964
Yes,0.196649,3402,0.068721,0.741038




dependents


Unnamed: 0_level_0,mean,count,mean_diff,risk_ratio
dependents,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
No,0.312791,4933,-0.047422,1.1787
Yes,0.154502,2110,0.110868,0.582215




phoneservice


Unnamed: 0_level_0,mean,count,mean_diff,risk_ratio
phoneservice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
No,0.249267,682,0.016103,0.939319
Yes,0.267096,6361,-0.001726,1.006506




multiplelines


Unnamed: 0_level_0,mean,count,mean_diff,risk_ratio
multiplelines,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
No,0.250442,3390,0.014927,0.943749
No phone service,0.249267,682,0.016103,0.939319
Yes,0.286099,2971,-0.020729,1.078114




internetservice


Unnamed: 0_level_0,mean,count,mean_diff,risk_ratio
internetservice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
DSL,0.189591,2421,0.075779,0.714441
Fiber optic,0.418928,3096,-0.153558,1.578656
No,0.07405,1526,0.19132,0.279044




onlinesecurity


Unnamed: 0_level_0,mean,count,mean_diff,risk_ratio
onlinesecurity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
No,0.417667,3498,-0.152297,1.573906
No internet service,0.07405,1526,0.19132,0.279044
Yes,0.146112,2019,0.119258,0.550597




onlinebackup


Unnamed: 0_level_0,mean,count,mean_diff,risk_ratio
onlinebackup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
No,0.399288,3088,-0.133918,1.504645
No internet service,0.07405,1526,0.19132,0.279044
Yes,0.215315,2429,0.050055,0.811377




deviceprotection


Unnamed: 0_level_0,mean,count,mean_diff,risk_ratio
deviceprotection,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
No,0.391276,3095,-0.125906,1.474456
No internet service,0.07405,1526,0.19132,0.279044
Yes,0.225021,2422,0.040349,0.847951




techsupport


Unnamed: 0_level_0,mean,count,mean_diff,risk_ratio
techsupport,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
No,0.416355,3473,-0.150985,1.56896
No internet service,0.07405,1526,0.19132,0.279044
Yes,0.151663,2044,0.113706,0.571517




streamingtv


Unnamed: 0_level_0,mean,count,mean_diff,risk_ratio
streamingtv,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
No,0.335231,2810,-0.069861,1.263261
No internet service,0.07405,1526,0.19132,0.279044
Yes,0.300702,2707,-0.035332,1.133143




streamingmovies


Unnamed: 0_level_0,mean,count,mean_diff,risk_ratio
streamingmovies,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
No,0.336804,2785,-0.071434,1.269188
No internet service,0.07405,1526,0.19132,0.279044
Yes,0.299414,2732,-0.034044,1.128291




contract


Unnamed: 0_level_0,mean,count,mean_diff,risk_ratio
contract,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Month-to-month,0.427097,3875,-0.161727,1.60944
One year,0.112695,1473,0.152675,0.424672
Two year,0.028319,1695,0.237051,0.106714




paperlessbilling


Unnamed: 0_level_0,mean,count,mean_diff,risk_ratio
paperlessbilling,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
No,0.163301,2872,0.102069,0.615371
Yes,0.335651,4171,-0.070281,1.264842




paymentmethod


Unnamed: 0_level_0,mean,count,mean_diff,risk_ratio
paymentmethod,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Bank transfer (automatic),0.167098,1544,0.098271,0.629681
Credit card (automatic),0.152431,1522,0.112939,0.57441
Electronic check,0.452854,2365,-0.187484,1.706502
Mailed check,0.191067,1612,0.074303,0.720003






### Feature Importance: Using Mutual Score. 

In [38]:
# Import metric package.
from sklearn.metrics import mutual_info_score

In [39]:
# Testing metric 
mutual_info_score(telco_data['churn'], telco_data['contract'])

0.09845305342598942

> With a value of 0.098 there seems to be a stronge relationship between the contract and churn, impling that the longer a contract the less likely the customer will churn. 

In [40]:
# Implimenting it to the whole dataset. 

# creating a function that takes only one argement. 
def mutual_score(series):
    return mutual_info_score(series, telco_data.churn)
    
# Deriving the mutual score. 
mi = telco_data[categorical].apply(mutual_score)

# Sorting the data in decending order. 
mi = mi.sort_values(ascending = False).to_frame(name = 'MI')

# Viewing values. 
mi.head(20)

Unnamed: 0,MI
contract,0.098453
onlinesecurity,0.064677
techsupport,0.063021
internetservice,0.055574
onlinebackup,0.046792
paymentmethod,0.044519
deviceprotection,0.043917
streamingmovies,0.032001
streamingtv,0.031908
paperlessbilling,0.019194


> The table above shoul the degree of relationship between each featur and the target variable from most significant to least significant. 

### Feature Importance: Correclation Co-efficent. 

In [41]:
# Viewing Numeric features. 
telco_data[numeric].head()

Unnamed: 0,tenure,monthlycharges,totalcharges
0,1,29.85,29.85
1,34,56.95,1889.5
2,2,53.85,108.15
3,45,42.3,1840.75
4,2,70.7,151.65


In [42]:
# Viewing the correlaction co-effient of data.
telco_data[numeric].corrwith(telco_data.churn)

tenure           -0.352229
monthlycharges    0.193356
totalcharges     -0.198324
dtype: float64

> This implies that there is a negative relationship between tenure and churn, connoting that the longer a customer uses our service the less likely they will churn. 

## One Hot Encoding using Scikit Learn. 

In [43]:
# Import scikit learn packages. 
from sklearn.feature_extraction import DictVectorizer

In [44]:
# Converting data to dict 
dict_train = telco_train.to_dict(orient='records')

# View outcome. 
dict_train

[{'gender': 'Female',
  'seniorcitizen': 0,
  'partner': 'Yes',
  'dependents': 'Yes',
  'tenure': 72,
  'phoneservice': 'Yes',
  'multiplelines': 'Yes',
  'internetservice': 'Fiber optic',
  'onlinesecurity': 'Yes',
  'onlinebackup': 'Yes',
  'deviceprotection': 'Yes',
  'techsupport': 'Yes',
  'streamingtv': 'Yes',
  'streamingmovies': 'Yes',
  'contract': 'Two year',
  'paperlessbilling': 'Yes',
  'paymentmethod': 'Electronic check',
  'monthlycharges': 115.5,
  'totalcharges': 8425.15},
 {'gender': 'Male',
  'seniorcitizen': 0,
  'partner': 'No',
  'dependents': 'No',
  'tenure': 10,
  'phoneservice': 'Yes',
  'multiplelines': 'Yes',
  'internetservice': 'Fiber optic',
  'onlinesecurity': 'No',
  'onlinebackup': 'Yes',
  'deviceprotection': 'Yes',
  'techsupport': 'No',
  'streamingtv': 'No',
  'streamingmovies': 'Yes',
  'contract': 'Month-to-month',
  'paperlessbilling': 'Yes',
  'paymentmethod': 'Electronic check',
  'monthlycharges': 95.25,
  'totalcharges': 1021.55},
 {'gender

In [45]:
# Creating an instance of DictVectorizer.
dv = DictVectorizer(sparse = False)


In [46]:
# Fitting the instance with the dict_train data. 
dv.fit(dict_train)

In [47]:
# Tranform the data 
x_train = dv.transform(dict_train)

In [48]:
# Getting feature names. 
dv.get_feature_names_out()

array(['contract=Month-to-month', 'contract=One year',
       'contract=Two year', 'dependents=No', 'dependents=Yes',
       'deviceprotection=No', 'deviceprotection=No internet service',
       'deviceprotection=Yes', 'gender=Female', 'gender=Male',
       'internetservice=DSL', 'internetservice=Fiber optic',
       'internetservice=No', 'monthlycharges', 'multiplelines=No',
       'multiplelines=No phone service', 'multiplelines=Yes',
       'onlinebackup=No', 'onlinebackup=No internet service',
       'onlinebackup=Yes', 'onlinesecurity=No',
       'onlinesecurity=No internet service', 'onlinesecurity=Yes',
       'paperlessbilling=No', 'paperlessbilling=Yes', 'partner=No',
       'partner=Yes', 'paymentmethod=Bank transfer (automatic)',
       'paymentmethod=Credit card (automatic)',
       'paymentmethod=Electronic check', 'paymentmethod=Mailed check',
       'phoneservice=No', 'phoneservice=Yes', 'seniorcitizen',
       'streamingmovies=No', 'streamingmovies=No internet service',

In [49]:
# Convert validataion data to dict. 
dict_val = telco_val.to_dict(orient = 'records')

In [50]:
# Tranform the validation dataset.
x_val = dv.transform(dict_val)

In [56]:
x_val

array([[0.0000e+00, 0.0000e+00, 1.0000e+00, ..., 1.0000e+00, 7.1000e+01,
        4.9734e+03],
       [1.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 1.0000e+00,
        2.0750e+01],
       [1.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 1.0000e+00,
        2.0350e+01],
       ...,
       [1.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.0000e+00, 1.8000e+01,
        1.0581e+03],
       [1.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 1.0000e+00,
        9.3300e+01],
       [1.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 3.0000e+00,
        2.9285e+02]])

## Training Logistic Regression Model with Scikit Learn. 

In [51]:
# Importing regression package.
from sklearn.linear_model import LogisticRegression

In [52]:
# Creating an Instance of the model.
model = LogisticRegression()

# Train model with training dataset. 
model.fit(x_train, telco_ytrain)

In [58]:
# Bias weight. 
model.intercept_[0]

-0.10904168456915862

In [None]:
# Weight Values. 
model.

In [53]:
# Hard Predict with train dataset. 
model.predict(x_train)

array([0, 1, 1, ..., 1, 0, 1])

In [55]:
# Soft Predicting with validation dataset. 
model.predict_proba(x_val,telco_yval)


TypeError: predict_proba() takes 2 positional arguments but 3 were given