# Chapter 3 - Machine Learning for Classification. 

## Project: Telco Customer Churn Predictor. 

In [1]:
# Importing Python Packages. 

# Data Manipulation. 
import pandas as pd 
import numpy as np

# Data Visualization Packages. 
import seaborn as sns 
import matplotlib.pyplot as plt

# Data Gathering Package. 
import wget 

# Other
%matplotlib inline

## Data Gathering. 

In [2]:
# Loading dataset. 
telco_data = pd.read_csv('data/telco_customer_churn.csv')

# Viewing dataset.
telco_data.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [3]:
# Viewing the shape of the dataset. 
print(f'There are {telco_data.shape[0]} rows and {telco_data.shape[1]} columns')

There are 7043 rows and 21 columns


In [4]:
# List of data columns 
telco_data.columns

Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [5]:
# Data structure. 
telco_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [6]:
# Checking for Null value.
telco_data.isna().sum()

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [7]:
# Removing upper case from column name.
column = telco_data.columns.str.lower()

# Replacing columns name with new column name.
telco_data.columns = column

In [8]:
# Checking column names
telco_data.columns

Index(['customerid', 'gender', 'seniorcitizen', 'partner', 'dependents',
       'tenure', 'phoneservice', 'multiplelines', 'internetservice',
       'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport',
       'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling',
       'paymentmethod', 'monthlycharges', 'totalcharges', 'churn'],
      dtype='object')

In [9]:
# Checking each feature and their unique values. 
column_info = {}
for col in column: 
    column_info[col] = [telco_data[col].nunique(), telco_data[col].unique()[0:5]]
    print("Columns name:", col, '\n')
    print(f"Unique values:  {telco_data[col].unique()[0:5]}\n")
    print(f"Number of unique values:  {telco_data[col].nunique()}\n\n")

Columns name: customerid 

Unique values:  ['7590-VHVEG' '5575-GNVDE' '3668-QPYBK' '7795-CFOCW' '9237-HQITU']

Number of unique values:  7043


Columns name: gender 

Unique values:  ['Female' 'Male']

Number of unique values:  2


Columns name: seniorcitizen 

Unique values:  [0 1]

Number of unique values:  2


Columns name: partner 

Unique values:  ['Yes' 'No']

Number of unique values:  2


Columns name: dependents 

Unique values:  ['No' 'Yes']

Number of unique values:  2


Columns name: tenure 

Unique values:  [ 1 34  2 45  8]

Number of unique values:  73


Columns name: phoneservice 

Unique values:  ['No' 'Yes']

Number of unique values:  2


Columns name: multiplelines 

Unique values:  ['No phone service' 'No' 'Yes']

Number of unique values:  3


Columns name: internetservice 

Unique values:  ['DSL' 'Fiber optic' 'No']

Number of unique values:  3


Columns name: onlinesecurity 

Unique values:  ['No' 'Yes' 'No internet service']

Number of unique values:  3


Columns n

In [10]:
# Checking dict 
column_info

{'customerid': [7043,
  array(['7590-VHVEG', '5575-GNVDE', '3668-QPYBK', '7795-CFOCW',
         '9237-HQITU'], dtype=object)],
 'gender': [2, array(['Female', 'Male'], dtype=object)],
 'seniorcitizen': [2, array([0, 1], dtype=int64)],
 'partner': [2, array(['Yes', 'No'], dtype=object)],
 'dependents': [2, array(['No', 'Yes'], dtype=object)],
 'tenure': [73, array([ 1, 34,  2, 45,  8], dtype=int64)],
 'phoneservice': [2, array(['No', 'Yes'], dtype=object)],
 'multiplelines': [3, array(['No phone service', 'No', 'Yes'], dtype=object)],
 'internetservice': [3, array(['DSL', 'Fiber optic', 'No'], dtype=object)],
 'onlinesecurity': [3,
  array(['No', 'Yes', 'No internet service'], dtype=object)],
 'onlinebackup': [3,
  array(['Yes', 'No', 'No internet service'], dtype=object)],
 'deviceprotection': [3,
  array(['No', 'Yes', 'No internet service'], dtype=object)],
 'techsupport': [3, array(['No', 'Yes', 'No internet service'], dtype=object)],
 'streamingtv': [3, array(['No', 'Yes', 'No inter

In [11]:
# Checking on our target feature. 
telco_data.churn.head()

0     No
1     No
2    Yes
3     No
4    Yes
Name: churn, dtype: object

In [12]:
# converting churn to int.
telco_data['churn'] = pd.to_numeric(telco_data['churn'] == 'Yes').astype(int)

# Checking Feature. 
telco_data['churn'].unique()

array([0, 1])

In [13]:
# Converting strings with binary value to int. 
for col in columns: 
    if telco_data[col].nunique() == 2: 
        telco_data[col].

SyntaxError: invalid syntax (600084747.py, line 4)

## Data Validataion. 

In [None]:
# import sklearn data validation package. 
from sklearn.model_selection import train_test_split