## We start by importing the necessary libraries for the cleaning

In [2]:
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
%matplotlib inline

## Load the data and give an overview of the dataset

In [5]:
df = pd.read_csv("WA_Fn-UseC_-Telco-Customer-Churn.csv")

In [6]:
# View a random sample of the dataset
df.sample(10)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
4577,0471-ARVMX,Female,1,Yes,No,62,Yes,Yes,Fiber optic,Yes,...,Yes,No,Yes,Yes,One year,Yes,Electronic check,104.85,6312.9,No
3183,7890-VYYWG,Male,1,Yes,No,3,No,No phone service,DSL,No,...,No,No,Yes,No,Month-to-month,Yes,Mailed check,36.45,93.7,Yes
2348,2683-JXWQQ,Male,0,Yes,Yes,61,No,No phone service,DSL,Yes,...,Yes,No,No,No,Month-to-month,No,Bank transfer (automatic),33.6,2117.2,No
4732,1052-QJIBV,Female,0,Yes,Yes,71,Yes,No,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,Two year,No,Credit card (automatic),19.9,1397.3,No
3292,7284-BUYEC,Female,0,No,No,5,No,No phone service,DSL,No,...,No,No,Yes,Yes,Month-to-month,Yes,Credit card (automatic),50.95,229.4,No
2497,5702-SKUOB,Female,0,Yes,No,4,Yes,No,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,Month-to-month,No,Mailed check,19.6,93.45,No
3356,2235-ZGKPT,Female,0,Yes,Yes,4,Yes,Yes,DSL,No,...,No,No,No,No,Month-to-month,No,Mailed check,50.85,239.55,Yes
2344,9796-MVYXX,Female,1,No,No,14,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,Two year,No,Mailed check,39.7,692.35,No
236,0621-JFHOL,Female,0,No,No,10,No,No phone service,DSL,No,...,No,Yes,No,No,Two year,Yes,Mailed check,29.6,299.05,No
4891,8404-GFGCZ,Male,0,Yes,No,4,Yes,Yes,DSL,No,...,Yes,No,Yes,No,Month-to-month,No,Electronic check,65.6,250.1,No


In [4]:
# The dimensions of our dataframe 
df.shape

(7043, 21)

In [7]:
# The columns names
df.columns

Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


### The 'TotalCharges' column is of an object type instead of float64. This needs to be fixed

In [9]:
df['TotalCharges'] = df['TotalCharges'].astype(np.float64)

ValueError: could not convert string to float: ''

### We need to replace the ' ' into Null object in order to be recognized by Pandas

In [22]:
# I found this solution in Stackoverflow
# https://stackoverflow.com/questions/13445241/replacing-blank-values-white-space-with-nan-in-pandas
df['TotalCharges'] = df['TotalCharges'].apply(lambda x: np.nan if isinstance(x, str) and x.isspace() else x)

In [25]:
df['TotalCharges'] = df['TotalCharges'].astype(np.float64)

In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


### Now that we fixed the "TotalCharges" column let's check the null values

In [28]:
df.isnull().sum()

customerID           0
gender               0
SeniorCitizen        0
Partner              0
Dependents           0
tenure               0
PhoneService         0
MultipleLines        0
InternetService      0
OnlineSecurity       0
OnlineBackup         0
DeviceProtection     0
TechSupport          0
StreamingTV          0
StreamingMovies      0
Contract             0
PaperlessBilling     0
PaymentMethod        0
MonthlyCharges       0
TotalCharges        11
Churn                0
dtype: int64

### The number of null values is really minimal with regard to the size of the dataframe
### this allow us to drop records with null values with affecting the overall dataframe

In [32]:
df = df.dropna()

In [33]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7032 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7032 non-null   object 
 1   gender            7032 non-null   object 
 2   SeniorCitizen     7032 non-null   int64  
 3   Partner           7032 non-null   object 
 4   Dependents        7032 non-null   object 
 5   tenure            7032 non-null   int64  
 6   PhoneService      7032 non-null   object 
 7   MultipleLines     7032 non-null   object 
 8   InternetService   7032 non-null   object 
 9   OnlineSecurity    7032 non-null   object 
 10  OnlineBackup      7032 non-null   object 
 11  DeviceProtection  7032 non-null   object 
 12  TechSupport       7032 non-null   object 
 13  StreamingTV       7032 non-null   object 
 14  StreamingMovies   7032 non-null   object 
 15  Contract          7032 non-null   object 
 16  PaperlessBilling  7032 non-null   object 


### Now the data is clean and ready for further analysis