## We start by importing the necessary libraries for the cleaning

In [18]:
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
%matplotlib inline

## Load the data and give an overview of the dataset

In [25]:
df = pd.read_csv("WA_Fn-UseC_-Telco-Customer-Churn.csv")

In [20]:
# View a random sample of the dataset
df.sample(10)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
6557,8709-KRDVL,Female,0,No,No,33,Yes,Yes,Fiber optic,No,...,No,No,Yes,Yes,Month-to-month,Yes,Electronic check,100.0,3320.6,No
4578,7766-CLTIC,Female,0,No,No,10,Yes,No,DSL,No,...,No,Yes,No,Yes,Month-to-month,Yes,Mailed check,60.95,629.55,No
3872,2018-PZKMU,Male,0,Yes,Yes,9,Yes,No,Fiber optic,No,...,Yes,Yes,Yes,Yes,Month-to-month,No,Electronic check,103.1,970.45,No
541,2782-LFZVW,Female,0,No,No,11,Yes,Yes,DSL,No,...,No,No,No,Yes,Month-to-month,Yes,Mailed check,58.95,601.6,No
870,1937-OTUKY,Female,0,Yes,No,72,Yes,Yes,Fiber optic,Yes,...,Yes,Yes,Yes,No,Two year,Yes,Bank transfer (automatic),98.2,7015.9,No
6145,5010-IPEAQ,Female,0,Yes,Yes,67,No,No phone service,DSL,Yes,...,Yes,Yes,Yes,No,Two year,No,Credit card (automatic),54.2,3623.95,No
4404,1307-ATKGB,Male,0,No,No,24,Yes,No,Fiber optic,No,...,Yes,No,No,Yes,Month-to-month,Yes,Electronic check,89.55,2187.15,No
6199,2856-NNASM,Male,1,No,No,43,Yes,No,Fiber optic,No,...,No,No,Yes,Yes,Month-to-month,Yes,Mailed check,89.55,3856.75,Yes
40,8865-TNMNX,Male,0,Yes,Yes,10,Yes,No,DSL,No,...,No,No,No,No,One year,No,Mailed check,49.55,475.7,No
169,1159-WFSGR,Female,0,Yes,Yes,16,Yes,No,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,Month-to-month,No,Electronic check,20.4,321.4,No


In [21]:
# The dimensions of our dataframe 
df.shape

(7043, 21)

In [22]:
# The columns names
df.columns

Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


### The 'TotalCharges' column is of an object type instead of float64. This needs to be fixed

In [24]:
df['TotalCharges'] = df['TotalCharges'].astype(np.float64)

ValueError: could not convert string to float: 

### We need to replace the ' ' into Null object in order to be recognized by Pandas

In [26]:
# I found this solution in Stackoverflow
# https://stackoverflow.com/questions/13445241/replacing-blank-values-white-space-with-nan-in-pandas
df['TotalCharges'] = df['TotalCharges'].apply(lambda x: np.nan if isinstance(x, str) and x.isspace() else x)

In [27]:
df['TotalCharges'] = df['TotalCharges'].astype(np.float64)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


### Now that we fixed the "TotalCharges" column let's check the null values

In [None]:
df.isnull().sum()

customerID           0
gender               0
SeniorCitizen        0
Partner              0
Dependents           0
tenure               0
PhoneService         0
MultipleLines        0
InternetService      0
OnlineSecurity       0
OnlineBackup         0
DeviceProtection     0
TechSupport          0
StreamingTV          0
StreamingMovies      0
Contract             0
PaperlessBilling     0
PaymentMethod        0
MonthlyCharges       0
TotalCharges        11
Churn                0
dtype: int64

### The number of null values is really minimal with regard to the size of the dataframe
### this allow us to drop records with null values with affecting the overall dataframe

In [28]:
df = df.dropna()

### Let's drop the customerID column since it's irrelevant

In [29]:
df = df.drop('customerID', axis=1)

In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7032 entries, 0 to 7042
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7032 non-null   object 
 1   SeniorCitizen     7032 non-null   int64  
 2   Partner           7032 non-null   object 
 3   Dependents        7032 non-null   object 
 4   tenure            7032 non-null   int64  
 5   PhoneService      7032 non-null   object 
 6   MultipleLines     7032 non-null   object 
 7   InternetService   7032 non-null   object 
 8   OnlineSecurity    7032 non-null   object 
 9   OnlineBackup      7032 non-null   object 
 10  DeviceProtection  7032 non-null   object 
 11  TechSupport       7032 non-null   object 
 12  StreamingTV       7032 non-null   object 
 13  StreamingMovies   7032 non-null   object 
 14  Contract          7032 non-null   object 
 15  PaperlessBilling  7032 non-null   object 
 16  PaymentMethod     7032 non-null   object 


### Now the data is clean and ready for further analysis