In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.graphics.mosaicplot import mosaic

# Validating the data

In [3]:
data=pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')
data.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


## Checking Total Count and types of Values

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


### Getting the Detailed Information about the Numeric data

In [5]:
data.describe()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges
count,7043.0,7043.0,7043.0
mean,0.162147,32.371149,64.761692
std,0.368612,24.559481,30.090047
min,0.0,0.0,18.25
25%,0.0,9.0,35.5
50%,0.0,29.0,70.35
75%,0.0,55.0,89.85
max,1.0,72.0,118.75


#### Knowing the Column Names

In [6]:
data.columns

Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

### Checking for the Null Values

In [7]:
data.isnull().sum()

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

### Converting the Non Numeric Data to Numeric data

In [8]:
data.head()


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


### Replacing the of type object to integer

In [9]:
# Replaing the Prediction where 1 indicates Yes and 0 indicates NO

data['Churn'].replace(to_replace='Yes', value=1, inplace=True)
data['Churn'].replace(to_replace='No', value=0, inplace=True)

In [10]:
# Replacing the Gender where 1 indicates male and 0 indicates Female

data['gender'].replace(to_replace='Male',value=1,inplace=True)
data['gender'].replace(to_replace='Female',value=0,inplace=True)

In [11]:
#Replacing the Partner where 1 indicates yes and O indicates No

data['Partner'].replace(to_replace='Yes',value=1,inplace=True)
data['Partner'].replace(to_replace='No',value=0,inplace=True)

In [12]:
# Replacing the Dependents where 1 indicates yes and O Indicates No

data['Dependents'].replace(to_replace='Yes',value=1,inplace=True)
data['Dependents'].replace(to_replace='No',value=0,inplace=True)

In [13]:
#Replacing the Phoneservice where 1 indictes yes and 0 indicates No

data['PhoneService'].replace(to_replace='Yes',value=1,inplace=True)
data['PhoneService'].replace(to_replace='No',value=0,inplace=True)

In [14]:
#Replacing the Mutiplines values Where 1 indicates yes and 0 indicates No and No Phone Service Indicates 0

data['MultipleLines'].replace(to_replace='Yes',value=1,inplace=True)
data['MultipleLines'].replace(to_replace='No',value=0,inplace=True)
data['MultipleLines'].replace(to_replace='No phone service',value=0,inplace=True)

In [15]:
#Replacing the Internet Service Where 1 indicates the DSl and Fiber optic and 0 indicates the No

data['InternetService'].replace(to_replace='DSL',value=1,inplace=True)
data['InternetService'].replace(to_replace='Fiber optic',value=1,inplace=True)
data['InternetService'].replace(to_replace='No',value=0,inplace=True)

In [31]:
#Replacing the Online Security Where 1 indicates Yes and 0 Indicates No

data['OnlineSecurity'].replace(to_replace='Yes',value=1,inplace=True)
data['OnlineSecurity'].replace(to_replace='No',value=0,inplace=True)
data['OnlineSecurity'].replace(to_replace='No internet service',value=0,inplace=True)


In [40]:
#Replacing the Device Protection Where 1 indicates Yes and 0 Indicates No

data['DeviceProtection'].replace(to_replace='Yes',value=1,inplace=True)
data['DeviceProtection'].replace(to_replace='No',value=0,inplace=True)
data['DeviceProtection'].replace(to_replace='No internet service',value=0,inplace=True)


In [44]:
#Replacing the TechSupport Where 1 indicates Yes and 0 Indicates No


data['TechSupport'].replace(to_replace='Yes',value=1,inplace=True)
data['TechSupport'].replace(to_replace='No',value=0,inplace=True)
data['TechSupport'].replace(to_replace='No internet service',value=0,inplace=True)

In [47]:
#Replacing the StreamingTV Where 1 indicates Yes and 0 Indicates No

data['StreamingTV'].replace(to_replace='Yes',value=1,inplace=True)
data['StreamingTV'].replace(to_replace='No',value=0,inplace=True)
data['StreamingTV'].replace(to_replace='No internet service',value=0,inplace=True)

In [48]:
#Replacing the StreamingMovies Where 1 indicates Yes and 0 Indicates No

data['StreamingMovies'].replace(to_replace='Yes',value=1,inplace=True)
data['StreamingMovies'].replace(to_replace='No',value=0,inplace=True)
data['StreamingMovies'].replace(to_replace='No internet service',value=0,inplace=True)

In [22]:
#Replacing the PaperlessBilling Where 1 indicates Yes and 0 Indicates No

data['PaperlessBilling'].replace(to_replace='Yes',value=1,inplace=True)
data['PaperlessBilling'].replace(to_replace='No',value=0,inplace=True)

In [23]:
#Replacing the PaymentMethod Where 1 indicates Electronic check and Mailed check and 2 indicates Bank transfer and Credit card

data['PaymentMethod'].replace(to_replace='Electronic check',value=1,inplace=True)
data['PaymentMethod'].replace(to_replace='Mailed check',value=1,inplace=True)
data['PaymentMethod'].replace(to_replace='Bank transfer (automatic)',value=2,inplace=True)
data['PaymentMethod'].replace(to_replace='Credit card (automatic)',value=2,inplace=True)

In [24]:
#Replacing the Contract Where 1 indicates Month-to-month, 2 indicates One year and 3 indicates Two year

data['Contract'].replace(to_replace='Month-to-month',value=1,inplace=True)
data['Contract'].replace(to_replace='One year',value=2,inplace=True)
data['Contract'].replace(to_replace='Two year',value=3,inplace=True)

In [32]:
data.head(10)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,0,0,1,0,1,0,0,1,0,...,0,0,0,0,1,1,1,29.85,29.85,0
1,5575-GNVDE,1,0,0,0,34,1,0,1,1,...,1,0,0,0,2,0,1,56.95,1889.5,0
2,3668-QPYBK,1,0,0,0,2,1,0,1,1,...,0,0,0,0,1,1,1,53.85,108.15,1
3,7795-CFOCW,1,0,0,0,45,0,0,1,1,...,1,1,0,0,2,0,2,42.3,1840.75,0
4,9237-HQITU,0,0,0,0,2,1,0,1,0,...,0,0,0,0,1,1,1,70.7,151.65,1
5,9305-CDSKC,0,0,0,0,8,1,1,1,0,...,1,0,1,1,1,1,1,99.65,820.5,1
6,1452-KIOVK,1,0,0,1,22,1,1,1,0,...,0,0,1,0,1,1,2,89.1,1949.4,0
7,6713-OKOMC,0,0,0,0,10,0,0,1,1,...,0,0,0,0,1,0,1,29.75,301.9,0
8,7892-POOKP,0,0,1,0,28,1,1,1,0,...,1,1,1,1,1,1,1,104.8,3046.05,1
9,6388-TABGU,1,0,0,1,62,1,0,1,1,...,0,0,0,0,2,0,2,56.15,3487.95,0


In [49]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   int64  
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   int64  
 4   Dependents        7043 non-null   int64  
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   int64  
 7   MultipleLines     7043 non-null   int64  
 8   InternetService   7043 non-null   int64  
 9   OnlineSecurity    7043 non-null   int64  
 10  OnlineBackup      7043 non-null   int64  
 11  DeviceProtection  7043 non-null   int64  
 12  TechSupport       7043 non-null   int64  
 13  StreamingTV       7043 non-null   int64  
 14  StreamingMovies   7043 non-null   int64  
 15  Contract          7043 non-null   int64  
 16  PaperlessBilling  7043 non-null   int64  


In [36]:
#Replacing the Online Backup Where 1 indicates Yes and 0 Indicates No

data['OnlineBackup'].replace(to_replace='Yes',value=1,inplace=True)
data['OnlineBackup'].replace(to_replace='No',value=0,inplace=True)
data['OnlineBackup'].replace(to_replace='No internet service',value=0,inplace=True)

In [60]:
data['TotalCharges'].unique().tolist()

['29.85',
 '1889.5',
 '108.15',
 '1840.75',
 '151.65',
 '820.5',
 '1949.4',
 '301.9',
 '3046.05',
 '3487.95',
 '587.45',
 '326.8',
 '5681.1',
 '5036.3',
 '2686.05',
 '7895.15',
 '1022.95',
 '7382.25',
 '528.35',
 '1862.9',
 '39.65',
 '202.25',
 '20.15',
 '3505.1',
 '2970.3',
 '1530.6',
 '4749.15',
 '30.2',
 '6369.45',
 '1093.1',
 '6766.95',
 '181.65',
 '1874.45',
 '20.2',
 '45.25',
 '7251.7',
 '316.9',
 '3548.3',
 '3549.25',
 '1105.4',
 '475.7',
 '4872.35',
 '418.25',
 '4861.45',
 '981.45',
 '3906.7',
 '97',
 '144.15',
 '4217.8',
 '4254.1',
 '3838.75',
 '1426.4',
 '1752.65',
 '633.3',
 '4456.35',
 '1752.55',
 '6311.2',
 '7076.35',
 '894.3',
 '7853.7',
 '4707.1',
 '5450.7',
 '2962',
 '957.1',
 '857.25',
 '244.1',
 '3650.35',
 '2497.2',
 '930.9',
 '887.35',
 '49.05',
 '1090.65',
 '7099',
 '1424.6',
 '177.4',
 '6139.5',
 '2688.85',
 '482.25',
 '2111.3',
 '1216.6',
 '79.35',
 '565.35',
 '496.9',
 '4327.5',
 '973.35',
 '918.75',
 '2215.45',
 '1057',
 '927.1',
 '1009.25',
 '2570.2',
 '74.7',