# Imports

In [None]:
import numpy as np
import matplotlib.pyplot as plt 
import pandas as pd

In [279]:
dataset = pd.read_csv('dataset/Telco-Customer-Churn.csv')

## Description

In [280]:
dataset.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


### Checking for Duplicate Values

In [None]:
#Checking if any of the features have duplicate values
dataset.duplicated().sum()

### Checking Null Values

In [None]:
mask = dataset['TotalCharges'].str.strip().astype(bool)

print('Empty Strings', dataset[-mask]) 

In [None]:
#Swapping the non empty strings to parseable float value
dataset.loc[-mask, 'TotalCharges'] = '0.00'

X = dataset.iloc[:, 1:-1].values
y = dataset.iloc[:, -1].values


### Converting the relevant string values to float values

In [None]:
#We discovered that the last column in our input features which is Monthly charges is in string format. Therefore we have to parse the column into floats
str_column = X[:,-1]

#convert the string column to float 
float_column = str_column.astype(float)
    
X[:,-1] = float_column

print(X[488])

### Encoding categorical independent data

In [None]:
print(X[0])

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

print(X[0])

#Encoding categorical columns to meaningful binary values
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0,2,3,5,6,7,8,9,10,11,12,13,14,15,16])], remainder='passthrough')
X = np.array(ct.fit_transform(X))

In [None]:
print(X[0])

### Encoding categorical dependent data

In [273]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

In [274]:
print(y)

[0 0 1 ... 0 1 0]


### Normalization

In [270]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

X = sc.fit_transform(X)

In [None]:
print(X[0])

### Splitting the dataset to training and testing

In [275]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 1)
