# Classification

## Data Preparation

- Download the data, read it with pandas
- Look at the data
- Make column names and values look uniform
- Check if all columns read correctly
- Check if the churn variable needs any preparation

In [41]:
# import libraries
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

In [3]:
# Enable viewing all columns regardless how many
pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)

In [4]:
# load the dataset
dataset = pd.read_csv("Telco-Customer-Churn.csv")
df = pd.DataFrame(dataset)
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [42]:
# make all column names and values uniform in lowercase letters and replace spaces with underscores
df.columns = df.columns.str.lower().str.replace(" ", "_")

categorical_columns = list(df.dtypes[df.dtypes == "object"].index)

for c in categorical_columns:
    df[c] = df[c].str.lower().str.replace(" ", "_")

In [43]:
df.head()

Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,onlinebackup,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
0,7590-vhveg,female,0,yes,no,1,no,no_phone_service,dsl,no,yes,no,no,no,no,month-to-month,yes,electronic_check,29.85,29.85,0
1,5575-gnvde,male,0,no,no,34,yes,no,dsl,yes,no,yes,no,no,no,one_year,no,mailed_check,56.95,1889.5,0
2,3668-qpybk,male,0,no,no,2,yes,no,dsl,yes,yes,no,no,no,no,month-to-month,yes,mailed_check,53.85,108.15,0
3,7795-cfocw,male,0,no,no,45,no,no_phone_service,dsl,yes,no,yes,yes,no,no,one_year,no,bank_transfer_(automatic),42.3,1840.75,0
4,9237-hqitu,female,0,no,no,2,yes,no,fiber_optic,no,no,no,no,no,no,month-to-month,yes,electronic_check,70.7,151.65,0


In [44]:
# the totalcharges column contains numerical values but there are some features that makes it seem to have objects.
# when we try to convert those values to numericals they raise an error, that error is overriden by use of "coerce" 
tc = pd.to_numeric(df.totalcharges, errors='coerce')

In [45]:
# converting the totalcharges column to numeric values
df.totalcharges = pd.to_numeric(df.totalcharges, errors='coerce')

In [46]:
# filling the null values in totalcharges column with zeros
df.totalcharges = df.totalcharges.fillna(0)

In [50]:
# check the churn variable
df.churn.head()

0    0
1    0
2    0
3    0
4    0
Name: churn, dtype: int32

In [51]:
# churn values are categorical (yes, no), convert them to numericals
df.churn = (df.churn == "yes").astype(int)

## Setting up the Validation Framework

In [52]:
# import train_test_split algorithm
from sklearn.model_selection import train_test_split

In [53]:
# spit the dataset into full train and test
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)

In [54]:
# check the lengths of both full_train and test sets
len(df_full_train), len(df_test)

(5634, 1409)

In [55]:
# now split full_train into train and validation sets
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

In [56]:
# check the lengths of train, validation and test sets
len(df_train), len(df_val), len(df_test)

(4225, 1409, 1409)

In [57]:
# reset index of all datasets
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [58]:
# create target variables from all datasets
y_train = df_train.churn.values
y_val = df_val.churn.values
y_test = df_test.churn.values

In [59]:
# delete the target variable from all sets so that it is not accidentally used as X variable
del df_train["churn"]
del df_val["churn"]
del df_test["churn"]