## Data Loading

In [1]:
import numpy as np
import pandas as pd
churn = pd.read_csv('dataset.csv', sep=',')

## Data Analysis

In [2]:
data_size = churn.shape
print(data_size)

(3333, 21)


In [3]:
churn_col_names = list(churn.columns)
print(churn_col_names)

['state', 'account length', 'area code', 'phone number', 'international plan', 'voice mail plan', 'number vmail messages', 'total day minutes', 'total day calls', 'total day charge', 'total eve minutes', 'total eve calls', 'total eve charge', 'total night minutes', 'total night calls', 'total night charge', 'total intl minutes', 'total intl calls', 'total intl charge', 'customer service calls', 'churn']


In [4]:
print(churn.describe())

       account length    area code  number vmail messages  total day minutes  \
count     3333.000000  3333.000000            3333.000000        3333.000000   
mean       101.064806   437.182418               8.099010         179.775098   
std         39.822106    42.371290              13.688365          54.467389   
min          1.000000   408.000000               0.000000           0.000000   
25%         74.000000   408.000000               0.000000         143.700000   
50%        101.000000   415.000000               0.000000         179.400000   
75%        127.000000   510.000000              20.000000         216.400000   
max        243.000000   510.000000              51.000000         350.800000   

       total day calls  total day charge  total eve minutes  total eve calls  \
count      3333.000000       3333.000000        3333.000000      3333.000000   
mean        100.435644         30.562307         200.980348       100.114311   
std          20.069084          9.25943

In [5]:
print(churn.head())

  state  account length  area code phone number international plan  \
0    KS             128        415     382-4657                 no   
1    OH             107        415     371-7191                 no   
2    NJ             137        415     358-1921                 no   
3    OH              84        408     375-9999                yes   
4    OK              75        415     330-6626                yes   

  voice mail plan  number vmail messages  total day minutes  total day calls  \
0             yes                     25              265.1              110   
1             yes                     26              161.6              123   
2              no                      0              243.4              114   
3              no                      0              299.4               71   
4              no                      0              166.7              113   

   total day charge  ...  total eve calls  total eve charge  \
0             45.07  ...           

## Target Identification

In [6]:
churn_target = churn['churn']
print(churn_target.unique())

[False  True]


## Feature Identification

In [9]:
#Phone number is unique and might not influence prediction
#churn is the target variable and should be dropped

cols_to_drop = ['phone number', 'churn']
churn_feature = churn.drop(cols_to_drop, axis=1)
print(churn_feature.head())

  state  account length  area code international plan voice mail plan  \
0    KS             128        415                 no             yes   
1    OH             107        415                 no             yes   
2    NJ             137        415                 no              no   
3    OH              84        408                yes              no   
4    OK              75        415                yes              no   

   number vmail messages  total day minutes  total day calls  \
0                     25              265.1              110   
1                     26              161.6              123   
2                      0              243.4              114   
3                      0              299.4               71   
4                      0              166.7              113   

   total day charge  total eve minutes  total eve calls  total eve charge  \
0             45.07              197.4               99             16.78   
1             27.47   

## Data Preprocessing

In [10]:
#Categorical Data

churn_categorical = churn.select_dtypes(include=[object])
print(churn_categorical.head(2))

  state phone number international plan voice mail plan
0    KS     382-4657                 no             yes
1    OH     371-7191                 no             yes


In [11]:
#Changing yes/no values to boolean

yes_no_cols = ['international plan', 'voice mail plan']
churn_feature[yes_no_cols] = churn_feature[yes_no_cols] == 'yes'
print(churn_feature.head(2))

  state  account length  area code  international plan  voice mail plan  \
0    KS             128        415               False             True   
1    OH             107        415               False             True   

   number vmail messages  total day minutes  total day calls  \
0                     25              265.1              110   
1                     26              161.6              123   

   total day charge  total eve minutes  total eve calls  total eve charge  \
0             45.07              197.4               99             16.78   
1             27.47              195.5              103             16.62   

   total night minutes  total night calls  total night charge  \
0                244.7                 91               11.01   
1                254.4                103               11.45   

   total intl minutes  total intl calls  total intl charge  \
0                10.0                 3                2.7   
1                13.7        

In [12]:
#Label Encoding

from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()
churn_feature['area code'] = label_encoder.fit_transform(churn_feature['area code'])
print(churn_feature.head(2))

  state  account length  area code  international plan  voice mail plan  \
0    KS             128          1               False             True   
1    OH             107          1               False             True   

   number vmail messages  total day minutes  total day calls  \
0                     25              265.1              110   
1                     26              161.6              123   

   total day charge  total eve minutes  total eve calls  total eve charge  \
0             45.07              197.4               99             16.78   
1             27.47              195.5              103             16.62   

   total night minutes  total night calls  total night charge  \
0                244.7                 91               11.01   
1                254.4                103               11.45   

   total intl minutes  total intl calls  total intl charge  \
0                10.0                 3                2.7   
1                13.7        

In [22]:
#One Hot Encoding

print('Churn data size before One Hot Encoding :', churn_feature.shape)
print('No. of unique states :', len(churn_feature['state'].unique()))

#Give the feature and columns to One Hot Encode in 'columns' and column rename prefix in 'prefix'
churn_dumm = pd.get_dummies(churn_feature, columns=['state'], prefix=['state'])
print('Churn data size after One Hot Encoding :', churn_dumm.shape)

import numpy as np  #Converting to numpy matrix
churn_matrix = churn_dumm.values.astype(np.float64)


Churn data size before One Hot Encoding : (3333, 19)
No. of unique states : 51
Churn data size after One Hot Encoding : (3333, 69)


In [23]:
#Imputing Missing values

from sklearn.impute import SimpleImputer
#Missing values replaced by mean
imp = SimpleImputer(missing_values=np.nan, strategy='mean', fill_value=None, copy=True)
#Fit to data, then transform it
churn_matrix = imp.fit_transform(churn_matrix)

In [None]:
#Standardization

from sklearn.preprocessing import StandardScaler
#Standardi