In [2]:
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sklearn.metrics as m

In [3]:
# Importing the dataset
dataset = pd.read_csv('Churn_Modelling.csv')
X = dataset.iloc[:, 3:13].values
y = dataset.iloc[:, 13].values
print(X[1:2])
dataset.head()

[[608 'Spain' 'Female' 41 1 83807.86 1 0 1 112542.58]]


Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [4]:
# Encoding categorical data
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder_X_1 = LabelEncoder()
X[:, 1] = labelencoder_X_1.fit_transform(X[:, 1])
labelencoder_X_2 = LabelEncoder()
X[:, 2] = labelencoder_X_2.fit_transform(X[:, 2])
onehotencoder = OneHotEncoder(categorical_features = [1])
X = onehotencoder.fit_transform(X).toarray()
X = X[:, 1:]

In [5]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

## KNN 

In [6]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=5, n_jobs=2)
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=2, n_neighbors=5, p=2,
           weights='uniform')

In [7]:
y_pred = knn.predict(X_test)
m.accuracy_score(y_test, y_pred)

0.76449999999999996

## Bagging

In [8]:
from sklearn.ensemble import BaggingClassifier

bag = BaggingClassifier(knn, max_samples=20, max_features=7, n_jobs=2, oob_score=True, n_estimators=20)
bag.fit(X_train, y_train)

BaggingClassifier(base_estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=2, n_neighbors=5, p=2,
           weights='uniform'),
         bootstrap=True, bootstrap_features=False, max_features=7,
         max_samples=20, n_estimators=20, n_jobs=2, oob_score=True,
         random_state=None, verbose=0, warm_start=False)

In [9]:
bag.oob_score_

0.79600000000000004

In [10]:
y_pred = bag.predict(X_test)
m.accuracy_score(y_test, y_pred)

0.79749999999999999

## Ensemble(Random Forest)

In [11]:
from sklearn.ensemble import RandomForestClassifier

rforest = RandomForestClassifier(n_estimators=50, oob_score=True)

In [12]:
rforest.fit(X_train, y_train)

rforest.oob_score_

0.86075000000000002

In [13]:
y_pred = rforest.predict(X_test)
m.accuracy_score(y_test, y_pred)

0.86699999999999999

## AdaBoost

In [14]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression

Logit = LogisticRegression()

boost = AdaBoostClassifier(n_estimators=100, learning_rate=1)

In [15]:
boost.fit(X_train, y_train)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1,
          n_estimators=100, random_state=None)

In [16]:
y_pred = boost.predict(X_test)
m.accuracy_score(y_test, y_pred)

0.86699999999999999

## Gradient Tree Boosting

In [17]:
from sklearn.ensemble import GradientBoostingClassifier

gboost = GradientBoostingClassifier(warm_start=True)

In [18]:
gboost.fit(X_train, y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=True)

In [19]:
y_pred = gboost.predict(X_test)
m.accuracy_score(y_test, y_pred)

0.86750000000000005

In [20]:
pd.Series(gboost.feature_importances_).sort_values()

8     0.001915
1     0.008766
5     0.027140
3     0.039650
9     0.048741
0     0.063826
10    0.087993
2     0.106927
7     0.153929
4     0.223851
6     0.237265
dtype: float64

## Voting Classifier

In [21]:
from sklearn.ensemble import VotingClassifier


vote = VotingClassifier(estimators = [('knn',knn), ('LR' , Logit) , ('Rfor', rforest)], voting='hard')

In [22]:
vote.fit(X_train, y_train)

VotingClassifier(estimators=[('knn', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=2, n_neighbors=5, p=2,
           weights='uniform')), ('LR', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1,...timators=50, n_jobs=1,
            oob_score=True, random_state=None, verbose=0, warm_start=False))],
         flatten_transform=None, n_jobs=1, voting='hard', weights=None)

In [23]:
y_pred = vote.predict(X_test)
m.accuracy_score(y_test, y_pred)

0.81200000000000006

## Neural Network

In [24]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [25]:
X_train.shape

(8000, 11)

In [26]:
from keras.models import Sequential
from keras.layers import Dense

nn = Sequential()

Using TensorFlow backend.


In [27]:
nn.add(Dense(units=6, activation='relu', input_shape = (11,),kernel_initializer='uniform'))

In [28]:
nn.add(Dense(units=6, activation='relu',kernel_initializer='uniform'))

In [29]:
nn.add(Dense(units=1, activation='sigmoid',kernel_initializer='uniform'))

In [30]:
nn.compile(optimizer='adam', metrics=['accuracy'], loss='binary_crossentropy')

In [31]:
nn.fit(X_train, y_train, epochs=50, batch_size=15)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x1c28bab1160>

In [37]:
y_pred = nn.predict(X_test)
y_pred = (y_pred > 0.5)
m.accuracy_score(y_test, y_pred)

0.86099999999999999