In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [2]:
data = pd.read_csv('Churn_Modelling.csv')
data.columns

Index(['RowNumber', 'CustomerId', 'Surname', 'CreditScore', 'Geography',
       'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary', 'Exited'],
      dtype='object')

In [3]:
data.drop(['RowNumber', 'Surname'], axis=1, inplace=True)
data.columns
print(data.shape)

(10000, 12)


In [4]:
data_encoded = pd.get_dummies(data)
print(data_encoded.shape)

(10000, 15)


In [5]:
data_encoded.drop(['Gender_Male', 'Geography_Spain'], axis=1, inplace=True)
data_encoded.columns

Index(['CustomerId', 'CreditScore', 'Age', 'Tenure', 'Balance',
       'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary',
       'Exited', 'Geography_France', 'Geography_Germany', 'Gender_Female'],
      dtype='object')

In [6]:
y = data_encoded.Exited
print(type(y))

<class 'pandas.core.series.Series'>


In [7]:
data_encoded.drop(['Exited'], axis=1, inplace=True)
print(data_encoded.shape)
print(type(data_encoded))

(10000, 12)
<class 'pandas.core.frame.DataFrame'>


In [8]:
data_encoded.columns

Index(['CustomerId', 'CreditScore', 'Age', 'Tenure', 'Balance',
       'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary',
       'Geography_France', 'Geography_Germany', 'Gender_Female'],
      dtype='object')

In [9]:
data_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 12 columns):
CustomerId           10000 non-null int64
CreditScore          10000 non-null int64
Age                  10000 non-null int64
Tenure               10000 non-null int64
Balance              10000 non-null float64
NumOfProducts        10000 non-null int64
HasCrCard            10000 non-null int64
IsActiveMember       10000 non-null int64
EstimatedSalary      10000 non-null float64
Geography_France     10000 non-null uint8
Geography_Germany    10000 non-null uint8
Gender_Female        10000 non-null uint8
dtypes: float64(2), int64(7), uint8(3)
memory usage: 732.5 KB


In [11]:
from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()
scaler.fit(data_encoded)

  return self.partial_fit(X, y)


MinMaxScaler(copy=True, feature_range=(0, 1))

In [12]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score

In [13]:
X_t, X_test, y_t, y_test = train_test_split(data_encoded, y, test_size=0.10, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X_t, y_t, test_size=0.10, random_state=1)

In [14]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
Dt = DecisionTreeClassifier(class_weight='balanced', random_state=1)

In [17]:
from sklearn.metrics import fbeta_score
cv_results = cross_val_score(Dt, X_train, y_train, cv=10)
print(cv_results)
Dt.fit(X_train,y_train)
y_pred_val = Dt.predict_proba(X_val)[:,1]

[0.79654747 0.77654321 0.81111111 0.79012346 0.78518519 0.81111111
 0.7691358  0.79753086 0.79753086 0.80098888]


In [18]:
fscore = fbeta_score(y_val, y_pred_val, beta=1)

roc_score = roc_auc_score(y_val, y_pred_val)
print("roc-auc score on validation set:",roc_score )
print('Fbeta score for Decision tree', fscore)

roc-auc score on validation set: 0.6506142506142507
Fbeta score for Decision tree 0.4432989690721649


In [19]:
from sklearn.ensemble import AdaBoostClassifier
dt = DecisionTreeClassifier(max_depth=5)
adb = AdaBoostClassifier(base_estimator= dt, n_estimators=100)
adb.fit(X_train,y_train)

AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
          learning_rate=1.0, n_estimators=100, random_state=None)

In [20]:
y_pred_proba_val = adb.predict_proba(X_val)[:,1]
roc_auc_score_adb = roc_auc_score(y_val, y_pred_proba_val)
y_pred_adb = adb.predict(X_val)
print('ROC-AUC score for Adaboost Classifier', roc_auc_score_adb)
print('FBeta score for Adaboost classifier', fbeta_score(y_val, y_pred_adb, beta=0.5))

ROC-AUC score for Adaboost Classifier 0.704948024948025
FBeta score for Adaboost classifier 0.46153846153846156


In [29]:
from sklearn.ensemble import GradientBoostingClassifier
sgbt = GradientBoostingClassifier(max_depth=6, subsample=0.8, max_features=0.2, n_estimators=300, random_state=1)

In [30]:
sgbt.fit(X_train, y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=6,
              max_features=0.2, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=300,
              n_iter_no_change=None, presort='auto', random_state=1,
              subsample=0.8, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)

In [31]:
sgbt_y = sgbt.predict(X_val)
sgby_y_prob = sgbt.predict_proba(X_val)[:,1]

In [32]:
print('Fbets score for Stochastic gradient boosting', fbeta_score(y_val, sgbt_y, beta=1))
print('ROC-AUC score for Stochastic gradient boosting', roc_auc_score(y_val, sgby_y_prob))

Fbets score for Stochastic gradient boosting 0.5590062111801243
ROC-AUC score for Stochastic gradient boosting 0.8467813267813268
