In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import warnings
warnings.filterwarnings("ignore")

## Models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

## Model evaluators
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import plot_roc_curve

In [24]:
df = pd.read_csv("Bank Customer Churn Prediction.csv")
df.head()

Unnamed: 0,customer_id,credit_score,country,gender,age,tenure,balance,products_number,credit_card,active_member,estimated_salary,churn
0,15634602,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,15647311,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,15619304,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,15701354,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,15737888,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [25]:
df.describe()

Unnamed: 0,customer_id,credit_score,age,tenure,balance,products_number,credit_card,active_member,estimated_salary,churn
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,15690940.0,650.5288,38.9218,5.0128,76485.889288,1.5302,0.7055,0.5151,100090.239881,0.2037
std,71936.19,96.653299,10.487806,2.892174,62397.405202,0.581654,0.45584,0.499797,57510.492818,0.402769
min,15565700.0,350.0,18.0,0.0,0.0,1.0,0.0,0.0,11.58,0.0
25%,15628530.0,584.0,32.0,3.0,0.0,1.0,0.0,0.0,51002.11,0.0
50%,15690740.0,652.0,37.0,5.0,97198.54,1.0,1.0,1.0,100193.915,0.0
75%,15753230.0,718.0,44.0,7.0,127644.24,2.0,1.0,1.0,149388.2475,0.0
max,15815690.0,850.0,92.0,10.0,250898.09,4.0,1.0,1.0,199992.48,1.0


In [26]:
df.isna().sum()

customer_id         0
credit_score        0
country             0
gender              0
age                 0
tenure              0
balance             0
products_number     0
credit_card         0
active_member       0
estimated_salary    0
churn               0
dtype: int64

In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customer_id       10000 non-null  int64  
 1   credit_score      10000 non-null  int64  
 2   country           10000 non-null  object 
 3   gender            10000 non-null  object 
 4   age               10000 non-null  int64  
 5   tenure            10000 non-null  int64  
 6   balance           10000 non-null  float64
 7   products_number   10000 non-null  int64  
 8   credit_card       10000 non-null  int64  
 9   active_member     10000 non-null  int64  
 10  estimated_salary  10000 non-null  float64
 11  churn             10000 non-null  int64  
dtypes: float64(2), int64(8), object(2)
memory usage: 937.6+ KB


## Transforming Categorical Columns

In [28]:
columns = df.columns

from sklearn.compose import make_column_selector as selector
categorical_columns_selector = selector(dtype_include=object)
categorical_columns = categorical_columns_selector(df)

no_categorical = columns.drop(categorical_columns)
categorical_columns


['country', 'gender']

In [29]:
data_categorical = df[categorical_columns]
data_categorical.head()

Unnamed: 0,country,gender
0,France,Female
1,Spain,Female
2,France,Female
3,France,Female
4,Spain,Female


In [30]:
no_categorical

Index(['customer_id', 'credit_score', 'age', 'tenure', 'balance',
       'products_number', 'credit_card', 'active_member', 'estimated_salary',
       'churn'],
      dtype='object')

In [31]:
Feature = df[no_categorical]

# Concatanate with categorical values
df_clean = pd.concat([Feature,pd.get_dummies(data_categorical)], axis=1)
df_clean


Unnamed: 0,customer_id,credit_score,age,tenure,balance,products_number,credit_card,active_member,estimated_salary,churn,country_France,country_Germany,country_Spain,gender_Female,gender_Male
0,15634602,619,42,2,0.00,1,1,1,101348.88,1,1,0,0,1,0
1,15647311,608,41,1,83807.86,1,0,1,112542.58,0,0,0,1,1,0
2,15619304,502,42,8,159660.80,3,1,0,113931.57,1,1,0,0,1,0
3,15701354,699,39,1,0.00,2,0,0,93826.63,0,1,0,0,1,0
4,15737888,850,43,2,125510.82,1,1,1,79084.10,0,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,15606229,771,39,5,0.00,2,1,0,96270.64,0,1,0,0,0,1
9996,15569892,516,35,10,57369.61,1,1,1,101699.77,0,1,0,0,0,1
9997,15584532,709,36,7,0.00,1,0,1,42085.58,1,1,0,0,1,0
9998,15682355,772,42,3,75075.31,2,1,0,92888.52,1,0,1,0,0,1


## Separate Attributes and Predictable Variable

In [32]:
# Attributes
X = df_clean.drop("churn", axis=1)
# Scaling Features
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()                                         
X = scaler.fit_transform(X)

# Predictable Variable
y = df_clean['churn']

X
#y

array([[-0.78321342, -0.32622142,  0.29351742, ..., -0.57380915,
         1.09598752, -1.09598752],
       [-0.60653412, -0.44003595,  0.19816383, ...,  1.74273971,
         1.09598752, -1.09598752],
       [-0.99588476, -1.53679418,  0.29351742, ..., -0.57380915,
         1.09598752, -1.09598752],
       ...,
       [-1.47928179,  0.60498839, -0.27860412, ..., -0.57380915,
         1.09598752, -1.09598752],
       [-0.11935577,  1.25683526,  0.29351742, ..., -0.57380915,
        -0.91241915,  0.91241915],
       [-0.87055909,  1.46377078, -1.04143285, ..., -0.57380915,
         1.09598752, -1.09598752]])

## Split Train and Test dataset

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=44)
X_train

array([[-0.75426966,  0.79123035, -0.4693113 , ..., -0.57380915,
        -0.91241915,  0.91241915],
       [-1.66886207, -0.44003595,  0.29351742, ..., -0.57380915,
        -0.91241915,  0.91241915],
       [ 0.23782587, -0.20206011, -0.4693113 , ..., -0.57380915,
         1.09598752, -1.09598752],
       ...,
       [ 0.54405703,  0.0048754 ,  0.5795782 , ..., -0.57380915,
        -0.91241915,  0.91241915],
       [ 0.91771242, -1.16431025, -0.66001848, ..., -0.57380915,
         1.09598752, -1.09598752],
       [ 0.84318431, -0.83321343, -0.37395771, ..., -0.57380915,
        -0.91241915,  0.91241915]])

## Classification Models

In [35]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier
from sklearn.cross_decomposition import PLSCanonical
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier




# Put models in a dictionary
models = {"KNN": KNeighborsClassifier(),
          "Logistic Regression": LogisticRegression(), 
          "Random Forest": RandomForestClassifier(),
          "Decision Tree": DecisionTreeClassifier(),
          "SVM":svm.SVC(), "gnb": GaussianNB(),
          "LDA": LinearDiscriminantAnalysis(),
          "SGD": SGDClassifier(),"AdaBoost": AdaBoostClassifier()}

# Create function to fit and score models
def fit_and_score(models, X_train, X_test, y_train, y_test):
    """
    Fits and evaluates given machine learning models.
    models : a dict of different Scikit-Learn machine learning models
    X_train : training data
    X_test : testing data
    y_train : labels assosciated with training data
    y_test : labels assosciated with test data
    """
    # Random seed for reproducible results
    np.random.seed(42)
    # Make a list to keep model scores
    model_scores = {}
    # Loop through models
    for name, model in models.items():
        # Fit the model to the data
        model.fit(X_train, y_train)
        # Evaluate the model and append its score to model_scores
        model_scores[name] = model.score(X_test, y_test)
    return model_scores

In [36]:
model_scores = fit_and_score(models=models,
                             X_train=X_train,
                             X_test=X_test,
                             y_train=y_train,
                             y_test=y_test)
model_scores

{'KNN': 0.8203333333333334,
 'Logistic Regression': 0.8106666666666666,
 'Random Forest': 0.8606666666666667,
 'Decision Tree': 0.795,
 'SVM': 0.8566666666666667,
 'gnb': 0.8173333333333334,
 'LDA': 0.806,
 'SGD': 0.806,
 'AdaBoost': 0.856}

#### Random Forest is the model with the best performance, but SVM and AdaBoost can be explored with Hyperparameters Techiniques too!

## Random Forest Classifier Hyperparameters Tuning

In [40]:
# Different RandomForestClassifier hyperparameters
rf_grid = {"n_estimators": np.arange(10, 1000, 50),
           "max_depth": [None, 3, 5, 10],
           "min_samples_split": np.arange(2, 20, 2),
           "min_samples_leaf": np.arange(1, 20, 2)}

### Randomized SearchCV Hyperparameters Techinique

In [41]:
# Setup random seed
np.random.seed(42)

# Setup random hyperparameter search for RandomForestClassifier
randomCV_RF = RandomizedSearchCV(RandomForestClassifier(),
                           param_distributions=rf_grid,
                           cv=5,
                           n_iter=20,
                           verbose=True) #n_job=-1

# Fit random hyperparameter search model
randomCV_RF.fit(X_train, y_train);

Fitting 5 folds for each of 20 candidates, totalling 100 fits


In [42]:
randomCV_RF.best_params_

{'n_estimators': 910,
 'min_samples_split': 18,
 'min_samples_leaf': 1,
 'max_depth': 10}

In [44]:
# Evaluate the randomized search random forest model
randomCV_RF.score(X_test, y_test)

0.864

#### Build Random Forest Classifier model with RandomizedSearchCV's best parameters and test on Test Data to evaluate performance

In [123]:
RF_model_random = RandomForestClassifier(n_estimators = 910,
                                         min_samples_split = 18,
                                         min_samples_leaf = 1,
                                         max_depth = 10)
# Train
RF_model_random.fit(X_train, y_train)

# Make Predictions
predictions = RF_model_random.predict(X_test)

In [124]:
from sklearn.metrics import jaccard_score, f1_score, accuracy_score
accuracy_score(y_test, predictions)

0.8636666666666667

### Grid SearchCV Hyperparameters Techinique

In [126]:
# Different RandomForestClassifier hyperparameters
RF_grid = {"n_estimators": np.arange(10, 50),
           "max_depth": [None, 3],
           "min_samples_split": [1,2,3],
           "min_samples_leaf": [2,3,5]}

# Setup grid hyperparameter search for LogisticRegression
gridCV_RF = GridSearchCV(RandomForestClassifier(),
                          param_grid=RF_grid,
                          cv=5,
                          verbose=True)

# Fit grid hyperparameter search model
gridCV_RF.fit(X_train, y_train);

Fitting 5 folds for each of 720 candidates, totalling 3600 fits


In [127]:
gs_RF.best_params_

{'max_depth': None,
 'min_samples_leaf': 1,
 'min_samples_split': 7,
 'n_estimators': 49}

In [128]:
gridCV_RF.score(X_test, y_test)

0.8623333333333333

#### Build Random Forest Classifier model with GridSearchCV's best parameters and test on Test Data to evaluate performance

In [152]:
RF_model_grid = RandomForestClassifier(max_depth= None,
                                     min_samples_leaf= 1,
                                     min_samples_split= 7,
                                     n_estimators= 49)
# Train
RF_model_grid.fit(X_train, y_train)

# Make Predictions
predictions = RF_model_grid.predict(X_test)
predictions

unique, counts = np.unique(predictions, return_counts=True)

dict(zip(unique, counts))

{0: 2619, 1: 381}

In [130]:
from sklearn.metrics import jaccard_score, f1_score, accuracy_score
accuracy_score(y_test, predictions)

0.8643333333333333

## Using Neural Networks to Predict Churn Customers

In [131]:
import keras
from keras.models import Sequential
from keras.layers import Dense

In [132]:
n_cols = X.shape[1]
n_cols

14

In [133]:
# define classification model
def classification_model():
    # create model
    model = Sequential()
    model.add(Dense(10, activation='relu', input_shape=(n_cols,)))
    model.add(Dense(20, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    
    # compile model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [134]:
# build the model
model = classification_model()

# train
np.random.seed(45)
model.fit(X_train, y_train, epochs=50, verbose=1, batch_size=10)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x1d0caaa5f90>

In [135]:
# evaluate the keras model on Test Data
_, accuracy = model.evaluate(X_test, y_test)
print('Accuracy on Test Data: %.2f' % (accuracy*100))

Accuracy on Test Data: 84.23


In [141]:
predictions_probability = model.predict(X_test)
predictions_probability = predictions_probability[0::]
predictions_probability



array([[0.11921655],
       [0.43128192],
       [0.7859904 ],
       ...,
       [0.28601742],
       [0.16873099],
       [0.03376225]], dtype=float32)

## SVM Classifier Hyperparameters Tuning

## AdaBoost Hyperparameters Tuning