In [1]:
import pandas as pd
import numpy as np

import tensorflow as tf 
random_state = 10
np.random.seed(random_state)
tf.random.set_seed(random_state)


# Matplotlib for visualization
from matplotlib import pyplot as plt
# display plots in the notebook
%matplotlib inline

# Seaborn for easier visualization
import seaborn as sns
### sns.set_style('darkgrid')

# store elements as dictionary keys and their counts as dictionary values
from collections import Counter

# scikit-learn
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, LabelEncoder, LabelBinarizer
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import make_pipeline

# Function for creating model pipelines - imblearn
from imblearn.pipeline import make_pipeline as imbl_pipe

# Over-sampling using SMOTE
from imblearn.over_sampling import SMOTE

# Classification metrics
from sklearn.metrics import confusion_matrix, classification_report

# Keras
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from tensorflow.keras.layers import Dense, Input, Dropout
from tensorflow.keras.models import Sequential

In [2]:
abt = pd.read_csv("analytical_base_table.csv")
print(f"Dataframe dimensions: {abt.shape}")
abt.head()

Dataframe dimensions: (10000, 11)


Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [3]:
# Object for target variable
y = abt.Exited

# object for input features
X = abt.drop(['Exited'], axis=1)

# display shapes of X and y
print(X.shape, y.shape)

(10000, 10) (10000,)


In [4]:
# List numerical features
num_columns = X.select_dtypes(include='number').columns.tolist()
num_columns

['CreditScore',
 'Age',
 'Tenure',
 'Balance',
 'NumOfProducts',
 'HasCrCard',
 'IsActiveMember',
 'EstimatedSalary']

In [5]:
# List categorical features
cat_columns = X.select_dtypes(include='object').columns.tolist()
cat_columns

['Geography', 'Gender']

In [6]:
def class_count(a):
    counter=Counter(a)
    kv=[list(counter.keys()),list(counter.values())]
    dff = pd.DataFrame(np.array(kv).T, columns=['Exited','Count'])
    dff['Count'] = dff['Count'].astype('int64')
    dff['%'] = round(dff['Count'] / a.shape[0] * 100, 2)
    return dff.sort_values('Count',ascending=False)

In [7]:
class_count(y)

Unnamed: 0,Exited,Count,%
1,0,7963,79.63
0,1,2037,20.37


In [8]:
random_state = 10

# Split X and y into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.3,
                                                    random_state=random_state,
                                                    stratify=abt.Exited)

# Print number of observations in X_train, X_test, y_train, and y_test
print(len(X_train), len(X_test), len(y_train), len(y_test))

7000 3000 7000 3000


In [9]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7000 entries, 8061 to 4741
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CreditScore      7000 non-null   int64  
 1   Geography        7000 non-null   object 
 2   Gender           7000 non-null   object 
 3   Age              7000 non-null   int64  
 4   Tenure           7000 non-null   int64  
 5   Balance          7000 non-null   float64
 6   NumOfProducts    7000 non-null   int64  
 7   HasCrCard        7000 non-null   int64  
 8   IsActiveMember   7000 non-null   int64  
 9   EstimatedSalary  7000 non-null   float64
dtypes: float64(2), int64(6), object(2)
memory usage: 601.6+ KB


In [10]:
num_features = [] 

for i in num_columns:
    location = X.columns.get_loc(i)
    num_features.append(location)
print(num_features)  

[0, 3, 4, 5, 6, 7, 8, 9]


In [11]:
cat_features = []

for i in cat_columns:
    location = X.columns.get_loc(i)
    cat_features.append(location)
print(cat_features)  

[1, 2]


In [12]:
# Define column transformer
# Need to be numeric not string to specify columns name 
preprocess = make_column_transformer(
    (MinMaxScaler(), num_features),
    (OneHotEncoder(sparse=False), cat_features)
)
preprocess

ColumnTransformer(transformers=[('minmaxscaler', MinMaxScaler(),
                                 [0, 3, 4, 5, 6, 7, 8, 9]),
                                ('onehotencoder', OneHotEncoder(sparse=False),
                                 [1, 2])])

In [13]:
# Preprocess X_train to get its shape for Keras pipeline
X_train_pp = preprocess.fit_transform(X_train)

print(X_train_pp.shape)

(7000, 13)


In [14]:
lb = LabelBinarizer()

y_train_lb = lb.fit_transform(y_train)
y_test_lb = lb.transform(y_test)
y_train_lb

array([[0],
       [0],
       [0],
       ...,
       [0],
       [0],
       [1]])

In [15]:
print(y_train_lb.shape, y_test_lb.shape)

(7000, 1) (3000, 1)


In [17]:
# Import model build function "create_model"from the script "keras_model.py"
import keras_model

In [18]:
keras_clf = KerasClassifier(build_fn=keras_model.create_model, verbose=0)

  """Entry point for launching an IPython kernel.


In [19]:
# Define model with pipeline
model = imbl_pipe(preprocess,
                  SMOTE(sampling_strategy='auto', random_state=random_state),
                  keras_clf)

model

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('minmaxscaler',
                                                  MinMaxScaler(),
                                                  [0, 3, 4, 5, 6, 7, 8, 9]),
                                                 ('onehotencoder',
                                                  OneHotEncoder(sparse=False),
                                                  [1, 2])])),
                ('smote', SMOTE(random_state=10)),
                ('kerasclassifier',
                 <keras.wrappers.scikit_learn.KerasClassifier object at 0x7f4690794450>)])

In [20]:
model.get_params()

{'columntransformer': ColumnTransformer(transformers=[('minmaxscaler', MinMaxScaler(),
                                  [0, 3, 4, 5, 6, 7, 8, 9]),
                                 ('onehotencoder', OneHotEncoder(sparse=False),
                                  [1, 2])]),
 'columntransformer__minmaxscaler': MinMaxScaler(),
 'columntransformer__minmaxscaler__clip': False,
 'columntransformer__minmaxscaler__copy': True,
 'columntransformer__minmaxscaler__feature_range': (0, 1),
 'columntransformer__n_jobs': None,
 'columntransformer__onehotencoder': OneHotEncoder(sparse=False),
 'columntransformer__onehotencoder__categories': 'auto',
 'columntransformer__onehotencoder__drop': None,
 'columntransformer__onehotencoder__dtype': numpy.float64,
 'columntransformer__onehotencoder__handle_unknown': 'error',
 'columntransformer__onehotencoder__sparse': False,
 'columntransformer__remainder': 'drop',
 'columntransformer__sparse_threshold': 0.3,
 'columntransformer__transformer_weights': None,
 'c

In [21]:
# Define parameters for Keras classifier

param_grid = {
    'kerasclassifier__epochs': [10],
    'kerasclassifier__n_units': [64, 128, 100],
    #'kerasclassifier__init': [ 'uniform', 'zeros', 'normal', ], 
    #'kerasclassifier__batch_size':[4, 16, 32],
    #'kerasclassifier__optimizer':['RMSprop', 'Adam', 'Adamax', 'sgd'],
    'kerasclassifier__dropout': [0.5, 0.3, 0.2, 0.1, 0],
    'kerasclassifier__verbose': [0]
}

In [22]:
# Create the GridSearchCV model
# Create the GridSearch estimator along with a parameter object containing the values to adjust
from sklearn.model_selection import GridSearchCV

grid = GridSearchCV(model, param_grid, verbose=3, cv= 5, n_jobs=4)

In [23]:
# Train the model with GridSearch
grid.fit(X_train, y_train_lb)

Fitting 5 folds for each of 15 candidates, totalling 75 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('columntransformer',
                                        ColumnTransformer(transformers=[('minmaxscaler',
                                                                         MinMaxScaler(),
                                                                         [0, 3,
                                                                          4, 5,
                                                                          6, 7,
                                                                          8,
                                                                          9]),
                                                                        ('onehotencoder',
                                                                         OneHotEncoder(sparse=False),
                                                                         [1,
                                                                          2])

In [24]:
print(f"Best Score: {grid.best_score_}  using:\n{grid.best_params_}")

Best Score: 0.7891428589820861  using:
{'kerasclassifier__dropout': 0.3, 'kerasclassifier__epochs': 10, 'kerasclassifier__n_units': 128, 'kerasclassifier__verbose': 0}


In [25]:
print(f"Training Data Score: {grid.score(X_train, y_train)}")
print(f"Testing Data Score: {grid.score(X_test, y_test)}")

Training Data Score: 0.7327142953872681
Testing Data Score: 0.6970000267028809


In [26]:
# Make predictions with the hypertuned model
pred = grid.predict(X_test)
pred

array([0, 0, 0, ..., 1, 0, 0])

In [27]:
# Confusion matrix
cm = confusion_matrix(y_test, pred)
print(cm)

[[1592  797]
 [ 112  499]]


In [28]:
# Normalized confusion matrix
cm = np.around(cm / cm.sum(axis=1)[:, np.newaxis], 2)
print(cm)

[[0.67 0.33]
 [0.18 0.82]]


In [29]:
# Classification report
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.93      0.67      0.78      2389
           1       0.39      0.82      0.52       611

    accuracy                           0.70      3000
   macro avg       0.66      0.74      0.65      3000
weighted avg       0.82      0.70      0.73      3000



In [30]:
print(f"Predicted classes: \t{list(pred[:10])}")
print(f"Actual Labels: \t\t{list(y_test[:10])}")

Predicted classes: 	[0, 0, 0, 0, 0, 1, 0, 0, 0, 1]
Actual Labels: 		[1, 0, 0, 0, 0, 1, 0, 0, 0, 0]


In [31]:
pred1 = grid.predict(X_test[5:6])
print(f"Predicted classes: \t{list(pred1)}")
print(f"Actual Labels: \t\t{list(y_test[5:6])}")

Predicted classes: 	[1]
Actual Labels: 		[1]


In [32]:
X_sh = X_test.iloc[0,:].values.reshape(1,-1)
X_sh

array([[638, 'France', 'Male', 36, 6, 188455.19, 1, 0, 0, 47031.4]],
      dtype=object)

In [33]:
X_sh.shape

(1, 10)

In [34]:
pred_sh = grid.predict(X_sh)
print(f"Predicted classes: \t{list(pred_sh)}")
print(f"Actual Labels: \t\t{list(y_test[:1])}")

Predicted classes: 	[0]
Actual Labels: 		[1]


  "X does not have valid feature names, but"
  "X does not have valid feature names, but"


In [35]:
import joblib
# Save the best estimator
# This will convert GridSearch object to pipeline object
# It will also partially destroyed grid object, 
# If needed, it could be reran or reconstructed using saved model

dl_model_s = grid.best_estimator_

In [None]:
type(dl_model_s)
imblearn.pipeline.Pipeline
# Save the Keras model first
dl_model_s.named_steps['kerasclassifier'].model.save('../models/keras_model.h5')

In [36]:
dl_model_s.named_steps['kerasclassifier'].model = None

In [38]:
joblib.dump(dl_model_s, 'imblearn_pipeline.sav')

['imblearn_pipeline.sav']

In [39]:
from tensorflow.keras.models import load_model

In [40]:
dl_model = joblib.load('imblearn_pipeline.sav')

In [43]:
dl_model.named_steps['kerasclassifier'].model = load_model('../models/keras_model.h5')

In [44]:
print(dl_model.score(X_test, y_test))

0.6466666460037231


In [45]:
# Classification metrics
predl = dl_model.predict(X_test)
cm = confusion_matrix(y_test, predl)
print(cm)

[[1805  584]
 [ 135  476]]


In [46]:
# Let's use the first X_test record as new data
X_test[:1]

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
7884,638,France,Male,36,6,188455.19,1,0,0,47031.4


In [47]:
pred_new = dl_model.predict(X_test[:1])
print(f"Predicted classes: \t{list(pred_new)}")
print(f"Actual Labels: \t\t{list(y_test[:1])}")

Predicted classes: 	[0]
Actual Labels: 		[1]


In [48]:
X_test[:1].to_numpy()

array([[638, 'France', 'Male', 36, 6, 188455.19, 1, 0, 0, 47031.4]],
      dtype=object)

In [49]:
pred_new1 = dl_model.predict(X_test[:1].to_numpy())
pred_new1

  "X does not have valid feature names, but"
  "X does not have valid feature names, but"


array([0])