In [35]:
import pandas as pd
from IPython.display import display
from sklearn.model_selection import train_test_split
import numpy as np
import random
from certifai.common.utils.encoding import CatEncoder
from keras.callbacks import ReduceLROnPlateau,EarlyStopping,ModelCheckpoint
from keras.models import Sequential
from keras import regularizers
from keras import optimizers
from keras.layers import Dense, Dropout
import os
import random
import tensorflow as tf
seed = 42

os.environ['PYTHONHASHSEED']=str(seed)
np.random.seed(seed)
random.seed(seed)
tf.random.set_random_seed(seed)

base_path = '..'
all_data_file = f"{base_path}/datasets/german_credit_eval.csv"

df = pd.read_csv(all_data_file)

cat_columns = [
    'checkingstatus',
    'history',
    'purpose',
    'savings',
    'employ',
    'status',
    'others',
    'property',
    'age',
    'otherplans',
    'housing',
    'job',
    'telephone',
    'foreign'
    ]

label_column = 'outcome'

# Separate outcome
y = df[label_column]
X = df.drop(label_column, axis=1)

encoder = CatEncoder(cat_columns, X)

# NN separate for probability scoring label 
y_model = np.asarray([(1,0) if x==1 else (0,1) for x in y ])

best_model_path = os.path.join('.', 'best_model_keras')
es = EarlyStopping(monitor='accuracy', mode='min', verbose=1,
                   patience=100, min_delta=0.0001)

rlp = ReduceLROnPlateau(monitor='accuracy', factor=0.02, patience=20, verbose=1, mode='min',
                        min_delta=0.001, cooldown=1, min_lr=0.0001)
mcp = ModelCheckpoint(best_model_path, monitor='accuracy', verbose=1,
                      save_best_only=True, save_weights_only=False, mode='max')

model = Sequential()
model.add(Dense(60, input_dim=encoder(X.values).shape[1], activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(2, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(encoder(X.values), y_model, epochs=100, batch_size=10, callbacks=[mcp,rlp,es],validation_split=0.2)
_, accuracy = model.evaluate(encoder(X.values), y_model)
print('Accuracy: %.2f' % (accuracy*100))

Train on 800 samples, validate on 200 samples
Epoch 1/100

Epoch 00001: accuracy improved from -inf to 0.70375, saving model to ./best_model_keras
Epoch 2/100

Epoch 00002: accuracy improved from 0.70375 to 0.73375, saving model to ./best_model_keras
Epoch 3/100

Epoch 00003: accuracy improved from 0.73375 to 0.76812, saving model to ./best_model_keras
Epoch 4/100

Epoch 00004: accuracy improved from 0.76812 to 0.79438, saving model to ./best_model_keras
Epoch 5/100

Epoch 00005: accuracy improved from 0.79438 to 0.79750, saving model to ./best_model_keras
Epoch 6/100

Epoch 00006: accuracy improved from 0.79750 to 0.80500, saving model to ./best_model_keras
Epoch 7/100

Epoch 00007: accuracy improved from 0.80500 to 0.81687, saving model to ./best_model_keras
Epoch 8/100

Epoch 00008: accuracy improved from 0.81687 to 0.82375, saving model to ./best_model_keras
Epoch 9/100

Epoch 00009: accuracy improved from 0.82375 to 0.82938, saving model to ./best_model_keras
Epoch 10/100

Epoch 0


Epoch 00036: accuracy did not improve from 0.97500
Epoch 37/100

Epoch 00037: accuracy did not improve from 0.97500
Epoch 38/100

Epoch 00038: accuracy did not improve from 0.97500
Epoch 39/100

Epoch 00039: accuracy did not improve from 0.97500
Epoch 40/100

Epoch 00040: accuracy did not improve from 0.97500
Epoch 41/100

Epoch 00041: accuracy did not improve from 0.97500
Epoch 42/100

Epoch 00042: accuracy improved from 0.97500 to 0.97625, saving model to ./best_model_keras
Epoch 43/100

Epoch 00043: accuracy improved from 0.97625 to 0.97938, saving model to ./best_model_keras
Epoch 44/100

Epoch 00044: accuracy did not improve from 0.97938
Epoch 45/100

Epoch 00045: accuracy did not improve from 0.97938
Epoch 46/100

Epoch 00046: accuracy did not improve from 0.97938
Epoch 47/100

Epoch 00047: accuracy did not improve from 0.97938
Epoch 48/100

Epoch 00048: accuracy improved from 0.97938 to 0.98062, saving model to ./best_model_keras
Epoch 49/100

Epoch 00049: accuracy did not impr


Epoch 00075: accuracy did not improve from 0.98750
Epoch 76/100

Epoch 00076: accuracy did not improve from 0.98750
Epoch 77/100

Epoch 00077: accuracy improved from 0.98750 to 0.98813, saving model to ./best_model_keras
Epoch 78/100

Epoch 00078: accuracy improved from 0.98813 to 0.98875, saving model to ./best_model_keras
Epoch 79/100

Epoch 00079: accuracy improved from 0.98875 to 0.98937, saving model to ./best_model_keras
Epoch 80/100

Epoch 00080: accuracy did not improve from 0.98937
Epoch 81/100

Epoch 00081: accuracy improved from 0.98937 to 0.99000, saving model to ./best_model_keras
Epoch 82/100

Epoch 00082: accuracy did not improve from 0.99000
Epoch 83/100

Epoch 00083: accuracy did not improve from 0.99000
Epoch 84/100

Epoch 00084: accuracy did not improve from 0.99000
Epoch 85/100

Epoch 00085: accuracy did not improve from 0.99000
Epoch 86/100

Epoch 00086: accuracy did not improve from 0.99000
Epoch 87/100

Epoch 00087: accuracy did not improve from 0.99000
Epoch 88

In [36]:
# initial value counts of outcome label
df['outcome'].value_counts()

1    700
2    300
Name: outcome, dtype: int64

In [37]:
y_preds = model.predict(encoder(X.values))
y_classes = np.argmax(y_preds,axis=1)
y_actual = np.where(y==1, 0,1)
np.mean(y_actual == y_classes)

0.948

In [38]:
class_1_threshold = 0.95
class_2_threshold = 0.4

In [39]:
y_class_1 = y_preds[y_preds[:,0] > class_2_threshold][:,0]
y_class_1.shape

(718,)

In [40]:
y_class_2 = y_preds[y_preds[:,1] > class_1_threshold][:,1]
y_class_2.shape

(165,)

In [41]:
y_class_1 = y_preds[y_preds[:,0] > class_2_threshold][:,0]
np.sum(y_class_1 < class_1_threshold)

161

### creating new label

In [42]:
# loan_granted 1
new_label_1 = y_preds[y_preds[:,0] > class_1_threshold][:,0]
new_label_1.shape

(557,)

In [43]:
#loan_denied 2
new_label_2 = y_preds[y_preds[:,0] < class_2_threshold][:,0]
new_label_2.shape

(282,)

In [44]:
# further_inspection 3
new_label_3 = y_preds[(y_preds[:,0] > class_2_threshold) & (y_preds[:,0] < class_1_threshold)][:,0]
new_label_3.shape

(161,)

### arrange the new labels to be appended to dataset

In [45]:
new_label = []
for z in y_preds:
    prob_class_1 = z[0]
    if prob_class_1 > class_1_threshold:
        new_label.append(1)
    elif prob_class_1 < class_2_threshold:
        new_label.append(2)
    else:
        new_label.append(3)

In [46]:
np.mean(y==new_label)

0.81

### create dataset with new labels

In [47]:
df_new = df.copy()
df_new[label_column] = new_label

### check new outcome label counts

In [48]:
df_new['outcome'].value_counts()

1    557
2    282
3    161
Name: outcome, dtype: int64

### dump the new dataset as csv

In [49]:
save_as = f"{base_path}/datasets/german_credit_eval_multiclass.csv"
df_new.to_csv(save_as,index=False)

### check file is saved correctly

In [50]:
!ls $base_path/datasets | grep multiclass

german_credit_eval_multiclass.csv
