### Requirements to generate german credit multiclass csv

- keras (tensorflow backend)
- numpy
- certifai_common(encoder for model training)
- scikit-learn

In [1]:
import keras as keras_version_check
import tensorflow as tf_version_check
print(f'keras version {keras_version_check.__version__}')
print(f'tensorflow version {tf_version_check.__version__}')

2023-01-19 20:02:14.805447: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


keras version 2.11.0
tensorflow version 2.11.0


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import random
from certifai.common.utils.encoding import CatEncoder
from keras.callbacks import ReduceLROnPlateau,EarlyStopping,ModelCheckpoint
from keras.models import Sequential
from keras import regularizers
from keras import optimizers
from keras.layers import Dense, Dropout
import os
import random
import tensorflow as tf
seed = 42

os.environ['PYTHONHASHSEED']=str(seed)
np.random.seed(seed)
random.seed(seed)

try:
    tf.random.set_seed(seed)
except AttributeError:
    tf.set_random_seed(seed)

base_path = '../..'
all_data_file = f"{base_path}/datasets/german_credit_eval.csv"

df = pd.read_csv(all_data_file)

cat_columns = [
    'checkingstatus',
    'history',
    'purpose',
    'savings',
    'employ',
    'status',
    'others',
    'property',
    'age',
    'otherplans',
    'housing',
    'job',
    'telephone',
    'foreign'
    ]

label_column = 'outcome'

# Separate outcome
y = df[label_column]
X = df.drop(label_column, axis=1)

encoder = CatEncoder(cat_columns, X)

# NN separate for probability scoring label 
y_model = np.asarray([(1,0) if x==1 else (0,1) for x in y ])

best_model_path = os.path.join('.', 'best_model_keras')
es = EarlyStopping(monitor='accuracy', mode='min', verbose=1,
                   patience=100, min_delta=0.0001)

rlp = ReduceLROnPlateau(monitor='accuracy', factor=0.02, patience=20, verbose=1, mode='min',
                        min_delta=0.001, cooldown=1, min_lr=0.0001)
mcp = ModelCheckpoint(best_model_path, monitor='accuracy', verbose=1,
                      save_best_only=True, save_weights_only=False, mode='max')

model = Sequential()
model.add(Dense(60, input_dim=encoder(X.values).shape[1], activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(2, activation='softmax'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(encoder(X.values), y_model, epochs=100, batch_size=10, callbacks=[mcp,rlp,es],validation_split=0.2)
_, accuracy = model.evaluate(encoder(X.values), y_model)
print('Accuracy: %.2f' % (accuracy*100))

Epoch 1/100


2023-01-19 20:02:54.724123: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Epoch 1: accuracy improved from -inf to 0.62500, saving model to ./best_model_keras
Epoch 2/100
Epoch 2: accuracy improved from 0.62500 to 0.71125, saving model to ./best_model_keras
Epoch 3/100
Epoch 3: accuracy improved from 0.71125 to 0.75125, saving model to ./best_model_keras
Epoch 4/100
Epoch 4: accuracy improved from 0.75125 to 0.76875, saving model to ./best_model_keras
Epoch 5/100
Epoch 5: accuracy improved from 0.76875 to 0.77500, saving model to ./best_model_keras
Epoch 6/100
Epoch 6: accuracy improved from 0.77500 to 0.78875, saving model to ./best_model_keras
Epoch 7/100
Epoch 7: accuracy improved from 0.78875 to 0.79750, saving model to ./best_model_keras
Epoch 8/100
Epoch 8: accuracy did not improve from 0.79750
Epoch 9/100
Epoch 9: accuracy improved from 0.79750 to 0.81125, saving model to ./best_model_keras
Epoch 10/100
Epoch 10: accuracy improved from 0.81125 to 0.81750, saving model to ./best_model_keras
Epoch 11/100
Epoch 11: accuracy improved from 0.81750 to 0.8237

In [3]:
# initial value counts of outcome label
df['outcome'].value_counts()

1    700
2    300
Name: outcome, dtype: int64

In [4]:
y_preds = model.predict(encoder(X.values))
y_classes = np.argmax(y_preds,axis=1)
y_actual = np.where(y==1, 0,1)
np.mean(y_actual == y_classes)



0.943

### creating new label

In [5]:
# loan_granted 1
new_label_1 = y_preds[y_preds[:,0] > 0.98][:,0]
new_label_1.shape

(598,)

In [6]:
#loan_denied 2
new_label_2 = y_preds[y_preds[:,0] < 0.3][:,0]
new_label_2.shape

(282,)

In [7]:
# further_inspection 3
new_label_3 = y_preds[(y_preds[:,0] > 0.3) & (y_preds[:,0] < 0.98)][:,0]
new_label_3.shape

(120,)

### arrange the new labels to be appended to dataset

In [8]:
new_label = []
for z in y_preds:
    prob_class_1 = z[0]
    if prob_class_1 > 0.98:
        new_label.append(1)
    elif prob_class_1 < 0.3:
        new_label.append(2)
    else:
        new_label.append(3)

In [9]:
# test for sanity of new labels
np.mean(y==new_label)

0.845

### create dataset with new labels

In [10]:
df_new = df.copy()
df_new[label_column] = new_label

### check new outcome label counts

In [11]:
df_new['outcome'].value_counts()

1    598
2    282
3    120
Name: outcome, dtype: int64

### dump the new dataset as csv

In [12]:
save_as = f"{base_path}/datasets/german_credit_eval_multiclass.csv"
df_new.to_csv(save_as,index=False)

### check file is saved correctly

In [13]:
!ls $base_path/datasets | grep multiclass.csv

german_credit_eval_multiclass.csv
