In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from scipy.io import arff
from sklearn import preprocessing
from sklearn.utils import class_weight
import tensorflow as tf
from tensorflow.keras import layers
import numpy as np

In [2]:
dataset =  pd.DataFrame(arff.loadarff('data/4year.arff')[0])
dataset = dataset.drop(columns=["Attr37"])
dataset = dataset.dropna()
dataset.describe()

Unnamed: 0,Attr1,Attr2,Attr3,Attr4,Attr5,Attr6,Attr7,Attr8,Attr9,Attr10,...,Attr55,Attr56,Attr57,Attr58,Attr59,Attr60,Attr61,Attr62,Attr63,Attr64
count,8091.0,8091.0,8091.0,8091.0,8091.0,8091.0,8091.0,8091.0,8091.0,8091.0,...,8091.0,8091.0,8091.0,8091.0,8091.0,8091.0,8091.0,8091.0,8091.0,8091.0
mean,0.048761,0.516031,0.180543,2.745438,81.0556,0.065842,0.060676,2.739699,1.598251,0.464641,...,8532.902,0.027347,-0.01269,0.980542,1.014384,92.263964,11.682148,309.515,7.014592,26.655851
std,0.183423,0.398833,0.377479,6.615317,12317.55,5.053292,0.189032,8.635029,1.181109,0.400167,...,81255.26,1.449081,9.676144,2.182341,28.758891,1996.909462,69.396327,14697.5,9.441743,375.161393
min,-5.9655,0.002567,-12.135,0.021061,-228990.0,-117.42,-5.9655,-1.8482,0.000469,-12.473,...,-713220.0,-121.0,-597.42,-0.086331,-284.38,0.038646,-12.656,1.0821,0.0,0.0
25%,0.003552,0.275615,0.022997,1.0519,-50.3685,0.0,0.005163,0.448995,1.0123,0.301865,...,38.1725,0.005436,0.012816,0.89668,0.0,5.41475,4.43795,45.529,3.022,2.03585
50%,0.042728,0.47153,0.19276,1.566,-2.379,0.0,0.050727,1.0756,1.1451,0.50581,...,1271.2,0.042501,0.10064,0.9605,0.012904,9.501,6.3654,75.564,4.8272,3.9453
75%,0.109865,0.67836,0.384555,2.68875,48.431,0.093481,0.126375,2.53845,1.8891,0.698495,...,5677.95,0.10675,0.23566,0.99507,0.24514,19.3055,9.8296,120.75,8.00805,8.699
max,3.1233,13.063,0.99112,385.57,1034100.0,322.2,3.1233,388.59,16.603,1.577,...,6123700.0,0.99926,226.76,187.0,1661.0,161540.0,4718.6,1301600.0,337.29,19721.0


In [3]:
# get labels and data from the data set
labels = np.array(dataset['class'].values, dtype=np.int32)
data = dataset.loc[:, "Attr1":"Attr64"].values

# normalize the data
min_max_scaler = preprocessing.MinMaxScaler()
data = min_max_scaler.fit_transform(data)
    
# create a random order
order = np.arange(labels.size)
np.random.shuffle(order)
labels = labels[order]
data = data[order]

# split into test and train dataset
split_point = int(len(labels) * 0.8)
train_data = data[:split_point]
test_data = data[split_point:]
train_labels = labels[:split_point]
test_labels = labels[split_point:]

# create also a validation dataset
split_point = int(len(train_labels) * 0.9)
val_data = train_data[split_point:]
val_labels = train_labels[split_point:]
train_data = train_data[:split_point]
train_labels = train_labels[:split_point]

# calculate the class weights
num_of_bankrupt = np.sum(train_labels)
num_of_samples = len(train_labels)
class_w = {0: num_of_bankrupt/num_of_samples, 1: (num_of_samples-num_of_bankrupt)/num_of_samples}

In [4]:
model = tf.keras.Sequential()
model.add(layers.Dense(32, input_dim=63, activation='tanh'))
#model.add(layers.LeakyReLU(alpha=0.1))
model.add(layers.Dense(16, input_dim=63, activation='tanh'))
#model.add(layers.LeakyReLU(alpha=0.1))
model.add(layers.Dense(2, activation='softmax'))
model.compile(optimizer=tf.train.AdamOptimizer(0.001),
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

model.fit(train_data, 
          train_labels, 
          epochs=10, 
          batch_size=64, 
          validation_data=(val_data, val_labels), 
          class_weight=class_w)

Instructions for updating:
Colocations handled automatically by placer.
Train on 5824 samples, validate on 648 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1a3cb40828>

In [5]:
model.evaluate(test_data, test_labels, batch_size=64)



[0.32748675977764047, 0.95676345]

In [6]:
bank_test_data = test_data[test_labels == 1]
bank_test_label = np.ones(len(bank_test_data))
model.evaluate(bank_test_data, bank_test_label, batch_size=64)



[1.149758219718933, 0.0952381]

In [7]:
no_bank_test_data = test_data[test_labels == 0]
no_bank_test_label = np.zeros(len(no_bank_test_data))
model.evaluate(no_bank_test_data, no_bank_test_label, batch_size=64)



[0.3055873341059942, 0.9797083]