# Default of Credit Ratings Dataset
## 1: Clean Data, Save Case Base and Models

In [1]:
import pickle
import pandas as pd
import numpy as np

from keras.models import Sequential
from keras.layers import Dense, Activation

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, confusion_matrix

Using TensorFlow backend.


In [2]:
df = pd.read_excel("default of credit card clients.xls")

In [3]:
# Save target y and dummy the X matrix
y = df["credible"]
del df["credible"]
del df["ID"]

# One hot encode fully to get high dimensional space
df = pd.get_dummies(df, columns=["SEX"])
df = pd.get_dummies(df, columns=["EDUCATION"])
df = pd.get_dummies(df, columns=["MARRIAGE"])

In [4]:
# Scale the Training Data Matrix
scaler = MinMaxScaler(feature_range=(-1, 1), copy=True)
scaler.fit(df.values)
X = scaler.transform(df.values)
y = y.values



## Make Case Base: 

In [5]:
# Save modified df (to save column names etc.)
df.to_csv("processed_df.csv", index=False)

In [6]:
# Make case base
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Train a Keras MLP

In [7]:
model = Sequential()
model.add(Dense(len(X_train[0]), input_dim=len(X_train[0])))
model.add(Activation("relu"))
model.add(Dense(1))
model.add(Activation("sigmoid"))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(X_train, y_train, epochs=30, batch_size=8)

scores = model.evaluate(X_train, y_train)
print("Training Set:", "\n%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))

scores = model.evaluate(X_test, y_test)
print("Test Set:", "\n%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Training Set: 
acc: 82.21%
Test Set: 
acc: 81.75%


In [8]:
# Use brute for maximum reliability in experiments
knn_clf = KNeighborsClassifier(n_neighbors=1, algorithm="brute") 
knn_clf.fit(X_train, y_train)

KNeighborsClassifier(algorithm='brute', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=1, p=2,
           weights='uniform')

In [9]:
knn_predictions_test = knn_clf.predict(X_test)
print("k-NN Accuracy Test:", accuracy_score(y_test, knn_predictions_test))

k-NN Accuracy Test: 0.732


In [10]:
confusion_matrix(y_test, knn_predictions_test, labels=None, sample_weight=None)

array([[3852,  835],
       [ 773,  540]])

In [11]:
confusion_matrix(y_test, model.predict_classes(X_test), labels=None, sample_weight=None)

array([[4459,  228],
       [ 867,  446]])

In [12]:
pickle.dump(knn_clf, open('k-nn_model.sav', 'wb'))

In [13]:
model.save("NN.h5")

In [14]:
np.save("X_train", X_train)
np.save("X_test", X_test)
np.save("y_train", y_train)
np.save("y_test", y_test)