In [52]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd

## load dataset for training and testing

In [53]:
train = pd.read_csv('./../train_balanced.csv')
# train.drop(['Unnamed: 0'], axis=1, inplace=True)

test = pd.read_csv('./../test_balanced.csv')
# test.drop(['Unnamed: 0'], axis=1, inplace=True)

In [54]:
train.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,...,X15,X16,X17,X18,X19,X20,X21,X22,X23,Y
0,120000,0,2,1,25,0,0,0,-2,-2,...,0,0,0,6000,0,0,0,0,0,0
1,30000,0,3,1,41,1,2,0,0,2,...,30566,29229,26260,0,2000,2200,0,1100,1100,0
2,20000,0,2,1,32,3,2,2,7,7,...,2400,2400,2400,0,0,0,0,0,0,1
3,200000,0,1,2,28,-1,-1,2,-1,-1,...,1117,1117,0,26286,0,1117,1117,0,1128,0
4,50000,1,1,2,28,0,0,-1,-1,-1,...,11132,0,0,5224,19906,11132,0,0,0,0


In [55]:
X_train = train.iloc[:, 0:23].values
y_train = train.iloc[:, 23].values
# print(y_train)
X_train = torch.tensor(X_train, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.float32).reshape(-1, 1)

In [56]:
X_test = test.iloc[:, 0:23].values
y_test = test.iloc[:, 23].values

X_test = torch.tensor(X_test, dtype=torch.float32)
y_test = torch.tensor(y_test, dtype=torch.float32).reshape(-1, 1)

In [57]:
train.head(10)

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,...,X15,X16,X17,X18,X19,X20,X21,X22,X23,Y
0,120000,0,2,1,25,0,0,0,-2,-2,...,0,0,0,6000,0,0,0,0,0,0
1,30000,0,3,1,41,1,2,0,0,2,...,30566,29229,26260,0,2000,2200,0,1100,1100,0
2,20000,0,2,1,32,3,2,2,7,7,...,2400,2400,2400,0,0,0,0,0,0,1
3,200000,0,1,2,28,-1,-1,2,-1,-1,...,1117,1117,0,26286,0,1117,1117,0,1128,0
4,50000,1,1,2,28,0,0,-1,-1,-1,...,11132,0,0,5224,19906,11132,0,0,0,0
5,260000,1,1,1,51,1,-2,-2,-2,-2,...,6682,1718,1512,0,0,6882,1734,1512,0,0
6,110000,0,3,1,46,0,0,2,2,0,...,100188,102110,107014,8942,3800,0,3685,6580,4703,1
7,80000,0,1,1,34,-1,2,2,2,2,...,5092,6008,5618,0,1500,0,1000,0,1000,1
8,140000,0,2,1,27,-2,-2,-2,-2,-2,...,0,2226,0,0,1650,0,2226,0,656,1
9,130000,0,1,2,26,0,0,0,0,0,...,10079,10289,10495,1152,1164,360,372,376,540,1


In [58]:
from sklearn.preprocessing import StandardScaler
from pickle import dump , load

columns_to_standardize = list(range(0, 23))
scaler = StandardScaler()

X_train_np = X_train.numpy()
X_test_np = X_test.numpy()
print(X_test_np.shape)

# Fit the scaler on the training data columns
scaler.fit(X_train_np[:, columns_to_standardize])

# Standardize the selected columns in both X_train and X_test
X_train_np[:, columns_to_standardize] = scaler.transform(X_train_np[:, columns_to_standardize])
X_test_np[:, columns_to_standardize] = scaler.transform(X_test_np[:, columns_to_standardize])

# Convert back to PyTorch tensors
X_train = torch.from_numpy(X_train_np).type(torch.float) 
X_test = torch.from_numpy(X_test_np).type(torch.float)


# save standard scaler model.
# save model
dump(scaler, open('StandardScaler.pkl', 'wb'))

# load model
scaler = load(open('StandardScaler.pkl', 'rb'))

(2655, 23)


In [59]:
X_train

tensor([[-0.2711, -0.8309,  0.1622,  ..., -0.3148, -0.2964, -0.2806],
        [-0.9755, -0.8309,  1.4549,  ..., -0.3148, -0.2203, -0.2126],
        [-1.0538, -0.8309,  0.1622,  ..., -0.3148, -0.2964, -0.2806],
        ...,
        [-0.6624,  1.2036,  0.1622,  ..., -0.0819, -0.1228, -0.2806],
        [ 0.1986, -0.8309,  0.1622,  ...,  1.4956, -0.2272, -0.2188],
        [-0.8190, -0.8309,  0.1622,  ..., -0.2614, -0.1580, -0.2393]])

In [60]:
class TaiwaneseCreditClassifier(nn.Module):
    def __init__(self):
        super().__init__()
        self.hidden1 = nn.Linear(23, 64)
        self.act1 = nn.ReLU()
        self.hidden2 = nn.Linear(64, 128)
        self.act2 = nn.ReLU()
        self.hidden3 = nn.Linear(128, 32)
        self.act3 = nn.ReLU()
        self.hidden4 = nn.Linear(32, 16)
        self.act4 = nn.ReLU()
        self.output = nn.Linear(16, 1)
        self.act_output = nn.Sigmoid()
        self.dropout = nn.Dropout(0.1)

    def forward(self, x):
        x = self.dropout(self.act1(self.hidden1(x)))
        x = self.dropout(self.act2(self.hidden2(x)))
        x = self.dropout(self.act3(self.hidden3(x)))
        x = self.dropout(self.act4(self.hidden4(x)))
        # x = self.output(x)
        x = self.act_output(self.output(x))
        return x

model = TaiwaneseCreditClassifier()
print(model)

# loss function and optimizer
loss_fn = nn.BCELoss() # binary cross entropy
optimizer = optim.Adam(model.parameters(), lr=0.001)

TaiwaneseCreditClassifier(
  (hidden1): Linear(in_features=23, out_features=64, bias=True)
  (act1): ReLU()
  (hidden2): Linear(in_features=64, out_features=128, bias=True)
  (act2): ReLU()
  (hidden3): Linear(in_features=128, out_features=32, bias=True)
  (act3): ReLU()
  (hidden4): Linear(in_features=32, out_features=16, bias=True)
  (act4): ReLU()
  (output): Linear(in_features=16, out_features=1, bias=True)
  (act_output): Sigmoid()
  (dropout): Dropout(p=0.1, inplace=False)
)


In [61]:
def evaluate(model, X_test, y_test):
    model.eval()
    with torch.no_grad():
        y_pred = model(X_test)
        # print("y_pred :",  y_pred)
        # print("y_round  :",  y_pred.round())
    accuracy = (y_pred.round() == y_test).float().mean()
    # print(y_pred, y_test)
    return accuracy

In [62]:
n_epochs = 50
batch_size = 128


accuracy = evaluate(model, X_test, y_test)
print('accuracy before training {}'.format(accuracy))
for epoch in range(n_epochs):
    optimizer.zero_grad()
    for i in range(0, len(X_train), batch_size):
        Xbatch = X_train[i:i+batch_size]
        y_pred = model(Xbatch)
        ybatch = y_train[i:i+batch_size]
        loss = loss_fn(y_pred, ybatch)
        # print(loss)
        loss.backward()
        optimizer.step()
    # print(loss.tolist())
    # evaluate model on each epoch
    accuracy = evaluate(model, X_test, y_test)
    print(f'Finished epoch {epoch},  accuracy {accuracy}')

accuracy before training 0.4971751272678375
Finished epoch 0,  accuracy 0.5905838012695312
Finished epoch 1,  accuracy 0.6463276743888855
Finished epoch 2,  accuracy 0.647834300994873
Finished epoch 3,  accuracy 0.6508474349975586
Finished epoch 4,  accuracy 0.6813559532165527
Finished epoch 5,  accuracy 0.6858757138252258
Finished epoch 6,  accuracy 0.644444465637207
Finished epoch 7,  accuracy 0.6470809578895569
Finished epoch 8,  accuracy 0.677589476108551
Finished epoch 9,  accuracy 0.6802259683609009
Finished epoch 10,  accuracy 0.6726930141448975
Finished epoch 11,  accuracy 0.6158192157745361
Finished epoch 12,  accuracy 0.6455743908882141
Finished epoch 13,  accuracy 0.6817325949668884
Finished epoch 14,  accuracy 0.6813559532165527
Finished epoch 15,  accuracy 0.661770224571228
Finished epoch 16,  accuracy 0.6726930141448975
Finished epoch 17,  accuracy 0.6911487579345703
Finished epoch 18,  accuracy 0.6922787427902222
Finished epoch 19,  accuracy 0.6821092367172241
Finished e

In [63]:
accuracy = evaluate(model, X_test, y_test)
print(accuracy)

tensor(0.6945)


## Save/load model and evaluate 

In [64]:
torch.save(model, 'Taiwanese_credit_model')

In [65]:
model = torch.load('Taiwanese_credit_model')
model.eval()
accuracy = evaluate(model, X_test, y_test)
print(accuracy)

tensor(0.6945)


## create util functions for evolution

In [66]:
def scale_input_data(scaler, df, columns_to_standardize = list(range(23))):
    numpy_array = torch.tensor(df.iloc[:, 0:23].values, dtype=torch.float32)
    numpy_array = numpy_array.numpy()
    numpy_array[:, columns_to_standardize] = scaler.transform(numpy_array[:, columns_to_standardize])
    return numpy_array
    

In [67]:
scale_input_data(scaler, test, columns_to_standardize = list(range(23)))

array([[ 0.04203123, -0.83086455, -1.1305646 , ..., -0.14366117,
        -0.15796189, -0.28059798],
       [-0.584143  ,  1.2035656 , -1.1305646 , ..., -0.15949832,
        -0.29640576, -0.21879433],
       [-0.8972301 ,  1.2035656 , -1.1305646 , ..., -0.3147646 ,
        -0.08873996, -0.0333833 ],
       ...,
       [ 0.7464772 ,  1.2035656 ,  0.16218762, ..., -0.3147646 ,
         0.13277024,  0.10258478],
       [ 1.3726513 , -0.83086455, -1.1305646 , ..., -0.12844507,
        -0.08873996, -0.15699065],
       [ 0.35511833,  1.2035656 , -1.1305646 , ..., -0.28402188,
        -0.26899388, -0.25612375]], dtype=float32)

In [68]:
X_test_np.shape

(2655, 23)

In [51]:
import numpy as np
# columns_to_standardize = [0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13]
# 
# def eval_model(model, input):
#         # print(input)
#         model.eval()
#         with torch.no_grad():
#             prob = model(input)
#             # print(prob)
#         return 1 - prob.tolist()[0][0]
# 
# for i in range(20):
#     x_in = test.iloc[i, 0:14].values.reshape(-1, 14)
#     # print(x_in)
#     x_in = torch.tensor(x_in, dtype=torch.float32)
#     x_in = x_in.numpy()
#     x_in[:, columns_to_standardize] = scaler.transform(x_in[:, columns_to_standardize])
#     
#     # Convert back to PyTorch tensors
#     x_in = torch.from_numpy(x_in).type(torch.float) 
#     # print(x_in)
#     
#     print( eval_model(model, x_in))