In [46]:
# this model was my first attampt at this project
# not so ideal, and has some unreasonable process

import numpy as np
import pandas as pd
import torch

# pre-process the data: converting categorical data to numerical data
data_path = "./data_training.csv"
data_pd = pd.read_csv(data_path)
categorical_data = data_pd.select_dtypes(include=["object"])
numerical_data = data_pd.select_dtypes(exclude=['object']).to_numpy()
categorical_data_onehot = pd.get_dummies(categorical_data).to_numpy()   # transfer to one-hot
merged_data = np.concatenate((numerical_data, categorical_data_onehot), axis=1)    # merge two data forms
data = torch.from_numpy(merged_data).to(torch.float32)   # convert the data to tensor
data

tensor([[43.,  3., 33.,  ...,  0.,  0.,  0.],
        [49.,  4., 33.,  ...,  0.,  0.,  0.],
        [29.,  1., 36.,  ...,  0.,  0.,  0.],
        ...,
        [46.,  3., 41.,  ...,  0.,  0.,  1.],
        [49.,  4., 38.,  ...,  0.,  0.,  0.],
        [41.,  2., 33.,  ...,  0.,  0.,  1.]])

In [47]:
device = (torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu'))
print(f"Training on device {device}.")

Training on device cuda.


In [48]:
# extract the label
ground_truth = data[:,14]
ground_truth, ground_truth.shape, ground_truth.min(), ground_truth.max() 

(tensor([1., 1., 1.,  ..., 1., 1., 1.]),
 torch.Size([8101]),
 tensor(1.),
 tensor(2.))

In [49]:
# delete the label column in the training data
data = torch.cat((data[:, :14], data[:, 14 + 1:]), dim=1)
data.shape

torch.Size([8101, 37])

In [50]:
# deal with the ground truth, making the data in one-hot form
ground_truth = ground_truth.long()
truth_onehot = torch.zeros(ground_truth.shape[0], 3)
truth_onehot.scatter_(1, ground_truth.unsqueeze(1), 1.0)
truth_onehot = truth_onehot[:, 1:]
truth_onehot,truth_onehot.shape,truth_onehot[7]

(tensor([[1., 0.],
         [1., 0.],
         [1., 0.],
         ...,
         [1., 0.],
         [1., 0.],
         [1., 0.]]),
 torch.Size([8101, 2]),
 tensor([0., 1.]))

In [51]:
# cross validation prepration
n_samples = data.shape[0]
n_val = int(0.2 * n_samples)
shuffled_indices = torch.randperm(n_samples)    # randomly choose some indices
train_indices = shuffled_indices[:-n_val]    # 80% of those random indices are marked as indices for training
val_indices = shuffled_indices[-n_val:]      # 20% of those random indices are marked as indices for validating

# split the data set for training and testing
train_set_x = data[train_indices].to(device=device)
train_set_y = truth_onehot[train_indices].to(device=device)
val_set_x = data[val_indices].to(device=device)
val_set_y = truth_onehot[val_indices].to(device=device)

train_set_x.shape,train_set_y.shape,val_set_x.shape,val_set_y.shape
# train_set_x,train_set_y,val_set_x,val_set_y

(torch.Size([6481, 37]),
 torch.Size([6481, 2]),
 torch.Size([1620, 37]),
 torch.Size([1620, 2]))

In [56]:
import torch.nn as nn
import torch.optim as optim

# construct a model
model = nn.Sequential(
    nn.Linear(37, 1024),
    nn.ReLU(),
    nn.Linear(1024, 512),
    nn.ReLU(),
    nn.Linear(512, 128),
    nn.ReLU(),
    nn.Linear(128, 2),
    nn.Softmax(dim=1)
)

loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-2)     # use Adam becuase the data set is not normalized

def training_loop(n_epochs, optimizer, model, loss_fn,x_train,x_val,y_train,y_val):
    for epoch in range(1, n_epochs + 1):
        # train and calculate the loss
        y_train_pred = model(x_train)
        loss_train = loss_fn(y_train_pred, y_train)
        y_val_pred = model(x_val)
        loss_val = loss_fn(y_val_pred, y_val)
        
        # Auto_grad
        optimizer.zero_grad()
        loss_train.backward()
        optimizer.step()
        
        if epoch == 1 or epoch % 100 == 0:
            print('Epoch {}, Training loss {}, Validation loss {}'.format(
                epoch, float(loss_train), float(loss_val)))
            correct = 0
            i = 0
            while(i < 1620):
                if((y_val_pred[i] == y_val[i]).all()):    # check the prediction
                    correct += 1
                i += 1
            total = y_val.size(0)
            accuracy = correct / total
            print(f"Accuracy: {accuracy * 100:.2f}%")

training_loop(
    n_epochs = 3000,
    optimizer = optimizer,
    model = model.to(device=device),
    loss_fn = loss_fn,
    x_train = train_set_x,
    y_train = train_set_y,
    x_val = val_set_x,
    y_val = val_set_y
)



Epoch 1, Training loss 1.1160508394241333, Validation loss 1.1303942203521729
Accuracy: 5.19%
Epoch 100, Training loss 0.4748108685016632, Validation loss 0.470669150352478
Accuracy: 84.26%
Epoch 200, Training loss 0.4748108685016632, Validation loss 0.470669150352478
Accuracy: 84.26%
Epoch 300, Training loss 0.4748108685016632, Validation loss 0.470669150352478
Accuracy: 84.26%
Epoch 400, Training loss 0.4748108685016632, Validation loss 0.470669150352478
Accuracy: 84.26%
Epoch 500, Training loss 0.4748108685016632, Validation loss 0.470669150352478
Accuracy: 84.26%
Epoch 600, Training loss 0.4748108685016632, Validation loss 0.470669150352478
Accuracy: 84.26%
Epoch 700, Training loss 0.4748108685016632, Validation loss 0.470669150352478
Accuracy: 84.26%
Epoch 800, Training loss 0.4748108685016632, Validation loss 0.470669150352478
Accuracy: 84.26%
Epoch 900, Training loss 0.4748108685016632, Validation loss 0.470669150352478
Accuracy: 84.26%
Epoch 1000, Training loss 0.47481086850166