In [5]:
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix, accuracy_score

In [6]:
train_data = np.load('train_data.npy')
train_label = np.load('train_label.npy')
test_data = np.load('test_data.npy')
test_label = np.load('test_label.npy')
tree_pred = pd.read_csv('tree_pred.csv').to_numpy()

# Decision Tree (Continued)

In [7]:
accuracy_score(test_label, tree_pred)

0.44954128440366975

In [8]:
confusion_matrix(test_label, tree_pred)

array([[ 28,   3,   0,  18,   8,   0,  19,   5],
       [  3,  24,   1,  11,  14,   0,  16,   0],
       [  0,   0,  25,   8,  15,   1,   1,   0],
       [ 12,   2,   5,  85,  30,   0,  32,   3],
       [ 10,   5,   7,  34,  71,   0,  37,   0],
       [  0,   0,   7,   3,  12,   1,   1,   0],
       [ 11,   4,   1,  15,  23,   0, 106,   0],
       [ 10,   0,   0,  20,   4,   0,   9,   3]])

# Random Forest

In [9]:
from sklearn.ensemble import RandomForestClassifier

In [10]:
rf = RandomForestClassifier(random_state=1000)
rf.fit(train_data, train_label)
pred = rf.predict(test_data)
accuracy_score(pred, test_label)

0.5045871559633027

# Gradient Boosting

In [44]:
from sklearn.ensemble import GradientBoostingClassifier

In [63]:
gbm = GradientBoostingClassifier(random_state=999)
gbm.fit(train_data, train_label)
pred = gbm.predict(test_data)
accuracy_score(test_label, pred)

0.4875491480996068

# XGBoost

In [46]:
import xgboost as xgb

In [51]:
dtrain = xgb.DMatrix(data=train_data, label=train_label)
dtest = xgb.DMatrix(data=test_data)

In [67]:
parameters = {'objective': 'multi:softmax', 'num_class': 8, 'seed': 424}
bst = xgb.train(parameters, dtrain)
pred = bst.predict(dtest)
accuracy_score(test_label, pred)

0.48230668414154654

# Neural Network

In [81]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

In [78]:
class BillboardDataset(Dataset):
    def __init__(self, data, label) -> None:
        super(BillboardDataset, self).__init__()
        self.data = torch.tensor(data.to_numpy())
        self.label = torch.tensor(label)
    def __len__(self):
        return self.data.size(0)
    def __getitem__(self, index):
        return self.data[index], self.label[index]

In [79]:
train_dataset = BillboardDataset(train_data, train_label)
test_dataset = BillboardDataset(test_data, test_label)

In [80]:
class NN(nn.Module):
    def __init__(self, input_dim, num_class) -> None:
        super(NN, self).__init__()
        self.relu = nn.ReLU()
        self.fc1 = nn.Linear(input_dim, 32)
        self.fc2 = nn.Linear(32, 64)
        self.fc3 = nn.Linear(64, 64)
        self.out = nn.Linear(64, num_class)
    def forward(self, x):
        h1 = self.relu(self.fc1(x))
        h2 = self.relu(self.fc2(h1))
        h3 = self.relu(self.fc3(h2))
        logits = self.relu(self.out(h3))
        return logits

In [None]:
def eval(model: nn.Module, dataset: BillboardDataset) -> float:
    dataloader = DataLoader(dataset, batch_size=128)
    model.eval()
    correct = 0
    total = 0
    for data, label in dataloader:
        logits = model(data)
        pred = torch.argmax(logits, 1)
        correct += torch.sum(pred == label)
        total += len(data)
    return correct / total

def train(model: nn.Module,
          train_dataset: BillboardDataset,
          val_dataset: BillboardDataset,
          num_epochs: int=50,
          lr: float=0.001,
          batch_size: int=64):
    
    model.train()
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr)
    train_dataloader = DataLoader(train_dataset, batch_size)

    for i in tqdm(range(num_epochs), desc='Training'):
        print(f'Training epoch {i}')
        logits = model(train_dataloader)