In [65]:
import torch
import pickle
import os
import traceback

import torch.nn as nn
import polars as pl
import numpy as np
import xgboost as xgb

from tqdm import tqdm
from time import time
from torch.utils.data import TensorDataset, DataLoader
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [51]:
df_all = pl.read_csv('features.csv')
df_all.shape

(85450, 21)

In [52]:
X_all = df_all.drop('LABEL').to_numpy()
y_all = df_all['LABEL'].apply(lambda y: 0 if y == -1 else 1).to_numpy()
X_all.shape, y_all.shape

((85450, 20), (85450,))

In [53]:
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.33, random_state=447128, shuffle=True, stratify=y_all)

# 1. XGB

In [69]:
start = time()

xgb_model = xgb.XGBRegressor(objective="binary:logistic", random_state=6786122)
xgb_model.fit(X_train, y_train)

pred_values = xgb_model.predict(X_test)
pred_labels = np.round(pred_values)

print(classification_report(y_test, pred_labels))
delta_time = time() - start
print(f'Time taken = {delta_time:.3f}s')

              precision    recall  f1-score   support

           0       0.92      0.92      0.92     19321
           1       0.84      0.84      0.84      8878

    accuracy                           0.90     28199
   macro avg       0.88      0.88      0.88     28199
weighted avg       0.90      0.90      0.90     28199

Time taken = 5.257s


# 2. MLP

In [54]:
# Environment variables
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"

DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

N_EPOCHS = 300
LEARNING_RATE = 0.0005
DROPOUT_RATE = 0.3

In [55]:
class MLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.dropout = nn.Dropout(0.3)
        self.fc1 = nn.Linear(20, 128)
        self.fc2 = nn.Linear(128, 1)

    def forward(self, x):
        x = self.dropout(x)
        x = self.fc1(x)
        x = self.dropout(x)
        x = self.fc2(x)
        return x

In [56]:
def train_mlp(
        model: nn.Module, 
        dataloader_train: DataLoader, 
        criterion,
        optimizer: torch.optim.Optimizer,
        n_epochs: int,
        X_valid: torch.Tensor,
        y_valid: torch.Tensor):
    # Load validation set to device
    X_valid = X_valid.to(DEVICE)
    y_valid = y_valid.to(DEVICE)

    train_loss_history = []
    valid_loss_history = []

    start_time = time()
    for epoch in range(n_epochs):
        running_train_loss = 0.0
        # Batches
        for inputs, labels in tqdm(dataloader_train):
            model.train()
            # Copy batch to device
            inputs = inputs.to(DEVICE)
            labels = labels.to(DEVICE)

            # Rremove gradients from previous batch
            optimizer.zero_grad()
            # Predict using current model state
            outputs = model(inputs)

            # Compute batch loss
            loss = criterion(outputs, labels)
            # Backpropagate
            loss.backward()
            optimizer.step()

            # Add training loss
            running_train_loss += loss.item()
        # Epoch training loss (divide by num of batches)
        epoch_train_loss = running_train_loss / len(dataloader_train) 
        train_loss_history.append(epoch_train_loss)

        # Epoch validation loss
        with torch.inference_mode():
            outputs_valid = model(X_valid)
            epoch_valid_loss = criterion(outputs_valid, y_valid).item()
            # epoch_valid_loss = loss_valid.item() * X_valid.size(0)
            valid_loss_history.append(epoch_valid_loss)
        print(f"Epoch {epoch+1}/{n_epochs}: Train Loss = {epoch_train_loss:.4f}, Validation Loss = {epoch_valid_loss:.4f}")

    print(f"Took {time() - start_time:.2f} seconds to train in total")
    return train_loss_history, valid_loss_history

In [57]:
def predict(model_with_logit: nn.Module, X: torch.Tensor) -> torch.Tensor:
    was_training = model_with_logit.training
    model_with_logit.eval()
    with torch.inference_mode():
        # Predicted probabilities
        pred_probs = torch.sigmoid(model_with_logit(X.to(DEVICE)))
    # Predicted labels
    if was_training is True:
        model_with_logit.train()
    return torch.round(pred_probs)

def predict_probs(model_with_logit: nn.Module, X: torch.Tensor) -> torch.Tensor:
    was_training = model_with_logit.training
    model_with_logit.eval()
    with torch.inference_mode():
        # Predicted probabilities
        pred_probs = torch.sigmoid(model_with_logit(X.to(DEVICE)))
    # Predicted labels
    if was_training is True:
        model_with_logit.train()
    return pred_probs

In [58]:
def get_accuracy(model_with_logit: nn.Module, X: torch.Tensor, y: torch.Tensor):
    X = X.to(DEVICE)
    y = y.to(DEVICE)
    # Predict labels using model
    pred_labels = predict(model_with_logit, X).to('cpu')
    X = X.to('cpu')
    y = y.to('cpu')
    # Compare predicted with ground truth
    is_equal_tensor = torch.eq(pred_labels.squeeze(), y.squeeze())
    accuracy = is_equal_tensor.sum() / len(X)

    diff = (pred_labels.squeeze() - y.squeeze())
    fpos = diff[diff == -1].shape[0]
    fneg = diff[diff == 1].shape[0]
    
    return accuracy.item(), fpos, fneg

Train and test MLP model

In [62]:
 # Create training and validation tensors
X_train_tensor = torch.from_numpy(X_train).float()
y_train_tensor = torch.from_numpy(y_train.reshape(-1, 1)).float()
X_test_tensor = torch.from_numpy(X_test).float()
y_test_tensor = torch.from_numpy(y_test.reshape(-1, 1)).float()

# Create tensor dataset for training
dataset_train = TensorDataset(X_train_tensor, y_train_tensor)
# torch.manual_seed(2298)
dataloader_train = DataLoader(dataset_train, batch_size=32, shuffle=True)

# Create model
model = MLP().to(DEVICE)
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0005) # lr=0.00005 seems best with Adam
# Train model
start = time()
train_loss_history, valid_loss_history = train_mlp(
    model, 
    dataloader_train, 
    criterion, 
    optimizer, 
    n_epochs=20, 
    X_valid=X_test_tensor, 
    y_valid=y_test_tensor
)

# Compute accuracy, false positives and false negatives
accuracy_train, fpos_train, fneg_train = get_accuracy(model, X_train_tensor, y_train_tensor)
accuracy_valid, fpos_valid, fneg_valid = get_accuracy(model, X_test_tensor, y_test_tensor)

print(f'Best Training Loss = {np.min(train_loss_history):.4f}')
print(f'Training Accuracy = {accuracy_train:.4f}')
print(f'Training False Positive = {fpos_train}')
print(f'Training False Negative = {fneg_train}')  

print(f'\nBest Test Loss = {np.min(valid_loss_history):.4f}')
print(f'Test Accuracy = {accuracy_valid:.4f}')
print(f'Test False Positive = {fpos_valid}')
print(f'Test False Negative = {fneg_valid}')


100%|██████████| 1790/1790 [00:07<00:00, 240.12it/s]


Epoch 1/20: Train Loss = 4.8669, Validation Loss = 0.7831


100%|██████████| 1790/1790 [00:06<00:00, 262.70it/s]


Epoch 2/20: Train Loss = 0.8089, Validation Loss = 1.0413


100%|██████████| 1790/1790 [00:06<00:00, 261.20it/s]


Epoch 3/20: Train Loss = 0.7304, Validation Loss = 0.7306


100%|██████████| 1790/1790 [00:06<00:00, 260.19it/s]


Epoch 4/20: Train Loss = 0.6670, Validation Loss = 0.6594


100%|██████████| 1790/1790 [00:06<00:00, 256.77it/s]


Epoch 5/20: Train Loss = 0.6421, Validation Loss = 0.6453


100%|██████████| 1790/1790 [00:06<00:00, 256.44it/s]


Epoch 6/20: Train Loss = 0.6071, Validation Loss = 0.5979


100%|██████████| 1790/1790 [00:06<00:00, 264.06it/s]


Epoch 7/20: Train Loss = 0.5918, Validation Loss = 0.5753


100%|██████████| 1790/1790 [00:06<00:00, 265.46it/s]


Epoch 8/20: Train Loss = 0.5817, Validation Loss = 0.5662


100%|██████████| 1790/1790 [00:06<00:00, 265.62it/s]


Epoch 9/20: Train Loss = 0.5798, Validation Loss = 0.5666


100%|██████████| 1790/1790 [00:06<00:00, 270.81it/s]


Epoch 10/20: Train Loss = 0.5757, Validation Loss = 0.5818


100%|██████████| 1790/1790 [00:06<00:00, 272.96it/s]


Epoch 11/20: Train Loss = 0.5727, Validation Loss = 0.5879


100%|██████████| 1790/1790 [00:06<00:00, 270.77it/s]


Epoch 12/20: Train Loss = 0.5746, Validation Loss = 0.5638


100%|██████████| 1790/1790 [00:06<00:00, 271.88it/s]


Epoch 13/20: Train Loss = 0.5709, Validation Loss = 0.5616


100%|██████████| 1790/1790 [00:06<00:00, 272.56it/s]


Epoch 14/20: Train Loss = 0.5746, Validation Loss = 0.5628


100%|██████████| 1790/1790 [00:06<00:00, 279.06it/s]


Epoch 15/20: Train Loss = 0.5746, Validation Loss = 0.5663


100%|██████████| 1790/1790 [00:06<00:00, 272.88it/s]


Epoch 16/20: Train Loss = 0.5762, Validation Loss = 0.6262


100%|██████████| 1790/1790 [00:06<00:00, 275.22it/s]


Epoch 17/20: Train Loss = 0.5752, Validation Loss = 0.5746


100%|██████████| 1790/1790 [00:06<00:00, 271.73it/s]


Epoch 18/20: Train Loss = 0.5708, Validation Loss = 0.5643


100%|██████████| 1790/1790 [00:06<00:00, 267.25it/s]


Epoch 19/20: Train Loss = 0.5719, Validation Loss = 0.5650


100%|██████████| 1790/1790 [00:06<00:00, 273.57it/s]


Epoch 20/20: Train Loss = 0.5762, Validation Loss = 0.5691
Took 135.24 seconds to train in total
Best Training Loss = 0.5708
Training Accuracy = 0.6913
Training False Positive = 17095
Training False Negative = 577

Best Test Loss = 0.5616
Test Accuracy = 0.6912
Test False Positive = 8418
Test False Negative = 289
