In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Subset, DataLoader
from torchvision.datasets import DatasetFolder
import numpy as np
from math import ceil
from sklearn.model_selection import train_test_split

## Defining the model, data and transforms

In [2]:
# Define the transform according to the assignment, parameterized by the expected length
class BinaryTransform:
    def __init__(self, input_length: int) -> None:
        """Initializes the single length parameter that defines this transform."""
        self.input_length = input_length

    def __call__(self, binary_data: bytes) -> torch.Tensor:
        """Returns the (1,input_length) sized Tensor of values between 0 and 1.
        The extra dimension is needed for the model's 2D pooling to work properly
        (not to be confused with the batch dimension that should be also added,
        the final batched model input being of shape (N, 1, L))."""
        
        binary_array = np.frombuffer(binary_data, dtype=np.uint8) 
        l = len(binary_array)

        # Pad or truncate the binary data based on < or > case
        # In the == case there is nothing further to do
        if l < self.input_length:
            padding = np.zeros(self.input_length - l, dtype=np.uint8)
            binary_array = np.concatenate((binary_array, padding))
        elif l > self.input_length:
            # In this case the input should be split into non-overlapping windows...
            window_size = ceil(l / self.input_length)
            padding = np.zeros(self.input_length * window_size - l, dtype=np.uint8)
            binary_array = np.concatenate((binary_array, padding))
            # ...and for each window its mean should be taken
            binary_array = binary_array.reshape(-1, window_size)
            binary_array = np.mean(binary_array, axis=1)
            
        # Scale the data to [0, 1]
        scaled_data = binary_array / 255.0
        tensor = torch.tensor(scaled_data, dtype=torch.float32)
        # Add extra dimension for the model to work properly.
        return tensor.unsqueeze(0)

In [3]:
# Building the model according to the assignment on Teams
class ConvNet(nn.Module):
    def __init__(self) -> None:
        super().__init__()
        self.conv = nn.Conv1d(1, 16, kernel_size=(10,), stride=(1,))
        self.relu = nn.ReLU()
        self.pooling = nn.MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
        self.linear = nn.Linear(65496, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.pooling(self.relu(self.conv(x)))
        x = x.view(-1, 65496) # flatten
        x = self.sigmoid(self.linear(x))
        return x.reshape(-1) # keep only the batch dimension

In [4]:
# Define expected input length and transform
input_length = 2**14
transform = BinaryTransform(input_length)

In [5]:
# Using Torchvision's DatasetFolder for easier preprocessing fit for the folder structure
TRAIN_DATA_PATH = "data/train"

train_dataset = DatasetFolder(
    root=TRAIN_DATA_PATH,
    loader=lambda x: open(x, 'rb').read(), # file reading in binary mode
    extensions=('',),
    transform=transform
)

# Splitting the train set further into train and validation sets by defining index subsets
np.random.seed(42)
train_indices = np.arange(len(train_dataset))
np.random.shuffle(train_indices)
train_indices, val_indices = train_test_split(train_indices, test_size=0.3, random_state=42)

# Define the train and validation subsets
train_subset = Subset(train_dataset, train_indices)
val_subset = Subset(train_dataset, val_indices)

## Training the model

In [6]:
# Set up device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [7]:
# Data loaders
train_loader = DataLoader(train_subset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_subset, batch_size=64)
# Model
model = ConvNet().to(device)
# Optimizer and loss
optimizer = optim.Adam(model.parameters(), lr=0.001)
loss_fn = nn.BCELoss()

In [8]:
model

ConvNet(
  (conv): Conv1d(1, 16, kernel_size=(10,), stride=(1,))
  (relu): ReLU()
  (pooling): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (linear): Linear(in_features=65496, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

In [9]:
# Training loop - even 10 epochs should yield 99% on the validation set without serious overfit
torch.manual_seed(42)

for epoch in range(10):
    model.train()
    # Define accumulating values
    total_train_loss = 0.
    total_train_samples = 0
    # Forward pass along data
    for X, y in train_loader:
        X, y = X.to(device), y.to(device)
        optimizer.zero_grad()
        y_pred = model(X)
        loss = loss_fn(y_pred, y.float()) # type conversion needed
        loss.backward()
        optimizer.step()

        total_train_loss += loss.detach().item()
        total_train_samples += len(X)

    # Validation
    model.eval()
    total_val_loss = 0.
    total_val_correct = 0
    total_val_samples = 0

    with torch.no_grad():
        for X, y in val_loader:
            X, y = X.to(device), y.to(device)
            y_pred = model(X)
            loss = loss_fn(y_pred, y.float())
            pred_label = (y_pred > 0.5)

            total_val_loss += loss.item()
            total_val_correct += pred_label.eq(y).sum().item()
            total_val_samples += len(X)

    train_bce = total_train_loss/total_train_samples
    val_bce = total_val_loss/total_val_samples
    val_acc = total_val_correct / total_val_samples

    print(f"Epoch {epoch}: Train BCE: {train_bce:.8f} Val. BCE: {val_bce:.8f} Val. Acc. {val_acc:.8f}")


Epoch 0: Train BCE: 0.00900685 Val. BCE: 0.00395540 Val. Acc. 0.90788224
Epoch 1: Train BCE: 0.00300047 Val. BCE: 0.00241600 Val. Acc. 0.95251662
Epoch 2: Train BCE: 0.00191229 Val. BCE: 0.00184717 Val. Acc. 0.96296296
Epoch 3: Train BCE: 0.00143395 Val. BCE: 0.00153979 Val. Acc. 0.96866097
Epoch 4: Train BCE: 0.00118846 Val. BCE: 0.00165777 Val. Acc. 0.96866097
Epoch 5: Train BCE: 0.00097784 Val. BCE: 0.00118758 Val. Acc. 0.98005698
Epoch 6: Train BCE: 0.00088844 Val. BCE: 0.00082961 Val. Acc. 0.98670465
Epoch 7: Train BCE: 0.00067490 Val. BCE: 0.00074352 Val. Acc. 0.99050332
Epoch 8: Train BCE: 0.00050040 Val. BCE: 0.00100447 Val. Acc. 0.98290598
Epoch 9: Train BCE: 0.00048660 Val. BCE: 0.00061377 Val. Acc. 0.99430199


In [10]:
# Saving model for later use
torch.save(model.state_dict(), "model.pt")

## Evaluation on test set

In [11]:
from torchmetrics.classification import BinaryAUROC, BinaryStatScores

In [12]:
# Set up device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [13]:
# Loading the test set
TEST_DATA_PATH = "data/test"

test_dataset = DatasetFolder(
    root=TEST_DATA_PATH,
    loader=lambda x: open(x, 'rb').read(),
    extensions=('',),
    transform=transform
)
test_loader = DataLoader(test_dataset, batch_size=64)

In [14]:
# Loading model from file: no need to run the training loop adain if the runtime is restarted
model = ConvNet().to(device)
model.load_state_dict(torch.load("model.pt"))
model

ConvNet(
  (conv): Conv1d(1, 16, kernel_size=(10,), stride=(1,))
  (relu): ReLU()
  (pooling): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (linear): Linear(in_features=65496, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

In [15]:
# Testing loop with additional metrics
roc_metric = BinaryAUROC()
stat_scores = BinaryStatScores()

total_test_correct = 0
total_test_samples = 0

model.eval()

with torch.no_grad():
    for X, y in test_loader:
        X, y = X.to(device), y.to(device)
        y_pred = model(X)
        pred_label = (y_pred > 0.5)

        total_test_correct += pred_label.eq(y).sum().item()
        total_test_samples += len(X)

        roc_metric.update(pred_label.float(), y.float())
        stat_scores.update(pred_label.float(), y.float())

In [16]:
roc_auc = roc_metric.compute()
tp, fp, tn, fn, _ = stat_scores.compute()

In [17]:
print(f"Mean Acc. on test set: {total_test_correct / total_test_samples:.8f}")
print(f"ROC AUC: {roc_auc:.8f}")
print(f"TPR: {(tp / (tp + fn)).item():.8f}")
print(f"TNR: {(tn / (tn + fp)).item():.8f}")
print(f"FPR: {(fp / (fp + tn)).item():.8f}")
print(f"FNR: {(fn / (fn + tp)).item():.8f}")

Mean Acc. on test set: 0.99487179
ROC AUC: 0.99487185
TPR: 0.99487180
TNR: 0.99487180
FPR: 0.00512821
FNR: 0.00512821
