In [41]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torchvision.datasets import DatasetFolder
from torch.utils.tensorboard import SummaryWriter
import numpy as np
from math import ceil
from sklearn.model_selection import train_test_split
from torchsummary import summary
from torchmetrics.classification import AUROC, StatScores

In [14]:
class BinaryTransform:
    def __init__(self, input_length):
        self.input_length = input_length

    def __call__(self, binary_data):
        binary_data = np.frombuffer(binary_data, dtype=np.uint8)
        
        l = len(binary_data)

        # Pad or truncate the binary data
        if l < self.input_length:
            padding = np.zeros(self.input_length - l, dtype=np.uint8)
            binary_data = np.concatenate((binary_data, padding))
        elif l > self.input_length:
            excess = ceil(l / self.input_length)
            padding = np.zeros(self.input_length * excess - l, dtype=np.uint8)
            binary_data = np.concatenate((binary_data, padding))
            binary_data = binary_data.reshape(len(binary_data)//excess, -1)
            binary_data = np.mean(binary_data, axis=1)
            
        # Scale the data to [0, 1]
        scaled_data = binary_data / 255.0
        tensor = torch.tensor(scaled_data, dtype=torch.float32)
        return tensor.unsqueeze(0)

In [5]:
# There are two versions of the assigment, so we created two versions, either of them works

# Assignment on Teams
class ConvNet(nn.Module):
    def __init__(self):
        super(ConvNet, self).__init__()
        self.conv1 = nn.Conv1d(1, 16, kernel_size=(10,), stride=(1,))
        self.fc1 = nn.Linear(65496, 1)  # Adjust the input size based on your data size

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.max_pool2d(x, kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
        x = x.view(-1, 65496)
        return F.sigmoid(self.fc1(x))

# Assignmanet on Moodle (Linear input size adjusted to meet expected output dimension)
# class ConvNet(nn.Module):
#     def __init__(self):
#         super(ConvNet, self).__init__()
#         self.conv1 = nn.Conv1d(1, 16, kernel_size=(10,), stride=(1,))
#         self.fc1 = nn.Linear(2*65488, 1)  # Adjust the input size based on your data size

#     def forward(self, x):
#         x = F.relu(self.conv1(x))
#         x = F.max_pool1d(x, kernel_size=4, stride=2, padding=0, dilation=1, ceil_mode=False)
#         x = x.view(-1, 2*65488)
#         return F.sigmoid(self.fc1(x))

In [7]:
# Set up device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [15]:
# Define data paths
train_data_path = "data/train"
test_data_path = "data/test"

In [16]:
# Define the input length and instantiate the transform
input_length = 16384
transform = BinaryTransform(input_length)

In [17]:
# Create dataset
train_dataset = DatasetFolder(root=train_data_path, loader=lambda x: open(x, 'rb').read(), extensions=('',), transform=transform)
test_dataset = DatasetFolder(root=test_data_path, loader=lambda x: open(x, 'rb').read(), extensions=('',), transform=transform)

In [18]:
# Split dataset into train and validation sets
indices = np.arange(len(train_dataset))
np.random.shuffle(indices)
train_indices, val_indices = train_test_split(indices, test_size=0.3, random_state=42)

In [19]:
train_loader = DataLoader(torch.utils.data.Subset(train_dataset, train_indices), batch_size=64, shuffle=True)
val_loader = DataLoader(torch.utils.data.Subset(train_dataset, val_indices), batch_size=64)
test_loader = DataLoader(test_dataset, batch_size=64)

In [23]:
# Instantiate model
model = ConvNet().to(device)

In [24]:
summary(model,(1,16384))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv1d-1            [-1, 16, 16375]             176
            Linear-2                    [-1, 1]          65,497
Total params: 65,673
Trainable params: 65,673
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.06
Forward/backward pass size (MB): 2.00
Params size (MB): 0.25
Estimated Total Size (MB): 2.31
----------------------------------------------------------------


In [25]:
# Define optimizer and loss function
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.BCELoss()

In [14]:
# Set up TensorBoard
writer = SummaryWriter()

In [15]:
# Training loop
best_val_loss = float('inf')
patience = 4
counter = 0

In [16]:
for epoch in range(100):  # You can adjust the number of epochs
    model.train()
    train_loss = 0.0
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data).squeeze()
        loss = criterion(output, target.float())
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    # Validate
    model.eval()
    val_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for batch_idx, (data, target) in enumerate(val_loader):
            data, target = data.to(device), target.to(device)
            output = model(data).squeeze()
            loss = criterion(output, target.float())
            val_loss += loss.item()
            total += target.size(0)
            pred_label = output > 0.5
            correct += pred_label.eq(target).sum().item()
        
    # Write to TensorBoard
    writer.add_scalar('Loss/train', train_loss/len(train_loader.dataset), epoch)
    writer.add_scalar('Loss/val', val_loss/len(val_loader.dataset), epoch)
    writer.add_scalar('Accuracy/val', 100.*correct/total, epoch)

    print(f'Epoch {epoch}, Train Loss: {train_loss/len(train_loader.dataset)}, Val Loss: {val_loss/len(val_loader.dataset)}, Val Acc: {100.*correct/total}')

    # Early stopping
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        counter = 0
    else:
        counter += 1
        if counter >= patience:
            print("Early stopping")
            break

Epoch 0, Train Loss: 0.015413649771385285, Val Loss: 0.0047024543780094085, Val Acc: 87.55935422602089
Epoch 1, Train Loss: 0.0030089689037350554, Val Loss: 0.002972913504439315, Val Acc: 94.01709401709402
Epoch 2, Train Loss: 0.0020407564971424673, Val Loss: 0.0025795027825907084, Val Acc: 94.3019943019943
Epoch 3, Train Loss: 0.0016685363191824693, Val Loss: 0.002696676627296781, Val Acc: 93.44729344729345
Epoch 4, Train Loss: 0.0012676939959563906, Val Loss: 0.001959722160104333, Val Acc: 95.53656220322887
Epoch 5, Train Loss: 0.0009891373754657522, Val Loss: 0.0012483850956415971, Val Acc: 97.72079772079772
Epoch 6, Train Loss: 0.0008004888809541158, Val Loss: 0.0017632039163017544, Val Acc: 95.82146248812916
Epoch 7, Train Loss: 0.0007825787453982322, Val Loss: 0.0011610416067518287, Val Acc: 97.62583095916429
Epoch 8, Train Loss: 0.0005888685013221006, Val Loss: 0.0010606287434654698, Val Acc: 97.81576448243115
Epoch 9, Train Loss: 0.0006392580124606046, Val Loss: 0.0013218092832

In [17]:
# Close TensorBoard writer
writer.close()

In [18]:
torch.save(model.state_dict(), "model.pt")

In [8]:
saved_model = ConvNet().to(device)
saved_model.load_state_dict(torch.load("model.pt"))

<All keys matched successfully>

In [43]:
roc_metric = AUROC(task="binary")
stat_scores = StatScores(task="binary")

test_loss = 0
total = 0
correct = 0

with torch.no_grad():
    for batch_idx, (data, target) in enumerate(test_loader):
        data, target = data.to(device), target.to(device)
        output = saved_model(data).squeeze()
        loss = criterion(output, target.float())
        test_loss += loss.item()
        total += target.size(0)
        pred_label = output > 0.5
        correct += pred_label.eq(target).sum().item()
        roc_metric.update(pred_label.float(), target.float())
        stat_scores.update(pred_label.float(), target.float())

In [59]:
"Mean acc. on test set: ", correct / total

('Mean acc. on test set: ', 0.9923076923076923)

In [61]:
"ROC AUC: ", roc_metric.compute().item()

('ROC AUC: ', 0.992307722568512)

In [54]:
# tp, fp, tn, fn = stat_scores.compute()
scores = stat_scores.compute()
tp = scores[0]
fp = scores[1]
tn = scores[2]
fn = scores[3]

In [62]:
"TPR: ", (tp / (tp + fn)).item()

('TPR: ', 0.9948717951774597)

In [63]:
"TNR: ", (tn / (tn + fp)).item()

('TNR: ', 0.9897435903549194)

In [65]:
"FPR: ", (fp / (fp + tn)).item()

('FPR: ', 0.010256410576403141)

In [64]:
"FNR: ", (fn / (fn + tp)).item()

('FNR: ', 0.0051282052882015705)