In [1]:
import torch
from PIL import Image
import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.metrics import roc_auc_score
from torch.nn.functional import relu
from torchvision.io import read_image
import matplotlib.pyplot as plt

In [2]:
DATA_PATH = Path('/') / 'home' / 'user' / 'data'
TRAIN_DATA_PATH = DATA_PATH / 'ML' / 'train'
TEST_DATA_PATH = DATA_PATH / 'test'
TARGETS = ['target_0', 'target_1']

In [3]:
torch_device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

## Define the dataset and data loader

In [4]:
class CamelyonDataset(torch.utils.data.Dataset):

    def __init__(self, data_path, targets, transform=None):
        samples = list()
        labels = list()
        self.transform = transform
        for target in targets:
            data_paths = list((data_path / target).glob('*.jpg'))
            label = int(target.split('_')[-1])
            assert len(data_paths) > 0, f'Wrong data path: {data_path / target}'
            samples.extend(data_paths)
            labels.extend([label]*len(data_paths))
        self.samples = pd.DataFrame({'filepath': samples, 'label': labels})

    def __len__(self):
        return len(self.samples)
    
    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
    
        img_path = self.samples.iloc[idx, 0]
        image = read_image(str(img_path.resolve())).to(torch_device)
        image = (1.0 / 255.0) * image
        
        label = self.samples.iloc[idx, 1]
        sample = image, label
        
        if self.transform:
            sample = self.transform(sample)
       
        return sample

In [5]:
dataset = CamelyonDataset(
    data_path=TRAIN_DATA_PATH,
    targets=TARGETS,
)
test_dataset = CamelyonDataset(
    data_path=TEST_DATA_PATH,
    targets=TARGETS,
)

In [6]:
dataloader = torch.utils.data.DataLoader(dataset, batch_size=32, shuffle=True, num_workers=0)
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=32, shuffle=False, num_workers=0)

In [7]:
# Evaluation code
def evaluate(model, loader):
    model.eval()

    all_labels = None
    all_preds = None
    
    with torch.no_grad():
        for batch in loader:
            inputs, labels = batch
            labels = labels.to(torch_device).float().cpu().detach().numpy().tolist()
            
            # Apply a sigmoid on the predictions
            preds = torch.sigmoid(model(inputs))
            preds = preds.cpu().detach().numpy().tolist()

            if all_labels is None:
                all_labels = labels
                all_preds = preds
            else:
                all_labels.extend(labels)
                all_preds.extend(preds)

    return all_labels, all_preds, roc_auc_score(all_labels, all_preds)

## Define the model

In [8]:
class TorchModel(torch.nn.Module):
    
    def __init__(
        self,
    ):
        super().__init__()
        self.conv1 = torch.nn.Conv2d(3, 6, kernel_size=3)
        self.pool = torch.nn.MaxPool2d(2, 2)
        self.conv2 = torch.nn.Conv2d(6, 16, kernel_size=5)
        self.fc1 = torch.nn.Linear(16 * 53 * 53, 120)
        self.fc2 = torch.nn.Linear(120, 84)
        self.fc3 = torch.nn.Linear(84, 1)
        
    def forward(self, x: torch.Tensor):
        x = self.conv1(x)
        x = self.pool(relu(x))
        x = self.pool(relu(self.conv2(x)))
        x = torch.flatten(x, 1) # flatten all dimensions except batch
        x = relu(self.fc1(x))
        x = relu(self.fc2(x))
        x = self.fc3(x)
        return x.squeeze()

## Train and evaluate the model

**DISCLAIMER**: I should create a separate validation and test set from the train data and do nothing on the test set until the submission, otherwise I will overfit the test set.

In [None]:
n_epochs = 15

model = TorchModel().to(torch_device)

criterion = torch.nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

test_perfs = list()
train_perfs = list()

for epoch in range(n_epochs):
    model.train()
    for batch in dataloader:
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = batch
        labels = labels.to(torch_device).float()

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
    
    _, _, train_score = evaluate(model, loader=dataloader)
    train_perfs.append(train_score)

    labels, predictions, test_score = evaluate(model, loader=test_dataloader)
    test_perfs.append(test_score)
    print(f'Epoch {epoch+1}/{n_epochs} - Train AUC: {train_score} - Test AUC: {test_score}')

Epoch 1/15 - Train AUC: 0.8564383821365915 - Test AUC: 0.741283032141758
Epoch 2/15 - Train AUC: 0.9101253660944353 - Test AUC: 0.7387192032579104
Epoch 3/15 - Train AUC: 0.9744453884300959 - Test AUC: 0.7188989392368471


## Display the results

In [None]:
%matplotlib inline

In [None]:
# Plot the results
plt.title('ROC-AUC')
plt.plot(list(range(n_epochs)), train_perfs,label = 'Train AUC')
plt.plot(list(range(n_epochs)), test_perfs, label = 'Test AUC')
plt.legend(loc = 'lower right')
plt.ylabel('AUC')
plt.xlabel('epochs')
plt.show()

## Kaggle submission

In [None]:
predictions

In [None]:
labels