In [None]:
from util import *

import torch
import torchvision
import numpy as np
import pandas as pd
from copy import deepcopy
import argparse
from tqdm import tqdm
import matplotlib.pyplot as plt


In [None]:
#Made by Rasmus Hannibal Tirsgaard
### Setup MNIST dataset
transform = torchvision.transforms.Compose([
    torchvision.transforms.ToTensor(),
    torchvision.transforms.Normalize((0.5,), (0.5,))
])
train_dataset = torchvision.datasets.MNIST(root='./data', train=True, download=True, transform=transform)

In [None]:
val_dataset = deepcopy(train_dataset)

In [None]:
train_size = int((1 - val_split) * len(train_dataset))
val_size = len(train_dataset) - train_size
indexes = torch.randperm(len(train_dataset)).tolist()
# Define validation set
indexes_val = indexes[train_size:]
val_dataset.targets = val_dataset.targets[indexes_val]
val_dataset.data = val_dataset.data[indexes_val]
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=1024, shuffle=False)

In [None]:
#Made by Rasmus Hannibal Tirsgaard
# Define training set
indexes_train = indexes[:train_size]
train_dataset.targets = train_dataset.targets[indexes_train]
train_dataset.data = train_dataset.data[indexes_train]

In [None]:
#Made by Rasmus Hannibal Tirsgaard
# Split training data into labelled and unlabelled
unlabelled_size = int(unlabelled_size * len(train_dataset))
indexes_train = torch.randperm(len(train_dataset)).tolist()  # Redefine indexes_train
unlabbelled_dataset = deepcopy(train_dataset)
unlabbelled_dataset.targets = unlabbelled_dataset.targets[indexes_train[:unlabelled_size]]
unlabbelled_dataset.data = unlabbelled_dataset.data[indexes_train[:unlabelled_size]]
train_dataset.targets = train_dataset.targets[indexes_train[unlabelled_size:]]
train_dataset.data = train_dataset.data[indexes_train[unlabelled_size:]]
unlabbelled_dataset.targets = unlabbelled_dataset.targets
unlabbelled_dataset.data = unlabbelled_dataset.data
start_train_dataset = deepcopy(train_dataset)  # Save for baseline
start_unlabbelled_dataset = deepcopy(unlabbelled_dataset)  # Save for baseline

In [None]:
#Made by Rasmus Hannibal Tirsgaard
# Setup model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = torchvision.models.resnet18(pretrained=False)
model.fc = torch.nn.Linear(model.fc.in_features, 10)
# Modify input layer to accept 1 channel
model.conv1 = torch.nn.Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)

model_parameters = deepcopy(model.state_dict())
model = model.to(device)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

In [None]:
## Run active learning Uncertanty
datapoint_list_US = []
accuracy_list_US = []

train_dataset.targets = start_train_dataset.targets
train_dataset.data = start_train_dataset.data
unlabbelled_dataset.targets = start_unlabbelled_dataset.targets
unlabbelled_dataset.data = start_unlabbelled_dataset.data
for i in range(label_iterations):
    print(i)
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True)
    model.load_state_dict(model_parameters)  # Important to reset the model each time
    accuracies = train_model(model, train_loader, val_loader, criterion, optimizer, device, num_epochs=num_epochs, val_interval=10)
    datapoint_list_US.append(len(train_dataset))
    accuracy_list_US.append(accuracies)
    if i < label_iterations - 1:
        train_dataset, unlabbelled_dataset = label_iteration_uncertanty_sampling(model, train_dataset, unlabbelled_dataset, device, top_frac=0.001)

In [None]:
## Run active learning Margin
datapoint_list_MAR = []
accuracy_list_MAR = []

train_dataset.targets = start_train_dataset.targets
train_dataset.data = start_train_dataset.data
unlabbelled_dataset.targets = start_unlabbelled_dataset.targets
unlabbelled_dataset.data = start_unlabbelled_dataset.data
for i in range(label_iterations):
    print(i)
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True)
    model.load_state_dict(model_parameters)  # Important to reset the model each time
    accuracies = train_model(model, train_loader, val_loader, criterion, optimizer, device, num_epochs=num_epochs, val_interval=10)
    datapoint_list_MAR.append(len(train_dataset))
    accuracy_list_MAR.append(accuracies)
    if i < label_iterations - 1:
        train_dataset, unlabbelled_dataset = label_iteration_margin_based(model, train_dataset, unlabbelled_dataset, device, top_frac=0.001)

datapoints_MAR = np.array(datapoint_list_MAR)
accuracies_MAR = np.array(accuracy_list_MAR).max(-1)

N = np.max(datapoints_MAR)
pd.DataFrame(np.vstack((datapoints_MAR, accuracies_MAR)).T).to_excel(f'./results/MNIST_{N}_margin_based.xlsx',index = False)



In [None]:
## Run active learning BADL
datapoint_list_BADL = []
accuracy_list_BADL = []

train_dataset.targets = start_train_dataset.targets
train_dataset.data = start_train_dataset.data
unlabbelled_dataset.targets = start_unlabbelled_dataset.targets
unlabbelled_dataset.data = start_unlabbelled_dataset.data
for i in range(label_iterations):
    print(i)
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True)
    model.load_state_dict(model_parameters)  # Important to reset the model each time
    accuracies = train_model(model, train_loader, val_loader, criterion, optimizer, device, num_epochs=num_epochs, val_interval=10)
    datapoint_list_BADL.append(len(train_dataset))
    accuracy_list_BADL.append(accuracies)
    if i < label_iterations - 1:
        train_dataset, unlabbelled_dataset = label_iteration_BADL(model, train_dataset, unlabbelled_dataset, device, top_frac=0.001)

datapoints_BADL = np.array(datapoint_list_BADL)
accuracies_BADL = np.array(accuracy_list_BADL).max(-1)

N = np.max(datapoints_BADL)
pd.DataFrame(np.vstack((datapoints_BADL, accuracies_BADL)).T).to_excel(f'./results/MNIST_{N}_bayesian.xlsx',index = False)

In [None]:
## Run active learning KMEANS
datapoint_list_KMEANS = []
accuracy_list_KMEANS = []

train_dataset.targets = start_train_dataset.targets
train_dataset.data = start_train_dataset.data
unlabbelled_dataset.targets = start_unlabbelled_dataset.targets
unlabbelled_dataset.data = start_unlabbelled_dataset.data
for i in range(label_iterations):
    print(i)
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True)
    model.load_state_dict(model_parameters)  # Important to reset the model each time
    accuracies = train_model(model, train_loader, val_loader, criterion, optimizer, device, num_epochs=num_epochs, val_interval=10)
    datapoint_list_KMEANS.append(len(train_dataset))
    accuracy_list_KMEANS.append(accuracies)
    if i < label_iterations - 1:
        train_dataset, unlabbelled_dataset = label_iteration_cluster_KMEANS(model, train_dataset, unlabbelled_dataset, device, top_frac=0.001)

In [None]:
#Made by Rasmus Hannibal Tirsgaard
# Add baseline accuracy (no active learning)
n_datapoints = len(train_dataset) - len(start_train_dataset)
model.load_state_dict(model_parameters)
# We reuse the initial training set to reduce run to run variance
train_dataset.data = torch.cat([start_train_dataset.data, start_unlabbelled_dataset.data[:n_datapoints]])
train_dataset.targets = torch.cat([start_train_dataset.targets, start_unlabbelled_dataset.targets[:n_datapoints]])

In [None]:
#Made by Rasmus Hannibal Tirsgaard
# Train model
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=1024, shuffle=False)
baseline_accuracy = train_model(model, train_loader, val_loader, criterion, optimizer, device, num_epochs=num_epochs, val_interval=10)

In [None]:
# Plot the accuracy
datapoints_US = np.array(datapoint_list_US)
accuracies_US = np.array(accuracy_list_US).max(-1)
plt.figure(figsize=(8,5))
plt.plot(datapoints_US, accuracies_US, label='Uncertanty based AL')

datapoints_MAR = np.array(datapoint_list_MAR)
accuracies_MAR = np.array(accuracy_list_MAR).max(-1)
plt.plot(datapoints_MAR, accuracies_MAR, label='Margin based AL')

datapoints_BADL = np.array(datapoint_list_BADL)
accuracies_BADL = np.array(accuracy_list_BADL).max(-1)
plt.plot(datapoints_MAR, accuracies_MAR, label='Bayesian AL')

datapoints_KMEANS = np.array(datapoint_list_KMEANS)
accuracies_KMEANS = np.array(accuracy_list_KMEANS).max(-1)
plt.plot(datapoints_KMEANS, accuracies_KMEANS, label='K-means AL')

plt.hlines(max(baseline_accuracy), min(datapoints_US), max(datapoints_US), label=f'Baseline Accuracy ({len(train_dataset)} datapoints)', color='red')
plt.title('Active learning on the CIFAR-10 data set', fontsize = 22)
plt.xlabel('Datapoints used for AL model', fontsize = 16)
plt.ylabel('Accuracy', fontsize = 16)
plt.legend(fontsize = 14)

plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)


plt.tight_layout()
plt.savefig('figs/2_Comparison_3_MNIST.png')
plt.show()

In [None]:
# Save data for later

import pandas as pd
N = np.max(datapoints_US)
pd.DataFrame(np.vstack((datapoints_US, accuracies_US)).T).to_excel(f'./results/MNIST_{N}_uncertanty_sampling.xlsx',index = False)

N = np.max(datapoints_MAR)
pd.DataFrame(np.vstack((datapoints_MAR, accuracies_MAR)).T).to_excel(f'./results/MNIST_{N}_margin_based.xlsx',index = False)

N = np.max(datapoints_BADL)
pd.DataFrame(np.vstack((datapoints_BADL, accuracies_BADL)).T).to_excel(f'./results/MNIST_{N}_bayesian.xlsx',index = False)

N = np.max(datapoints_KMEANS)
pd.DataFrame(np.vstack((datapoints_KMEANS, accuracies_KMEANS)).T).to_excel(f'./results/MNIST_{N}_kmeans.xlsx',index = False)