# **Import Libraries** #

In [None]:
#For Import Dataset
from google.colab import drive

#For Data Augmentation
import os
import torch
# import cv2
import matplotlib.pyplot as plt
from tqdm import tqdm
from google.colab import files

# For Datasets and Dataloader
from torch.utils.data import DataLoader, Dataset
from torchvision import datasets, models, transforms
from torchvision.io import read_image

#For EfficientNet Model Architecture
! pip install efficientnet_pytorch
import pandas as pd
import json
from PIL import Image
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR
from efficientnet_pytorch import EfficientNet

#For ensemble model
import collections

#For Training & Validation
import time
import copy
#import shutil
import numpy as np
from sklearn.model_selection import StratifiedKFold

  from IPython.utils import traitlets as _traitlets


Traceback (most recent call last):
  File "/home/tanwp/anaconda3/envs/ALLClassification/bin/pip", line 7, in <module>
    from pip._internal.cli.main import main
  File "/home/tanwp/anaconda3/envs/ALLClassification/lib/python3.8/site-packages/pip/_internal/cli/main.py", line 10, in <module>
    from pip._internal.cli.autocompletion import autocomplete
  File "/home/tanwp/anaconda3/envs/ALLClassification/lib/python3.8/site-packages/pip/_internal/cli/autocompletion.py", line 9, in <module>
    from pip._internal.cli.main_parser import create_main_parser
  File "/home/tanwp/anaconda3/envs/ALLClassification/lib/python3.8/site-packages/pip/_internal/cli/main_parser.py", line 7, in <module>
    from pip._internal.cli import cmdoptions
  File "/home/tanwp/anaconda3/envs/ALLClassification/lib/python3.8/site-packages/pip/_internal/cli/cmdoptions.py", line 23, in <module>
    from pip._internal.cli.progress_bars import BAR_TYPES
  File "/home/tanwp/anaconda3/envs/ALLClassification/lib

#**Directories & Folders**
Directories created very slowly, may need to wait for folders to appear. Refresh files to allow folders to be updated.

In [None]:
#Directory to store base EfficientNet model information
!mkdir /content/baseEfficientNet
!mkdir /content/baseEfficientNet/training_results
!mkdir /content/baseEfficientNet/training_results/weights #Weights
!mkdir /content/baseEfficientNet/training_results/stats #Training stats
!mkdir /content/baseEfficientNet/testing_results/baseStats #Testing stats for base models
!mkdir /content/baseEfficientNet/testing_results/ensembleStats #Testing stats for ensemble model

#Directory to store Noisy Student model information
!mkdir /content/noisyStudent
!mkdir /content/noisyStudent/training_results
!mkdir /content/noisyStudent/training_results/weights #Weights
!mkdir /content/noisyStudent/training_results/stats #Training stats
!mkdir /content/noisyStudent/testing_results/baseStats #Testing stats for base models
!mkdir /content/noisyStudent/testing_results/ensembleStats #Testing stats for ensemble model

mkdir: cannot create directory ‘/content/baseEfficientNet/testing_results/baseStats’: No such file or directory
mkdir: cannot create directory ‘/content/baseEfficientNet/testing_results/ensembleStats’: No such file or directory
mkdir: cannot create directory ‘/content/noisyStudent/testing_results/baseStats’: No such file or directory
mkdir: cannot create directory ‘/content/noisyStudent/testing_results/ensembleStats’: No such file or directory


In [None]:
#Mount google drive to store datasets and results in.
#Ensure data_main.zip and weights.zip files are uploaded into google driver folder first.
drive.mount('/content/drive')

Mounted at /content/drive


# **Import Datasets** # 
Make sure to upload data_main.zip and weights.zip into google drive folder.

In [None]:
# unzip data_main folder
!unzip /content/drive/MyDrive/Data_main.zip -d .

In [None]:
##Run this cell to debug and check datasets.

#subset for debugging
#!unzip /content/drive/MyDrive/Data_main_subset.zip -d .

# check no. of images in train_images
# !ls Data_main/images | wc -l

# check no. of images in test_images 
#!ls Data_main/held_out_test/test_images | wc -l

10114


# **Classes & Class Utils** #
Defines custom class for image datasets and base EfficientNet architecture.

##CellsDataset Class##
Custom class for image dataset, required for usage of dataloaders.

In [None]:
#Custom CellsDataset class for dataloader to process images.
class CellsDataset(Dataset):

    def __init__(self, label_file, img_dir):
        # initialise image directory
        self.img_dir = img_dir

        # initialise labels
        try:
            self.img_labels = pd.read_csv(label_file) #Label csv files.
        except:
            self.img_labels = label_file #Label df files.
        
        # remove rows without an image path
        files = os.listdir(self.img_dir)
        self.img_labels = self.img_labels.loc[self.img_labels['Patient_ID'].isin(files)]

    def __len__(self):
        return len(self.img_labels)

    def __getitem__(self, idx):
        img_path = os.path.join(self.img_dir, self.img_labels.iloc[idx, 1])
        
        #Normalises images according to pretrained EfficientNet-B0 values
        transform_data = transforms.Compose(
            [transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])])

        # image = read_image(img_path)
        image = Image.open(img_path)
        image = transform_data(image)
        label = self.img_labels.iloc[idx, 2]

        return image, label

##EfficientNet Class##
Custom class for base EfficientNet model.

In [None]:
#Custom class for EfficientNet model.
class BaseEfficientNet(nn.Module):
  def __init__(self):
    super(BaseEfficientNet,self).__init__()
    #Import pre-trained b0 efficientnet - loads pre-trained weights for base model
    self.model = EfficientNet.from_pretrained('efficientnet-b0')

    self.avgpool = nn.AdaptiveAvgPool2d(1)
    self.dropout = nn.Dropout(0.5)
    self.fc = nn.Linear(1280, 2) 

  def forward(self, x):
    x = x.float()
    x = self.model.extract_features(x)
    # print(f"No of features: {x.shape}")
    x = self.avgpool(x)
    x = x.flatten(start_dim=1)
    x = self.dropout(x) #Randomly remove nodes to prevent overfitting
    x = self.fc(x)

    return x

## Ensemble Model Utils ##

In [None]:
#Function to create an Ensemble model, returns a dictionary of (model name, model weights).
def createEnsemble(model_class, weights_path):
#Pull all files containing weights from each training fold.
    weight_path = weights_path
    file_paths = os.listdir(weight_path)

    #Retrieve weights of each base model.
    model_weights = list()
    for pth in file_paths:
        if 'pth' not in pth:
          continue
        path = f"{weight_path}/{pth}"
        model = model_class
        model.load_state_dict(torch.load(path, map_location = torch.device('cpu')), strict = False)
        model_weights.append(model)

    #Return a dictionary of the model weights
    models = dict()
    # for i in range(len(model_weights)):
    #   models[f'model{i+1}'] = model_weights[i]

    models['model1'] = model_weights[0]
    models['model2'] = model_weights[1]
    models['model3'] = model_weights[2]
    models['model4'] = model_weights[3]
    models['model5'] = model_weights[4]

    #Delete intermediate variables to save memory
    del model_weights
    del model

    return models

In [None]:
#Function to predict using ensemble model via max Voting classification.
def get_ensembleVoting(ensemble_model, test_input):
  #Initialise base models of ensemble model.
  model1 = ensemble_model['model1']
  model2 = ensemble_model['model2']
  model3 = ensemble_model['model3']
  model4 = ensemble_model['model4']
  model5 = ensemble_model['model5']
  
  #Send models to CPU/GPU 
  model1.to(device)
  model2.to(device)
  model3.to(device)
  model4.to(device)
  model5.to(device)

  #Set model to evaluation mode
  model1.eval()
  model2.eval()
  model3.eval()
  model4.eval()
  model5.eval()
  
  #Obtain base model predictions.
  preds = list()
  model1_output = model1(test_input)
  _, model1_pred = torch.max(model1_output.data, 1)
  preds.append(model1_pred)

  model2_output = model2(test_input)
  _, model2_pred = torch.max(model2_output.data, 1)
  preds.append(model2_pred)

  model3_output = model3(test_input)
  _, model3_pred = torch.max(model3_output.data, 1)
  preds.append(model3_pred)

  model4_output = model4(test_input)
  _, model4_pred = torch.max(model4_output.data, 1)
  preds.append(model4_pred)

  model5_output = model5(test_input)
  _, model5_pred = torch.max(model5_output.data, 1)
  preds.append(model5_pred)

  #Find vote that has the maximum number and output max Voting prediction.
  votes = collections.Counter(preds) #Counts frequency of each vote
  common_vote = votes.most_common(1) #Retrive most common vote
  final_pred = common_vote[0][0]

  return final_pred

# **Training Base Models**

## **Initialise Training Parameters** ##

In [None]:
#Training parameters.
learning_rate = 0.001
momentum = 0.9 # SGD momentum
step_size = 7 # scheduler step size
gamma = 0.1 # learning rate decay

#EfficientNet and Noisy Student parameters.
output_layer = 2
criterion = nn.CrossEntropyLoss() #Loss function.
seed = torch.manual_seed(42)
num_fold = 5 # fix at 5
batch_size = 5
num_workers = 8 # to change based on machine used
aug_rounds = 5
CV_states = 42
epochs = 100 # to change
patience = 50 # add patience for early stopping

#Use CUDA to run training, else use CPU.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device in use: {device}")

Device in use: cuda


##**Load Pre-Trained EfficientNet Model**##

In [None]:
#Specify EfficientNet model as base model.
base_model = BaseEfficientNet()
base_model = base_model.to(device)

#Optimize with Stochastic Gradient descent
optimizer = optim.SGD(base_model.parameters(), lr=learning_rate, momentum=momentum)

#Decay the learning rate.
base_scheduler = StepLR(optimizer, step_size=step_size, gamma=gamma) 

Loaded pretrained weights for efficientnet-b0


## **Generate Folds for K-Fold CV** ##

In [None]:
#Initialise labels file.
label_csv = "Data_main/train_labels.csv"

# Define stratified K-fold cross-validation function.
strat_kfold = StratifiedKFold(n_splits=num_fold, shuffle=True, random_state=CV_states)

#Obtain indexes for stratified k fold.
label_df = pd.read_csv(label_csv)
X = np.zeros(len(label_df))
y = label_df['labels']

## **Training base EfficientNet Model** ##



In [None]:
# Base model output directories
weight_path = "baseEfficientNet/training_results/weights"
stats_path = "baseEfficientNet/training_results/stats"
version = 'noisy_debug' # change every round to prevent overriding of weights

In [None]:
#Training Function.
def train_baseModel(model, criterion, optimizer, scheduler, num_epochs=epochs, fold=0):
    
    # specify output files
    log_file = f"{stats_path}/fold{fold+1}_{version}_log.log"
    weight_file = f"{weight_path}/fold{fold+1}_{version}_weights.pth"

    since = time.time()

    #Initialise model.
    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0
    best_epoch = 0

    for epoch in range(num_epochs):
        epoch_start = time.time()
        epoch_no = epoch+1
        print('Epoch {}/{}'.format(epoch_no, num_epochs))
        print('-' * 10)
        with open(log_file, 'a') as log:
          log.write('Epoch {}/{}, '.format(epoch_no, num_epochs))
        
        # Each epoch has a training and validation phase
        for phase in ['training', 'validation']:
            if phase == 'training':
                model.train()  # Set model to training mode
                
            else:
                model.eval()   # Set model to validation mode

            p_count = 0
            lowest_loss = 0.0
            running_loss = 0.0
            running_corrects = 0

            # Iterate over data.
            for inputs, labels in dataloaders[phase]:
                inputs = inputs.to(device)
                labels = labels.to(device)

                #Set parameter gradients to 0.
                optimizer.zero_grad()

                #Iterate forward.
                #Track history only if in training phase.
                with torch.set_grad_enabled(phase == 'training'):
                    outputs = model(inputs)
                    soft_preds = torch.softmax(outputs, dim = -1) #Generate soft labels
                    max_prob, hard_preds = torch.max(soft_preds, dim = -1)
                    loss = criterion(soft_preds, labels)

                    #Backward propagate. 
                    #Update parameters only if in training phase.
                    if phase == 'training':
                        loss.backward()
                        optimizer.step()

                #Output training statistics.
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(hard_preds == labels.data)
            
            if phase == 'training':
                scheduler.step()

            #Calculates epoch statistics.
            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_acc = running_corrects.double() / dataset_sizes[phase]

            print('{} Loss: {:.4f} Acc: {:.4f}'.format(
                phase, epoch_loss, epoch_acc))
            with open(log_file, 'a') as log:
              log.write('{} Loss: {:.4f} Acc: {:.4f}, '.format(
                phase, epoch_loss, epoch_acc))
              
            #Validation phase
            if phase == 'validation':
              with open(log_file, 'a') as log:
                  log.write('\n')

              #Add patience for early stopping
              if (lowest_loss <= epoch_loss):
                p_count += 1
                if p_count > patience:
                  print(f'val loss did not decrease after {patience} epochs')
                  break

              if lowest_loss > epoch_loss:
                p_count = 0
                lowest_loss = epoch_loss

              #Save best model weights if epoch gives best accuracy
              if epoch_acc > best_acc:
                best_acc = epoch_acc
                best_epoch = epoch
                best_model_wts = copy.deepcopy(model.state_dict())

        epoch_end = time.time()
        epoch_time = epoch_end - epoch_start
        print('Epoch Time {:.0f}m {:.0f}s\n'.format(
          epoch_time // 60, epoch_time % 60))
        with open(log_file, 'a') as log:
          log.write('Time {:.0f}m {:.0f}s\n'.format(
          epoch_time // 60, epoch_time % 60))

        print()
    
    #Output runtime of model.
    time_elapsed = time.time() - since
    print('Time {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(best_acc))

    with open(log_file, 'a') as log:
      log.write('Time {:.0f}m {:.0f}s\n'.format(
          time_elapsed // 60, time_elapsed % 60))
      log.write('Best val Acc: {:4f}'.format(best_acc))

    #Save weights of the best epoch model into training directory.
    torch.save(best_model_wts, weight_file)

    return model

In [None]:
#Fold code.
for fold, (train_idx, valid_idx) in enumerate(strat_kfold.split(X, y)):
    fold_no = fold+1
    print(f"FOLD {fold_no}")

    base_model = BaseEfficientNet()
    base_model = base_model.to(device)
    optimizer = optim.SGD(base_model.parameters(), lr=learning_rate, momentum=momentum)
    base_scheduler = StepLR(optimizer, step_size=step_size, gamma=gamma) 

    train_df = label_df.iloc[train_idx][["Patient_no", "Patient_ID", "labels"]]
    valid_df = label_df.iloc[valid_idx][["Patient_no", "Patient_ID", "labels"]]
    train_set = CellsDataset(train_df, "Data_main/images")
    valid_set = CellsDataset(valid_df, "Data_main/images")

    #Add augmentation data to original training set.
    # aug_dir = "../test_augment/"
    # aug_df = augment(rounds=aug_rounds, op_dir=aug_dir, labels=train_df)
    # aug_set = CellsDataset(aug_df, aug_dir)
    # train_aug_set = torch.utils.data.ConcatDataset([train_set, aug_set])

    #Load dataset onto dataloader
    valid_loader = torch.utils.data.DataLoader(valid_set, batch_size=batch_size, num_workers=num_workers, shuffle=True)
    train_loader = torch.utils.data.DataLoader(train_set, batch_size=batch_size, num_workers=num_workers, shuffle=True)
    dataloaders = {'training': train_loader, 'validation': valid_loader}
    dataset_sizes = {'training': len(train_set), 'validation': len(valid_set)}
    class_names = [0, 1] # hard code labels based on labels in csv
    
    #train data on n epochs
    #trained_baseModel = train_baseModel(base_model, criterion, optimizer, base_scheduler, num_epochs=epochs, fold=fold)

    #Delete augmented training directory before next training fold to save disk space.
    #shutil.rmtree(aug_dir)

## **Training Noisy Student** ##

In [None]:
# Noisy student output directories
weight_path = "/content/noisyStudent/training_results/weights"
stats_path = "/content/noisyStudent/training_results/stats"
#version = 'v2' # change every round to prevent overriding of weights

In [None]:
#Training Function for noisy student.
def noisyStudentTraining(model, dataloader, dataset_sizes, criterion, optimizer, scheduler, stats_path, weight_path, num_epochs = 50, fold = 0):
    
    # specify output files
    log_file = f"{stats_path}/fold{fold+1}_{version}_log.log"
    weight_file = f"{weight_path}/fold{fold+1}_{version}_weights.pth"

    since = time.time()

    #Initialise model.
    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0
    best_epoch = 0

    for epoch in range(num_epochs):
        epoch_start = time.time()
        epoch_no = epoch+1
        print('Epoch {}/{}'.format(epoch_no, num_epochs))
        print('-' * 10)
        with open(log_file, 'a') as log:
          log.write('Epoch {}/{}, '.format(epoch_no, num_epochs))
        
        # Each epoch has a training and validation phase
        for phase in ['training', 'validation']:
            if phase == 'training':
                model.train()  # Set model to training mode
                
            else:
                model.eval()   # Set model to validation mode

            p_count = 0
            lowest_loss = 0.0
            running_loss = 0.0
            running_corrects = 0

            # Iterate over data.
            for inputs, labels in dataloader[phase]:
                inputs = inputs.to(device)
                labels = labels.to(device)

                #Set parameter gradients to 0.
                optimizer.zero_grad()

                #Iterate forward.
                #Track history only if in training phase.
                with torch.set_grad_enabled(phase == 'training'):
                    outputs = model(inputs)
                    soft_preds = torch.softmax(outputs, dim = -1) #Generate soft labels
                    max_prob, hard_preds = torch.max(soft_preds, dim = -1)
                    loss = criterion(soft_preds, labels)

                    #Backward propagate. 
                    #Update parameters only if in training phase.
                    if phase == 'training':
                        loss.backward()
                        optimizer.step()

                #Output training statistics.
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(hard_preds == labels.data)
            
            if phase == 'training':
                scheduler.step()

            #Calculates epoch statistics.
            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_acc = running_corrects.double() / dataset_sizes[phase]

            print('{} Loss: {:.4f} Acc: {:.4f}'.format(
                phase, epoch_loss, epoch_acc))
            with open(log_file, 'a') as log:
              log.write('{} Loss: {:.4f} Acc: {:.4f}, '.format(
                phase, epoch_loss, epoch_acc))
              
            #Validation phase
            if phase == 'validation':
              with open(log_file, 'a') as log:
                  log.write('\n')

              #Add patience for early stopping
              if (lowest_loss <= epoch_loss):
                p_count += 1
                if p_count > patience:
                  print(f'val loss did not decrease after {patience} epochs')
                  break

              if lowest_loss > epoch_loss:
                p_count = 0
                lowest_loss = epoch_loss

              #Save best model weights if epoch gives best accuracy
              if epoch_acc > best_acc:
                best_acc = epoch_acc
                best_epoch = epoch
                best_model_wts = copy.deepcopy(model.state_dict())

        epoch_end = time.time()
        epoch_time = epoch_end - epoch_start
        print('Epoch Time {:.0f}m {:.0f}s\n'.format(
          epoch_time // 60, epoch_time % 60))
        with open(log_file, 'a') as log:
          log.write('Time {:.0f}m {:.0f}s\n'.format(
          epoch_time // 60, epoch_time % 60))

        print()
    
    #Output runtime of model.
    time_elapsed = time.time() - since
    print('Time {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(best_acc))

    with open(log_file, 'a') as log:
      log.write('Time {:.0f}m {:.0f}s\n'.format(
          time_elapsed // 60, time_elapsed % 60))
      log.write('Best val Acc: {:4f}'.format(best_acc))

    #Save weights of the best epoch model into training directory.
    torch.save(best_model_wts, weight_file)

    return model

In [None]:
#Prepare data for training on each fold for noisy student model.
for fold, (train_idx, valid_idx) in enumerate(strat_kfold.split(X, y)):

    print("FOLD {}".format(fold+1))
    
    base_model = BaseEfficientNet()
    base_model = base_model.to(device)
    optimizer = optim.SGD(base_model.parameters(), lr=learning_rate, momentum=momentum)
    base_scheduler = StepLR(optimizer, step_size=step_size, gamma=gamma) 
    
    #Split training into labelled(50%)/unlabelled(50%)
    labelled_idx = []
    unlabelled_idx = []
    a = train_idx
    for idx in range(len(train_idx)):
      index = train_idx[idx]
      
      if (index % 2 == 0):
        labelled_idx.append(index)
      else:
        unlabelled_idx.append(index)
    
    labelled_df = label_df.iloc[labelled_idx][["Patient_no", "Patient_ID", "labels"]]
    unlabelled_df = label_df.iloc[unlabelled_idx][["Patient_no", "Patient_ID", "labels"]]
    valid_df = label_df.iloc[valid_idx][["Patient_no", "Patient_ID", "labels"]]

    # to change directory
    valid_set = CellsDataset(valid_df, "Data_main/images")
    labelled_set = CellsDataset(labelled_df, "Data_main/images")
    unlabelled_set = CellsDataset(unlabelled_df, "Data_main/images")

    #Load dataset onto dataloader
    valid_loader = torch.utils.data.DataLoader(valid_set, batch_size=batch_size, num_workers=num_workers, pin_memory=True, shuffle=True)
    label_loader = torch.utils.data.DataLoader(labelled_set, batch_size=batch_size, num_workers=num_workers, pin_memory=True, shuffle=True)
    unlabel_loader = torch.utils.data.DataLoader(unlabelled_set, batch_size=batch_size, num_workers=num_workers, pin_memory=True, shuffle=True)
    
    
    label_dataloader = {'training': label_loader, 'validation': valid_loader}
    dataset_sizes_l = {'training': len(labelled_set), 'validation': len(valid_set)}
    class_names = [0, 1] # hard code labels based on labels in csv
    
    #Training noisy student model.
    models = 3
    #Update variable with unlabelled images and inferred pseudolabels
    pseudolabel_df = unlabelled_df[["Patient_no", "Patient_ID"]]

    for loop in range(models):
      if loop == 0:
        #Training first teacher model on original labelled data.
        print("TRAINING TEACHER:")
        noisyStudentTraining(base_model, label_dataloader, dataset_sizes_l, criterion, optimizer, base_scheduler, stats_path, weight_path, num_epochs = 50, fold=fold)

      else:
        #Obtain predictions on unlabelled data to obtain pseudolabels.
        base_model.eval()

        infer_pseudolabels = []
        #Generate pseudolabels from unlabelled data
        for inputs, _ in unlabel_loader:
            inputs = inputs.to(device)
            with torch.no_grad():
              output = base_model(inputs)
              soft_preds = torch.softmax(output, dim = -1)
              max_prob, hard_preds = torch.max(soft_preds, dim = -1)
              hard_preds = hard_preds.cpu().detach().numpy()
              infer_pseudolabels.extend(hard_preds)

        #Add pseudolabel column from inferred pseudolabels
        pseudolabel_df['labels'] = infer_pseudolabels
        # pseudolabel_df = unlabelled_df[['Patient_no', 'Patient_ID']].concat(infer_pseudolabels)
        pseudolabel_set = CellsDataset(pseudolabel_df, "Data_main_subset/images")
        pseudolabel_loader = torch.utils.data.DataLoader(pseudolabel_set, batch_size=batch_size, shuffle=True)
        pseudo_dataloader = {'training': pseudolabel_loader, 'validation': valid_loader}
        dataset_sizes_u = {'training': len(pseudolabel_set), 'validation': len(valid_set)}

        #Train student models on pseudolabelled data.
        print(f"TRAINING STUDENT LOOP: {loop}")
        noisyStudentTraining(base_model, pseudo_dataloader, dataset_sizes_u, criterion, optimizer, base_scheduler, stats_path, weight_path, num_epochs = 50, fold=fold)


## **Create Ensemble Models** ##

In [None]:
#Load weights of base models for ensemble models.
# !unzip /content/drive/MyDrive/8Nov_weight.zip -d /content/baseEfficientNet/training_results/weights #base EfficientNet Model
# baseEfficientNet_weights = "/content/baseEfficientNet/training_results/weights/8Nov_weight"
baseEfficientNet_weights = "NoisyStudent/weights/10Nov_Noisy_weight"
#Create ensemble models.
base_ensembleModel = createEnsemble(BaseEfficientNet(), baseEfficientNet_weights)

Loaded pretrained weights for efficientnet-b0


# **Testing and Evaluation** #

## **Testing Function** ##

In [None]:
#Testing function.
def test_model(model, stats_path, ensemble=False, fold = ''):
  #Message to indicate if evaluating base or ensemble model.
  if ensemble == False:
    print("Evaluating base models...")
  else:
    print("Evaluating ensemble model...")
  
  #Create empty confusion matrix.
  confusion_matrix = np.zeros((2,2), dtype=int)
  
  with torch.no_grad(): #Disable backpropagation
    for inputs, labels in test_dataloader:  #Iterate over testing data.
      inputs = inputs.to(device)
      labels = labels.to(device)
    
      if ensemble == False: #Testing base model
          output = model(inputs)
          _, prediction = torch.max(output.data, 1)

      if ensemble == True: #Testing ensemble model
          prediction = get_ensembleVoting(model, inputs)

      #Evaluate predictions and summarise in confusion matrix.
      for j in range(inputs.size()[0]): 
          if prediction[j]==1 and labels[j]==1:
              term='TP'
              confusion_matrix[0][0]+=1

          elif prediction[j]==1 and labels[j]==0:
              term='FP'
              confusion_matrix[1][0]+=1

          elif prediction[j]==0 and labels[j]==1:
              term='FN'
              confusion_matrix[0][1]+=1
        
          elif prediction[j]==0 and labels[j]==0:
              term='TN'
              confusion_matrix[1][1]+=1

      #Obtain results from confusion matrix.
      TP = confusion_matrix[0][0]
      FP = confusion_matrix[0][1]
      FN = confusion_matrix[1][0]
      TN = confusion_matrix[1][1]

      #Calculate test statistics.
      accuracy = 100*(TP+TN)/ (TP+FP+TN+FN)
      sensitivity = (100*TP)/(TP+FP)
      specificity = (100*TN)/(TN+FN)
      PPV = (100*TP)/(TP+FN)
      NPV = (100*TN)/(TN+FP)
      F1 = 2*(PPV*sensitivity)/(PPV+sensitivity) 

    #Print test statistics.
    print('-----------------------')
    print('PREDICTION STATISTICS')
    print('-----------------------')
    print('True Positives: ' + str(TP))
    print('False Positives: ' + str(FP))
    print('False Negatives: ' + str(FN))
    print('True Negatives: ' + str(TN))

    print('-----------------------')
    print('EVALUATION STATISTICS')
    print('-----------------------')
    print('Accuracy: %f %%' % (accuracy))
    print('Sensitivity: %f %%' % (sensitivity))
    print('Specificity: %f %%' % (specificity))
    print('PPV: %f %%' % (PPV))
    print('NPV: %f %%' % (NPV))
    print('F1 Score: %f %%' % (F1))

    #Save statistics as log files.
    stat_file = f"{stats_path}_stats.log"

    if ensemble:
      with open(stat_file, 'a') as log:
        log.write('ENSEMBLE MODEL:' + '\n')

    else:
      with open(stat_file, 'a') as log:
        log.write(fold + '\n')

    #Save test statistics
    with open(stat_file, 'a') as log:
      log.write(
      'True Positives: ' + str(TP) + '\n'
      'False Positives: ' + str(FP) + '\n'
      'False Negatives: ' + str(FN) + '\n'
      'True Negatives: ' + str(TN) + '\n' + '\n'
      'Accuracy: %f %%' % (accuracy) +'\n'
      'Sensitivity: %f %%' % (sensitivity) + '\n'
      'Specificity: %f %%' % (specificity) + '\n'
      'PPV: %f %%' % (PPV) + '\n'
      'NPV: %f %%' % (NPV) + '\n'
      'F1 Score: %f %%' % (F1) + '\n')

## Evaluating ensembl models

In [None]:
#Initialise testing set files
test_path = "Data_main/held_out_test/test_labels.csv"
test_df = pd.read_csv(test_path)[["Patient_no", "Patient_ID", "labels"]]
test_df.index = test_df.index + 1
test_set = CellsDataset(test_df, "Data_main/held_out_test/test_images")

#Initialise paths for testing_stats
baseStats_path = "/content/baseEfficientNet/testing_results/baseStats"
# ensembleStats_path = "/content/baseEfficientNet/testing_results/ensembleStats"
baseStats_path = "NoisyStudent/stats"
ensembleStats_path = "NoisyStudent/stats"

#Dataloader for testing input.
test_dataloader = torch.utils.data.DataLoader(test_set, batch_size=batch_size, shuffle=True)

#Evaluate base models.
# model1 = base_ensembleModel['model1']
# model2 = base_ensembleModel['model2']
# model3 = base_ensembleModel['model3']
# model4 = base_ensembleModel['model4']
# model5 = base_ensembleModel['model5']

# test_basemodel1 = test_model(model1, stats_path=baseStats_path, ensemble=False)
# test_basemodel2 = test_model(model2, stats_path=baseStats_path, ensemble=False)
# test_basemodel3 = test_model(model3, stats_path=baseStats_path, ensemble=False)
# test_basemodel4 = test_model(model4, stats_path=baseStats_path, ensemble=False)
# test_basemodel5 = test_model(model5, stats_path=baseStats_path, ensemble=False)

#Evaluate ensemble models.
test_baseEnsembleModel = test_model(base_ensembleModel, stats_path=ensembleStats_path, ensemble=True)

Evaluating ensemble model...
-----------------------
PREDICTION STATISTICS
-----------------------
True Positives: 1760
False Positives: 76
False Negatives: 143
True Negatives: 435
-----------------------
EVALUATION STATISTICS
-----------------------
Accuracy: 90.927920 %
Sensitivity: 95.860566 %
Specificity: 75.259516 %
PPV: 92.485549 %
NPV: 85.127202 %
F1 Score: 94.142819 %


## Evaluating Individual Fold's models

In [None]:
# Noisy student outputs
weight_dir = "NoisyStudent/weights/10Nov_Noisy_weight"


weights = os.listdir(weight_dir)
for weight in weights:
  pth = weight_dir + '/'+ weight
  model = BaseEfficientNet()
  model.load_state_dict(torch.load(pth))
  # load to gpu
  model.to(device)

  path = baseStats_path + '/' + weight.split('.')[0]
  test_model(model, stats_path=path, ensemble=False)

Loaded pretrained weights for efficientnet-b0
Evaluating base models...
-----------------------
PREDICTION STATISTICS
-----------------------
True Positives: 1636
False Positives: 200
False Negatives: 138
True Negatives: 440
-----------------------
EVALUATION STATISTICS
-----------------------
Accuracy: 85.998343 %
Sensitivity: 89.106754 %
Specificity: 76.124567 %
PPV: 92.220970 %
NPV: 68.750000 %
F1 Score: 90.637119 %
Loaded pretrained weights for efficientnet-b0
Evaluating base models...
-----------------------
PREDICTION STATISTICS
-----------------------
True Positives: 1642
False Positives: 194
False Negatives: 123
True Negatives: 455
-----------------------
EVALUATION STATISTICS
-----------------------
Accuracy: 86.868268 %
Sensitivity: 89.433551 %
Specificity: 78.719723 %
PPV: 93.031161 %
NPV: 70.107858 %
F1 Score: 91.196890 %
Loaded pretrained weights for efficientnet-b0
Evaluating base models...


  specificity = (100*TN)/(TN+FN)


-----------------------
PREDICTION STATISTICS
-----------------------
True Positives: 1649
False Positives: 187
False Negatives: 127
True Negatives: 451
-----------------------
EVALUATION STATISTICS
-----------------------
Accuracy: 86.992543 %
Sensitivity: 89.814815 %
Specificity: 78.027682 %
PPV: 92.849099 %
NPV: 70.689655 %
F1 Score: 91.306755 %
Loaded pretrained weights for efficientnet-b0
Evaluating base models...
-----------------------
PREDICTION STATISTICS
-----------------------
True Positives: 1612
False Positives: 224
False Negatives: 136
True Negatives: 442
-----------------------
EVALUATION STATISTICS
-----------------------
Accuracy: 85.086993 %
Sensitivity: 87.799564 %
Specificity: 76.470588 %
PPV: 92.219680 %
NPV: 66.366366 %
F1 Score: 89.955357 %
Loaded pretrained weights for efficientnet-b0
Evaluating base models...
-----------------------
PREDICTION STATISTICS
-----------------------
True Positives: 1643
False Positives: 193
False Negatives: 135
True Negatives: 443
-

In [None]:
# Base Model outputs
weight_dir = "/content/baseEfficientNet/training_results/weights/8Nov_weight"
baseStats_path = "/content/baseEfficientNet/testing_results/baseStats"

weights = os.listdir(weight_dir)
for weight in weights:
  pth = weight_dir + '/'+ weight
  model = BaseEfficientNet()
  model.load_state_dict(torch.load(pth, map_location = torch.device('cpu')))

  path = baseStats_path + '/' + weight.split('.')[0]
  test_model(model, stats_path=path, ensemble=False)

Loaded pretrained weights for efficientnet-b0
Evaluating base models...
-----------------------
PREDICTION STATISTICS
-----------------------
True Positives: 1637
False Positives: 199
False Negatives: 127
True Negatives: 451
-----------------------
EVALUATION STATISTICS
-----------------------
Accuracy: 86.495443 %
Sensitivity: 89.161220 %
Specificity: 78.027682 %
PPV: 92.800454 %
NPV: 69.384615 %
F1 Score: 90.944444 %
Loaded pretrained weights for efficientnet-b0
Evaluating base models...
-----------------------
PREDICTION STATISTICS
-----------------------
True Positives: 1643
False Positives: 193
False Negatives: 130
True Negatives: 448
-----------------------
EVALUATION STATISTICS
-----------------------
Accuracy: 86.619718 %
Sensitivity: 89.488017 %
Specificity: 77.508651 %
PPV: 92.667795 %
NPV: 69.890796 %
F1 Score: 91.050152 %
Loaded pretrained weights for efficientnet-b0
Evaluating base models...
-----------------------
PREDICTION STATISTICS
-----------------------
True Positiv

  specificity = (100*TN)/(TN+FN)


-----------------------
PREDICTION STATISTICS
-----------------------
True Positives: 1637
False Positives: 199
False Negatives: 134
True Negatives: 444
-----------------------
EVALUATION STATISTICS
-----------------------
Accuracy: 86.205468 %
Sensitivity: 89.161220 %
Specificity: 76.816609 %
PPV: 92.433653 %
NPV: 69.051322 %
F1 Score: 90.767951 %


# **Miscellaneous** #

##**Initialise Pre-Trained MPL Model**##

In [None]:
#MPL model
# s_model, t_model = BaseEfficientNet(), BaseEfficientNet()
# s_model, t_model = s_model.to(device), t_model.to(device)
# #Optimize with Stochastic Gradient descent
# s_optimizer = optim.SGD(s_model.parameters(), lr=learning_rate, momentum=momentum)
# t_optimizer = optim.SGD(t_model.parameters(), lr=learning_rate, momentum=momentum)
# #Decay the learning rate by a factor of 0.1 for every 7 epochs.
# s_scheduler = StepLR(s_optimizer, step_size=step_size, gamma=gamma)
# t_scheduler = StepLR(t_optimizer, step_size=step_size, gamma=gamma)

##Ensembl EfficientNet Class##

In [None]:
#Custom class for Ensembl model using models generated from all training folds.
class EnsblEfficientNet(nn.Module):
  def __init__(self, models, nb_classes=2):
    super(EnsblEfficientNet, self).__init__()
    self.modelA = models[0]
    self.modelB = models[1]
    self.modelC = models[2]
    self.modelD = models[3]
    self.modelE = models[4]

    # Remove last linear layer
    # self.modelA.fc = nn.Identity()
    # self.modelB.fc = nn.Identity()
    # self.modelC.fc = nn.Identity()
    # self.modelD.fc = nn.Identity()
    # self.modelE.fc = nn.Identity()
      
    # Create new classifier
    self.sub_classifier = nn.Linear(2, nb_classes) # classify hard labels
    self.classifier = nn.Linear(2*5, nb_classes)
      
  def forward(self, x):
    #clone to make sure x is not changed by inplace methods
    x1 = self.modelA(x.clone()) 
    x1 = x1.view(x1.size(0), -1)
    x2 = self.modelB(x.clone())
    x2 = x2.view(x2.size(0), -1)
    x3 = self.modelC(x.clone())
    x3 = x3.view(x3.size(0), -1)
    x4 = self.modelD(x.clone())
    x4 = x4.view(x4.size(0), -1)
    x5 = self.modelE(x.clone())
    x5 = x5.view(x5.size(0), -1)

    x1 = self.sub_classifier(F.relu(x1))
    x2 = self.sub_classifier(F.relu(x2))
    x3 = self.sub_classifier(F.relu(x3))
    x4 = self.sub_classifier(F.relu(x4))
    x5 = self.sub_classifier(F.relu(x5))

    #Concatenate models to form ensembl model.
    x = torch.cat((x1, x2, x3, x4, x5), dim=1)
    #Obtain prediction.
    x = self.classifier(F.relu(x))
    return x

## **Training Meta-Pseudo Label Model** ##

In [None]:
def train_metaModel(s_model, t_model, criterion, t_optimizer, s_optimizer, t_scheduler, s_scheduler, num_epochs=epochs):
    since = time.time()

    # Initialise model.
    best_model_wts = copy.deepcopy(s_model.state_dict())
    best_acc = 0.0
    best_epoch = 0

    for epoch in range(num_epochs):
        epoch_start = time.time()
        epoch_no = epoch + 1
        print('Epoch {}/{}'.format(epoch_no, num_epochs))
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['training', 'validation']:
            #Training phase
            if phase == 'training':
                s_model.train()  # Set model to training mode
                t_model.train()

                # iterate through size of labelled and unlabelled set
                for i in range(len(labelled_set)):
                    # load labelled and unlabelled dataloaders
                    labelled_loader = dataloaders['training'][0]  # to put training dataloaders as a list in dataloaders dict
                    unlabelled_loader = dataloaders['training'][1]

                    # Labelled dataloader
                    labelled_iter = iter(labelled_loader)
                    images_l, targets = labelled_iter.next()
                    images_l, targets = images_l.to(device), targets.to(device)
                    batch_size = images_l.shape[0]

                    # Unlabelled dataloader
                    unlabelled_iter = iter(unlabelled_loader)
                    images_u, _ = unlabelled_iter.next()
                    images_u = images_u.to(device)

                    # Initialise images for teacher and student
                    all_images = torch.cat((images_l, images_u))

                    # Set parameter gradients to 0.
                    s_optimizer.zero_grad()
                    t_optimizer.zero_grad()

                    with torch.set_grad_enabled(phase == 'training'):
                        # Run initial training for teacher model
                        t_outputs = t_model(all_images)
                        # Split t_outputs based on labelled and unlabelled
                        t_outputs_l = t_outputs[:batch_size]
                        t_outputs_u = t_outputs[batch_size:]

                        # Obtain training predictions on unlabelled
                        soft_preds = torch.softmax(t_outputs_u, dim=-1)
                        max_prob, pseudo_preds = torch.max(soft_preds, dim=-1)

                        # Obtain teacher's loss on labelled
                        t_loss_l = criterion(t_outputs_l, targets)

                        # Run training for student model
                        s_outputs = s_model(all_images)
                        # Split s_outputs based on labelled and unlabelled
                        s_outputs_l = s_outputs[:batch_size]
                        s_outputs_u = s_outputs[batch_size:]


                        # Obtain loss on labelled
                        s_loss_l_old = criterion(s_outputs_l, targets)

                        # Obtain loss on unlabelled + backward propagate to update parameters
                        s_loss = criterion(s_outputs_u, pseudo_preds)
                        s_loss.backward()
                        s_optimizer.step()
                        # s_scheduler.scale(s_loss).backward()
                        # clip_gradient(s_optimizer, grad_clip) #Clip gradient to prevent exploding gradients
                        # s_scaler.step(s_optimizer)
                        # s_scaler.update()
                        # s_scheduler.step()

                        # Get dot product to feedback with teacher's unlabelled prediction
                        s_outputs_l_new = s_model(images_l)
                        s_loss_l_new = criterion(s_outputs_l_new, targets)
                        dot_product = s_loss_l_old.detach() - s_loss_l_new.detach()

                        # Calculate total teacher's loss
                        t_loss_mpl = criterion(t_outputs_u.detach(), pseudo_preds.detach()) * dot_product
                        t_loss = t_loss_l + t_loss_mpl

                        # Update teacher parameters based on loss (feedback loop)
                        t_loss.backward()
                        t_optimizer.step()
                        # t_scaler.scale(t_loss).backward()
                        # clip_gradient(t_optimizer, grad_clip) #Clip gradient to prevent exploding gradients
                        # t_scaler.step(t_optimizer)
                        # t_scaler.update()
                        # t_scheduler.step()

                s_scheduler.step()
                t_scheduler.step()

            # Validation phase.
            if phase == 'validation':
                s_model.eval()  # Set model to validation mode
                # t_model.eval()

                p_count = 0
                lowest_loss = 0.0
                running_loss = 0.0
                running_corrects = 0

                # Run prediction for student model using validation set.
                for inputs, labels in dataloaders[phase]:
                    inputs = inputs.to(device)
                    labels = labels.to(device)
                    
                    outputs = s_model(inputs)
                    
                    soft_preds = torch.softmax(outputs, dim=-1)  # Generate soft labels
                    max_prob, hard_preds = torch.max(soft_preds, dim=-1)
                    loss = criterion(soft_preds, labels)

                    # Output training statistics.
                    running_loss += loss.item() * inputs.size(0)
                    running_corrects += torch.sum(hard_preds == labels.data)

                # Calculates epoch statistics.
                epoch_loss = running_loss / dataset_sizes[phase]
                epoch_acc = running_corrects.double() / dataset_sizes[phase]

                print('{} Loss: {:.4f} Acc: {:.4f}'.format(
                    phase, epoch_loss, epoch_acc))

                # Add patience for early stopping
                if (lowest_loss <= epoch_loss):
                    p_count += 1
                    if p_count > patience:
                        print(f'val loss did not decrease after {patience} epochs')
                        break

                if lowest_loss > epoch_loss:
                    p_count = 0
                    lowest_loss = epoch_loss
                    
                # Save best model weights if epoch gives best accuracy
                if epoch_acc >= best_acc:
                    best_acc = epoch_acc
                    best_epoch = epoch
                    best_model_wts = copy.deepcopy(s_model.state_dict())

            epoch_end = time.time()
            epoch_time = epoch_end - epoch_start

            print()

    # Output runtime of model.
    time_elapsed = time.time() - since
    print('Time {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(best_acc))
    with open(log_file, 'a') as log:
        log.write('Time {:.0f}m {:.0f}s\n'.format(time_elapsed // 60, time_elapsed % 60))
        log.write('Best val Acc: {:4f} at epoch {}'.format(best_acc, best_epoch))

    # Save weights of the best epoch model into training directory.
    torch.save(best_model_wts, weight_file)

    return model

In [None]:
#Prepare data for training on each fold for mpl model.
for fold, (train_idx, valid_idx) in enumerate(strat_kfold.split(X, y)):
    print("FOLD {}".format(fold+1))
    
    #Split training set into labelled and unlabelled set: labelled(40%)/unlabelled(60%)
    #Split into labelled(50%)/unlabelled(50%) instead
    labelled_idx = []
    unlabelled_idx = []
    a = train_idx
    for idx in range(len(train_idx)):
      index = train_idx[idx]
      
      # if (index % 5 == 0) or (index % 5 == 1):
      if (index % 2 == 0):
        labelled_idx.append(index)
      else:
        unlabelled_idx.append(index)
    
    labelled_df = label_df.iloc[labelled_idx][["Patient_no", "Patient_ID", "labels"]]
    unlabelled_df = label_df.iloc[unlabelled_idx][["Patient_no", "Patient_ID", "labels"]]
    valid_df = label_df.iloc[valid_idx][["Patient_no", "Patient_ID", "labels"]]

    # to change directory
    labelled_set = CellsDataset(labelled_df, "Data_main_subset/images")
    unlabelled_set = CellsDataset(unlabelled_df, "Data_main_subset/images")
    valid_set = CellsDataset(valid_df, "Data_main_subset/images")

    # #Add augmentation data to original training set.
    # aug_dir = "../test_augment/"
    # aug_df = augment(rounds=aug_rounds, op_dir=aug_dir, labels=train_df)
    # aug_set = CellsDataset(aug_df, aug_dir)
    # train_aug_set = torch.utils.data.ConcatDataset([train_set, aug_set])

    #Load dataset onto dataloader
    valid_loader = torch.utils.data.DataLoader(valid_set, batch_size=batch_size,pin_memory=True, shuffle=True)
    labeltrng_loader = torch.utils.data.DataLoader(labelled_set, batch_size=batch_size, pin_memory=True, shuffle=True)
    unlabeltrng_loader = torch.utils.data.DataLoader(unlabelled_set, batch_size=batch_size, shuffle=True)
    # new format for dataloaders
    # dataloaders = {'training_lab': valid_loader, 'training_unlab': train_loader, 'validation': valid_loader}
    dataloaders = {'training': [labeltrng_loader, unlabeltrng_loader], 'validation': valid_loader}
    dataset_sizes = {'training': [len(labeltrng_loader), len(unlabeltrng_loader)], 'validation': len(valid_set)}
    class_names = [0, 1] # hard code labels based on labels in csv
    
    # train data on n epochs
    trained_mplModel = train_metaModel(s_model, t_model, criterion, t_optimizer, s_optimizer, t_scheduler, s_scheduler, num_epochs=epochs)

    #Delete augmented training directory before next training fold to save disk space.
    #shutil.rmtree(aug_dir)

## **Test Models** ##

Output directories

In [None]:
weight_path = "BaseModel/training_results/weights/weights_3Nov"
stats_path = "BaseModel/training_results/stats"
version = 'v1' # change every round to prevent overriding of weights

## Put weights into ensembl model

In [None]:
#Ensembl code for base model.
# #Pull all files containing weights from each training fold.
# # dir = "/content/BaseModel/training_results/weights/"
# file_paths = os.listdir(weight_path)

# #Store models weights in dictionary.
# base_models = {}
# for pth in file_paths:
#   if 'pth' not in pth:
#     continue
#   path = f"{weight_path}/{pth}"
#   model = BaseEfficientNet()
#   model.load_state_dict(torch.load(path), strict = False)
#   base_models[pth] = model

# #Create base ensembl model.
# baseModels = list(base_models.values())
# base_ensembl_model = EnsblEfficientNet(baseModels)
# #Set base ensembl model to evaluation mode
# base_ensembl_model.eval()
# #Freeze weights in ensembl model
# for param in base_ensembl_model.parameters():
#     param.requires_grad = False


In [None]:
#Ensembl code for MPL model.
#Pull all files containing weights from each training fold.
dir = "MPLModel/training_results/weights"
file_paths = os.listdir(dir)

#Store models weights in dictionary.
metapseudo_models = {}
for file in file_paths:
  path = dir + file
  model = EfficientNet()
  model.load_state_dict(torch.load(path), strict = False)
  models[file.split('.')[0]] = model

#Create MPL ensembl model.
mpl_models = metapseudo_models.values()
mpl_ensembl_model = EnsblEfficientNet(mpl_models)

## Evaluating individual fold's model performance

In [None]:
weights = os.listdir(weight_path)

# remove folders without pth extension
for weight in weights:
  if 'pth' not in weight:
    weights.remove(weight)
weights.sort()

# Iterate through each fold's weights
for weight in weights:
  print(f"Evaluating: {weight}")
  model = BaseEfficientNet()
  model.load_state_dict(torch.load(f"{weight_path}/{weight}"), strict = False)
  fold = weight.split('_')[0]
  test_model(test_loader, model, stats_path=stats_path, fold=fold)

# Data Augmentation #

In [None]:
#Function for augmentation details.
def augmentation(img):
    img = torch.from_numpy(img).type(torch.DoubleTensor) # Change dtype from numpy to tensor.
    img = torch.moveaxis(img, -1, 0)

    affine_transformer = transforms.RandomAffine(degrees=(0, 360), # Rotates image.
                                                 translate=(0.1, 0.1), #Offsets X/Y axes.
                                                 scale=(0.95, 1.05)) #Scales size.

    affine_img = affine_transformer(img).type(torch.LongTensor)
    affine_img = torch.moveaxis(affine_img, 0, -1)

    return affine_img

## Augmentation Functions ##

## Testing Augmentation ##

In [None]:
#Create new directory to store augmentation test results.
! mkdir test_augment

#Copy 3 images each from ALL and Hem classes into test_augment directory.
! find /content/ZB4171_LeukemiaImageClassification-Ongoing-/Data_Subset/training_data/fold_0/all -type f | head -3 | xargs cp -t /content/test_augment
! find /content/ZB4171_LeukemiaImageClassification-Ongoing-/Data_Subset/training_data/fold_0/hem -type f | head -3 | xargs cp -t /content/test_augment

In [None]:
!rename /content/test_augment/augmented/*.bmp /content/test_augment/augmented/*.png

In [None]:
#Initialise input/output directory for testing.
ip_dir = "/content/test_augment/"
op_dir = "/content/test_augment/augmented/"
labels = "/content/ZB4171_LeukemiaImageClassification-Ongoing-/Data_main/labels.csv"
#Run data augmentation code. 
augment(rounds=5, ip_dir=ip_dir, op_dir=op_dir, labels=labels)

In [None]:
# Visualise images created from augmentation test.
fig = plt.figure(figsize=(12, 4))
row = 2
column = 6

fig.add_subplot(row, column, 1)
plt.imshow(cv2.imread("/content/test_augment/UID_28_34_5_all.bmp"))
plt.axis('off')
plt.title("ALL_Org")

for i in range(5):
  fig.add_subplot(row, column, i+2)
  plt.imshow(cv2.imread(f"/content/test_augment/augmented/UID_28_34_5_all_{i}.bmp"))
  plt.axis('off')
  plt.title(f"ALL_{i}")

fig.add_subplot(row, column, 7)
plt.imshow(cv2.imread("/content/test_augment/UID_H11_4_1_hem.bmp"))
plt.axis('off')
plt.title("HEM_Org")

for j in range(5):
  fig.add_subplot(row, column, j+8)
  plt.imshow(cv2.imread(f"/content/test_augment/augmented/UID_H11_4_1_hem_{j}.bmp"))
  plt.axis('off')
  plt.title(f"HEM_{j}")

In [None]:
#Augmentation function.
"""
    #Saves augmented image into op_dir given ip_dir of images and csv file/df.
    :param rounds: no.of augmented images generated per image.
    :param ip_dir: directory of image files.
    :param op_dir: directory for augmented images.
    :param labels: csv file or df for labels.
    :return: saves temp_aug directory + aug_labels df.
"""

def augment(rounds, ip_dir="../Data_main/images/", op_dir="../Data_main/temp_aug/", labels=None):
    
    #Initialise input and output directory.
    IP_DIR = ip_dir
    OP_DIR = op_dir

    #Create augmentation directory.
    if not os.path.exists(OP_DIR):
        os.makedirs(OP_DIR)

    #Table to read images for augmentation.
    try:
        labels_df = pd.read_csv(labels) # For csv inputs.
    except:
        labels_df = labels # For df inputs.

    #Create dataframe to attach labels to augmented images.
    aug_l = []

    #Iterate through labels_df to augment images.
    for idx, row in tqdm(labels_df.iterrows()):
        bmp = row["Patient_ID"]
        id = bmp.split('.')[0] # get name w/o .bmp extension
        label = row["labels"]
        patient_no = row["Patient_no"]

        try:
            img = plt.imread(IP_DIR + bmp)
        except:
            print('{} not in image folder'.format(bmp))
            continue

        for i in range(rounds):
            augmented = augmentation(img).detach().numpy().astype('uint8')
            aug_name = "{}_{}".format(i, id)

            #Add augmented image info to dataframe.
            aug_l.append([patient_no, aug_name, label])
            #Save augmented image in bmp format.
            plt.imsave('{}{}_{}.bmp'.format(OP_DIR, i, id), augmented)

    aug_df = pd.DataFrame(aug_l, columns=["Patient_no", "Patient_ID", "labels"])

    return aug_df