In [None]:
qlogin -q short.qg -l gpu=1[affinity=true],gputype=rtx8000

In [None]:
#!/bin/bash

cd /well/win/users/hsv459/agemapper

module purge
module load Python/3.7.4-GCCcore-8.3.0

source /well/win/users/hsv459/python/functionmapper-skylakeA100/bin/activate

# continue to use your python venv as normal
   
ipython



In [None]:
import os
import torch
import torch.nn as nn
import torch.utils.data as data
from torch.optim import lr_scheduler
import numpy as np
from datetime import datetime
from AgeMapper import AgeMapper
from utils.data_utils import get_datasets_dynamically
from utils.settings import Settings
from utils.misc import my_KLDivLoss, create_folder, mae
from utils.logging_functions import LogWriter
from utils.early_stopping import EarlyStopping
# Set the default floating point tensor type to FloatTensor
torch.set_default_tensor_type(torch.FloatTensor)

import pandas as pd

checkpoint_extension = 'path.tar'

settings_file_name = 'AM0-55c.ini'
settings = Settings(settings_file_name)
data_parameters = settings['DATA']
training_parameters = settings['TRAINING']
network_parameters = settings['NETWORK']
misc_parameters = settings['MISC']

In [None]:
base_lr = 1e-6
max_lr = 10
steps = 10500

In [None]:
def load_data_dynamically(data_parameters):
    print("Data is loading...")
    train_data, validation_data = get_datasets_dynamically(data_parameters)
    print("Data has loaded!")
    print("Training dataset size is {}".format(len(train_data)))
    print("Validation dataset size is {}".format(len(validation_data)))

    return train_data, validation_data

def train(data_parameters, training_parameters, network_parameters, misc_parameters):

    if training_parameters['optimiser'] == 'adamW':
        optimizer = torch.optim.AdamW
    elif training_parameters['optimiser'] == 'adam':
        optimizer = torch.optim.Adam
    elif training_parameters['optimiser'] == 'sgd':
        optimizer = torch.optim.SGD
    else:
        optimizer = torch.optim.Adam # Default option

    # ========================================================================================
        
    if training_parameters['optimiser'] == 'sgd':
        optimizer_arguments={'lr': 1e-6,
                            'momentum': training_parameters['optimizer_sgd_momentum'],
                            'dampening': training_parameters['optimizer_sgd_dampening'],
                            'weight_decay': training_parameters['optimizer_weigth_decay'],
                            'nesterov': training_parameters['optimizer_sgd_nesterov']
                            }
    else:
        optimizer_arguments={'lr': 1e-6,
                            'betas': training_parameters['optimizer_beta'],
                            'eps': training_parameters['optimizer_epsilon'],
                            'weight_decay': training_parameters['optimizer_weigth_decay']
                            }

    # ========================================================================================
        
    if training_parameters['loss_function'] == 'mse':
        loss_function = torch.nn.MSELoss()
    elif training_parameters['loss_function'] == 'kld':
        loss_function = torch.nn.KLDivLoss()
        print("Loss will return the KLD where the losses are averaged for each minibatch over observations as well as over dimensions!")
    elif training_parameters['loss_function'] == 'kld_batch':
        loss_function = torch.nn.KLDivLoss(reduction='batchmean')
        print("Loss will return the correct KL divergence where losses are averaged over batch dimension only!")
    elif training_parameters['loss_function'] == 'kld_batch_custom':
        loss_function = my_KLDivLoss
        print("Loss will return the CUSTOM correct KL divergence where losses are averaged over batch dimension only!")
    else:
        print("Loss function not valid. Defaulting to KLD with batchmean reduction!")
        loss_function = torch.nn.KLDivLoss(reduction='batchmean')

    train_data, validation_data = load_data_dynamically(data_parameters)
    train_loader = data.DataLoader(
        dataset=train_data,
        batch_size=training_parameters['training_batch_size'],
        shuffle=True,
        pin_memory=True,
        num_workers=data_parameters['num_workers']
    )
    validation_loader = data.DataLoader(
        dataset=validation_data,
        batch_size=training_parameters['validation_batch_size'],
        shuffle=False,
        pin_memory=True,
        num_workers=data_parameters['num_workers']
    )

    if network_parameters['network_name'] == 'AgeMapper_N1':
        from AgeMapper import AgeMapper_N1
        AgeMapperModel = AgeMapper_N1()
    elif network_parameters['network_name'] == 'AgeMapper_N2':
        from AgeMapper import AgeMapper_N2
        AgeMapperModel = AgeMapper_N2()
    elif network_parameters['network_name'] == 'AgeMapper_N3':
        from AgeMapper import AgeMapper_N3
        AgeMapperModel = AgeMapper_N3()
    elif network_parameters['network_name'] == 'AgeMapper_N4':
        from AgeMapper import AgeMapper_N4
        AgeMapperModel = AgeMapper_N4()
    elif network_parameters['network_name'] == 'AgeMapper_N5':
        from AgeMapper import AgeMapper_N5
        AgeMapperModel = AgeMapper_N5()
    elif network_parameters['network_name'] == 'AgeMapper_N6':
        from AgeMapper import AgeMapper_N6
        AgeMapperModel = AgeMapper_N6()
    elif network_parameters['network_name'] == 'AgeMapper_N7':
        from AgeMapper import AgeMapper_N7
        AgeMapperModel = AgeMapper_N7()
    elif network_parameters['network_name'] == 'AgeMapper_N8':
        from AgeMapper import AgeMapper_N8
        AgeMapperModel = AgeMapper_N8()
    elif network_parameters['network_name'] == 'AgeMapper_N9':
        from AgeMapper import AgeMapper_N9
        AgeMapperModel = AgeMapper_N9()
    elif network_parameters['network_name'] == 'AgeMapper_N10':
        from AgeMapper import AgeMapper_N10
        AgeMapperModel = AgeMapper_N10()
    elif network_parameters['network_name'] == 'AgeMapper_N11':
        from AgeMapper import AgeMapper_N11
        AgeMapperModel = AgeMapper_N11()
    elif network_parameters['network_name'] == 'AgeMapper_N12':
        from AgeMapper import AgeMapper_N12
        AgeMapperModel = AgeMapper_N12()
    else:   
        AgeMapperModel = AgeMapper()

    solver = Solver(model=AgeMapperModel,
                    number_of_classes=network_parameters['number_of_classes'],
                    experiment_name=training_parameters['experiment_name'],
                    optimizer=optimizer,
                    optimizer_arguments=optimizer_arguments,
                    loss_function=loss_function,
                    model_name=training_parameters['experiment_name'],
                    number_epochs=training_parameters['number_of_epochs'],
                    loss_log_period=training_parameters['loss_log_period'],
                    learning_rate_scheduler_step_size=training_parameters['learning_rate_scheduler_step_size'],
                    learning_rate_scheduler_gamma=training_parameters['learning_rate_scheduler_gamma'],
                    use_last_checkpoint=training_parameters['use_last_checkpoint'],
                    experiment_directory=misc_parameters['experiments_directory'],
                    logs_directory=misc_parameters['logs_directory'],
                    checkpoint_directory=misc_parameters['checkpoint_directory'],
                    best_checkpoint_directory=misc_parameters['best_checkpoint_directory'],
                    save_model_directory=misc_parameters['save_model_directory'],
                    learning_rate_validation_scheduler=training_parameters['learning_rate_validation_scheduler'],
                    learning_rate_cyclical = training_parameters['learning_rate_cyclical'],
                    learning_rate_scheduler_patience=training_parameters['learning_rate_scheduler_patience'],
                    learning_rate_scheduler_threshold=training_parameters['learning_rate_scheduler_threshold'],
                    learning_rate_scheduler_min_value=training_parameters['learning_rate_scheduler_min_value'],
                    learning_rate_scheduler_max_value=training_parameters['learning_rate_scheduler_max_value'],
                    learning_rate_scheduler_step_number=training_parameters['learning_rate_scheduler_step_number'],
                    early_stopping_patience=training_parameters['early_stopping_patience'],
                    early_stopping_min_delta=training_parameters['early_stopping_min_delta'],
                    age_prediction_loss_flag=training_parameters['age_prediction_loss_flag']
                    )

    solver.train(train_loader, validation_loader)

    del train_data, validation_data, train_loader, validation_loader, AgeMapperModel, solver, optimizer
    torch.cuda.empty_cache()

In [None]:
class Solver():
    def __init__(self,
                 model,
                 number_of_classes,
                 experiment_name,
                 optimizer,
                 optimizer_arguments={},
                 loss_function=torch.nn.MSELoss(),
                 model_name='BrainMapper',
                 number_epochs=10,
                 loss_log_period=5,
                 learning_rate_scheduler_step_size=5,
                 learning_rate_scheduler_gamma=0.5,
                 use_last_checkpoint=True,
                 experiment_directory='experiments',
                 logs_directory='logs',
                 checkpoint_directory='checkpoints',
                 best_checkpoint_directory = 'best_checkpoint_directory',
                 save_model_directory='saved_models',
                 learning_rate_validation_scheduler = False,
                 learning_rate_cyclical = False,
                 learning_rate_scheduler_patience=5,
                 learning_rate_scheduler_threshold=1e-6,
                 learning_rate_scheduler_min_value=5e-6,
                 learning_rate_scheduler_max_value=5e-5,
                 learning_rate_scheduler_step_number=13200,
                 early_stopping_patience=10,
                 early_stopping_min_delta=0,
                 age_prediction_loss_flag=False
                 ):
        
        self.age_prediction_loss_flag = age_prediction_loss_flag

        self.model = model
#         print(model)
        self.parallelism = False

        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        if self.device is "cpu":
            print("WARNING: Default device is CPU, not GPU!")
        elif torch.cuda.device_count()>1:
            self.parallelism = True
            print("ATTENTION! Multiple GPUs detected. {} GPUs will be used for training".format(torch.cuda.device_count()))
        else:
            print("A single GPU detected")

        if optimizer_arguments['weight_decay']!=0:
            prelus = {name for name, module in model.named_modules() if isinstance(module, torch.nn.PReLU)}
            prelu_parameter_names = {name for name, _ in model.named_parameters() if name.rsplit('.', 1)[0] in prelus}
            parameters = [
                {'params': [parameter for parameter_name, parameter in model.named_parameters() if parameter_name not in prelu_parameter_names]},
                {'params': [parameter for parameter_name, parameter in model.named_parameters() if parameter_name in prelu_parameter_names], 'weight_decay': 0.0}
            ]
        else:
            parameters = model.parameters()
        self.optimizer = optimizer(parameters, **optimizer_arguments)

        if torch.cuda.is_available():
            if hasattr(loss_function, 'to'):
                self.loss_function = loss_function.to(self.device)
            else:
                self.loss_function = loss_function

        else:
            self.loss_function = loss_function

        self.model_name = model_name
        self.number_epochs = number_epochs
        self.loss_log_period = loss_log_period  

        self.use_last_checkpoint = use_last_checkpoint

        experiment_directory_path = os.path.join(experiment_directory, experiment_name)
        self.experiment_directory_path = experiment_directory_path

        self.checkpoint_directory = checkpoint_directory
        self.best_checkpoint_directory = best_checkpoint_directory

        create_folder(experiment_directory)
        create_folder(experiment_directory_path)
        create_folder(os.path.join(experiment_directory_path, self.checkpoint_directory))
        create_folder(os.path.join(experiment_directory_path, self.best_checkpoint_directory))

        self.start_epoch = 1
        self.start_iteration = 1

        self.LogWriter = LogWriter(number_of_classes=number_of_classes,
                                   logs_directory=logs_directory,
                                   experiment_name=experiment_name,
                                   use_last_checkpoint=use_last_checkpoint
                                   )

        self.early_stop = False

        self.save_model_directory = save_model_directory
        self.final_model_output_file = experiment_name + ".pth.tar"

        self.best_score_early_stop = None
        self.counter_early_stop = 0
        self.previous_loss = None
        self.valid_epoch = None
        self.previous_age_deltas = None

        if use_last_checkpoint:
            self.load_checkpoint()
            self.EarlyStopping = EarlyStopping(patience=early_stopping_patience, min_delta=early_stopping_min_delta, best_score=self.best_score_early_stop, counter=self.counter_early_stop)
        else:
            self.EarlyStopping = EarlyStopping(patience=early_stopping_patience, min_delta=early_stopping_min_delta)

        self.bin_centers = np.load("datasets/bin_centers.npy")
        
        
        # ========================================================================================
        
        # SET batch = 8, dataset = male large, AM0-25
        
        
#         self.max_iterations = 10500

        self.experiment_name = experiment_name
        
#         self.learning_rate_scheduler = lr_scheduler.CyclicLR(optimizer=self.optimizer,
#                                                             base_lr = 1e-7,
#                                                             max_lr = 10,
#                                                             step_size_up=self.max_iterations,
#                                                             cycle_momentum=False,
#                                                             mode = 'exp_range',
#                                                             verbose=True,
#                                                             )

#         self.learning_rate_scheduler = lr_scheduler.StepLR(optimizer=self.optimizer,
#                                                             step_size=learning_rate_scheduler_step_size,
#                                                             gamma=learning_rate_scheduler_gamma)
        
        # ========================================================================================
        

    def train(self, train_loader, validation_loader):

#         model, optimizer, learning_rate_scheduler = self.model, self.optimizer, self.learning_rate_scheduler
        model, optimizer = self.model, self.optimizer


        # ========================================================================================
        
        output_statistics = {}
        output_statistics_name = "lr_range_test_" + self.experiment_name + '_4' + ".csv"
        create_folder("lr_range_tests")
        output_statistics_path = os.path.join("lr_range_tests", output_statistics_name)
        
        number_of_steps = 6
        
        
        lrs = []
        steps=1000
        self.max_iterations = number_of_steps * steps # max 7500 (30 epochs @ 3000 subs with batch=12)
        for i in range(number_of_steps):
            base_lr = 10**-i
            max_lr = 10**-(i+1)
            lr = np.linspace(base_lr, max_lr, steps)
            if i!=(number_of_steps-1):
                lr = lr[:-1]
            lr = lr.tolist()
            lrs += lr
            
        lrs = np.array(lrs)
        lrs = lrs.flatten()
        lrs = np.flip(lrs)
#         print(lrs)
        
        lrs_counter = 0
        
        optimizer.param_groups[0]['lr'] = lrs[lrs_counter]
#         optimizer.param_groups[1]['lr'] = lrs[lrs_counter]
        
        print('LEARNING RATE 0=', optimizer.param_groups[0]['lr'])
#         print('LEARNING RATE 0=', optimizer.param_groups[1]['lr'])
        
        # ========================================================================================
        
        
        dataloaders = {'train': train_loader}

        if torch.cuda.is_available():
            torch.cuda.empty_cache()  # clear memory
            model.to(self.device)  # Moving the model to GPU

        print('****************************************************************')
        print('TRAINING IS STARTING!')
        print('=====================')
        print('Model Name: {}'.format(self.model_name))
        if torch.cuda.is_available():
            print('Device Type: {}'.format(
                torch.cuda.get_device_name(self.device)))
        else:
            print('Device Type: {}'.format(self.device))
        start_time = datetime.now()
        print('Started At: {}'.format(start_time))
        print('----------------------------------------')

        iteration = self.start_iteration

        for epoch in range(self.start_epoch, self.number_epochs+1):

            print("Epoch {}/{}".format(epoch, self.number_epochs))

            for phase in ['train']:
                print('-> Phase: {}'.format(phase))

                model.train()

                for batch_index, sampled_batch in enumerate(dataloaders[phase]):
                    X = sampled_batch[0].type(torch.FloatTensor)
                    y = sampled_batch[1].type(torch.FloatTensor)
                    y += 1e-16 # to prevent log(0) problem
                    y_age = sampled_batch[2]

                    # We add an extra dimension (~ number of channels) for the 3D convolutions.
                    if len(X.size())<5:
                        X = torch.unsqueeze(X, dim=1)

                    if torch.cuda.is_available():
                        X = X.cuda(self.device, non_blocking=True)
                        y = y.cuda(self.device, non_blocking=True)

                    y_hat = model(X)   # Forward pass
                    
                    if self.age_prediction_loss_flag == True:
                        if torch.cuda.is_available():
                            y_age = y_age.type(torch.FloatTensor)
                            y_age = y_age.reshape(-1,1)
                            y_age = y_age.cuda(self.device, non_blocking=True)
                            
#                         print(y_hat.shape, y_age.shape)
                        loss = self.loss_function(y_hat, y_age)

                        y_hat_age = np.float32(y_hat.detach().cpu().numpy())
                        y_age = np.float32(y_age.detach().cpu().numpy())
                        age_delta = mae(y_hat_age, y_age)
                    else:
                        y_hat = torch.squeeze(y_hat)
                        loss = self.loss_function(y_hat, y) 
                        y_hat = np.float32(y_hat.detach().cpu().numpy())
                        y_hat_age = np.matmul(np.exp(y_hat), self.bin_centers)
                        y_age = np.float32(y_age.detach().cpu().numpy())
                        age_delta = mae(y_hat_age, y_age)


                    optimizer.zero_grad()  # Zero the parameter gradients
                    loss.backward()  # Backward propagation
                    optimizer.step()

# ========================================================================================

                    if lrs_counter<(len(lrs)-1):
                        lrs_counter+=1

                        optimizer.param_groups[0]['lr'] = lrs[lrs_counter]
    #                     optimizer.param_groups[1]['lr'] = lrs[lrs_counter]

                        print('----> LEARNING RATE = ', optimizer.param_groups[0]['lr'])
    #                     print('----> LEARNING RATE = ', optimizer.param_groups[1]['lr'])
        
# ========================================================================================

                    self.LogWriter.loss_per_iteration(loss.item(), batch_index, iteration)
                    self.LogWriter.learning_rate_per_iteration(optimizer.param_groups[0]['lr'], batch_index, iteration)

                    output_statistics[iteration] = [iteration, loss.item(), age_delta, optimizer.param_groups[0]['lr']]

                    iteration += 1

                    # Clear the memory

                    del X, y, y_hat, loss, y_hat_age, y_age
                    torch.cuda.empty_cache()

#                     learning_rate_scheduler.step()

                    if iteration == self.max_iterations:
                        break

                if iteration == self.max_iterations:
                    break

            if iteration == self.max_iterations:
                break

            print("Epoch {}/{} DONE!".format(epoch, self.number_epochs))
            
        output_statistics_df = pd.DataFrame.from_dict(output_statistics, orient='index', columns=['iteration', 'kldloss', 'agedelta', 'lr'])     
        output_statistics_df.to_csv(output_statistics_path)

        self.LogWriter.close()

        print('----------------------------------------')
        print('NO TRAINING DONE TO PREVENT OVERFITTING!')
        print('=====================')
        end_time = datetime.now()
        print('Completed At: {}'.format(end_time))
        print('Training Duration: {}'.format(end_time - start_time))
        print('****************************************************************')
        
train(data_parameters, training_parameters, network_parameters, misc_parameters)

In [None]:
train(data_parameters, training_parameters, network_parameters, misc_parameters)