I want to construct a model which satisfies as many sanity checks as I can think of to allow me to be sure that my progress is solid.

The main problem here is that I found weird values in the validation set in 3.19: predicting directly on the validation set gives me very different results compared to the values from the logs, and it should be the same. I don't know if this is because of the (wrong) data augmentation I implemented in the validation set, the mutliGPU training or some other error along the way.

Also, I can see many instances where the dice score for class 1 is zero when predicting on the validation data set

First I will run the sanity checks with 1 GPU only and then see how they behave with multiple GPUs

1. Run model for 1 epoch using one patch for training and one patch for validation. Check:
    - The logged validation loss corresponds to the validation loss I get by predicting the validation dataset using the trained model
    - Same for the train loss
    - Check if there are values with dice score for class 1 equal to zero in training or validation sets

In [1]:
from tomoSegmentPipeline import dataloader as dl
from tomoSegmentPipeline.utils import setup
from tomoSegmentPipeline.dataloader import to_categorical, transpose_to_channels_first, tomoSegment_dummyDataset, tomoSegment_dataset
from tomoSegmentPipeline.training import Train
from pytorch_lightning.callbacks.early_stopping import EarlyStopping

import os

PARENT_PATH = setup.PARENT_PATH

import torch
from torch.utils.data import DataLoader
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from glob import glob
from tomoSegmentPipeline.showcaseResults import (predict_fullTomogram, load_model, load_tomoData, Tversky_index, Tversky_loss, Tversky1_loss,
                                        fullTomogram_modelComparison, make_comparison_plot, write_comparison_gif, save_classPred)

from tensorboard.backend.event_processing.event_accumulator import EventAccumulator


val_tomos = ['tomo32', 'tomo10']
concat_val_ids = sorted([s.replace('tomo', '') for s in val_tomos])
concat_val_ids = '-'.join(concat_val_ids)

test_tomos = ['tomo38', 'tomo04']
concat_test_ids = sorted([s.replace('tomo', '') for s in test_tomos])
concat_test_ids = '-'.join(concat_test_ids)


input_type = 'cryoCARE+isoNET'

train_tomos = ['tomo02', 'tomo03', 'tomo17']

concat_train_ids = sorted([s.replace('tomo', '') for s in train_tomos])
concat_train_ids = '-'.join(concat_train_ids)

%matplotlib inline
%config Completer.use_jedi = False
%load_ext autoreload
%autoreload 2

# Single GPU

In [None]:
chkpnt = None

tb_logdir = os.path.join(PARENT_PATH, 'data/model_logs/sanity_checks/logs/1EpochBaseline/%s/' %input_type)
model_name = 'Baseline'

epochs = 1
Ncl = 2
dim_in = 84
lr = 1e-4
weight_decay = 0
Lrnd = 18
augment_data = True
batch_size = 22
pretrained_model = None

trainer = Train(Ncl=Ncl, dim_in=dim_in, lr=lr, weight_decay=weight_decay, Lrnd=Lrnd, tensorboard_logdir=tb_logdir,
model_name=model_name, augment_data=augment_data, 
batch_size=batch_size, epochs=epochs, pretrained_model=pretrained_model)

early_stop_callback = EarlyStopping(
        monitor='hp/val_loss',
        min_delta=1e-4,
        patience=100,
        verbose=True,
        mode='min'
        )

trainer = trainer.launch(train_tomos, val_tomos, input_type=input_type, num_gpus=1, accelerator=None,
                         num_workers=1, resume_from_checkpoint=chkpnt, train_callbacks=[early_stop_callback])

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2]
Missing logger folder: /home/haicu/jeronimo.carvajal/Thesis/data/model_logs/sanity_checks/logs/1EpochBaseline/cryoCARE+isoNET/
Set SLURM handle signals.

  | Name    | Type          | Params
------------------------------------------
0 | loss_fn | Tversky1_loss | 0     
1 | layer1  | Sequential    | 28.6 K
2 | layer2  | Sequential    | 103 K 
3 | layer3  | Sequential    | 558 K 
4 | layer4  | Sequential    | 288 K 
5 | layer5  | Sequential    | 96.9 K
------------------------------------------
1.1 M     Trainable params
0         Non-trainable params
1.1 M     Total params
4.304     Total estimated model params size (MB)


"Ncl":           2
"loss_fn":       Tversky1_loss()
"lr":            0.0001
"pretrain_type": None
"weight_decay":  0


Metric hp/val_loss improved. New best score: 0.770


Saving model at:  /home/haicu/jeronimo.carvajal/Thesis/data/model_logs/sanity_checks/logs/1EpochBaseline/cryoCARE+isoNET/version_0/Baseline_ep1_in84_lr0.000100_0.model


In [2]:
logdir_path = '/home/haicu/jeronimo.carvajal/Thesis/data/model_logs/sanity_checks/logs/1EpochBaseline/cryoCARE+isoNET/version_0/'

model_file = PARENT_PATH+'data/model_logs/sanity_checks/logs/1EpochBaseline/cryoCARE+isoNET/version_0/Baseline_ep1_in84_lr0.000100_0.model'

model_file_split = model_file.split('/')

model_info = []

input_type = model_file_split[-3]

name, epochs, patch_size, lr, version = model_file_split[-1].split('_')
epochs = int(epochs.replace('ep', ''))
version = 'v'+version.replace('.model', '')

events_path = glob(os.path.join(logdir_path, 'events.*'))[0]
event_acc = EventAccumulator(events_path)
event_acc.Reload()

_, step_nums, values_valLoss = zip(*event_acc.Scalars('hp/val_loss'))
best_val_loss_epoch = np.min(values_valLoss)
best_val_loss_epoch_idx = np.argmin(values_valLoss) #index starts count at 0

effective_epochs = len(values_valLoss)

_, _, values_dice = zip(*event_acc.Scalars('hp/val_dice'))
_, _, values_trainLoss = zip(*event_acc.Scalars('hp/train_loss_epoch'))

associated_val_class1_dice = float(values_dice[best_val_loss_epoch_idx])
associated_train_loss_epoch = float(values_trainLoss[best_val_loss_epoch_idx])

epochs_str = "%i out of %i" %(effective_epochs, len(values_trainLoss))

model_info.append([name, model_file, input_type, epochs_str, patch_size, lr, version, best_val_loss_epoch, associated_val_class1_dice])

df_model = pd.DataFrame(model_info, columns=['name', 'model_file', 'input_type', 'epochs', 'patch_size', 'lr', 'version', 'best_val_loss_epoch',
                                         'associated_val_class1_dice'])

df_model['batch_size'] = 22

print(df_model.shape)
df_model.head()

(1, 10)


Unnamed: 0,name,model_file,input_type,epochs,patch_size,lr,version,best_val_loss_epoch,associated_val_class1_dice,batch_size
0,Baseline,/home/haicu/jeronimo.carvajal/Thesis/data/mode...,cryoCARE+isoNET,1 out of 1,in84,lr0.000100,v0,0.76955,0.23045,22


# Predict using validation DataLoader yields same results as logs: YES

In [3]:
loss_fn = Tversky1_loss()

nCenters_dict = {'in56':4, 'in84':2}

dice1_test_bestModels = []

for i, row in df_model.iterrows():
    model_fileList = [row['model_file']]
    n_centers_list = [nCenters_dict[row['patch_size']]]
    
    input_type = row['input_type']
    batch_size = row['batch_size']
    dim_in = row['patch_size'].replace('in', '')
    dim_in = int(dim_in)
    
    model = load_model(row['model_file'], Nclass=2)
    
    paths_valData, paths_valTarget = setup.get_paths(val_tomos, input_type)
    
    my_dataset = dl.tomoSegment_dataset(paths_valData, paths_valTarget, dim_in=dim_in, Ncl=3, Lrnd=0, augment_data=False)
    val_loader = DataLoader(my_dataset, batch_size=batch_size, shuffle=True, pin_memory=True, num_workers=1)

    dice1_avg = []
    val_loss_avg = []
 
    print('Predicting for %s using Validation DataLoader...\n' %input_type)
    for patch, target in val_loader:

        model.eval()
        with torch.no_grad():
            y_pred = model(patch)

        y_true = target.to('cuda')

        dice_by_class = Tversky_index(y_pred, y_true).to('cpu')
        dice1_avg.append(float(dice_by_class[1]))

        val_loss = loss_fn(y_pred, y_true).to('cpu')
        val_loss_avg.append(float(val_loss))

    dice1_avg = np.mean(dice1_avg)
    avg_loss = np.mean(val_loss_avg)
    dice1_test_bestModels.append(np.mean(dice1_avg))

Predicting for cryoCARE+isoNET using Validation DataLoader...



In [5]:
df_model['loss_val_patches_dataLoader'] = avg_loss
df_model['dice1_val_patches_dataLoader'] = dice1_test_bestModels
df_model

Unnamed: 0,name,model_file,input_type,epochs,patch_size,lr,version,best_val_loss_epoch,associated_val_class1_dice,batch_size,loss_val_patches_dataLoader,dice1_val_patches_dataLoader
0,Baseline,/home/haicu/jeronimo.carvajal/Thesis/data/mode...,cryoCARE+isoNET,1 out of 1,in84,lr0.000100,v0,0.76955,0.23045,22,0.770444,0.229556


# Check how much difference there is between predicting using my method: batch_size=1

In [20]:
Nclass_data = 3
Nclass_model = 2
z, y, x = 3*[dim_in]

loss_fn = Tversky1_loss()

dice1_average = []
val_loss_avg = []

for instance in my_dataset:
    patch, target = instance
    patch = torch.as_tensor(patch).unsqueeze(0).to("cuda")
    with torch.no_grad():
        y_pred = model(patch)
        
    y_true = torch.zeros((1, Nclass_data, z, y, x))
    y_true[0, :, :, :, :] = target
    y_true = y_true.to('cuda')
    
    dice_by_class = Tversky_index(y_pred, y_true).to('cpu')
    dice1_average.append(float(dice_by_class[1]))

    val_loss = loss_fn(y_pred, y_true).to('cpu')
    val_loss_avg.append(float(val_loss))

# print(val_loss_avg)
dice1_average = np.mean(dice1_average)
val_loss_avg = np.mean(val_loss_avg)

In [15]:
df_model['loss_val_patches'] = np.mean(val_loss_avg)
df_model['dice1_val_patches'] = dice_average
df_model

Unnamed: 0,name,model_file,input_type,epochs,patch_size,lr,version,best_val_loss_epoch,associated_val_class1_dice,batch_size,loss_val_patches_dataLoader,dice1_val_patches_dataLoader,loss_val_patches,dice1_val_patches,loss_val_patches_dataLoader_bs1,dice1_val_patches_dataLoader_bs1
0,Baseline,/home/haicu/jeronimo.carvajal/Thesis/data/mode...,cryoCARE+isoNET,1 out of 1,in84,lr0.000100,v0,0.76955,0.23045,22,0.770444,0.229556,0.795742,1.204258,0.795742,0.204258


In [8]:
loss_fn = Tversky1_loss()

nCenters_dict = {'in56':4, 'in84':2}

dice1_test_bestModels = []

for i, row in df_model.iterrows():
    model_fileList = [row['model_file']]
    n_centers_list = [nCenters_dict[row['patch_size']]]
    
    input_type = row['input_type']
    batch_size = row['batch_size']
    dim_in = row['patch_size'].replace('in', '')
    dim_in = int(dim_in)
    
    model = load_model(row['model_file'], Nclass=2)
    
    paths_valData, paths_valTarget = setup.get_paths(val_tomos, input_type)
    
    my_dataset = dl.tomoSegment_dataset(paths_valData, paths_valTarget, dim_in=dim_in, Ncl=3, Lrnd=0, augment_data=False)
    val_loader = DataLoader(my_dataset, batch_size=1, shuffle=True, pin_memory=True, num_workers=1)

    dice1_avg = []
    val_loss_avg = []
 
    print('Predicting for %s using Validation DataLoader...\n' %input_type)
    for patch, target in val_loader:

        model.eval()
        with torch.no_grad():
            y_pred = model(patch)

        y_true = target.to('cuda')

        dice_by_class = Tversky_index(y_pred, y_true).to('cpu')
        dice1_avg.append(float(dice_by_class[1]))

        val_loss = loss_fn(y_pred, y_true).to('cpu')
        val_loss_avg.append(float(val_loss))

    dice1_avg = np.mean(dice1_avg)
    avg_loss = np.mean(val_loss_avg)
    dice1_test_bestModels.append(np.mean(dice1_avg))

Predicting for cryoCARE+isoNET using Validation DataLoader...



In [None]:
df_model['loss_val_patches_dataLoader_bs1'] = avg_loss
df_model['dice1_val_patches_dataLoader_bs1'] = dice1_test_bestModels
df_model

Unnamed: 0,name,model_file,input_type,epochs,patch_size,lr,version,best_val_loss_epoch,associated_val_class1_dice,batch_size,loss_val_patches_dataLoader,dice1_val_patches_dataLoader,loss_val_patches,dice1_val_patches,loss_val_patches_dataLoader_bs1,dice1_val_patches_dataLoader_bs1
0,Baseline,/home/haicu/jeronimo.carvajal/Thesis/data/mode...,cryoCARE+isoNET,1 out of 1,in84,lr0.000100,v0,0.76955,0.23045,22,0.770444,0.229556,0.795742,1.204258,0.795742,0.204258


# Multiple GPU: models_2 folder (not a dummy model)

In [4]:
logdir_path = '/home/haicu/jeronimo.carvajal/Thesis/data/model_logs/models_2/logs/BaselineModel/cryoCARE+isoNET/train02-03-17/version_0/'

model_file = PARENT_PATH+'data/model_logs/models_2/logs/BaselineModel/cryoCARE+isoNET/train02-03-17/version_0/Baseline_ep1000_in84_lr0.000100_0.model'

model_file_split = model_file.split('/')

model_info = []

input_type = model_file_split[-4]

name, epochs, patch_size, lr, version = model_file_split[-1].split('_')
epochs = int(epochs.replace('ep', ''))
version = 'v'+version.replace('.model', '')

events_path = glob(os.path.join(logdir_path, 'events.*'))[0]
event_acc = EventAccumulator(events_path)
event_acc.Reload()

_, step_nums, values_valLoss = zip(*event_acc.Scalars('hp/val_loss'))
best_val_loss_epoch = np.min(values_valLoss)
best_val_loss_epoch_idx = np.argmin(values_valLoss) #index starts count at 0

effective_epochs = len(values_valLoss)

_, _, values_dice = zip(*event_acc.Scalars('hp/val_dice'))
_, _, values_trainLoss = zip(*event_acc.Scalars('hp/train_loss_epoch'))

associated_val_class1_dice = float(values_dice[best_val_loss_epoch_idx])
associated_train_loss_epoch = float(values_trainLoss[best_val_loss_epoch_idx])

epochs_str = "%i out of %i" %(effective_epochs, 1000)

model_info.append([name, model_file, input_type, epochs_str, patch_size, lr, version, best_val_loss_epoch, associated_val_class1_dice])

df_model = pd.DataFrame(model_info, columns=['name', 'model_file', 'input_type', 'epochs', 'patch_size', 'lr', 'version', 'best_val_loss_epoch',
                                         'associated_val_class1_dice'])

df_model['batch_size'] = 22

print(df_model.shape)
df_model.head()

(1, 10)


Unnamed: 0,name,model_file,input_type,epochs,patch_size,lr,version,best_val_loss_epoch,associated_val_class1_dice,batch_size
0,Baseline,/home/haicu/jeronimo.carvajal/Thesis/data/mode...,cryoCARE+isoNET,431 out of 1000,in84,lr0.000100,v0,0.086578,0.913422,22


# Predict using validation DataLoader yields same results as logs: YES

In [5]:
loss_fn = Tversky1_loss()

nCenters_dict = {'in56':4, 'in84':2}

dice1_test_bestModels = []

for i, row in df_model.iterrows():
    model_fileList = [row['model_file']]
    n_centers_list = [nCenters_dict[row['patch_size']]]
    
    input_type = row['input_type']
    batch_size = row['batch_size']
    dim_in = row['patch_size'].replace('in', '')
    dim_in = int(dim_in)
    
    model = load_model(row['model_file'], Nclass=2)
    
    paths_valData, paths_valTarget = setup.get_paths(val_tomos, input_type)
    
    my_dataset = dl.tomoSegment_dataset(paths_valData, paths_valTarget, dim_in=dim_in, Ncl=3, Lrnd=0, augment_data=False)
    val_loader = DataLoader(my_dataset, batch_size=batch_size, shuffle=True, pin_memory=True, num_workers=1)

    dice1_avg = []
    val_loss_avg = []
 
    print('Predicting for %s using Validation DataLoader...\n' %input_type)
    for patch, target in val_loader:

        model.eval()
        with torch.no_grad():
            y_pred = model(patch)

        y_true = target.to('cuda')

        dice_by_class = Tversky_index(y_pred, y_true).to('cpu')
        dice1_avg.append(float(dice_by_class[1]))

        val_loss = loss_fn(y_pred, y_true).to('cpu')
        val_loss_avg.append(float(val_loss))

    dice1_avg = np.mean(dice1_avg)
    avg_loss = np.mean(val_loss_avg)
    dice1_test_bestModels.append(np.mean(dice1_avg))

Predicting for cryoCARE+isoNET using Validation DataLoader...



In [6]:
df_model['loss_val_patches_dataLoader'] = avg_loss
df_model['dice1_val_patches_dataLoader'] = dice1_test_bestModels
df_model

Unnamed: 0,name,model_file,input_type,epochs,patch_size,lr,version,best_val_loss_epoch,associated_val_class1_dice,batch_size,loss_val_patches_dataLoader,dice1_val_patches_dataLoader
0,Baseline,/home/haicu/jeronimo.carvajal/Thesis/data/mode...,cryoCARE+isoNET,431 out of 1000,in84,lr0.000100,v0,0.086578,0.913422,22,0.085344,0.914656
