In [19]:
from __future__ import print_function

import pdb
import os
import math
from random import Random

# internal imports
from utils.file_utils import save_pkl, load_pkl
from utils.utils import *
from utils.core_utils import train
from datasets.dataset_generic import Generic_WSI_Classification_Dataset, Generic_MIL_Dataset

# pytorch imports
import torch
from torch.utils.data import DataLoader, sampler
import torch.nn as nn
import torch.nn.functional as F

import pandas as pd
import numpy as np

In [20]:
# create splits

seed = 10
patch_dir = "image_sets/patches/"
dest_dir = "image_sets/splits/"
    
for folder in [dest_dir, dest_dir+"train", dest_dir+"test", dest_dir+"val"]:
    if not os.path.exists(folder):
        os.makedirs(folder)

# [train, test, val]
split_ratios = [0.76, 0.12, 0.12]

In [21]:
patches_list = sorted(os.listdir(patch_dir))
# print(patches_list)
Random(seed).shuffle(patches_list)
# print(patches_list)

patches_list_length = len(patches_list)
train_length = math.ceil(split_ratios[0] * patches_list_length)
test_length = math.ceil(split_ratios[1] * patches_list_length)
val_length = patches_list_length - train_length - test_length
print(patches_list_length, train_length, test_length, val_length)

train_dataset = patches_list[:train_length]
test_dataset = patches_list[train_length:train_length + test_length]
val_dataset = patches_list[train_length + test_length:]
# print(len(train), len(test), len(val))

141 108 17 16


In [22]:
# Generic training settings
# Configurations for WSI Training

data_root_dir = "image_sets/patches/"
max_epochs = 200
lr = 1e-4
label_frac = 1.0
reg = 1e-5
seed = 1
k = 10
k_start = -1
k_end = -1
results_dir = "image_sets/results"
split_dir = "fungal_vs_nonfungal_100"
log_data = False
testing = False
early_stopping = False
opt = 'adam'
drop_out = False
bag_loss = 'ce'
model_type = 'clam_sb'
weighted_sample = False
model_size = 'small'
task = 'task_fungal_vs_nonfungal'
### CLAM specific options
no_inst_cluster = False
inst_loss = None
subtyping = False
bag_weight = 0.7
B = 8

exp_code = "exp_0"

device=torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [23]:
def seed_torch(seed=7):
    import random
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if device.type == 'cuda':
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)  # if you are using multi-GPU.
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True


seed_torch(seed)

encoding_size = 1024
settings = {'num_splits': k,
            'k_start': k_start,
            'k_end': k_end,
            'task': task,
            'max_epochs': max_epochs,
            'results_dir': results_dir,
            'lr': lr,
            'experiment': exp_code,
            'reg': reg,
            'label_frac': label_frac,
            'bag_loss': bag_loss,
            'seed': seed,
            'model_type': model_type,
            'model_size': model_size,
            "use_drop_out": drop_out,
            'weighted_sample': weighted_sample,
            'opt': opt,
            'data_root_dir': None,
            'label_frac': 1.0,
            'k': 10,
            'split_dir': None,
            'log_data': False,
            'testing': False,
            'early_stopping': False,
            'dropout': False,
            'no_inst_cluster': False,
            'inst_loss': None,
            'subtyping': False,
            'bag_weight': 0.7,
            'B': 8
            }

if model_type in ['clam_sb', 'clam_mb']:
    settings.update({'bag_weight': bag_weight,
                     'inst_loss': inst_loss,
                     'B': B})

print('\nLoad Dataset')


if task == 'task_fungal_vs_nonfungal':
    n_classes = 2
    dataset = Generic_MIL_Dataset(csv_path='dataset_csv/fungal_vs_nonfungal.csv',
                                  data_dir=os.path.join(
                                      data_root_dir, 'fungal_vs_nonfungal_resnet_features'),
                                  shuffle=False,
                                  seed=seed,
                                  print_info=True,
                                  label_dict={'nonfungal': 0, 'fungal': 1},
                                  patient_strat=False,
                                  ignore=[])

elif task == 'task_1_tumor_vs_normal':
    n_classes = 2
    dataset = Generic_MIL_Dataset(csv_path='dataset_csv/tumor_vs_normal_dummy_clean.csv',
                                  data_dir=os.path.join(
                                      data_root_dir, 'tumor_vs_normal_resnet_features'),
                                  shuffle=False,
                                  seed=seed,
                                  print_info=True,
                                  label_dict={'normal_tissue': 0,
                                              'tumor_tissue': 1},
                                  patient_strat=False,
                                  ignore=[])

elif task == 'task_2_tumor_subtyping':
    n_classes = 3
    dataset = Generic_MIL_Dataset(csv_path='dataset_csv/tumor_subtyping_dummy_clean.csv',
                                  data_dir=os.path.join(
                                      data_root_dir, 'tumor_subtyping_resnet_features'),
                                  shuffle=False,
                                  seed=seed,
                                  print_info=True,
                                  label_dict={'subtype_1': 0,
                                              'subtype_2': 1, 'subtype_3': 2},
                                  patient_strat=False,
                                  ignore=[])

    if model_type in ['clam_sb', 'clam_mb']:
        assert subtyping

else:
    raise NotImplementedError

if not os.path.isdir(results_dir):
    os.mkdir(results_dir)

results_dir = os.path.join(results_dir, str(exp_code) + '_s{}'.format(seed))
if not os.path.isdir(results_dir):
    os.mkdir(results_dir)

if split_dir is None:
    split_dir = os.path.join('splits', task+'_{}'.format(int(label_frac*100)))
else:
    split_dir = os.path.join('splits', split_dir)

# print('split_dir: ', split_dir)
# assert os.path.isdir(split_dir)

# settings.update({'split_dir': split_dir})


with open(results_dir + '/experiment_{}.txt'.format(exp_code), 'w') as f:
    print(settings, file=f)
f.close()

print("################# Settings ###################")
for key, val in settings.items():
    print("{}:  {}".format(key, val))



Load Dataset
    case_id   slide_id label
0    case_0    slide_0     1
1    case_0    slide_1     0
2    case_0    slide_2     0
3    case_0    slide_3     0
4    case_0    slide_4     1
..      ...        ...   ...
135  case_9  slide_135     1
136  case_9  slide_136     1
137  case_9  slide_137     1
138  case_9  slide_138     1
139  case_9  slide_139     1

[140 rows x 3 columns]
label column: label
label dictionary: {'nonfungal': 0, 'fungal': 1}
number of classes: 2
slide-level counts:  
 0      3
1    137
Name: label, dtype: int64
Patient-LVL; Number of samples registered in class 0: 0
Slide-LVL; Number of samples registered in class 0: 3
Patient-LVL; Number of samples registered in class 1: 10
Slide-LVL; Number of samples registered in class 1: 137
################# Settings ###################
num_splits:  10
k_start:  -1
k_end:  -1
task:  task_fungal_vs_nonfungal
max_epochs:  200
results_dir:  image_sets/results
lr:  0.0001
experiment:  exp_0
reg:  1e-05
label_frac:  1.0
bag_lo

In [24]:
# main

# create results directory if necessary
if not os.path.isdir(results_dir):
    os.mkdir(results_dir)

if k_start == -1:
    start = 0
else:
    start = k_start
if k_end == -1:
    end = k
else:
    end = k_end

all_test_auc = []
all_val_auc = []
all_test_acc = []
all_val_acc = []
folds = np.arange(start, end)
for i in folds:
    seed_torch(seed)
    train_dataset, val_dataset, test_dataset = dataset.return_splits(from_id=False, 
            csv_path='{}/splits_{}.csv'.format(split_dir, i))
    
#     datasets = (train_dataset, val_dataset, test_dataset)
    
    results, test_auc, val_auc, test_acc, val_acc  = train(datasets, i, settings)
    all_test_auc.append(test_auc)
    all_val_auc.append(val_auc)
    all_test_acc.append(test_acc)
    all_val_acc.append(val_acc)
    #write results to pkl
    filename = os.path.join(results_dir, 'split_{}_results.pkl'.format(i))
    save_pkl(filename, results)

final_df = pd.DataFrame({'folds': folds, 'test_auc': all_test_auc, 
    'val_auc': all_val_auc, 'test_acc': all_test_acc, 'val_acc' : all_val_acc})

if len(folds) != k:
    save_name = 'summary_partial_{}_{}.csv'.format(start, end)
else:
    save_name = 'summary.csv'
final_df.to_csv(os.path.join(results_dir, save_name))

Settings: {'num_splits': 10, 'k_start': -1, 'k_end': -1, 'task': 'task_fungal_vs_nonfungal', 'max_epochs': 200, 'results_dir': 'image_sets/results', 'lr': 0.0001, 'experiment': 'exp_0', 'reg': 1e-05, 'label_frac': 1.0, 'bag_loss': 'ce', 'seed': 1, 'model_type': 'clam_sb', 'model_size': 'small', 'use_drop_out': False, 'weighted_sample': False, 'opt': 'adam', 'data_root_dir': None, 'k': 10, 'split_dir': None, 'log_data': False, 'testing': False, 'early_stopping': False, 'dropout': False, 'no_inst_cluster': False, 'inst_loss': None, 'subtyping': False, 'bag_weight': 0.7, 'B': 8}

Training Fold 0!

Init train/val/test splits... (['F007a13', 'F030a15', 'F010a01', 'F033a03', 'F033a14', 'F034a09', 'F007a11', 'F009a02', 'F007a16', 'F030a01', 'F007a06', 'F007a03', 'F018a02', 'F013a04', 'F033a11', 'F030a04', 'F013a13', 'F012a03', 'F030a09', 'F007a08', 'F013a08', 'F009a01', 'F018a11', 'F006a02', 'F012a01', 'F033a26', 'F033a09', 'F006a01', 'F009a04', 'F018a09', 'F033a21', 'F007a04', 'F018a05', 'F0

AttributeError: 'list' object has no attribute 'slide_data'