In [1]:
import os
import pandas as pd

import torch
from torch.utils.data import DataLoader

from gliopath.train.task.cat import seed_torch, train, EmbeddingDataset, TaskHead
from gliopath.utils.proces import split_dataset

os.chdir('F:/workspace/pathology/gigapath')

In [2]:
seed = 42
dataset_df = pd.read_table('data\\metadata.tbl', sep='\t')
embed_path = 'output/all_slides_embeds.pt'
z_score = False
type_col = 'tumour_type'
num_classes = dataset_df[type_col].nunique()
batch_size = 4   # should be set as much larger number if not at test phase
num_workers = 4  # set for the cores of cpu
embed_dim = 1536

splits = ['train', 'val', 'test']
split_col = 'split_col'
id_col = 'id'
params = {
    'lr': 0.02,
    'min_lr': 0.0,
    'train_iters': 4000,
    'eval_interval': 100,
    'optim': 'sgd',
    'output_dir': 'output/models/classification',
    'weight_decay': 0.01,
}

In [3]:
# set the random seed
seed_torch(torch.device('cuda'), 0)
# read the metadata
dataset_df = split_dataset(dataset_df, id_col='id', type_col='tumour_type', val_split=0.2, test_split=0.1, in_df=True, split_col='split_col')

# load the dataset
train_dataset, val_dataset, test_dataset = [EmbeddingDataset(dataset_df, embed_path, split_col=split_col, split=split, id_col=id_col, type_col=type_col, z_score=z_score) for split in splits]
# set num_classes
print(f'Sample size:\nTrain: {len(train_dataset)}\tVal: {len(val_dataset)}\tTest: {len(test_dataset)}')

Sample size:
Train: 68	Val: 20	Test: 12


  collated_dict = torch.load(self.embed_path)


In [4]:
# infinite sampler for training
train_sampler = torch.utils.data.sampler.RandomSampler(train_dataset, replacement=True)
train_loader = DataLoader(train_dataset, batch_size=batch_size, num_workers=num_workers, sampler=train_sampler, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers, pin_memory=True)

# Load the model
model = TaskHead(embed_dim, num_classes)

In [None]:
# Train the model
train(model, train_loader, val_loader, test_loader, **params)

In [5]:
train_loader = DataLoader(train_dataset, batch_size=batch_size, num_workers=0)

In [6]:
import tensorboard
train_iters = 4000
min_lr = 0.0
lr = 0.02
momentum = 0
weight_decay = 0.01

In [7]:
import torch.nn as nn
import itertools

In [8]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

# set Tensorboard
optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=momentum, weight_decay=weight_decay)

# Set the optimizer

# Set the learning rate scheduler
lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=train_iters, eta_min=min_lr)

# Set the loss function
criterion = nn.CrossEntropyLoss()

# Set the infinite train loader
infinite_train_loader = itertools.cycle(train_loader)

best_f1 = 0
# Train the model
print('Start training')
for idx, (embed, category) in enumerate(infinite_train_loader):
    if idx >= train_iters:
        break

    embed, category = embed.to(device), category.to(device)

    # Forward pass
    output = model(embed)
    if idx <2: continue
    loss = criterion(output, category)

    break


Start training


In [9]:
embed[0].squeeze(0).shape

torch.Size([1536])

In [10]:
embed.shape

torch.Size([4, 1536])