## this sample script takes continuous variable as outcome, as example

In [1]:
import os
import pandas as pd

import torch
from torch.utils.data import DataLoader

os.chdir('/mnt/f/workspace/pathology/gigapath/code/transfer')
from gliopath.train.task.long import seed_torch, train, EmbeddingDataset, TaskHead, collate_fn_with_padding, SeriesHead
from gliopath.utils.proces import split_dataset
from gliopath.train.gadget import get_sampler
from gliopath.models.load import giga_slide_enc

os.chdir('/mnt/f/workspace/pathology/gigapath')



In [8]:
seed = 42
dataset_df = pd.read_table('data/metadata.tbl', sep='\t')
test_df = pd.read_table('data/metadata_test.tbl', sep='\t')
embed_path = 'output/tiles/rand_embed/'
z_score = False
outcome_col = ['IDH1','TP53','ATRX','PTEN','EGFR','TERT']
num_classes = len(outcome_col)
batch_size = 4
num_workers = 0
embed_dim = 1536
weighted_sampler = True
feat_layers=[0,1,2]
num_epochs = 10

splits = ['train', 'val', 'test']
split_col = 'split_col'
id_col = 'id'
params = {
    'lr': 0.001,
    'min_lr': 0.0,
    'num_epochs': num_epochs,
    'eval_interval': 10,
    'output_dir': 'output/models/life',
    'optim': 'sgd',
    'weight_decay': 0.01,
    'outcome_type': 'gene',
    'gc_step': 10,
    'freeze_longnet': True,
}

In [9]:
# set the random seed
seed_torch(torch.device('cuda'), 0)
# read the metadata
dataset_df = split_dataset(dataset_df, id_col='id', type_col='tumour_type', val_split=0.2, test_split=0, in_df=True, split_col='split_col')
test_df['split_col'] = 'test'
dataset_df = pd.concat([dataset_df, test_df], ignore_index=True)

# load the dataset
train_dataset, val_dataset, test_dataset = [EmbeddingDataset(dataset_df, embed_path, feat_layer=feat_layers, split_col=split_col, split=split, id_col=id_col, type_col=outcome_col, outcome_type='gene', z_score=z_score) for split in splits]

# set num_classes
print(f'Sample size:\nTrain: {len(train_dataset)}\tVal: {len(val_dataset)}\tTest: {len(test_dataset)}')

Sample size:
Train: 158	Val: 42	Test: 50


In [10]:
# infinite sampler for training
# not sure if cha nge shuffle to TRUE? (*)
# train_sampler = torch.utils.data.sampler.RandomSampler(train_dataset, replacement=True)
train_sampler = get_sampler(train_dataset)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers, collate_fn=collate_fn_with_padding, sampler=train_sampler, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers, collate_fn=collate_fn_with_padding, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers, collate_fn=collate_fn_with_padding, pin_memory=True)

In [5]:
model_longnet = giga_slide_enc(path='model/pub/slide_encoder.pth', global_pool=True)
model = SeriesHead(LongNetModel=model_longnet,
                   TaskHead=TaskHead(768 * len(feat_layers), num_classes),
                   feat_layers=feat_layers)

/mnt/f/workspace/pathology/gigapath
dilated_ratio:  [1, 2, 4, 8, 16]
segment_length:  [1024, 5792, 32768, 185363, 1048576]
Number of trainable LongNet parameters:  85148160
Global Pooling: True
[92m Successfully Loaded Pretrained GigaPath model from model/pub/slide_encoder.pth [00m
Slide encoder param # 86330880


In [11]:
# Train the model
pred_gather, target_gather = train(model, train_loader, val_loader, test_loader, **params)

LongNet encoder has been frozen
Set the optimizer as sgd
Start training for 10 epochs with gradient accumulation steps: 10


  0%|          | 0/10 [00:00<?, ?it/s]

Epoch 0:   0%|          | 0/40 [00:00<?, ?it/s]

Epoch [1/10]	Train Loss: 0.6926	LR: 0.001000


Epoch 1:   0%|          | 0/40 [00:00<?, ?it/s]

Epoch [2/10]	Train Loss: 0.6924	LR: 0.000976


Epoch 2:   0%|          | 0/40 [00:00<?, ?it/s]

Epoch [3/10]	Train Loss: 0.6894	LR: 0.000905


Epoch 3:   0%|          | 0/40 [00:00<?, ?it/s]

Epoch [4/10]	Train Loss: 0.6948	LR: 0.000794


Epoch 4:   0%|          | 0/40 [00:00<?, ?it/s]

Epoch [5/10]	Train Loss: 0.6878	LR: 0.000655


Epoch 5:   0%|          | 0/40 [00:00<?, ?it/s]

Epoch [6/10]	Train Loss: 0.6939	LR: 0.000500


Epoch 6:   0%|          | 0/40 [00:00<?, ?it/s]

Epoch [7/10]	Train Loss: 0.6858	LR: 0.000345


Epoch 7:   0%|          | 0/40 [00:00<?, ?it/s]

Epoch [8/10]	Train Loss: 0.6885	LR: 0.000206


Epoch 8:   0%|          | 0/40 [00:00<?, ?it/s]

Epoch [9/10]	Train Loss: 0.6900	LR: 0.000095


Epoch 9:   0%|          | 0/40 [00:00<?, ?it/s]

Epoch [10/10]	Train Loss: 0.6941	LR: 0.000024
Evaluating at epoch 10...
Val Epoch [10/10] Acc: 0.488 F1: 0.000 Prec: 0.000 Rec: 0.000 AUROC: 0.523 AUPRC: 0.549
Loading best model with f1: 0.000
Evaluating on test set...
Test Acc: 0.513 F1: 0.054 Prec: 0.100 Rec: 0.037 AUROC: 0.459 AUPRC: 0.484


In [9]:
device = torch.device('cuda')
# state_dict = torch.load('output/models/life/last_model.pth', map_location=device)
# model.load_state_dict(state_dict, strict=False)
# model.to(device)
model.eval()
for batch in test_loader:
    embed, coords, category = batch['tile_embeds'].to(device), batch['coords'].to(device), batch[
                'categories'].to(device)
    with torch.cuda.amp.autocast():
        test_output = model(embed, coords)

In [10]:
test_output

tensor([[-0.1185,  0.0707,  0.0075, -0.3877, -0.2227,  0.1137],
        [ 0.0099,  0.0759, -0.3223, -0.3796,  0.1345,  0.2452]],
       device='cuda:0', dtype=torch.float16, grad_fn=<AddmmBackward0>)