In [1]:
%pylab inline
import pandas as pd
import numpy as np
import fastai
import torch
from pathlib import Path
import PIL
import tqdm
import os
import json
from fastai.conv_learner import resnet34, resnet152, transforms_top_down, CropType, \
    tfms_from_model, ConvLearner, optim, T
from fastai.dataset import ImageClassifierData, Denormalize, folder_source
from fastai.metrics import accuracy, f1
from fastai.sgdr import TrainingPhase, DecayType
from lifelines.utils import concordance_index
from collections import defaultdict
from sklearn.metrics import f1_score

tqdm.monitor_interval = 0

Populating the interactive namespace from numpy and matplotlib


In [2]:
torch.backends.cudnn.benchmark=True
torch.cuda.set_device(1)
torch.cuda.current_device()

1

In [3]:
LIVER_PATH = Path('/DATA/BIO/GDC/liver')
LIVER_SAMPLES = LIVER_PATH/"samples"
EXP_PATH = LIVER_PATH/"exp3"
EXP_MODEL_PATH = EXP_PATH/"models"

for d in [EXP_PATH, EXP_MODEL_PATH]:
    if not d.exists():
        d.mkdir()

In [4]:
%%time
slides = pd.read_csv(LIVER_PATH/'slides.csv')
slides = slides.loc[slides.sample_type_id.isin([1,11])]
slide_level = 'level_1'
samples_per_slide = 20
split = 0.7
val_split = 0.7

# 400 tumor, 90 normal - so we'll do 4.44 more normal samples
num_tiles_by_category = {
    'tumor': samples_per_slide,
    'normal': int(4.44 * samples_per_slide),
    'test': int(4 * samples_per_slide)
}

slide_info = defaultdict(dict)

def pull_tiles(slides, num_tiles, slide_level):
    tiles = []
    for i, slide in slides.iterrows():
        slide_file_name = slide.slide_file_name
        sfp = LIVER_SAMPLES/slide_file_name.upper()/slide_level
        sample_files = list(sfp.iterdir())
        num_samples = len(sample_files)
        for fn in np.random.choice(sample_files, size=min(num_tiles,num_samples), replace=False):
            tiles.append(fn)
    return tiles
        
for grp_type, grp_slides in slides.groupby('sample_type_id'):
    num_slides = len(grp_slides)
    slide_test_split = int(split * num_slides)
    slide_val_split = int(val_split * slide_test_split)
    train_slides = grp_slides.iloc[0:slide_val_split]
    val_slides = grp_slides.iloc[slide_val_split:slide_test_split]
    test_slides = grp_slides.iloc[slide_test_split:]

    slide_category = {1: 'tumor', 11: 'normal'}[grp_type]
    num_tiles = num_tiles_by_category[slide_category]
    test_num_tiles = num_tiles_by_category['test']
    
    slide_info['train'][slide_category] = pull_tiles(train_slides, num_tiles, slide_level)
    slide_info['valid'][slide_category] = pull_tiles(val_slides, num_tiles, slide_level)
    slide_info['test'][slide_category] = pull_tiles(test_slides, test_num_tiles, slide_level)
    
for folder in slide_info:
    if (EXP_PATH/folder).exists():
        print("skip: %s - already exists" % folder)
    else:
        for catname in slide_info[folder]:
            for tile_fn in slide_info[folder][catname]:
                dest_path = EXP_PATH/folder/catname
                dest_path.mkdir(parents=True, exist_ok=True)

                slide_id = '_'.join(os.path.basename(tile_fn.parents[1]).split('.')[0:2])
                img_id = os.path.basename(tile_fn).split('.')[0]
                new_fn_base = '-'.join([slide_id, img_id]) + '.tiff'
                full_path = dest_path/new_fn_base

                if not full_path.exists():
                    os.symlink(tile_fn, dest_path/new_fn_base)


skip: train - already exists
skip: valid - already exists
skip: test - already exists
CPU times: user 752 ms, sys: 92 ms, total: 844 ms
Wall time: 844 ms


In [5]:
f_model = resnet152
sz=128
bs=16
tfms = tfms_from_model(f_model, sz, aug_tfms=transforms_top_down, crop_type=CropType.CENTER)
md = ImageClassifierData.from_paths(EXP_PATH, tfms=tfms, test_name='test', test_with_labels=True)

In [6]:
learn = ConvLearner.pretrained(f_model, md)
learn.opt_fn = optim.Adam

In [None]:
learn.save('tmp_wt')
lrf=learn.lr_find(start_lr=1e-06, end_lr=100)
learn.sched.plot(0)
learn.load('tmp_wt')

In [None]:
lr = 0.05
learn.fit(lr, 1, cycle_len=10, use_clr=(32,10), best_save_name='liver_cat_best_1', metrics=[accuracy])
learn.save('liver_cat_1')

In [None]:
learn.load('liver_cat_best_1')
lrs = np.array([lr/100, lr/10, lr])
learn.freeze_to(-2)
learn.fit(lrs, 1, cycle_len=10, use_clr=(32,10), best_save_name='liver_cat_best_2', metrics=[accuracy])
learn.save('liver_cat_2')

In [None]:
learn.load('liver_cat_best_2')
learn.unfreeze()
learn.fit(lrs, 1, cycle_len=20, use_clr=(32,10), best_save_name='liver_cat_best_3', metrics=[accuracy])
learn.save('liver_cat_3')

In [None]:
learn.load('liver_cat_best_3')
learn.unfreeze()
learn.fit(lrs, 1, cycle_len=50, use_clr=(32,10), best_save_name='liver_cat_best_4', metrics=[accuracy])
learn.save('liver_cat_4')
learn.load('liver_cat_best_4')

In [7]:
learn.load('liver_cat_best_4')
multi_preds, y = learn.TTA()
preds = np.mean(multi_preds, 0)

                                             

In [8]:
multi_test_preds, test_y = learn.TTA(is_test=True)
test_preds = np.mean(multi_test_preds, 0)

                                             

In [9]:
preds.shape, accuracy(T(preds), T(y)), f1_score(y, np.argmax(preds, 1))

((3285, 2), tensor(0.9924, device='cuda:1'), 0.9925395404356908)

In [10]:
test_preds.shape, accuracy(T(test_preds), T(test_y)), f1_score(test_y, np.argmax(test_preds, 1))

((11605, 2), tensor(0.9870, device='cuda:1'), 0.9920211360634082)

In [11]:
y.mean(), test_y.mean()

(0.5092846270928463, 0.8199913830245584)

In [12]:
np.argmax(preds,1).mean(), np.argmax(test_preds,1).mean(), 

(0.5108066971080669, 0.810771219302025)