In [1]:
%pylab inline
import pandas as pd
import numpy as np
import fastai
import torch
from pathlib import Path
import PIL
import tqdm
import os
import json
tqdm.monitor_interval = 0

Populating the interactive namespace from numpy and matplotlib


In [2]:
from fastai.conv_learner import resnet34, resnet152, transforms_top_down, CropType, \
    tfms_from_model, ConvLearner, optim, T
from fastai.dataset import ImageClassifierData, Denormalize
from fastai.metrics import accuracy, f1
from fastai.sgdr import TrainingPhase, DecayType
from lifelines.utils import concordance_index
from torch.nn.modules.loss import PoissonNLLLoss


In [3]:
def cindex_metric(preds, targs):
    try:
        cindex = concordance_index(np.exp(targs.cpu().numpy()).astype(int), 
                                   np.exp(preds.cpu().numpy()).astype(int))
    except:
        cindex = 0.0
    return cindex

In [4]:
torch.backends.cudnn.benchmark=True
torch.cuda.set_device(1)
torch.cuda.current_device()


1

In [5]:
LIVER_PATH = Path('/DATA/BIO/GDC/liver')
LIVER_SAMPLES = LIVER_PATH/"samples"
EXP_PATH = LIVER_PATH/"exp_poisson"
EXP_MODEL_PATH = EXP_PATH/"models"
EXP_DATA = EXP_PATH/"data"
EXP_TRAIN_DATA = EXP_DATA/"train"
EXP_TEST_DATA = EXP_DATA/"test"
PATIENT_JSON = EXP_PATH/'patient_split.json'
TRAIN_CSV = EXP_PATH/'level_1_train.csv'
TRAIN_CSV_FULL = EXP_PATH/'level_1_train_full.csv'
TEST_CSV = EXP_PATH/'level_1_test.csv'
TEST_CSV_FULL = EXP_PATH/'level_1_test_FULL.csv'

for d in [EXP_PATH, EXP_DATA, EXP_TRAIN_DATA, EXP_TEST_DATA, EXP_MODEL_PATH]:
    if not d.exists():
        d.mkdir()

In [6]:
slides = pd.read_csv(LIVER_PATH/'slides.csv')
slides = slides.loc[slides.sample_type_id == 1].copy()
slides['days_proxy'] = slides.days_to_death.fillna(slides.days_to_last_follow_up).astype(float)
slides = slides.loc[slides.days_proxy.notnull()].copy()
slides['event_observed'] = True
slides.loc[slides.days_to_last_follow_up.notnull(),'event_observed'] = False    


In [7]:
try:
    with open(PATIENT_JSON, 'r') as fobj:
        patients = json.load(fobj)
        train_patients = patients['train']
        val_patients = patients['val']
        test_patients = patients['test']
    train_slides = slides.loc[slides.submitter_id.isin(train_patients)]
    test_slides = slides.loc[slides.submitter_id.isin(test_patients)]

    train_csv_df = pd.read_csv(TRAIN_CSV_FULL)
    val_idx = train_csv_df.loc[train_csv_df.val_patient].index
except:
    patients = random.permutation(list(set(slides.submitter_id)))
    
    split     = int(0.7 * len(patients))
    val_split = int(0.7 * split)
    train_patients = patients[:split]
    val_patients   = patients[val_split:split]
    test_patients  = patients[split:]

    patient_split = {
        'train': list(train_patients),
        'val': list(val_patients),
        'test': list(test_patients)
    }
    with open(EXP_PATH/'patient_split.json', 'w') as fobj:
        json.dump(patient_split, fobj)

    print(len(train_patients), len(val_patients), len(test_patients))
    
    train_slides = slides.loc[slides.submitter_id.isin(train_patients)]
    test_slides  = slides.loc[slides.submitter_id.isin(test_patients)]
    
    
    train_items = []
    test_items = []
    num_slides = []
    slide_level = 'level_1'
    samples_per_slide = 20
    
    
    for ix, patient in tqdm.tqdm_notebook(slides.iterrows(), total=len(slides)):
        sfp = LIVER_SAMPLES/patient.slide_file_name.upper()/slide_level
        sample_files = list(sfp.iterdir())
        num_samples = len(sample_files)
        num_slides.append(num_samples)
        for fn in np.random.choice(sample_files, size=min(samples_per_slide,num_samples), replace=False):
            if patient.submitter_id in train_patients:
                dest_path = EXP_TRAIN_DATA
                train = True
                if patient.submitter_id in val_patients:
                    val = True
                else:
                    val = False
            else:
                dest_path = EXP_TEST_DATA
                train = False

            slide_id = os.path.basename(patient.slide_file_name).split('.')[0]
            img_id = os.path.basename(fn).split('.')[0]
            new_fn_base = '-'.join([slide_id, img_id]) + '.tiff'
            full_path = dest_path/new_fn_base

            if not full_path.exists():
                os.symlink(fn, dest_path/new_fn_base)

            if train:
                train_items.append({
                    'fn': str(new_fn_base),
                    'val': np.log(patient.days_proxy+1),
                    'val_patient': val,
                    'patient_id': patient.submitter_id,
                    'slide_file':patient.slide_file_name
                })
            else:
                test_items.append({
                    'fn': str(new_fn_base),
                    'val': np.log(patient.days_proxy+1),
                    'patient_id': patient.submitter_id,
                    'slide_file':patient.slide_file_name
                })
       
    train_csv_df = pd.DataFrame(list(random.permutation(train_items)))
     #fast.ai will sort on filenames, idx will be a mess!
    train_csv_df = train_csv_df.sort_values('fn').reset_index(drop=True)
    
    train_csv_df[['fn','val']].to_csv(TRAIN_CSV, index=False)
    train_csv_df.to_csv(TRAIN_CSV_FULL, index=False)
    val_idx = train_csv_df.loc[train_csv_df.val_patient].index
    
    test_csv_df = pd.DataFrame(test_items)
    test_csv_df = test_csv_df.sort_values('fn').reset_index(drop=True)
    
    test_csv_df[['fn','val']].to_csv(TEST_CSV, index=False)
    test_csv_df.to_csv(TEST_CSV_FULL, index=False)
    

263 79 113





In [11]:
f_model = resnet152
sz=256
bs=32
tfms = tfms_from_model(f_model, sz, aug_tfms=transforms_top_down, crop_type=CropType.CENTER)
md = ImageClassifierData.from_csv(
    EXP_PATH, "data/train", TRAIN_CSV, tfms=tfms, bs=bs, val_idxs=val_idx.values, continuous=True
)


In [12]:
learn = ConvLearner.pretrained(f_model, md)
learn.opt_fn = optim.Adam
#learn.crit = PoissonNLLLoss(full=True)

In [13]:
learn.children[-3:]

[BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True),
 Dropout(p=0.5),
 Linear(in_features=512, out_features=1, bias=True)]

In [14]:
# set to true to try learning rate finder
if False:
    learn.save('tmp')
    learn.unfreeze()
    lrf=learn.lr_find()
    learn.sched.plot(0)
    learn.load('tmp')

In [15]:
lr = 10e-3
print(lr)

0.01


In [None]:
learn.unfreeze()
learn.fit(lr, 1, cycle_len=100, use_clr=(100,10), best_save_name='liver_class_best_1', metrics=[cindex_metric])
learn.save('liver_class_1')

epoch      trn_loss   val_loss   cindex_metric              
    0      4.681112   1.89927    0.494228  
    1      1.600315   0.992847   0.457846                   
    2      1.213423   4.976754   0.476419                   
    3      1.157554   6.679834   0.503424                   
    4      1.118454   1.282096   0.507104                   
 40%|████      | 50/125 [00:37<00:55,  1.35it/s, loss=1.14]

In [None]:
learn.load('liver_class_best_1')

In [None]:
#learn.load('liver_class_best_1')
learn.save('liver_saveme')


In [None]:
y_pred, targ = learn.predict_with_targs()

In [None]:
concordance_index(np.exp(targ), np.exp(y_pred))

In [None]:
scatter(np.exp(y_pred), targ)

In [None]:
pd.Series(np.exp(y_pred[:,0])).hist()

In [None]:
np.argmax(poisson.pmf(np.arange(max_count), _lambda))

In [None]:
val_df.head()

In [None]:
val_df.loc[:,'y_pred']=y_pred_val.copy()

In [None]:
plt.scatter(np.clip(val_df.y_pred,0,np.inf),val_df.val)

In [None]:
print(concordance_index(val_df.y_pred,val_df.val))
print(cindex_metric(val_df.y_pred,val_df.val))


### TEST

In [None]:
y_pred = learn.predict_dl(md_test.val_dl)
test_df = test_csv_df.copy()
test_df['y_pred'] = y_pred

y_pred.shape, test_df.shape

In [None]:
yp_pred = test_df.groupby('patient_id').y_pred.min()
yp_targ = test_df.groupby('patient_id').val.mean()
concordance_index(yp_pred, yp_targ )

In [None]:
scatter(yp_targ, yp_pred)

In [None]:
test_df.val.hist()

In [None]:
train_csv_df.val.hist()

In [None]:
torch.nn.functional.poisson_nll_loss??

In [None]:
concordance_index?