In [1]:
import sys
import copy

import torch
import torchvision
from torch.utils.data import RandomSampler
from tqdm import trange, tqdm
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sys.path.append('../')
from src.dataset import EncodeDataset
from src.dataset import LargeRandomSampler, encode_worker_init_fn
# from train.train_encode_dataset import TrainEncodeDatasetModel
from src.deepct_model import DeepCT, criterion

import selene_sdk
from data.utils import interval_from_line
from src.utils import expand_dims
from src.transforms import PermuteSequenceChannels, RandomReverseStrand, MaskTracks, MaskFeatures

%load_ext autoreload
%autoreload 2

In [3]:
# m = torch.load('/home/msindeeva/DeepCT/DeepCT_outputs/boix/ct_v2/all/multi_ct/AdamW_COS_LR0001/2021-05-31-20-28-42/best_model.pth.tar')

In [7]:
from notebook_utils import load_datasets, get_loader

In [2]:
boix_folder = '/mnt/datasets/DeepCT/dataset_data/Biox_et_al'

track_mask = np.load('/mnt/datasets/DeepCT/drop_tracks756.npy')
train_transform = torchvision.transforms.Compose([
    PermuteSequenceChannels(),
    RandomReverseStrand(p=0.5),
    MaskTracks(track_mask)
])

feature_mask = np.load('/mnt/datasets/DeepCT/dataset_data/Biox_et_al/not_top4_feature_mask.npy')
val_transform = torchvision.transforms.Compose([
    PermuteSequenceChannels(),
    RandomReverseStrand(p=0.5),
    MaskFeatures(feature_mask)
])

In [4]:
boix_train = load_datasets(boix_folder, train=True, val=False, 
                           test=False, train_transform=train_transform)[0]
boix_train_loader = get_loader(boix_train, batch_size=64, shuffle=1447)

In [8]:
boix_val = load_datasets(boix_folder, train=False, val=True, test=False, 
                         val_transform=val_transform)[0]
boix_val_loader = get_loader(boix_val, batch_size=64, shuffle=False)

In [10]:
boix_val[0][0].shape

(4, 1000)

In [32]:
len(boix_val) // 

2310326

In [12]:
boix_test = load_datasets(boix_folder, train=False, val=False, test=True)[0]
boix_test_loader = get_loader(boix_test, batch_size=64, shuffle=False)

In [18]:
boix_cell_types = boix_val._cell_types
boix_target_features = boix_val.target_features
boix_track_matrix = boix_val._feature_indices_by_cell_type_index

In [19]:
n_cell_types = len(boix_cell_types)
n_features = len(boix_target_features)

In [20]:
np.random.seed(14)

gts = []
mean_preds = []
masks = []
for sample in tqdm(boix_val_loader):
    batch = copy.deepcopy(sample)
    del sample
    seq_mask = batch[3] # * ~track_mask # mask for mean positional value computation
    mean_seq_val = (batch[2] * seq_mask).sum(axis=1) / seq_mask.sum(axis=1)
    
    mean_batch_pred = torch.repeat_interleave(mean_seq_val, n_cell_types, dim=0)
    mean_batch_pred = mean_batch_pred.view(-1, n_features)
    batch_gt = batch[2].view(-1, n_features)
    #batch_mask = (batch[3] * track_mask).view(-1, n_features).astype(np.bool)
    batch_mask = (batch[3]).view(-1, n_features)
    
    # mask of samples to save for evaluation
    save_mask = np.random.choice(mean_batch_pred.shape[0], mean_batch_pred.shape[0] // 16, replace=False)
    
    gts.append(batch_gt.data.numpy()[save_mask])
    mean_preds.append(mean_batch_pred.data.numpy()[save_mask])
    # masks for metric computation
    masks.append(batch_mask.data.numpy()[save_mask])
    del batch

gts = expand_dims(np.concatenate(gts))
np.save('gts_val_4.npy', gts)
del gts
mean_preds = expand_dims(np.concatenate(mean_preds))
np.save('mean_preds_val_top4.npy', mean_preds)
del mean_preds
masks = expand_dims(np.concatenate(masks))
np.save('masks_val_top4.npy', masks)

100%|██████████| 36099/36099 [37:44<00:00, 15.94it/s]


OSError: Not enough free space to write 19822596960 bytes

In [None]:
m1 = np.load('mean_preds_val_all.npy')
m2 = np.load('mean_preds.npy')

In [None]:
m1[0]

In [None]:
(m1 != m2).sum()

In [22]:
gts = np.load('gts.npy')

In [25]:
~feature_mask

array([False, False,  True, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False,  True,
       False,  True, False,  True, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False])

In [27]:
gts_top4 = gts[:, ~feature_mask]

In [30]:
mean_preds = expand_dims(np.concatenate(mean_preds)) # calculated on top4 only

In [37]:
mean_preds = mean_preds[:, ~feature_mask]

In [45]:
masks_old = np.load('masks_val_all.npy')

In [40]:
masks = expand_dims(np.concatenate(masks))

In [44]:
masks = masks[:, ~feature_mask]

In [50]:
gts = np.load('gts_test.npy')
mean_preds = np.load('mean_preds_test.npy')
masks = np.load('masks_test.npy')

In [50]:
from src.metrics import jaccard_score, threshold_wrapper
from sklearn.metrics import average_precision_score
from selene_sdk.utils import compute_score

In [51]:
del masks
masks25 = np.load('masks25_test.npy')

In [51]:
map_val, ap_val = compute_score(mean_preds, gts_top4, average_precision_score, target_mask=masks.astype(np.bool))

In [52]:
np.save('mean_pos_val_ap_top4.npy', ap_val)
np.save('mean_pos_val_map_top4.npy', map_val)

In [45]:
ap_val[ap_val < 1].mean()

0.27422211294336285

In [53]:
map_val

0.3883023778788334

In [55]:
ap_val

array([0.54387984, 0.46001893, 0.34335172, 0.20595902])

In [54]:
jaccard_vals = []
jaccard_mean_vals = []
thresholds = [i / 10 for i in range(1, 10, 2)]
print(thresholds)
for threshold in tqdm(thresholds):
    jaccard_fun = threshold_wrapper(jaccard_score, threshold=threshold)
    mean_jacc_val, jacc_val = compute_score(mean_preds, gts_top4, jaccard_fun, target_mask=masks.astype(np.bool))
    jaccard_vals.append(jacc_val)
    jaccard_mean_vals.append(mean_jacc_val)
    print(f"At threshold {threshold} mean jaccard: {mean_jacc_val}")
jaccard_vals = np.array(jaccard_vals)
np.save('mean_pos_val_jaccard_top4.npy', jaccard_vals)
del jaccard_vals
jaccard_mean_vals = np.array(jaccard_mean_vals)
np.save('mean_pos_val_mean_jaccard_top4.npy', jaccard_mean_vals)

  0%|          | 0/5 [00:00<?, ?it/s]

[0.1, 0.3, 0.5, 0.7, 0.9]


 20%|██        | 1/5 [01:26<05:47, 86.94s/it]

At threshold 0.1 mean jaccard: 0.22299540024901707


 40%|████      | 2/5 [02:51<04:18, 86.28s/it]

At threshold 0.3 mean jaccard: 0.23621724922196574


 60%|██████    | 3/5 [04:15<02:50, 85.43s/it]

At threshold 0.5 mean jaccard: 0.15360814177916104


 80%|████████  | 4/5 [05:44<01:26, 86.69s/it]

At threshold 0.7 mean jaccard: 0.09537979971216563


100%|██████████| 5/5 [07:42<00:00, 92.42s/it]

At threshold 0.9 mean jaccard: 0.04342675998418352





In [1]:
jaccard_mean_vals

NameError: name 'jaccard_mean_vals' is not defined

In [53]:
del mean_preds
del gts
del masks

NameError: name 'masks' is not defined

In [54]:
del masks25

In [33]:
jaccard_vals[1][jaccard_vals[1] < 1].mean()

0.24142992402118915

In [29]:
jaccard_vals = np.load('mean_pos_test_jaccard.npy')

In [96]:
jaccard_vals[1][~np.isnan(jaccard_vals[1])].mean()

0.2052215914338065

In [85]:
jaccard_mean_vals

array([0.18550453, 0.20522159, 0.15125646, 0.11031006, 0.05832995])

In [13]:
jaccard_vals = np.load('jaccard_vals.npy')

In [17]:
# 100% tracks
jaccard_vals[1][jaccard_vals[1] < 1].mean()

0.2477876758770974

In [20]:
list(map(lambda x: x.split('\t')[1:], """DNase-seq	0.4998	0.3221
H3K27ac	0.3220	0.2081
H3K36me3	0.1651	0.0815
H3K4me1	0.1796	0.0909""".split('\n')))

[['0.4998', '0.3221'],
 ['0.3220', '0.2081'],
 ['0.1651', '0.0815'],
 ['0.1796', '0.0909']]

In [6]:
top4_on_all = [['0.4681', '0.2999'],
 ['0.2996', '0.1965'],
 ['0.1361', '0.0409'],
 ['0.1650', '0.0721']]

In [16]:
top4_on_all = np.array(top4_on_all).astype(np.float)

In [19]:
top4_on_all.mean(axis=0)

array([0.2672 , 0.15235])

In [21]:
top4_on_top4 = [['0.4998', '0.3221'],
 ['0.3220', '0.2081'],
 ['0.1651', '0.0815'],
 ['0.1796', '0.0909']]
top4_on_top4 = np.array(top4_on_top4).astype(np.float)

In [22]:
top4_on_top4.mean(axis=0)

array([0.291625, 0.17565 ])

In [None]:
# top4 map
0.388
# top4 IoU@3
0.236