## TCGA Analysis
- training 1 post-inference.
- Analyze `0107_pred_TCGA.csv`.
- Generate `0107_pred_TCGA_2.csv`.
- Analysis tile level with soft assignment.

#### Statistical significante of ROC
- DeLong test for statistical significance of ROCAUC [link](https://www.jstor.org/stable/2531595?seq=1)
- DeLong test in Python [link](https://biasedml.com/roc-comparison/)
- Bootstrapping ROC for estimating error [StackOverflow](https://stackoverflow.com/questions/19124239/scikit-learn-roc-curve-with-confidence-intervals).

In [2]:
import pandas as pd
import numpy as np
import os
import glob

import re
from tommy_library import *

import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import entropy
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn import metrics
from tqdm import tqdm

***
## PP prediction and compute entropy/uncertainty/mode

In [3]:
'Load metadata'
data_fld = '../data'
sub_fld = os.path.join(data_fld, 'TCGA_data')
path = os.path.join(sub_fld, 'tcga_wsi_meta.csv')
df_wsi = pd.read_csv(path, index_col=0)
path = os.path.join(sub_fld, 'tcga_tile_meta.csv')
df_tile = pd.read_csv(path)

In [4]:
len(df_wsi)

475

In [5]:
'Load predictions post-inference'
path = os.path.join(sub_fld, '0107_pred_TCGA.csv')
df_tst = pd.read_csv(path, index_col=0)
assert len(df_tile) == len(df_tst)
# df_tst.head(2)

In [6]:
#Newly Added
df_tst = df_tst[df_tst.primary_tumor_type == 'NON-ACRAL CUTANEOUS']

In [7]:
'Remove low quality WSI'
low_quality_wsi = ['TCGA-WE-AAA3-06Z-00-DX1-WSI',
 'TCGA-GF-A2C7-01Z-00-DX1-WSI',
 'TCGA-FS-A1ZU-06Z-00-DX3-WSI',
 'TCGA-FS-A1ZU-06Z-00-DX2-WSI',
 'TCGA-FS-A1ZN-01Z-00-DX9-WSI',
 'TCGA-FS-A1ZE-06Z-00-DX1-WSI',
 'TCGA-FR-A728-01Z-00-DX1-WSI',
 'TCGA-FR-A3YN-01Z-00-DX1-WSI',
 'TCGA-FR-A3R1-01Z-00-DX1-WSI',
 'TCGA-ER-A3EV-01Z-00-DX1-WSI',
 'TCGA-DA-A95Y-01Z-00-DX1-WSI',
 'TCGA-DA-A95W-01Z-00-DX1-WSI',
 'TCGA-DA-A3F8-01Z-00-DX1-WSI',
 'TCGA-DA-A3F5-01Z-00-DX1-WSI',
 'TCGA-DA-A1IA-01Z-00-DX1-WSI',
 'TCGA-DA-A1I5-01Z-00-DX1-WSI',
 'TCGA-D3-A2JH-06Z-00-DX1-WSI',]
low_quality_wsi = [name.split('-WSI')[0]+'.svs' for name in low_quality_wsi]
print(df_tst.wsi_id.isin(low_quality_wsi).sum())
df_tst = df_tst[~df_tst.wsi_id.isin(low_quality_wsi)]

39


In [8]:
'Process prediction output to legible format'
CBT_pb = lambda x: parse_str(x, 0)
CBTA_pb = lambda x: parse_str(x, 1)
CBTP_pb = lambda x: parse_str(x, 2)
CBT3_pb = lambda x: parse_str(x, 3)
CBTPA_pb = lambda x: parse_str(x, 4)
CBTP3_pb = lambda x: parse_str(x, 5)

In [9]:
'Compute prediction probabilities for each class'
df_tst['CBT'] = df_tst.pred_prob.map(CBT_pb)
df_tst['CBTA'] = df_tst.pred_prob.map(CBTA_pb)
df_tst['CBTP'] = df_tst.pred_prob.map(CBTP_pb)
df_tst['CBT3'] = df_tst.pred_prob.map(CBT3_pb)
df_tst['CBTPA'] = df_tst.pred_prob.map(CBTPA_pb)
df_tst['CBTP3'] = df_tst.pred_prob.map(CBTP3_pb)

In [10]:
# check probabilities are  normalized
df_tst['CBT'] + df_tst['CBTA'] + df_tst['CBTP'] + df_tst['CBT3'] + df_tst['CBTPA'] + df_tst['CBTP3']

2        1.0
4        1.0
5        1.0
7        1.0
8        1.0
        ... 
21989    1.0
21990    1.0
21991    1.0
21992    1.0
21994    1.0
Length: 19921, dtype: float64

In [11]:
'Assing most likely prediction to each tile'
classes = ['CBT', 'CBTA', 'CBTP', 'CBT3', 'CBTPA', 'CBTP3']
preds = np.argmax(df_tst[classes].values, axis=1)
ix2cl = {ix: cl for ix, cl in enumerate(classes)}
df_tst['prediction'] = [ix2cl[p] for p in preds]

In [12]:
import seaborn as sns
sns.set_style('darkgrid')

In [13]:
'Compute entropy'
df_tst['tile_entropy'] = entropy(df_tst[classes], axis=1, base=6)
plt.title('Entropy histogram (TCGA; tile level)')
plt.xlabel('Entropy')
df_tst.tile_entropy.hist(bins=50);
plt.savefig('../results/entropy_histogram_tcga_tile.pdf')

In [14]:
# 'Assign uncertain predictions'
# mask = df_tst.tile_entropy > .3
# df_tst.loc[mask, 'prediction'] = 'Uncertain'
# (df_tst.prediction == 'Uncertain').sum() 

'Assign uncertain predictions'
mask = df_tst.tile_entropy > .2
df_tst['uncertain'] = mask

In [15]:
'Fraction of uncertain tile'
df_tst.uncertain.sum()  / len(df_tst)

0.6415842578183826

In [16]:
df_tst

Unnamed: 0,wsi_id,x_tile_coord,y_tile_coord,clinical_donor_id,wsi_name,clinical_sample_id,primary_tumor_type,CNA_data,ABSOLUTE_purity,rna_subtype,...,pred_prob,CBT,CBTA,CBTP,CBT3,CBTPA,CBTP3,prediction,tile_entropy,uncertain
2,TCGA-EE-A3JD-01Z-00-DX1.svs,1,8,TCGA-EE-A3JD,TCGA-EE-A3JD-01Z-00-DX1.D4E5B644-C7EF-442D-91F...,TCGA-EE-A3JD-06,NON-ACRAL CUTANEOUS,True,0.26,MITF-Low,...,[1.0015298e-21 1.0000000e+00 0.0000000e+00 2.7...,1.001530e-21,1.000000e+00,0.000000e+00,2.721224e-37,1.858032e-14,0.000000e+00,CBTA,3.382706e-13,False
4,TCGA-D3-A3CB-06Z-00-DX1.svs,0,5,TCGA-D3-A3CB,TCGA-D3-A3CB-06Z-00-DX1.9862D604-C9E7-44BE-99E...,TCGA-D3-A3CB-06,NON-ACRAL CUTANEOUS,True,0.37,Common,...,[8.64833027e-16 8.75000238e-01 3.34407762e-23 ...,8.648330e-16,8.750002e-01,3.344078e-23,2.762047e-24,1.249998e-01,1.542755e-16,CBTA,2.102792e-01,True
5,TCGA-EE-A2MK-01Z-00-DX1.svs,8,6,TCGA-EE-A2MK,TCGA-EE-A2MK-01Z-00-DX1.3A8F8407-BA89-46E6-959...,TCGA-EE-A2MK-06,NON-ACRAL CUTANEOUS,True,0.76,Common,...,[0.0000000e+00 3.3936431e-16 1.4012985e-45 5.8...,0.000000e+00,3.393643e-16,1.401298e-45,5.889657e-42,1.000000e+00,2.828387e-31,CBTPA,6.994277e-15,False
7,TCGA-ER-A2NF-01Z-00-DX1.svs,4,9,TCGA-ER-A2NF,TCGA-ER-A2NF-01Z-00-DX1.1468DD2D-6AC8-4657-A02...,TCGA-ER-A2NF-01,NON-ACRAL CUTANEOUS,True,0.58,Common,...,[1.4670611e-04 9.5189506e-01 8.6480618e-09 3.4...,1.467061e-04,9.518951e-01,8.648062e-09,3.402360e-06,4.795466e-02,8.639984e-08,CBTA,1.082347e-01,False
8,TCGA-EE-A2MI-01Z-00-DX1.svs,9,6,TCGA-EE-A2MI,TCGA-EE-A2MI-01Z-00-DX1.1C56D0A7-3FA7-49A6-BBC...,TCGA-EE-A2MI-06,NON-ACRAL CUTANEOUS,True,0.66,MITF-Low,...,[2.6823066e-33 1.0000000e+00 0.0000000e+00 0.0...,2.682307e-33,1.000000e+00,0.000000e+00,0.000000e+00,1.546927e-17,0.000000e+00,CBTA,3.341852e-16,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21989,TCGA-D3-A2J9-06Z-00-DX1.svs,1,5,TCGA-D3-A2J9,TCGA-D3-A2J9-06Z-00-DX1.5526CFD6-96AB-49F8-B88...,TCGA-D3-A2J9-06,NON-ACRAL CUTANEOUS,True,0.59,Common,...,[5.4147631e-07 6.3293052e-01 3.4650725e-08 4.8...,5.414763e-07,6.329305e-01,3.465072e-08,4.869677e-08,3.670658e-01,3.058369e-06,CBTA,3.669162e-01,True
21990,TCGA-BF-A5EO-01Z-00-DX1.svs,13,7,TCGA-BF-A5EO,TCGA-BF-A5EO-01Z-00-DX1.1BA74189-485E-4ABF-831...,TCGA-BF-A5EO-01,NON-ACRAL CUTANEOUS,True,0.61,OxPhos,...,[0.03188124 0.13373284 0.01077398 0.0079757 0...,3.188124e-02,1.337328e-01,1.077398e-02,7.975700e-03,6.324995e-01,1.831368e-01,CBTPA,5.954319e-01,True
21991,TCGA-EE-A20C-01Z-00-DX1.svs,7,3,TCGA-EE-A20C,TCGA-EE-A20C-01Z-00-DX1.48BAD79E-DFC8-44A7-92F...,TCGA-EE-A20C-06,NON-ACRAL CUTANEOUS,True,0.89,Common,...,[0.16861549 0.00968766 0.44288546 0.03302999 0...,1.686155e-01,9.687660e-03,4.428855e-01,3.302999e-02,2.296810e-03,3.434846e-01,CBTP,6.694179e-01,True
21992,TCGA-ER-A2NG-01Z-00-DX1.svs,2,4,TCGA-ER-A2NG,TCGA-ER-A2NG-01Z-00-DX1.35B12E55-502A-4B87-A68...,TCGA-ER-A2NG-06,NON-ACRAL CUTANEOUS,True,0.49,Common,...,[0.49730957 0.02897338 0.11772093 0.35428905 0...,4.973096e-01,2.897338e-02,1.177209e-01,3.542890e-01,8.161500e-04,8.909200e-04,CBT,6.036191e-01,True


In [17]:
df_tst.wsi_name.nunique()

406

In [18]:
df_tst

Unnamed: 0,wsi_id,x_tile_coord,y_tile_coord,clinical_donor_id,wsi_name,clinical_sample_id,primary_tumor_type,CNA_data,ABSOLUTE_purity,rna_subtype,...,pred_prob,CBT,CBTA,CBTP,CBT3,CBTPA,CBTP3,prediction,tile_entropy,uncertain
2,TCGA-EE-A3JD-01Z-00-DX1.svs,1,8,TCGA-EE-A3JD,TCGA-EE-A3JD-01Z-00-DX1.D4E5B644-C7EF-442D-91F...,TCGA-EE-A3JD-06,NON-ACRAL CUTANEOUS,True,0.26,MITF-Low,...,[1.0015298e-21 1.0000000e+00 0.0000000e+00 2.7...,1.001530e-21,1.000000e+00,0.000000e+00,2.721224e-37,1.858032e-14,0.000000e+00,CBTA,3.382706e-13,False
4,TCGA-D3-A3CB-06Z-00-DX1.svs,0,5,TCGA-D3-A3CB,TCGA-D3-A3CB-06Z-00-DX1.9862D604-C9E7-44BE-99E...,TCGA-D3-A3CB-06,NON-ACRAL CUTANEOUS,True,0.37,Common,...,[8.64833027e-16 8.75000238e-01 3.34407762e-23 ...,8.648330e-16,8.750002e-01,3.344078e-23,2.762047e-24,1.249998e-01,1.542755e-16,CBTA,2.102792e-01,True
5,TCGA-EE-A2MK-01Z-00-DX1.svs,8,6,TCGA-EE-A2MK,TCGA-EE-A2MK-01Z-00-DX1.3A8F8407-BA89-46E6-959...,TCGA-EE-A2MK-06,NON-ACRAL CUTANEOUS,True,0.76,Common,...,[0.0000000e+00 3.3936431e-16 1.4012985e-45 5.8...,0.000000e+00,3.393643e-16,1.401298e-45,5.889657e-42,1.000000e+00,2.828387e-31,CBTPA,6.994277e-15,False
7,TCGA-ER-A2NF-01Z-00-DX1.svs,4,9,TCGA-ER-A2NF,TCGA-ER-A2NF-01Z-00-DX1.1468DD2D-6AC8-4657-A02...,TCGA-ER-A2NF-01,NON-ACRAL CUTANEOUS,True,0.58,Common,...,[1.4670611e-04 9.5189506e-01 8.6480618e-09 3.4...,1.467061e-04,9.518951e-01,8.648062e-09,3.402360e-06,4.795466e-02,8.639984e-08,CBTA,1.082347e-01,False
8,TCGA-EE-A2MI-01Z-00-DX1.svs,9,6,TCGA-EE-A2MI,TCGA-EE-A2MI-01Z-00-DX1.1C56D0A7-3FA7-49A6-BBC...,TCGA-EE-A2MI-06,NON-ACRAL CUTANEOUS,True,0.66,MITF-Low,...,[2.6823066e-33 1.0000000e+00 0.0000000e+00 0.0...,2.682307e-33,1.000000e+00,0.000000e+00,0.000000e+00,1.546927e-17,0.000000e+00,CBTA,3.341852e-16,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21989,TCGA-D3-A2J9-06Z-00-DX1.svs,1,5,TCGA-D3-A2J9,TCGA-D3-A2J9-06Z-00-DX1.5526CFD6-96AB-49F8-B88...,TCGA-D3-A2J9-06,NON-ACRAL CUTANEOUS,True,0.59,Common,...,[5.4147631e-07 6.3293052e-01 3.4650725e-08 4.8...,5.414763e-07,6.329305e-01,3.465072e-08,4.869677e-08,3.670658e-01,3.058369e-06,CBTA,3.669162e-01,True
21990,TCGA-BF-A5EO-01Z-00-DX1.svs,13,7,TCGA-BF-A5EO,TCGA-BF-A5EO-01Z-00-DX1.1BA74189-485E-4ABF-831...,TCGA-BF-A5EO-01,NON-ACRAL CUTANEOUS,True,0.61,OxPhos,...,[0.03188124 0.13373284 0.01077398 0.0079757 0...,3.188124e-02,1.337328e-01,1.077398e-02,7.975700e-03,6.324995e-01,1.831368e-01,CBTPA,5.954319e-01,True
21991,TCGA-EE-A20C-01Z-00-DX1.svs,7,3,TCGA-EE-A20C,TCGA-EE-A20C-01Z-00-DX1.48BAD79E-DFC8-44A7-92F...,TCGA-EE-A20C-06,NON-ACRAL CUTANEOUS,True,0.89,Common,...,[0.16861549 0.00968766 0.44288546 0.03302999 0...,1.686155e-01,9.687660e-03,4.428855e-01,3.302999e-02,2.296810e-03,3.434846e-01,CBTP,6.694179e-01,True
21992,TCGA-ER-A2NG-01Z-00-DX1.svs,2,4,TCGA-ER-A2NG,TCGA-ER-A2NG-01Z-00-DX1.35B12E55-502A-4B87-A68...,TCGA-ER-A2NG-06,NON-ACRAL CUTANEOUS,True,0.49,Common,...,[0.49730957 0.02897338 0.11772093 0.35428905 0...,4.973096e-01,2.897338e-02,1.177209e-01,3.542890e-01,8.161500e-04,8.909200e-04,CBT,6.036191e-01,True


In [20]:
'Compute probabilities of having a KO'
# tile
df_tst['P53_prob'] = (df_tst.CBT3 + df_tst.CBTP3)
df_tst['PTEN_prob'] = (df_tst.CBTPA + df_tst.CBTP3 + df_tst.CBTP)  
df_tst['APC_prob'] = (df_tst.CBTPA + df_tst.CBTA)  

# # WSI
# df_wsi['P53_prob'] = (df_wsi.CBT3 + df_wsi.CBTP3)
# df_wsi['PTEN_prob'] = (df_wsi.CBTPA + df_wsi.CBTP3 + df_wsi.CBTP)  
# df_wsi['APC_prob'] = (df_wsi.CBTPA + df_wsi.CBTA)  

In [21]:
'Add RNA signatures to ground truth labels (tile)'
# tile
ox_mask = df_tst.rna_subtype == 'OxPhos'
mi_mask = df_tst.rna_subtype == 'MITF-Low'
cm_mask = df_tst.rna_subtype == 'Common'

df_tst['APC_BC'] = df_tst.BetaCAT | df_tst.APC
df_tst['OxPhos'] = ox_mask
df_tst['APC_OxPhos'] = df_tst.APC | ox_mask
df_tst['APC_BC_OxPhos'] = df_tst.BetaCAT | df_tst.APC | ox_mask

df_tst['P53_MITF'] = df_tst.P53 | mi_mask
df_tst['P53_Co'] = df_tst.P53 | cm_mask
df_tst['P53_MITF_Co'] = df_tst.P53 | cm_mask | mi_mask
df_tst['MITF_Co'] = cm_mask | mi_mask

In [22]:
df_tst.to_csv('0107_pred_TCGA_2.csv')
df_tst.to_csv('Extended_Data_Table_2.csv')

In [23]:
'Remove uncertain prediction from further analysis'
mask = df_tst.uncertain == False
df_tst = df_tst[mask]

***
## ROCs (tile level)

In [24]:
import seaborn as sns
sns.set_style('darkgrid')

In [25]:
'Compute and show ROC curves on data (APC)'
# APC 
roc_A_x, roc_A_y = get_roc_xy(df_tst[['APC', 'APC_prob']])
auc_A = roc_auc_score(df_tst['APC'], df_tst['APC_prob'])
roc_AB_x, roc_AB_y = get_roc_xy(df_tst[['APC_BC', 'APC_prob']])
auc_AB = roc_auc_score(df_tst['APC_BC'], df_tst['APC_prob'])
roc_AO_x, roc_AO_y = get_roc_xy(df_tst[['APC_OxPhos', 'APC_prob']])
auc_AO = roc_auc_score(df_tst['APC_OxPhos'], df_tst['APC_prob'])
roc_ABO_x, roc_ABO_y = get_roc_xy(df_tst[['APC_BC_OxPhos', 'APC_prob']])
auc_ABO = roc_auc_score(df_tst['APC_BC_OxPhos'], df_tst['APC_prob'])
roc_O_x, roc_O_y = get_roc_xy(df_tst[['OxPhos', 'APC_prob']])
auc_O = roc_auc_score(df_tst['OxPhos'], df_tst['APC_prob'])

In [26]:
'Show ROC curves for KO only'
fig, ax = plt.subplots(1, 1, figsize=(5, 5))

ax.plot([0, 1], [0, 1], 'k--', label='random')
ax.plot(roc_A_x, roc_A_y, label='APC (AUC {:.2f})'.format(auc_A));
ax.plot(roc_AB_x, roc_AB_y, label='APC+BC (AUC {:.2f})'.format(auc_AB));
ax.plot(roc_AO_x, roc_AO_y, label='APC+OxPh (AUC {:.2f})'.format(auc_AO));
ax.plot(roc_ABO_x, roc_ABO_y, label='APC+BC+OxPh (AUC {:.2f})'.format(auc_ABO));

ax.set_aspect(1)
ax.set_xlabel('False positive rate')
ax.set_ylabel('True positive rate')
ax.set_title('ROCs on TCGA (APC KO; tile level)')
ax.legend(loc='lower right');

plt.savefig('../results/rocs_tcga_apc_tile.pdf')
# # PTEN KO
# roc_P_m, roc_P_e = get_roc_mean_err(df_tst[['PTEN', 'PTEN_prob']])
# roc_Prnd_m, roc_Prnd_e = get_roc_mean_err_rand(df_tst[['PTEN']])

# # P53 KO
# roc_3_m, roc_3_e = get_roc_mean_err(df_tst[['P53', 'P53_prob']])
# roc_3R_m, roc_3R_e = get_roc_mean_err(df_tst[['P53_RNA', 'P53_prob']])
# roc_3rnd_m, roc_3rnd_e = get_roc_mean_err_rand(df_tst[['P53']])
# roc_3Rrnd_m, roc_3Rrnd_e = get_roc_mean_err_rand(df_tst[['P53_RNA']])

In [27]:
'Compute and show ROC curves on data (P53)'
# APC KO
roc_1_x, roc_1_y = get_roc_xy(df_tst[['P53', 'P53_prob']])
auc_1 = roc_auc_score(df_tst['P53'], df_tst['P53_prob'])
roc_2_x, roc_2_y = get_roc_xy(df_tst[['P53_MITF', 'P53_prob']])
auc_2 = roc_auc_score(df_tst['P53_MITF'], df_tst['P53_prob'])
roc_3_x, roc_3_y = get_roc_xy(df_tst[['P53_Co', 'P53_prob']])
auc_3 = roc_auc_score(df_tst['P53_Co'], df_tst['P53_prob'])
roc_4_x, roc_4_y = get_roc_xy(df_tst[['P53_MITF_Co', 'P53_prob']])
auc_4 = roc_auc_score(df_tst['P53_MITF_Co'], df_tst['P53_prob'])

'Show ROC curves for KO only'
fig, ax = plt.subplots(1, 1, figsize=(5, 5))

ax.plot([0, 1], [0, 1], 'k--', label='random')
ax.plot(roc_1_x, roc_1_y, label='P53 (AUC {:.2f})'.format(auc_1));
ax.plot(roc_2_x, roc_2_y, label='P53+MITF (AUC {:.2f})'.format(auc_2));
ax.plot(roc_3_x, roc_3_y, label='P53+Co (AUC {:.2f})'.format(auc_3));
ax.plot(roc_4_x, roc_4_y, label='P53+MITF+Co (AUC {:.2f})'.format(auc_4));

ax.set_aspect(1)
ax.set_xlabel('False positive rate')
ax.set_ylabel('True positive rate')
ax.set_title('ROCs on TCGA (P53 KO; tile level)')
ax.legend(loc='lower right');

plt.savefig('../results/rocs_tcga_p53_tile.pdf')
# # PTEN KO
# roc_P_m, roc_P_e = get_roc_mean_err(df_tst[['PTEN', 'PTEN_prob']])
# roc_Prnd_m, roc_Prnd_e = get_roc_mean_err_rand(df_tst[['PTEN']])

# # P53 KO
# roc_3_m, roc_3_e = get_roc_mean_err(df_tst[['P53', 'P53_prob']])
# roc_3R_m, roc_3R_e = get_roc_mean_err(df_tst[['P53_RNA', 'P53_prob']])
# roc_3rnd_m, roc_3rnd_e = get_roc_mean_err_rand(df_tst[['P53']])
# roc_3Rrnd_m, roc_3Rrnd_e = get_roc_mean_err_rand(df_tst[['P53_RNA']])

In [28]:
'Compute and show ROC curves on data (PTEN)'
# APC KO
roc_1_x, roc_1_y = get_roc_xy(df_tst[['PTEN', 'PTEN_prob']])
auc_1 = roc_auc_score(df_tst['PTEN'], df_tst['PTEN_prob'])

'Show ROC curves for KO only'
fig, ax = plt.subplots(1, 1, figsize=(5, 5))

ax.plot([0, 1], [0, 1], 'k--', label='random')
ax.plot(roc_1_x, roc_1_y, label='PTEN (AUC {:.2f})'.format(auc_1));

ax.set_aspect(1)
ax.set_xlabel('False positive rate')
ax.set_ylabel('True positive rate')
ax.set_title('ROCs on TCGA (PTEN KO; tile level)')
ax.legend(loc='lower right');

plt.savefig('../results/rocs_tcga_pten_tile.pdf')
# # PTEN KO
# roc_P_m, roc_P_e = get_roc_mean_err(df_tst[['PTEN', 'PTEN_prob']])
# roc_Prnd_m, roc_Prnd_e = get_roc_mean_err_rand(df_tst[['PTEN']])

# # P53 KO
# roc_3_m, roc_3_e = get_roc_mean_err(df_tst[['P53', 'P53_prob']])
# roc_3R_m, roc_3R_e = get_roc_mean_err(df_tst[['P53_RNA', 'P53_prob']])
# roc_3rnd_m, roc_3rnd_e = get_roc_mean_err_rand(df_tst[['P53']])
# roc_3Rrnd_m, roc_3Rrnd_e = get_roc_mean_err_rand(df_tst[['P53_RNA']])

In [29]:
'Show APC ROC curve (Elena s)'
fig, ax = plt.subplots(1, 1, figsize=(5, 5))

ax.plot([0, 1], [0, 1], 'k--', label='random')
ax.plot(roc_AB_x, roc_AB_y, label='APC+BC (AUC {:.2f})'.format(auc_AB));
ax.plot(roc_O_x, roc_O_y, label='OxPh (AUC {:.2f})'.format(auc_O));
ax.plot(roc_ABO_x, roc_ABO_y, label='APC+BC+OxPh (AUC {:.2f})'.format(auc_ABO));

ax.set_aspect(1)
ax.set_xlabel('False positive rate')
ax.set_ylabel('True positive rate')
ax.set_title('ROCs on TCGA (APC KO; tile level)')
ax.legend(loc='lower right');

plt.savefig('../results/rocs_tcga_6e_apc_tile.pdf')

In [30]:
'Compute and show ROC curves on data (P53)'
# APC KO
roc_1_x, roc_1_y = get_roc_xy(df_tst[['P53', 'P53_prob']])
auc_1 = roc_auc_score(df_tst['P53'], df_tst['P53_prob'])
roc_2_x, roc_2_y = get_roc_xy(df_tst[['MITF_Co', 'P53_prob']])
auc_2 = roc_auc_score(df_tst['MITF_Co'], df_tst['P53_prob'])
roc_4_x, roc_4_y = get_roc_xy(df_tst[['P53_MITF_Co', 'P53_prob']])
auc_4 = roc_auc_score(df_tst['P53_MITF_Co'], df_tst['P53_prob'])

'Show ROC curves for KO only'
fig, ax = plt.subplots(1, 1, figsize=(5, 5))

ax.plot([0, 1], [0, 1], 'k--', label='random')
ax.plot(roc_1_x, roc_1_y, label='P53 (AUC {:.2f})'.format(auc_1));
ax.plot(roc_2_x, roc_2_y, label='MITF+Co (AUC {:.2f})'.format(auc_2));
ax.plot(roc_4_x, roc_4_y, label='P53+MITF+Co (AUC {:.2f})'.format(auc_4));

ax.set_aspect(1)
ax.set_xlabel('False positive rate')
ax.set_ylabel('True positive rate')
ax.set_title('ROCs on TCGA (P53 KO; tile level)')
ax.legend(loc='lower right');

plt.savefig('../results/rocs_tcga_6e_p53_tile.pdf')

In [31]:
'Control curve for PTEN'
# APC KO
roc_1_x, roc_1_y = get_roc_xy(df_tst[['APC_BC', 'PTEN_prob']])
auc_1 = roc_auc_score(df_tst['APC_BC'], df_tst['PTEN_prob'])
roc_2_x, roc_2_y = get_roc_xy(df_tst[['OxPhos', 'PTEN_prob']])
auc_2 = roc_auc_score(df_tst['OxPhos'], df_tst['PTEN_prob'])
roc_3_x, roc_3_y = get_roc_xy(df_tst[['APC_BC_OxPhos', 'PTEN_prob']])
auc_3 = roc_auc_score(df_tst['APC_BC_OxPhos'], df_tst['PTEN_prob'])

roc_4_x, roc_4_y = get_roc_xy(df_tst[['P53', 'PTEN_prob']])
auc_4 = roc_auc_score(df_tst['P53'], df_tst['PTEN_prob'])
roc_5_x, roc_5_y = get_roc_xy(df_tst[['MITF_Co', 'PTEN_prob']])
auc_5 = roc_auc_score(df_tst['MITF_Co'], df_tst['PTEN_prob'])
roc_6_x, roc_6_y = get_roc_xy(df_tst[['P53_MITF_Co', 'PTEN_prob']])
auc_6 = roc_auc_score(df_tst['P53_MITF_Co'], df_tst['PTEN_prob'])

'Show ROC curves for KO only'
fig, ax = plt.subplots(1, 1, figsize=(6, 6))

ax.plot([0, 1], [0, 1], 'k--', label='random')
ax.plot(roc_1_x, roc_1_y, label='APC_BC (AUC {:.2f})'.format(auc_1));
ax.plot(roc_2_x, roc_2_y, label='OxPhos (AUC {:.2f})'.format(auc_2));
ax.plot(roc_3_x, roc_3_y, label='APC_BC_OxPhos (AUC {:.2f})'.format(auc_3));
ax.plot(roc_4_x, roc_4_y, label='P53 (AUC {:.2f})'.format(auc_4));
ax.plot(roc_5_x, roc_5_y, label='MITF_Co (AUC {:.2f})'.format(auc_5));
ax.plot(roc_6_x, roc_6_y, label='P53+MITF+Co (AUC {:.2f})'.format(auc_6));

ax.set_aspect(1)
ax.set_xlabel('False positive rate')
ax.set_ylabel('True positive rate')
ax.set_title('ROCs on TCGA (control; PTEN KO prediction; tile level)')
ax.legend(loc='lower right');

plt.savefig('../results/rocs_tcga_control_PTEN_tile.pdf')
# # PTEN KO
# roc_P_m, roc_P_e = get_roc_mean_err(df_tst[['PTEN', 'PTEN_prob']])
# roc_Prnd_m, roc_Prnd_e = get_roc_mean_err_rand(df_tst[['PTEN']])

# # P53 KO
# roc_3_m, roc_3_e = get_roc_mean_err(df_tst[['P53', 'P53_prob']])
# roc_3R_m, roc_3R_e = get_roc_mean_err(df_tst[['P53_RNA', 'P53_prob']])
# roc_3rnd_m, roc_3rnd_e = get_roc_mean_err_rand(df_tst[['P53']])
# roc_3Rrnd_m, roc_3Rrnd_e = get_roc_mean_err_rand(df_tst[['P53_RNA']])

In [32]:
'Control curve for APC'
# APC KO
roc_1_x, roc_1_y = get_roc_xy(df_tst[['PTEN', 'APC_prob']])
auc_1 = roc_auc_score(df_tst['PTEN'], df_tst['APC_prob'])
roc_2_x, roc_2_y = get_roc_xy(df_tst[['P53', 'APC_prob']])
auc_2 = roc_auc_score(df_tst['P53'], df_tst['APC_prob'])
roc_3_x, roc_3_y = get_roc_xy(df_tst[['MITF_Co', 'APC_prob']])
auc_3 = roc_auc_score(df_tst['MITF_Co'], df_tst['APC_prob'])
roc_4_x, roc_4_y = get_roc_xy(df_tst[['P53_MITF_Co', 'APC_prob']])
auc_4 = roc_auc_score(df_tst['P53_MITF_Co'], df_tst['APC_prob'])

'Show ROC curves for KO only'
fig, ax = plt.subplots(1, 1, figsize=(6, 6))

ax.plot([0, 1], [0, 1], 'k--', label='random')
ax.plot(roc_1_x, roc_1_y, label='PTEN (AUC {:.2f})'.format(auc_1));
ax.plot(roc_2_x, roc_2_y, label='P53 (AUC {:.2f})'.format(auc_2));
ax.plot(roc_3_x, roc_3_y, label='MITF_Co (AUC {:.2f})'.format(auc_3));
ax.plot(roc_4_x, roc_4_y, label='P53_MITF_Co (AUC {:.2f})'.format(auc_4));

ax.set_aspect(1)
ax.set_xlabel('False positive rate')
ax.set_ylabel('True positive rate')
ax.set_title('ROCs on TCGA (control; APC KO prediction; tile level)')
ax.legend(loc='lower right');

plt.savefig('../results/rocs_tcga_control_APC_tile.pdf')
# # PTEN KO
# roc_P_m, roc_P_e = get_roc_mean_err(df_tst[['PTEN', 'PTEN_prob']])
# roc_Prnd_m, roc_Prnd_e = get_roc_mean_err_rand(df_tst[['PTEN']])

# # P53 KO
# roc_3_m, roc_3_e = get_roc_mean_err(df_tst[['P53', 'P53_prob']])
# roc_3R_m, roc_3R_e = get_roc_mean_err(df_tst[['P53_RNA', 'P53_prob']])
# roc_3rnd_m, roc_3rnd_e = get_roc_mean_err_rand(df_tst[['P53']])
# roc_3Rrnd_m, roc_3Rrnd_e = get_roc_mean_err_rand(df_tst[['P53_RNA']])

In [33]:
'Control curve for P53'
# APC KO
roc_1_x, roc_1_y = get_roc_xy(df_tst[['PTEN', 'P53_prob']])
auc_1 = roc_auc_score(df_tst['PTEN'], df_tst['P53_prob'])
roc_2_x, roc_2_y = get_roc_xy(df_tst[['APC_BC', 'P53_prob']])
auc_2 = roc_auc_score(df_tst['APC_BC'], df_tst['P53_prob'])
roc_3_x, roc_3_y = get_roc_xy(df_tst[['OxPhos', 'P53_prob']])
auc_3 = roc_auc_score(df_tst['OxPhos'], df_tst['P53_prob'])
roc_4_x, roc_4_y = get_roc_xy(df_tst[['APC_BC_OxPhos', 'P53_prob']])
auc_4 = roc_auc_score(df_tst['APC_BC_OxPhos'], df_tst['P53_prob'])

'Show ROC curves for KO only'
fig, ax = plt.subplots(1, 1, figsize=(6, 6))

ax.plot([0, 1], [0, 1], 'k--', label='random')
ax.plot(roc_1_x, roc_1_y, label='PTEN (AUC {:.2f})'.format(auc_1));
ax.plot(roc_2_x, roc_2_y, label='APC_BC (AUC {:.2f})'.format(auc_2));
ax.plot(roc_3_x, roc_3_y, label='OxPhos (AUC {:.2f})'.format(auc_3));
ax.plot(roc_4_x, roc_4_y, label='APC_BC_OxPhos (AUC {:.2f})'.format(auc_4));

ax.set_aspect(1)
ax.set_xlabel('False positive rate')
ax.set_ylabel('True positive rate')
ax.set_title('ROCs on TCGA (control; P53 KO prediction; tile level)')
ax.legend(loc='lower right');

plt.savefig('../results/rocs_tcga_control_P53_tile.pdf')
# # PTEN KO
# roc_P_m, roc_P_e = get_roc_mean_err(df_tst[['PTEN', 'PTEN_prob']])
# roc_Prnd_m, roc_Prnd_e = get_roc_mean_err_rand(df_tst[['PTEN']])

# # P53 KO
# roc_3_m, roc_3_e = get_roc_mean_err(df_tst[['P53', 'P53_prob']])
# roc_3R_m, roc_3R_e = get_roc_mean_err(df_tst[['P53_RNA', 'P53_prob']])
# roc_3rnd_m, roc_3rnd_e = get_roc_mean_err_rand(df_tst[['P53']])
# roc_3Rrnd_m, roc_3Rrnd_e = get_roc_mean_err_rand(df_tst[['P53_RNA']])

Redo-showing mean curve

- **TODO** BC is not helping. Why? How many examples and why are missclassified.

***
## Soft assignment on WSI
- For each WSI compute probability of mutation.
- Analogous of tiles BUT take out uncertain tiles (I can also take them out in tiles).

In [49]:
df_tst.wsi_id.nunique()

388

In [50]:
df_wsi.columns

Index(['wsi_name', 'wsi_id', 'clinical_sample_id', 'primary_tumor_type',
       'CNA_data', 'ABSOLUTE_purity', 'rna_subtype', 'BetaCAT', 'P53', 'PTEN',
       'APC'],
      dtype='object')

In [51]:
'Aggregate tiles in WSIs'
# compute number of tiles per WSI
num_tiles = df_tst.groupby('wsi_id').count().x_tile_coord
# aggregate by wsi_id and normalize by tile number
df_wsi = df_tst.groupby('wsi_id').sum()
df_wsi['APC_prob_norm'] = df_wsi['APC_prob'] / num_tiles
df_wsi['P53_prob_norm'] = df_wsi['P53_prob'] / num_tiles
df_wsi['PTEN_prob_norm'] = df_wsi['PTEN_prob'] / num_tiles
# cast ground truth to Boolean
df_wsi['APC'] = df_wsi.APC != 0
df_wsi['APC_BC'] = df_wsi.APC_BC != 0
df_wsi['APC_OxPhos'] = df_wsi.APC_OxPhos != 0
df_wsi['APC_BC_OxPhos'] = df_wsi.APC_BC_OxPhos != 0
df_wsi['OxPhos'] = df_wsi.OxPhos != 0

df_wsi['P53'] = df_wsi.P53 != 0
df_wsi['P53_MITF'] = df_wsi.P53_MITF != 0
df_wsi['P53_Co'] = df_wsi.P53_Co != 0
df_wsi['P53_MITF_Co'] = df_wsi.P53_MITF_Co != 0
df_wsi['MITF_Co'] = df_wsi.MITF_Co != 0

df_wsi['PTEN'] = df_wsi.PTEN != 0

In [52]:
df_wsi.APC_prob_norm.max(), df_wsi.PTEN_prob_norm.max(), df_wsi.P53_prob_norm.max()

(1.0000000173710002, 0.9999999747679034, 0.9999924300018137)

In [53]:
'Compute and show ROCs (WSI level, APC)'
roc_A_x, roc_A_y = get_roc_xy(df_wsi[['APC', 'APC_prob_norm']])
auc_A = roc_auc_score(df_wsi['APC'], df_wsi['APC_prob_norm'])
roc_AB_x, roc_AB_y = get_roc_xy(df_wsi[['APC_BC', 'APC_prob_norm']])
auc_AB = roc_auc_score(df_wsi['APC_BC'], df_wsi['APC_prob_norm'])
roc_AO_x, roc_AO_y = get_roc_xy(df_wsi[['APC_OxPhos', 'APC_prob_norm']])
auc_AO = roc_auc_score(df_wsi['APC_OxPhos'], df_wsi['APC_prob_norm'])
roc_ABO_x, roc_ABO_y = get_roc_xy(df_wsi[['APC_BC_OxPhos', 'APC_prob_norm']])
auc_ABO = roc_auc_score(df_wsi['APC_BC_OxPhos'], df_wsi['APC_prob_norm'])

#Newly Added
roc_O_x, roc_O_y = get_roc_xy(df_wsi[['OxPhos', 'APC_prob_norm']])
auc_O = roc_auc_score(df_wsi['OxPhos'], df_wsi['APC_prob_norm'])

fig, ax = plt.subplots(1, 1, figsize=(5, 5))
ax.plot([0, 1], [0, 1], 'k--', label='random')

#ax.plot(roc_A_x, roc_A_y, label='APC (AUC {:.2f})'.format(auc_A), c='b');
ax.plot(roc_AB_x, roc_AB_y, label='APC+BC (AUC {:.2f})'.format(auc_AB), c='r');
#ax.plot(roc_AO_x, roc_AO_y, label='APC+OxPh (AUC {:.2f})'.format(auc_AO), c='g');
#ax.plot(roc_ABO_x, roc_ABO_y, label='APC+BC+OxPh (AUC {:.2f})'.format(auc_ABO), c='brown');

#New Added
ax.plot(roc_O_x, roc_O_y, label='OxPh (AUC {:.2f})'.format(auc_O), c='blue');


ax.set_title('ROCs on TCGA (APC KO; WSI level)')
ax.set_aspect(1)
ax.set_xlabel('False positive rate')
ax.set_ylabel('True positive rate')
ax.legend(loc='lower right');

plt.savefig('../results/rocs_tcga_apc_wsi.pdf');

In [54]:
'Compute and show ROCs (WSI level, P53)'
roc_1_x, roc_1_y = get_roc_xy(df_wsi[['P53', 'P53_prob_norm']])
auc_1 = roc_auc_score(df_wsi['P53'], df_wsi['P53_prob_norm'])
roc_2_x, roc_2_y = get_roc_xy(df_wsi[['P53_MITF', 'P53_prob_norm']])
auc_2 = roc_auc_score(df_wsi['P53_MITF'], df_wsi['P53_prob_norm'])
roc_3_x, roc_3_y = get_roc_xy(df_wsi[['P53_Co', 'P53_prob_norm']])
auc_3 = roc_auc_score(df_wsi['P53_Co'], df_wsi['P53_prob_norm'])
roc_4_x, roc_4_y = get_roc_xy(df_wsi[['P53_MITF_Co', 'P53_prob_norm']])
auc_4 = roc_auc_score(df_wsi['P53_MITF_Co'], df_wsi['P53_prob_norm'])


fig, ax = plt.subplots(1, 1, figsize=(5, 5))
ax.plot([0, 1], [0, 1], 'k--', label='random')

ax.plot(roc_1_x, roc_1_y, label='P53 (AUC {:.2f})'.format(auc_1), c='b');
ax.plot(roc_2_x, roc_2_y, label='P53+Mi (AUC {:.2f})'.format(auc_2), c='r');
ax.plot(roc_3_x, roc_3_y, label='P53+Co (AUC {:.2f})'.format(auc_3), c='g');
ax.plot(roc_4_x, roc_4_y, label='P53+Mi+Co (AUC {:.2f})'.format(auc_4), c='brown');

ax.set_title('ROCs on TCGA (P53 KO; WSI level)')
ax.set_aspect(1)
ax.set_xlabel('False positive rate')
ax.set_ylabel('True positive rate')
ax.legend(loc='lower right');
plt.savefig('../results/rocs_tcga_p53_wsi.pdf');

In [55]:
'Compute and show ROCs (WSI level, PTEN)'
roc_1_x, roc_1_y = get_roc_xy(df_wsi[['PTEN', 'PTEN_prob_norm']])
auc_1 = roc_auc_score(df_wsi['PTEN'], df_wsi['PTEN_prob_norm'])

fig, ax = plt.subplots(1, 1, figsize=(5, 5))
ax.plot([0, 1], [0, 1], 'k--', label='random')

ax.plot(roc_1_x, roc_1_y, label='PTEN (AUC {:.2f})'.format(auc_1), c='b');

ax.set_title('ROCs on TCGA (PTEN KO; WSI level)')
ax.set_aspect(1)
ax.set_xlabel('False positive rate')
ax.set_ylabel('True positive rate')
ax.legend(loc='lower right');
plt.savefig('../results/rocs_tcga_pten_wsi.pdf');

In [56]:
'ROC APC (Elena s)'
fig, ax = plt.subplots(1, 1, figsize=(5, 5))
ax.plot([0, 1], [0, 1], 'k--', label='random')

ax.plot(roc_AB_x, roc_AB_y, label='APC+BC (AUC {:.2f})'.format(auc_AB), c='r');
ax.plot(roc_O_x, roc_O_y, label='OxPh (AUC {:.2f})'.format(auc_O), c='g');
ax.plot(roc_ABO_x, roc_ABO_y, label='APC+BC+OxPh (AUC {:.2f})'.format(auc_ABO), c='brown');

ax.set_title('ROCs on TCGA (APC KO; WSI level)')
ax.set_aspect(1)
ax.set_xlabel('False positive rate')
ax.set_ylabel('True positive rate')
ax.legend(loc='lower right');

plt.savefig('../results/rocs_tcga_6e_apc_wsi.pdf');

In [57]:
'Compute and show ROCs (WSI level, P53)'
roc_1_x, roc_1_y = get_roc_xy(df_wsi[['P53', 'P53_prob_norm']])
auc_1 = roc_auc_score(df_wsi['P53'], df_wsi['P53_prob_norm'])
roc_2_x, roc_2_y = get_roc_xy(df_wsi[['MITF_Co', 'P53_prob_norm']])
auc_2 = roc_auc_score(df_wsi['MITF_Co'], df_wsi['P53_prob_norm'])
roc_4_x, roc_4_y = get_roc_xy(df_wsi[['P53_MITF_Co', 'P53_prob_norm']])
auc_4 = roc_auc_score(df_wsi['P53_MITF_Co'], df_wsi['P53_prob_norm'])


fig, ax = plt.subplots(1, 1, figsize=(5, 5))
ax.plot([0, 1], [0, 1], 'k--', label='random')

ax.plot(roc_1_x, roc_1_y, label='P53 (AUC {:.2f})'.format(auc_1), c='b');
ax.plot(roc_2_x, roc_2_y, label='Mi+Co (AUC {:.2f})'.format(auc_2), c='r');
ax.plot(roc_4_x, roc_4_y, label='P53+Mi+Co (AUC {:.2f})'.format(auc_4), c='brown');

ax.set_title('ROCs on TCGA (P53 KO; WSI level)')
ax.set_aspect(1)
ax.set_xlabel('False positive rate')
ax.set_ylabel('True positive rate')
ax.legend(loc='lower right');
plt.savefig('../results/rocs_tcga_6e_p53_wsi.pdf');

In [58]:
'Control curve for P53'
# APC KO
roc_1_x, roc_1_y = get_roc_xy(df_wsi[['PTEN', 'P53_prob']])
auc_1 = roc_auc_score(df_wsi['PTEN'], df_wsi['P53_prob'])
roc_2_x, roc_2_y = get_roc_xy(df_wsi[['APC_BC', 'P53_prob']])
auc_2 = roc_auc_score(df_wsi['APC_BC'], df_wsi['P53_prob'])
roc_3_x, roc_3_y = get_roc_xy(df_wsi[['OxPhos', 'P53_prob']])
auc_3 = roc_auc_score(df_wsi['OxPhos'], df_wsi['P53_prob'])
roc_4_x, roc_4_y = get_roc_xy(df_wsi[['APC_BC_OxPhos', 'P53_prob']])
auc_4 = roc_auc_score(df_wsi['APC_BC_OxPhos'], df_wsi['P53_prob'])

'Show ROC curves for KO only'
fig, ax = plt.subplots(1, 1, figsize=(6, 6))

ax.plot([0, 1], [0, 1], 'k--', label='random')
ax.plot(roc_1_x, roc_1_y, label='PTEN (AUC {:.2f})'.format(auc_1));
ax.plot(roc_2_x, roc_2_y, label='APC_BC (AUC {:.2f})'.format(auc_2));
ax.plot(roc_3_x, roc_3_y, label='OxPhos (AUC {:.2f})'.format(auc_3));
ax.plot(roc_4_x, roc_4_y, label='APC_BC_OxPhos (AUC {:.2f})'.format(auc_4));

ax.set_aspect(1)
ax.set_xlabel('False positive rate')
ax.set_ylabel('True positive rate')
ax.set_title('ROCs on TCGA (control; P53 KO prediction; wsi level)')
ax.legend(loc='lower right');

plt.savefig('../results/rocs_tcga_control_P53_wsi.pdf')

In [59]:
'Control curve for PTEN'
# APC KO
roc_1_x, roc_1_y = get_roc_xy(df_wsi[['APC_BC', 'PTEN_prob']])
auc_1 = roc_auc_score(df_wsi['APC_BC'], df_wsi['PTEN_prob'])
roc_2_x, roc_2_y = get_roc_xy(df_wsi[['OxPhos', 'PTEN_prob']])
auc_2 = roc_auc_score(df_wsi['OxPhos'], df_wsi['PTEN_prob'])
roc_3_x, roc_3_y = get_roc_xy(df_wsi[['APC_BC_OxPhos', 'PTEN_prob']])
auc_3 = roc_auc_score(df_wsi['APC_BC_OxPhos'], df_wsi['PTEN_prob'])

roc_4_x, roc_4_y = get_roc_xy(df_wsi[['P53', 'PTEN_prob']])
auc_4 = roc_auc_score(df_wsi['P53'], df_wsi['PTEN_prob'])
roc_5_x, roc_5_y = get_roc_xy(df_wsi[['MITF_Co', 'PTEN_prob']])
auc_5 = roc_auc_score(df_wsi['MITF_Co'], df_wsi['PTEN_prob'])
roc_6_x, roc_6_y = get_roc_xy(df_wsi[['P53_MITF_Co', 'PTEN_prob']])
auc_6 = roc_auc_score(df_wsi['P53_MITF_Co'], df_wsi['PTEN_prob'])

'Show ROC curves for KO only'
fig, ax = plt.subplots(1, 1, figsize=(6, 6))

ax.plot([0, 1], [0, 1], 'k--', label='random')
ax.plot(roc_1_x, roc_1_y, label='APC_BC (AUC {:.2f})'.format(auc_1));
ax.plot(roc_2_x, roc_2_y, label='OxPhos (AUC {:.2f})'.format(auc_2));
ax.plot(roc_3_x, roc_3_y, label='APC_BC_OxPhos (AUC {:.2f})'.format(auc_3));
ax.plot(roc_4_x, roc_4_y, label='P53 (AUC {:.2f})'.format(auc_4));
ax.plot(roc_5_x, roc_5_y, label='MITF_Co (AUC {:.2f})'.format(auc_5));
ax.plot(roc_6_x, roc_6_y, label='P53+MITF+Co (AUC {:.2f})'.format(auc_6));

ax.set_aspect(1)
ax.set_xlabel('False positive rate')
ax.set_ylabel('True positive rate')
ax.set_title('ROCs on TCGA (control; PTEN KO prediction; wsi level)')
ax.legend(loc='lower right');

plt.savefig('../results/rocs_tcga_control_PTEN_wsi.pdf')

In [60]:
'Control curve for APC'
# APC KO
roc_1_x, roc_1_y = get_roc_xy(df_wsi[['PTEN', 'APC_prob']])
auc_1 = roc_auc_score(df_wsi['PTEN'], df_wsi['APC_prob'])
roc_2_x, roc_2_y = get_roc_xy(df_wsi[['P53', 'APC_prob']])
auc_2 = roc_auc_score(df_wsi['P53'], df_wsi['APC_prob'])
roc_3_x, roc_3_y = get_roc_xy(df_wsi[['MITF_Co', 'APC_prob']])
auc_3 = roc_auc_score(df_wsi['MITF_Co'], df_wsi['APC_prob'])
roc_4_x, roc_4_y = get_roc_xy(df_wsi[['P53_MITF_Co', 'APC_prob']])
auc_4 = roc_auc_score(df_wsi['P53_MITF_Co'], df_wsi['APC_prob'])

'Show ROC curves for KO only'
fig, ax = plt.subplots(1, 1, figsize=(6, 6))

ax.plot([0, 1], [0, 1], 'k--', label='random')
ax.plot(roc_1_x, roc_1_y, label='PTEN (AUC {:.2f})'.format(auc_1));
ax.plot(roc_2_x, roc_2_y, label='P53 (AUC {:.2f})'.format(auc_2));
ax.plot(roc_3_x, roc_3_y, label='MITF_Co (AUC {:.2f})'.format(auc_3));
ax.plot(roc_4_x, roc_4_y, label='P53_MITF_Co (AUC {:.2f})'.format(auc_4));

ax.set_aspect(1)
ax.set_xlabel('False positive rate')
ax.set_ylabel('True positive rate')
ax.set_title('ROCs on TCGA (control; APC KO prediction; wsi level)')
ax.legend(loc='lower right');

plt.savefig('../results/rocs_tcga_control_APC_wsi.pdf')

***
## AUC vs entropy
- restart kernel 

In [67]:
import seaborn as sns
from scipy.stats import entropy
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
import re

# 'Load metadata'
#fld = os.path.join(home, 'melanoma_images')
#meta_fld = os.path.join(fld, 'metadata')
data_fld = '../data'
sub_fld = os.path.join(data_fld, 'TCGA_data')
path = os.path.join(sub_fld, 'tcga_wsi_meta.csv')
df_wsi = pd.read_csv(path, index_col=0)
path = os.path.join(sub_fld, 'tcga_tile_meta.csv')
df_tile = pd.read_csv(path)  # no index_col = 0

# 'Load predictions post-inference'
path = os.path.join(sub_fld,'0107_pred_TCGA.csv')
df_tst = pd.read_csv(path, index_col=0)
assert len(df_tile) == len(df_tst)

NameError: name 'home' is not defined

In [None]:
# 'Remove low quality WSI from further analysis'
low_quality_wsi = ['TCGA-WE-AAA3-06Z-00-DX1-WSI',
 'TCGA-GF-A2C7-01Z-00-DX1-WSI',
 'TCGA-FS-A1ZU-06Z-00-DX3-WSI',
 'TCGA-FS-A1ZU-06Z-00-DX2-WSI',
 'TCGA-FS-A1ZN-01Z-00-DX9-WSI',
 'TCGA-FS-A1ZE-06Z-00-DX1-WSI',
 'TCGA-FR-A728-01Z-00-DX1-WSI',
 'TCGA-FR-A3YN-01Z-00-DX1-WSI',
 'TCGA-FR-A3R1-01Z-00-DX1-WSI',
 'TCGA-ER-A3EV-01Z-00-DX1-WSI',
 'TCGA-DA-A95Y-01Z-00-DX1-WSI',
 'TCGA-DA-A95W-01Z-00-DX1-WSI',
 'TCGA-DA-A3F8-01Z-00-DX1-WSI',
 'TCGA-DA-A3F5-01Z-00-DX1-WSI',
 'TCGA-DA-A1IA-01Z-00-DX1-WSI',
 'TCGA-DA-A1I5-01Z-00-DX1-WSI',
 'TCGA-D3-A2JH-06Z-00-DX1-WSI',]
low_quality_wsi = [name.split('-WSI')[0]+'.svs' for name in low_quality_wsi]
df_tst = df_tst[~df_tst.wsi_id.isin(low_quality_wsi)]

In [68]:
# 'Process prediction output to legible format'
def parse_str(string, ix):
    n_pt = '([\.0-9\+e\-]+)'
    pattern = '\['+n_pt+'\s+'+n_pt+'\s+'+n_pt+'\s+'+n_pt+'[.\\n\s]+'+n_pt+'[.\\n\s]+'+n_pt
    m = re.match(pattern, string)
    if m is None:
        print('error', string)
        
    CBT_pb, CBTA_pb, CBTP_pb, CBT3_pb, CBTPA_pb, CBTP3_pb = m.groups()
    CBT_pb = float(CBT_pb)
    CBTA_pb = float(CBTA_pb) 
    CBTP_pb = float(CBTP_pb)
    CBT3_pb = float(CBT3_pb)
    CBTPA_pb = float(CBTPA_pb) 
    CBTP3_pb = float(CBTP3_pb)
    all_pbs = [CBT_pb, CBTA_pb, CBTP_pb, CBT3_pb, CBTPA_pb, CBTP3_pb]
    return all_pbs[ix]
CBT_pb = lambda x: parse_str(x, 0)
CBTA_pb = lambda x: parse_str(x, 1)
CBTP_pb = lambda x: parse_str(x, 2)
CBT3_pb = lambda x: parse_str(x, 3)
CBTPA_pb = lambda x: parse_str(x, 4)
CBTP3_pb = lambda x: parse_str(x, 5)

# 'Compute prediction probabilities for each class'
df_tst['CBT'] = df_tst.pred_prob.map(CBT_pb)
df_tst['CBTA'] = df_tst.pred_prob.map(CBTA_pb)
df_tst['CBTP'] = df_tst.pred_prob.map(CBTP_pb)
df_tst['CBT3'] = df_tst.pred_prob.map(CBT3_pb)
df_tst['CBTPA'] = df_tst.pred_prob.map(CBTPA_pb)
df_tst['CBTP3'] = df_tst.pred_prob.map(CBTP3_pb)

# 'Assing most likely prediction to each tile'
classes = ['CBT', 'CBTA', 'CBTP', 'CBT3', 'CBTPA', 'CBTP3']
preds = np.argmax(df_tst[classes].values, axis=1)
ix2cl = {ix: cl for ix, cl in enumerate(classes)}
df_tst['prediction'] = [ix2cl[p] for p in preds]

# 'Compute entropy'
df_tst['tile_entropy'] = entropy(df_tst[classes], axis=1, base=6)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

In [69]:
import seaborn as sns
sns.set_style('darkgrid')

In [70]:
wsi_AB_aucs = []
wsi_O_aucs = []
wsi_MC_aucs = []
wsi_3_aucs = []
wsi_P_aucs = []

tile_AB_aucs = []
tile_O_aucs = []
tile_MC_aucs = []
tile_3_aucs = []
tile_P_aucs = []

# wsi_pvals = []
# tile_pvals = []
entropies = np.linspace(0, 1, num=50)

for entropy in tqdm(entropies):
    df_loop = df_tst.copy()
    mask = df_loop.tile_entropy > entropy
    df_loop['uncertain'] = mask
    mask = df_loop.uncertain == False
    df_loop = df_loop[mask]

    df_loop['APC_prob'] = (df_loop.CBTPA + df_loop.CBTA)      
    df_loop['P53_prob'] = (df_loop.CBT3 + df_loop.CBTP3)
    df_loop['PTEN_prob'] = (df_loop.CBTPA + df_loop.CBTP3 + df_loop.CBTP)  

    ox_mask = df_loop.rna_subtype == 'OxPhos'
    mi_mask = df_loop.rna_subtype == 'MITF-Low'
    cm_mask = df_loop.rna_subtype == 'Common'
    
    df_loop['OxPhos'] = ox_mask
    df_loop['MITF_Co'] = cm_mask | mi_mask
    df_loop['APC_BC'] = df_loop.APC | df_loop.BetaCAT
    
    tile_AB_aucs.append(roc_auc_score(df_loop['APC_BC'], df_loop['APC_prob']))
    tile_O_aucs.append(roc_auc_score(df_loop['OxPhos'], df_loop['APC_prob']))
    tile_MC_aucs.append(roc_auc_score(df_loop['MITF_Co'], df_loop['P53_prob']))
    tile_3_aucs.append(roc_auc_score(df_loop['P53'], df_loop['P53_prob']))
    tile_P_aucs.append(roc_auc_score(df_loop['PTEN'], df_loop['PTEN_prob']))
    
#     pred_rnd = np.random.rand(len(df_loop))    
#     p, z = get_delong_pvalue_zscore(pred_rnd, df_loop['APC_prob'], df_loop['APC_BC_OxPhos'])
#     tile_pvals.append(p)
    
    num_tiles = df_loop.groupby('wsi_id').count().x_tile_coord
    df_wsi = df_loop.groupby('wsi_id').sum()
    df_wsi['APC_prob_norm'] = df_wsi['APC_prob'] / num_tiles
    df_wsi['P53_prob_norm'] = df_wsi['P53_prob'] / num_tiles
    df_wsi['PTEN_prob_norm'] = df_wsi['PTEN_prob'] / num_tiles
    
    df_wsi['APC_BC'] = df_wsi.APC_BC != 0
    df_wsi['MITF_Co'] = df_wsi.MITF_Co != 0
    df_wsi['OxPhos'] = df_wsi.OxPhos != 0
    df_wsi['PTEN'] = df_wsi.PTEN != 0
    df_wsi['P53'] = df_wsi.P53 != 0
    
    wsi_AB_aucs.append(roc_auc_score(df_wsi['APC_BC'], df_wsi['APC_prob_norm']))
    wsi_O_aucs.append(roc_auc_score(df_wsi['OxPhos'], df_wsi['APC_prob_norm']))
    wsi_3_aucs.append(roc_auc_score(df_wsi['P53'], df_wsi['P53_prob']))
    wsi_MC_aucs.append(roc_auc_score(df_wsi['MITF_Co'], df_wsi['P53_prob']))
    wsi_P_aucs.append(roc_auc_score(df_wsi['PTEN'], df_wsi['PTEN_prob_norm']))
#     p, z = get_delong_pvalue_zscore(pred_rnd, df_loop['APC_prob'], df_loop['APC_BC_OxPhos'])
#     wsi_pvals.append(p)

NameError: name 'tqdm' is not defined

In [None]:
fig, ax = plt.subplots(ncols=1, figsize=(6, 5))
ax.set_ylabel('AUC')
ax.set_xlabel('Entropy')
ax.scatter(entropies[0:len(tile_AB_aucs)], tile_AB_aucs, c='r', marker='+', label='APC+BC', s=10.);
ax.scatter(entropies[0:len(tile_O_aucs)], tile_O_aucs, c='g', marker='+', label='OxPhos', s=10.);
ax.scatter(entropies[0:len(tile_MC_aucs)], tile_MC_aucs, c='b', marker='+', label='MITF_Co', s=10.);
ax.scatter(entropies[0:len(tile_3_aucs)], tile_3_aucs, c='k', marker='+', label='TP53', s=10.);
ax.scatter(entropies[0:len(tile_P_aucs)], tile_P_aucs, c='y', marker='+', label='PTEN', s=10.);


ax.set_ylim(0.45, .8)
plt.title('tile-level')
ax.legend();
plt.tight_layout()
plt.savefig('../results/auc_vs_entropy_TCGA_tile.pdf');
# axs[1].set_ylabel('p-values')
# axs[1].set_xlabel('Entropy')
# axs[1].scatter(entropies[0:len(wsi_pvals)], wsi_pvals, label='WSI_level');
# axs[1].scatter(entropies[0:len(tile_pvals)], tile_pvals, label='tile_level')

In [71]:
fig, ax = plt.subplots(ncols=1, figsize=(6, 5))
ax.set_ylabel('AUC')
ax.set_xlabel('Entropy')
ax.scatter(entropies[0:len(wsi_AB_aucs)], wsi_AB_aucs, c='r', marker='+', label='APC+BC', s=10.);
ax.scatter(entropies[0:len(wsi_O_aucs)], wsi_O_aucs, c='g', marker='+', label='OxPhos', s=10.);
ax.scatter(entropies[0:len(wsi_MC_aucs)], wsi_MC_aucs, c='b', marker='+', label='MITF_Co', s=10.);
ax.scatter(entropies[0:len(wsi_3_aucs)], wsi_3_aucs, c='k', marker='+', label='TP53', s=10.);
ax.scatter(entropies[0:len(wsi_P_aucs)], wsi_P_aucs, c='y', marker='+', label='PTEN', s=10.);


ax.set_ylim(0.45, .8)
plt.title('wsi-level')
ax.legend();
plt.tight_layout()
plt.savefig('../results/auc_vs_entropy_TCGA_wsi.pdf');
# axs[1].set_ylabel('p-values')
# axs[1].set_xlabel('Entropy')
# axs[1].scatter(entropies[0:len(wsi_pvals)], wsi_pvals, label='WSI_level');
# axs[1].scatter(entropies[0:len(tile_pvals)], tile_pvals, label='tile_level')

***

In [None]:
wsi_A_aucs = []
wsi_3_aucs = []
wsi_P_aucs = []

tile_A_aucs = []
tile_3_aucs = []
tile_P_aucs = []

# wsi_pvals = []
# tile_pvals = []
entropies = np.linspace(0, 1, num=50)

for entropy in tqdm(entropies):
    df_loop = df_tst.copy()
    mask = df_loop.tile_entropy > entropy
    df_loop['uncertain'] = mask
    mask = df_loop.uncertain == False
    df_loop = df_loop[mask]

    df_loop['APC_prob'] = (df_loop.CBTPA + df_loop.CBTA)      
    df_loop['P53_prob'] = (df_loop.CBT3 + df_loop.CBTP3)
    df_loop['PTEN_prob'] = (df_loop.CBTPA + df_loop.CBTP3 + df_loop.CBTP)  

    ox_mask = df_loop.rna_subtype == 'OxPhos'
    mi_mask = df_loop.rna_subtype == 'MITF-Low'
    cm_mask = df_loop.rna_subtype == 'Common'
    
    df_loop['APC_BC_OxPhos'] = df_loop.BetaCAT | df_loop.APC | ox_mask
    df_loop['P53_MITF_Co'] = df_loop.P53 | cm_mask | mi_mask
    df_loop['PTEN'] = df_loop.PTEN
    
    tile_A_aucs.append(roc_auc_score(df_loop['APC_BC_OxPhos'], df_loop['APC_prob']))
    tile_3_aucs.append(roc_auc_score(df_loop['P53_MITF_Co'], df_loop['P53_prob']))
    tile_P_aucs.append(roc_auc_score(df_loop['PTEN'], df_loop['PTEN_prob']))
    
#     pred_rnd = np.random.rand(len(df_loop))    
#     p, z = get_delong_pvalue_zscore(pred_rnd, df_loop['APC_prob'], df_loop['APC_BC_OxPhos'])
#     tile_pvals.append(p)
    
    num_tiles = df_loop.groupby('wsi_id').count().x_tile_coord
    df_wsi = df_loop.groupby('wsi_id').sum()
    df_wsi['APC_prob_norm'] = df_wsi['APC_prob'] / num_tiles
    df_wsi['P53_prob_norm'] = df_wsi['P53_prob'] / num_tiles
    df_wsi['PTEN_prob_norm'] = df_wsi['PTEN_prob'] / num_tiles
    df_wsi['APC_BC_OxPhos'] = df_wsi.APC_BC_OxPhos != 0
    df_wsi['P53_MITF_Co'] = df_wsi.P53_MITF_Co != 0
    df_wsi['PTEN'] = df_wsi.PTEN != 0
    
    wsi_A_aucs.append(roc_auc_score(df_wsi['APC_BC_OxPhos'], df_wsi['APC_prob_norm']))
    wsi_3_aucs.append(roc_auc_score(df_wsi['P53_MITF_Co'], df_wsi['P53_prob']))
    wsi_P_aucs.append(roc_auc_score(df_wsi['PTEN'], df_wsi['PTEN_prob_norm']))
#     p, z = get_delong_pvalue_zscore(pred_rnd, df_loop['APC_prob'], df_loop['APC_BC_OxPhos'])
#     wsi_pvals.append(p)

In [None]:
#Shifted one cell below

fig, axs = plt.subplots(ncols=2, figsize=(6, 5))
axs[0].set_ylabel('AUC')
axs[0].set_xlabel('Entropy')
axs[0].scatter(entropies[0:len(wsi_A_aucs)], wsi_A_aucs, c='r', marker='x', label='APC+BC+OxPhos (WSI-level)', s=10.);
axs[0].scatter(entropies[0:len(tile_A_aucs)], tile_A_aucs, c='r', marker='+', label='APC+BC+OxPhos (tile-level)', s=10.);

ax.scatter(entropies[0:len(wsi_3_aucs)], wsi_3_aucs, c='g', marker='x', label='P53_MITF_Co (WSI-level)', s=10.);
ax.scatter(entropies[0:len(tile_3_aucs)], tile_3_aucs, c='g', marker='+', label='P53_MITF_Co (tile-level)', s=10.);

ax.scatter(entropies[0:len(wsi_P_aucs)], wsi_P_aucs, c='b', marker='x', label='PTEN (WSI-level)', s=10.);
ax.scatter(entropies[0:len(tile_P_aucs)], tile_P_aucs, c='b', marker='+', label='PTEN (tile-level)', s=10.);


ax.set_ylim(0.5, .7)

ax.legend();
plt.tight_layout()
plt.savefig('../results/auc_vs_entropy_TCGA.pdf');
# axs[1].set_ylabel('p-values')
# axs[1].set_xlabel('Entropy')
# axs[1].scatter(entropies[0:len(wsi_pvals)], wsi_pvals, label='WSI_level');
# axs[1].scatter(entropies[0:len(tile_pvals)], tile_pvals, label='tile_level')

In [None]:
fig, ax = plt.subplots(ncols=1, figsize=(6, 5))
ax.set_ylabel('AUC')
ax.set_xlabel('Entropy')
ax.scatter(entropies[0:len(wsi_A_aucs)], wsi_A_aucs, c='r', marker='x', label='APC+BC+OxPhos (WSI-level)', s=10.);
ax.scatter(entropies[0:len(tile_A_aucs)], tile_A_aucs, c='r', marker='+', label='APC+BC+OxPhos (tile-level)', s=10.);

ax.scatter(entropies[0:len(wsi_3_aucs)], wsi_3_aucs, c='g', marker='x', label='P53_MITF_Co (WSI-level)', s=10.);
ax.scatter(entropies[0:len(tile_3_aucs)], tile_3_aucs, c='g', marker='+', label='P53_MITF_Co (tile-level)', s=10.);

ax.scatter(entropies[0:len(wsi_P_aucs)], wsi_P_aucs, c='b', marker='x', label='PTEN (WSI-level)', s=10.);
ax.scatter(entropies[0:len(tile_P_aucs)], tile_P_aucs, c='b', marker='+', label='PTEN (tile-level)', s=10.);


ax.set_ylim(0.5, .7)

ax.legend();
plt.tight_layout()
plt.savefig('../results/auc_vs_entropy_TCGA.pdf');
# axs[1].set_ylabel('p-values')
# axs[1].set_xlabel('Entropy')
# axs[1].scatter(entropies[0:len(wsi_pvals)], wsi_pvals, label='WSI_level');
# axs[1].scatter(entropies[0:len(tile_pvals)], tile_pvals, label='tile_level')