## Notebook to compute the precision and recall metrics for the training and test sets of the models

In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import VarianceThreshold, SelectKBest, chi2, mutual_info_classif
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, classification_report

import seaborn as sn
import matplotlib.pyplot as plt

### Training set

In [26]:
aux_thesis = pd.read_pickle("../data/cna_brca_train_0.8_threshold_0.6_chrX.pkl")

In [27]:
aux_thesis.dtypes[aux_thesis.dtypes !='float64']

tcga_id             object
Ciriello_subtype    object
dtype: object

In [28]:
aux_thesis.shape

(817, 386)

In [10]:
import ast
res_train = pd.read_csv("../results/cna/CVAE/300_hidden_100_emb/tcga_classifier_dropout_0.2_in_0.4_hidden_rec_loss_binary_crossentropy_classifier_frozen_False_cv_other_metrics.csv")

In [11]:
res_train

Unnamed: 0.1,Unnamed: 0,Fold,accuracy,other_metrics,mean_accuracy,intermediate_dim,latent_dim,batch_size,epochs_cvae,learning_rate,dropout_input,dropout_hidden,dropout_decoder,freeze_weights,classifier_use_z,classifier_loss,reconstruction_loss
0,0,1,0.678788,"{'Basal': {'precision': 0.8, 'recall': 0.85714...",0.635144,300,100,50,100,0.001,0.2,0.4,True,False,False,categorical_crossentropy,binary_crossentropy
1,1,2,0.650307,"{'Basal': {'precision': 0.8571428571428571, 'r...",0.635144,300,100,50,100,0.001,0.2,0.4,True,False,False,categorical_crossentropy,binary_crossentropy
2,2,3,0.515337,"{'Basal': {'precision': 0.0, 'recall': 0.0, 'f...",0.635144,300,100,50,100,0.001,0.2,0.4,True,False,False,categorical_crossentropy,binary_crossentropy
3,3,4,0.699386,"{'Basal': {'precision': 0.9166666666666666, 'r...",0.635144,300,100,50,100,0.001,0.2,0.4,True,False,False,categorical_crossentropy,binary_crossentropy
4,4,5,0.631902,"{'Basal': {'precision': 0.8, 'recall': 0.59259...",0.635144,300,100,50,100,0.001,0.2,0.4,True,False,False,categorical_crossentropy,binary_crossentropy


In [12]:
res_train["accuracy"]

0    0.678788
1    0.650307
2    0.515337
3    0.699386
4    0.631902
Name: accuracy, dtype: float64

In [13]:
res_train["mean_accuracy"]

0    0.635144
1    0.635144
2    0.635144
3    0.635144
4    0.635144
Name: mean_accuracy, dtype: float64

In [14]:
res_train["other_metrics"]

0    {'Basal': {'precision': 0.8, 'recall': 0.85714...
1    {'Basal': {'precision': 0.8571428571428571, 'r...
2    {'Basal': {'precision': 0.0, 'recall': 0.0, 'f...
3    {'Basal': {'precision': 0.9166666666666666, 'r...
4    {'Basal': {'precision': 0.8, 'recall': 0.59259...
Name: other_metrics, dtype: object

In [15]:
from statistics import stdev
subtypes = ["Basal", "Her2", "LumA", "LumB", "Normal"]
#weights_train=[135,65,415,176,25] # Ones for all things miRNA
weights_train=[136,65,415,176,25] # Ones for rna and cna
mean_precisions = []
mean_recalls = []

for i in range(0,5):
    dict_aux = ast.literal_eval(res_train["other_metrics"].values[i])
    arr_pre = []
    arr_rec = []
    for sub in subtypes:
        arr_pre.append(dict_aux[sub]['precision'])
        arr_rec.append(dict_aux[sub]['recall'])
    mean_precisions.append(np.average(arr_pre, weights=weights_train))
    mean_recalls.append(np.average(arr_rec, weights=weights_train))
    
print(mean_precisions)
print('{}+-{}'.format(np.mean(mean_precisions), stdev(mean_precisions)))
print("----------------")
print(mean_recalls)
print('{}+-{}'.format(np.mean(mean_recalls), stdev(mean_recalls)))

[0.6265185356778015, 0.607793376642344, 0.3896611561284725, 0.7288574852256076, 0.6618817545700236]
0.6029424616488498+-0.1278639035652289
----------------
[0.6798779895475122, 0.64984554409279, 0.5141808008393076, 0.698881570852195, 0.6312901115838693]
0.6348152033831348+-0.07232936925745442


In [16]:
print("Average accuracy: {}+-{}".format(np.mean(res_train["accuracy"].values), stdev(res_train["accuracy"].values)))

Average accuracy: 0.6351440787315369+-0.07180864359000959


In [1]:
# no miRNA
(43*1+16*0.982+131*0.942+32*0.966+14*0.982)/(43+16+131+32+14)

0.9609067796610168

In [23]:
# miRNA
(36*0.9945+15*0.985+128*0.923+26*0.953+14*1)/(36+15+128+26+14)

0.9474840182648402

### Test set

In [125]:
res_test = pd.read_csv("../results/miRNA/CVAE/100_hidden_20_emb/tcga_classifier_dropout_0.8_in_0.2_hidden_rec_loss_binary_crossentropy_classifier_frozen_False_cv_other_metrics.csv")

In [126]:
res_test['other_metrics']

0    {'Basal': {'precision': 0.896551724137931, 're...
1    {'Basal': {'precision': 0.9629629629629629, 'r...
2    {'Basal': {'precision': 0.8620689655172413, 'r...
3    {'Basal': {'precision': 0.96, 'recall': 0.8888...
4    {'Basal': {'precision': 0.7878787878787878, 'r...
Name: other_metrics, dtype: object

In [127]:
res_test

Unnamed: 0.1,Unnamed: 0,Fold,accuracy,other_metrics,mean_accuracy,intermediate_dim,latent_dim,batch_size,epochs_cvae,learning_rate,dropout_input,dropout_hidden,dropout_decoder,freeze_weights,classifier_use_z,classifier_loss,reconstruction_loss
0,0,1,0.810976,"{'Basal': {'precision': 0.896551724137931, 're...",0.769557,100,20,200,100,0.01,0.8,0.2,True,False,False,categorical_crossentropy,binary_crossentropy
1,1,2,0.779141,"{'Basal': {'precision': 0.9629629629629629, 'r...",0.769557,100,20,200,100,0.01,0.8,0.2,True,False,False,categorical_crossentropy,binary_crossentropy
2,2,3,0.711656,"{'Basal': {'precision': 0.8620689655172413, 'r...",0.769557,100,20,200,100,0.01,0.8,0.2,True,False,False,categorical_crossentropy,binary_crossentropy
3,3,4,0.779141,"{'Basal': {'precision': 0.96, 'recall': 0.8888...",0.769557,100,20,200,100,0.01,0.8,0.2,True,False,False,categorical_crossentropy,binary_crossentropy
4,4,5,0.766871,"{'Basal': {'precision': 0.7878787878787878, 'r...",0.769557,100,20,200,100,0.01,0.8,0.2,True,False,False,categorical_crossentropy,binary_crossentropy


In [128]:
subtypes = ["Basal", "Her2", "LumA", "LumB", "Normal"]
weights_test=[36,15,128,26,14]
mean_precisions = []
mean_recalls = []

dict_aux = ast.literal_eval(res_test['other_metrics'][0])

print("Accuracy {}".format(res_test["accuracy"].values))
print("Precision {}".format(dict_aux['weighted avg']['precision']))
print("Recall {}".format(dict_aux['weighted avg']['recall']))

Accuracy [0.81097561 0.77914113 0.71165645 0.77914113 0.76687115]
Precision 0.7956924118972507
Recall 0.8109756097560976


In [129]:
print("Average accuracy: {}".format(np.mean(res_test["accuracy"].values)))

Average accuracy: 0.7695570945739746
