# Test 1 : compare pre-graphs
In this notebook, we compare, for GCN model with euclidean distance, the possible pre-graphs : fully connected, tumour type, KMeans, Spectral clustering, and Hierarchical clustering. We use the whole dataset with mutational signatures, all normalized.

## Table of contents

1. [Fully connected pre-graph](#1-fully-connected)
2. [Tumour type pre-graph](#2-pre-graph)
3. [KMeans pre-graph](#3-kmeans)
4. [Evaluate performances for each pre-graph](#4-performances)

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from evaluation.ClassificationMetrics import ClassificationMetrics

from manage.GCNClassifierTrainTestManager import GCNClassifierTrainTestManager

from models.GCNClassifier import GCNClassifier

from sklearn.cluster import KMeans

from utils.preProcessing import preProcessing

## 0. Reading dataset and preprocessing

In [2]:
# Reading
df = pd.read_excel('data/table-s1-sigmut-indel.xlsx')

# Drop non naive patients
df = df.loc[df["Cohort"] == "Naive"]

# Drop NaN values
df = preProcessing.delete_nan_values(df)

# Drop censored patients (OS)
t = np.median(df["Overall survival (days)"].to_numpy())
df = preProcessing.drop_censored_patients(df, "Alive_0","Overall survival (days)",t)

# Extract features and label
features_name = ["Age at advanced disease diagnosis","CD8+ T cell score","Genome mut per mb","Exome mut per mb","CD274 expression","M1M2 expression","SBS1","SBS2","SBS3","SBS4","SBS5","SBS7a","SBS7b","SBS7c","SBS7d","SBS8","SBS10a","SBS10b","SBS10c","SBS13","SBS15","SBS17a","SBS17b","SBS18","SBS31","SBS35","SBS36","SBS37","SBS38","SBS40","SBS44","SBS4426","ID1","ID2","ID3","ID4","ID5","ID6","ID7","ID8","ID9","ID10","ID11","ID12","ID13","ID14","ID15","ID16","ID17","ID18"]
label_name = "Clinical benefit"
X = df.loc[:,features_name].to_numpy()
y = np.where(df[label_name]=="DCB",1,0)

# Normalize features
X = preProcessing.normalize_data(X)

1 patients censored deleted


## 1. Fully connected pre-graph

In [3]:
# Build group (all in same group)
group_fully_connected = np.array([0 for k in range(df.shape[0])])

In [4]:
# Instanciate GCN classifier with its train test manager
gcn_clf_fc = GCNClassifier(len(features_name))
gcn_manager_fc = GCNClassifierTrainTestManager(gcn_clf_fc)
n_epochs = 50

In [5]:
# Make the leave one out cross validation
scores_fc, classes_fc, train_loss_fc, val_loss_fc, std_train_loss_fc, std_val_loss_fc, params_conv_fc, params_fc_fc = gcn_manager_fc.leave_one_out_cross_validation(X, y, group_fully_connected, n_epochs, "euclidean",3)

## 2. Tumour type pre-graph

In [None]:
# Build group
group_tumour_type = df["Tumour type"].to_numpy()

In [None]:
# Instanciate GCN classifier with its train test manager
gcn_clf_tumour_type = GCNClassifier(len(features_name))
gcn_manager_tumour_type = GCNClassifierTrainTestManager(gcn_clf_tumour_type)

In [None]:
# Make the leave one out cross validation
scores_tumour_type, classes_tumour_type, train_loss_tumour_type, val_loss_tumour_type, std_train_loss_tumour_type, std_val_loss_tumour_type, params_conv_tumour_type, params_fc_tumour_type = gcn_manager_tumour_type.leave_one_out_cross_validation(X, y, group_tumour_type, n_epochs, "euclidean",3)

## 3. KMeans pre-graph

In [None]:
# Build group
group_kmeans = KMeans(n_clusters=10, n_init=10).fit(X).labels_

In [None]:
# Instanciate GCN classifier with its train test manager
gcn_clf_kmeans = GCNClassifier(len(features_name))
gcn_manager_kmeans = GCNClassifierTrainTestManager(gcn_clf_kmeans)

In [None]:
# Make the leave one out cross validation
scores_kmeans, classes_kmeans, train_loss_kmeans, val_loss_kmeans, std_train_loss_kmeans, std_val_loss_kmeans, params_conv_kmeans, params_fc_kmeans = gcn_manager_kmeans.leave_one_out_cross_validation(X, y, group_kmeans, n_epochs, "euclidean",3)

## 6. Evaluate performances

### Precision, recall, F1-score

In [None]:
# Fully connected
pre_fc, rec_fc, f1_fc = ClassificationMetrics.eval_metrics_from_conf_matrix(y, classes_fc)

# Tumour type
pre_tumour_type, rec_tumour_type, f1_tumour_type = ClassificationMetrics.eval_metrics_from_conf_matrix(y, classes_tumour_type)

# KMeans
pre_kmeans, rec_kmeans, f1_kmeans = ClassificationMetrics.eval_metrics_from_conf_matrix(y, classes_kmeans)

In [None]:
fig, ax = plt.subplots(2,2,figsize=(15,7))

ax[0,0].barh(["Precision","Recall","F1-score"],[pre_fc, rec_fc, f1_fc])
ax[0,0].set_xlabel("Score")
ax[0,0].set_ylabel("Metrics")
ax[0,0].set_title("Fully connected pre-graph")

ax[0,1].barh(["Precision","Recall","F1-score"],[pre_tumour_type, rec_tumour_type, f1_tumour_type])
ax[0,1].set_xlabel("Score")
ax[0,1].set_ylabel("Metrics")
ax[0,1].set_title("Tumour type pre-graph")

ax[1,0].barh(["Precision","Recall","F1-score"],[pre_kmeans, rec_kmeans, f1_kmeans])
ax[1,0].set_xlabel("Score")
ax[1,0].set_ylabel("Metrics")
ax[1,0].set_title("KMeans pre-graph")

### ROC curve and AUC

In [None]:
# Fully connected
fpr_fc, tpr_fc, thresholds_fc = ClassificationMetrics.compute_roc_curve(y, scores_fc)
auc_fc = ClassificationMetrics.compute_auc(y, scores_fc)

# Tumour type
fpr_tumour_type, tpr_tumour_type, thresholds_tumour_type = ClassificationMetrics.compute_roc_curve(y, scores_tumour_type)
auc_tumour_type = ClassificationMetrics.compute_auc(y, scores_tumour_type)

# KMeans
fpr_kmeans, tpr_kmeans, thresholds_kmeans = ClassificationMetrics.compute_roc_curve(y, scores_kmeans)
auc_kmeans = ClassificationMetrics.compute_auc(y, scores_kmeans)

In [None]:
fig, ax = plt.subplots(figsize=(10,7))
x = np.linspace(0,1,100)

ax.plot(fpr_fc, tpr_fc, label='Fully connected')
ax.plot(fpr_tumour_type, tpr_tumour_type, label='Tumour type')
ax.plot(fpr_kmeans, tpr_kmeans, label='KMeans')
ax.plot(x,x,linestyle='--',color='black',label='y=x')
ax.set_xlabel('FPR')
ax.set_ylabel('TPR')
ax.set_title('ROC curve')
ax.legend()
ax.text(0,0.8,f"AUC Fully connected : {auc_fc}")
ax.text(0,0.7,f"AUC Tumour type : {auc_tumour_type}")
ax.text(0,0.6,f"AUC KMeans type : {auc_kmeans}")

### Train loss and validation loss

In [None]:
epochs = [k for k in range(n_epochs)]

fig, ax = plt.subplots(2,2,figsize=(15,7))

# FC :
ax[0,0].plot(epochs, train_loss_fc,label='train loss')
ax[0,0].fill_between(epochs, train_loss_fc-std_train_loss_fc, train_loss_fc+std_train_loss_fc, alpha=0.4)
ax[0,0].plot(epochs, val_loss_fc,label='val loss')
ax[0,0].fill_between(epochs, val_loss_fc-std_val_loss_fc, val_loss_fc+std_val_loss_fc,alpha=0.4)
ax[0,0].set_xlabel('epochs')
ax[0,0].set_ylabel('loss')
ax[0,0].set_title('Train and validation loss with std - FC')

# Tumour type :
ax[0,1].plot(epochs, train_loss_tumour_type, label='train loss')
ax[0,1].fill_between(epochs, train_loss_tumour_type-std_train_loss_tumour_type, train_loss_tumour_type+std_train_loss_tumour_type, alpha=0.4)
ax[0,1].plot(epochs, val_loss_tumour_type,label='val loss')
ax[0,1].fill_between(epochs, val_loss_tumour_type-std_val_loss_fc, val_loss_tumour_type+std_val_loss_fc,alpha=0.4)
ax[0,1].set_xlabel('epochs')
ax[0,1].set_ylabel('loss')
ax[0,1].set_title('Train and validation loss with std - Tumour type')

# KMeans :
ax[1,0].plot(epochs, train_loss_kmeans, label='train loss')
ax[1,0].fill_between(epochs, train_loss_kmeans-std_train_loss_kmeans, train_loss_kmeans+std_train_loss_kmeans, alpha=0.4)
ax[1,0].plot(epochs, val_loss_kmeans,label='val loss')
ax[1,0].fill_between(epochs, val_loss_kmeans-std_val_loss_kmeans, val_loss_kmeans+std_val_loss_kmeans,alpha=0.4)
ax[1,0].set_xlabel('epochs')
ax[1,0].set_ylabel('loss')
ax[1,0].set_title('Train and validation loss with std - KMeans')

plt.legend()

### Plot weights