# Grandline tutorial

1. Load synthetic linear Data (200 samples, 5000 genes/nodes, 20 clusters)
    - `cv_data_dict[i]['X_train']` (160 sample x 5000 node)
    - `cv_data_dict[i]['X_test']` (40 sample x 5000 node)
    - `cv_data_dict[i]['y_train']` (160 sample x 1)
    - `cv_data_dict[i]['y_test']` (40 sample x 1)
    - `i=0,1,...,9` (10 shuffles)
2. Create adjacency matrix A 
3. Set GCN hyperparameters :
`epoch, learning rate, regularization, batch_size, number of graph convolutional filters(Fs), polynomial orders(Ks), pooling sizes(Ps), fully connected layers(Ms)`

4. Train model
5. ทดสอบ prediction
6. GradCAM
    - ค่าความสำคัญของแต่ละ sample
    - ค่าความสำคัญของแต่ละ class

In [31]:
import pandas as pd
import numpy as np
import networkx as nx 
import scipy
import pickle, os
import seaborn as sns
import tensorflow as tf

from lib import  graph, coarsening, utils, grandline #models,

In [2]:
os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'

## Read data and preprocessing

In [3]:
disease = 'synthetic'
disease_type = 'X_linear'
input_name = 'RandomPartition_5000_20'
input_prefix = 'data/{}_{}'.format(disease_type, input_name)

cv_data_dict = pickle.load(open("{}_cv.pickle".format(input_prefix), "rb"))
n_shuffle = 10

In [4]:
gene_list = cv_data_dict[0]['X_train'].columns
d = len(gene_list)

print ("Number of genes", d)
print ("List of genes", gene_list)

Number of genes 5000
List of genes Index(['N00000', 'N00001', 'N00002', 'N00003', 'N00004', 'N00005', 'N00006',
       'N00007', 'N00008', 'N00009',
       ...
       'N04990', 'N04991', 'N04992', 'N04993', 'N04994', 'N04995', 'N04996',
       'N04997', 'N04998', 'N04999'],
      dtype='object', length=5000)


In [5]:
# จำนวน class
temp_df = cv_data_dict[0]['y_train']
C = temp_df.groupby(temp_df.columns[0]).size().shape[0]
print ("Number of classes", C)

Number of classes 2


### Change DataFrame to numpy array and reshape

In [6]:
#choose shuffle
current_shuffle = 0

In [7]:
for name in ['X_train', 'X_test']:    
    cv_data_dict[current_shuffle][name]= cv_data_dict[current_shuffle][name].values.astype(np.float32)

for name in ['y_train', 'y_test']:    
    cv_data_dict[current_shuffle][name] = cv_data_dict[current_shuffle][name].values.astype(np.uint8)
        

cv_data_dict[current_shuffle]['y_test'] = cv_data_dict[current_shuffle]['y_test'].reshape((cv_data_dict[current_shuffle]['y_test'].shape[0],))
cv_data_dict[current_shuffle]['y_train'] = cv_data_dict[current_shuffle]['y_train'].reshape((cv_data_dict[current_shuffle]['y_train'].shape[0],))

In [8]:
X_train = cv_data_dict[current_shuffle]['X_train']
y_train = cv_data_dict[current_shuffle]['y_train']
X_test = cv_data_dict[current_shuffle]['X_test']
y_test = cv_data_dict[current_shuffle]['y_test']

### Create adjacency matrix A

In [9]:
A = utils.prepare_adjacency('data/A_{}.csv'.format(input_name), gene_list)
print ("Created A {}x{}".format(A.shape[0], A.shape[1]))

Created A 5000x5000


#### Laplacian function

L is calculated from A without diag

In [10]:
#Move!!
# def calculate_laplacian(A, levels, seed=1):
#     np.random.seed(seed)
#     graphs, perms = coarsening.coarsen(A, levels=levels, self_connections=False)
#     L = [graph.laplacian(A, normalized=True) for A in graphs]
    
#     return L, graphs, perms

## GCN hyperparameters


In [11]:
params = dict()
params['num_epochs']     = 15
params['learning_rate']  = 1e-3
params['filter_name']    = 'chebyshev'

seed = 8

params['Fs']              = [20, 20]  # Number of graph convolutional filters. 
params['Ks']              = [10, 10]  # Polynomial orders.
params['Ps']              = [2, 2]  # Pooling sizes. 
params['Ms']              = [C]  # Output dimensionality of fully connected layers.

params['regularization'] = 1e-5
params['batch_size'] = X_train.shape[0]


#### Calculate normalized L for each level

In [12]:
n_level_coarsen = int(np.log2(params['Ps']).sum())
print ("Coarsening level:", n_level_coarsen)
Ls, graphs, perms = graph.calculate_laplacian(A, levels=n_level_coarsen)

Coarsening level: 2
Layer 0: M_0 = |V| = 5032 nodes (32 added),|E| = 307436 edges
Layer 1: M_1 = |V| = 2516 nodes (5 added),|E| = 218625 edges
Layer 2: M_2 = |V| = 1258 nodes (0 added),|E| = 123181 edges


#### Arrange features/genes according to permutation (from coarsening)

In [13]:
if perms is not None:
    X_train = coarsening.perm_data(X_train, perms[0])
    X_test = coarsening.perm_data(X_test, perms[0])

In [14]:
from tensorflow.keras.utils import to_categorical 
y_train = to_categorical(y_train)
y_test = to_categorical(y_test)

In [15]:
X_train = np.expand_dims(X_train, 2)
X_test = np.expand_dims(X_test, 2)

X_train.shape, X_test.shape

((160, 5032, 1), (40, 5032, 1))

### Train GCN model

In [16]:
tf.keras.backend.clear_session()

In [17]:
model, model_logit = grandline.build_gcn_model(graphs, Ls, **params)

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=params['learning_rate'], name='Adam'), 
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=False), 
              metrics=['accuracy'])
model.build(input_shape=X_train.shape)

#### Call back functions




In [18]:
earlystop_callback = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', 
                                                      min_delta=0.001, 
                                                      patience=15,
                                                      verbose=1,
                                                      mode='max',
                                                      baseline=None, 
                                                      restore_best_weights=True)

#### Define class weight

In [19]:
from sklearn.utils import class_weight

y_train_class_name = np.argmax(y_train, axis=1)
class_weights = class_weight.compute_class_weight('balanced',
                                                  np.unique(y_train_class_name),
                                                  y_train_class_name)
class_weights = dict(enumerate(class_weights))
class_weights

{0: 1.0, 1: 1.0}

#### Start fitting

In [20]:
history = model.fit(x=X_train,
                    y=y_train,
                    epochs=params['num_epochs'],
                    validation_data=[X_test, y_test],
                    batch_size=params['batch_size'],
                    class_weight=class_weights,
                    callbacks=[earlystop_callback], # checkpoint_callback
                    verbose=1, shuffle=True)

Train on 160 samples, validate on 40 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


## Make a prediction 

In [21]:
logit_model = tf.keras.Model(inputs=model_logit.inputs, outputs=model_logit.outputs)

In [22]:
predict = model.predict(x=X_test)
np.argmax(predict, axis=1)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [23]:
np.argmax(y_test, axis=1)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

## GradCAM


In [24]:
num_train = X_train.shape[0]

node_label = []
node_importance = []

for selected_sample_id in range(num_train):
    X_input = np.expand_dims(X_train[selected_sample_id], 0).astype('float32')
    node_label += [np.argmax(y_train[selected_sample_id])]
    node_importance += [grandline.cal_gradcam(selected_sample_id, X_input, logit_model)]

### ค่า important ของแต่ละ sample

In [25]:
#สำหรับ node in last conv
importance_allnode_df = pd.DataFrame(np.array([node_label, node_importance]).T, columns=['label', 'important'])

  


In [26]:
#สำหรับ node in graph
i=0
ipt = np.abs(importance_allnode_df['important'][0])
label = importance_allnode_df['label'][0]
ipt_df = utils.get_node_importance_df(perms, ipt, d)[['node','important']]
ipt_df = ipt_df.rename(columns = {'important':'train_{}'.format(0)})

for i in range(1,num_train):
    ipt = np.abs(importance_allnode_df['important'][i])
    label = importance_allnode_df['label'][i]
    ipt_df2 = utils.get_node_importance_df(perms, ipt, d)[['important']]
    ipt_df2 = ipt_df2.rename(columns = {'important':'train_{}'.format(i)})
    ipt_df= pd.concat([ipt_df,ipt_df2],axis=1)

ipt_df.loc[:, 'Id'] = gene_list
ipt_df = ipt_df.set_index('Id')
ipt_df.head()

Unnamed: 0_level_0,node,train_0,train_1,train_2,train_3,train_4,train_5,train_6,train_7,train_8,...,train_150,train_151,train_152,train_153,train_154,train_155,train_156,train_157,train_158,train_159
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
N00000,0.0,0.000213,0.003232,0.000543,0.000522,0.002714,0.002878,0.004862,0.0042,0.005491,...,0.000943,0.00222,0.001092,0.001653,0.001697,0.000617,0.000501,5.5e-05,0.001606,0.001673
N00001,1.0,0.001194,0.002905,0.001239,0.002255,0.000218,0.001857,0.002158,0.001485,0.001985,...,0.004047,0.002359,0.004383,0.001917,0.004368,0.001851,0.003362,0.003968,0.004877,0.004391
N00002,2.0,0.005489,0.002887,0.001718,0.002107,0.003082,0.003859,0.006862,0.000435,0.001886,...,0.000447,0.000724,0.001299,0.001686,0.001146,0.001637,0.003153,0.001022,0.002315,0.001346
N00003,3.0,0.001857,0.005526,0.002899,0.006016,0.004776,0.005589,0.00664,0.005094,0.005745,...,0.000585,0.001389,0.000712,0.00174,0.000578,0.001012,2.7e-05,0.000147,0.001668,0.000145
N00004,4.0,0.005242,0.005293,0.004814,0.005816,0.006061,0.003648,0.00338,0.006086,0.006143,...,0.001608,0.000435,0.001387,0.000877,0.000852,0.002105,0.001764,0.000553,0.001297,0.000452


### ผลรวมค่า important ของทุก sample แยกตาม class 0/1
(sum / sum of absolute)

In [27]:
#สำหรับ node in last conv 
important_label0 = importance_allnode_df[importance_allnode_df['label']==0]['important'].sum()
important_label0_abs = np.abs(importance_allnode_df[importance_allnode_df['label']==0]['important']).sum()

important_label1 = importance_allnode_df[importance_allnode_df['label']==1]['important'].sum()
important_label1_abs = np.abs(importance_allnode_df[importance_allnode_df['label']==1]['important']).sum()

In [28]:
#สำหรับ node in graph
imp_label0_df = utils.get_node_importance_df(perms, important_label0, d)
imp_label0_abs_df = utils.get_node_importance_df(perms, important_label0_abs, d)

imp_label1_df = utils.get_node_importance_df(perms, important_label1, d)
imp_label1_abs_df = utils.get_node_importance_df(perms, important_label1_abs, d)


node_df = imp_label0_df
node_df.loc[:, 'node'] = node_df['node'].astype(int)
node_df.loc[:, 'cluster'] = node_df['cluster'].astype(int)

node_df.loc[:, 'Id'] = gene_list
node_df = node_df.set_index('Id')
node_df = node_df.rename(columns={'important':'label_0'})

node_df.loc[:, 'label_1'] = imp_label1_df['important'].values
node_df.loc[:, 'label_0_abs'] = imp_label0_abs_df['important'].values
node_df.loc[:, 'label_1_abs'] = imp_label1_abs_df['important'].values

node_df.head()

Unnamed: 0_level_0,label_0,node,cluster,label_1,label_0_abs,label_1_abs
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
N00000,0.202143,0,2240,-0.084943,0.227807,0.110702
N00001,0.154467,1,1004,-0.2502,0.154467,0.2502
N00002,0.298304,2,2017,0.118025,0.298304,0.124068
N00003,0.429635,3,286,-0.096839,0.429635,0.114606
N00004,0.411817,4,1584,-0.080547,0.411817,0.091072
