# GNN Modeling

## Data and Set up

In [1]:
import numpy as np
import pandas as pd

import torch
import pytorch_lightning as pl

from torch_geometric.data import Data

from tqdm import tqdm

In [2]:
# load edge list
edge_list_path = 'data/edge_list.npy'
edge_list = torch.Tensor(np.load(edge_list_path).T).type(torch.int64) # read in format expected by pytorch geometric [2, n_edges]

# load protein-ID dictionary (need new ID system starting at index 0 for pytorch geometric)
protein_id_dict = np.load('data/protein_ids_dict.npy', allow_pickle=True).item() # maps my custom ID system to Ensembl IDs
protein_id_dict_inv = {Ensembl: id_ for id_, Ensembl in protein_id_dict.items()} # maps Ensembl IDs to my custom ID system

In [3]:
data_path = 'data/HPAnode_PPInetwork_labels_tempv2.csv' #FIXME: currently dataset has no negative labels
node_dataset = pd.read_csv(data_path, index_col=0)

# map dataset
myID = node_dataset.index.map(protein_id_dict_inv).rename('myID')
node_dataset.insert(loc=0, column='myID', value=myID)
node_dataset = node_dataset.reset_index().set_index('myID')

In [4]:
# make sure dataset with myID is of correct order and format
node_dataset.sort_index(inplace=True) # should already be sorted, but just in case
assert((node_dataset.index.to_numpy() == np.arange(len(node_dataset))).all())

In [5]:
# FIXME: this is a temporary label. need to look into getting positive and negative labels.
# label: 1 if NIH_pos, 0 if not NIH_Cancer, NaN otherwise
label_name = 'my_label'
def my_labeler(NIH_pos, NIH_cancer):
    if NIH_pos and not NIH_cancer:
        raise ValueError('Data inconsistent. Found row with NIH label both positive and negative')
    if NIH_pos:
        return 1
    elif not NIH_cancer:
        return 0
    else:
        return np.nan

my_labels = pd.array([my_labeler(row.NIH_pos, row.NIH_Cancer) for id_, row in node_dataset.iterrows()], dtype='Int32')

node_dataset[label_name] = my_labels

print('Distribution of labels')
node_dataset[label_name].value_counts()

Distribution of labels


1    521
0    135
Name: my_label, dtype: Int64

In [6]:
label_col = label_name
node_dataset[label_col] = node_dataset[label_col].astype('Int32')

# TODO: decide whether or not to include network embedding features...
num_node_feats = 100
node_feat_cols = ['Tissue RNA - lung [NX]', 'Single Cell Type RNA - Mucus-secreting cells [NX]'] + [f'node_{i}' for i in range(num_node_feats)]

# get subset of node features features + labels
node_data = node_dataset[node_feat_cols + [label_col]]

X = torch.Tensor(node_data[node_feat_cols].to_numpy())#.type(torch.float64)

y = node_data[label_col].fillna(-1).astype('int') # fill NaN with -1 so that it can be converted to pytorch tensor
y = torch.Tensor(y).type(torch.int64)

# restrict to data with labels
node_data_labeled = node_data[node_data[label_col].notna()]
node_data_labeled

Unnamed: 0_level_0,Tissue RNA - lung [NX],Single Cell Type RNA - Mucus-secreting cells [NX],node_0,node_1,node_2,node_3,node_4,node_5,node_6,node_7,...,node_91,node_92,node_93,node_94,node_95,node_96,node_97,node_98,node_99,my_label
myID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
23,0.996506,0.038446,1.537459,0.429451,-0.302780,0.258347,-0.950399,1.365928,-0.064216,0.228324,...,-0.115296,-0.234800,0.049897,0.382442,0.168788,-0.033863,0.065672,0.088566,0.006333,1
80,0.150101,-0.069827,2.622879,0.092524,1.558535,-1.148822,0.606971,0.573626,0.106728,-0.357630,...,-0.327284,-0.087676,0.254183,-0.066311,-0.014220,-0.059492,0.095315,0.159288,-0.186821,1
116,0.046881,-0.103977,1.976907,-1.347319,1.559400,-0.076801,0.088430,0.544722,0.046640,-0.046761,...,0.037351,-0.026250,-0.042694,0.015399,-0.069292,-0.039175,-0.089943,0.059104,-0.008584,1
146,-0.211169,-0.105079,1.302629,0.359702,0.487322,-0.169744,-0.610400,0.668143,-0.180892,-0.021217,...,0.075845,-0.315101,0.056129,0.535637,0.085542,-0.076495,-0.089464,0.086111,0.017891,1
149,-0.655015,-0.105079,1.480835,0.846645,0.805566,0.485862,-0.194441,0.923558,1.438937,-0.492804,...,-0.031383,-0.038140,-0.029342,-0.007463,-0.029591,-0.034695,-0.038255,0.000011,0.000772,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14547,-0.634371,-0.105079,0.700987,-0.087903,-0.319591,0.006010,-0.224073,-0.241506,-0.419347,0.136663,...,-0.038222,-0.073042,0.162214,-0.065836,0.078069,0.020572,-0.251188,-0.065744,0.097520,0
14548,-0.603405,0.835389,2.009425,1.142514,0.587500,-1.108236,-0.523388,0.751758,0.338092,0.156393,...,-0.067621,0.055089,-0.035550,-0.016057,0.030357,0.086635,-0.116914,-0.061450,-0.064007,0
14549,-0.618888,-0.105079,1.292300,0.347360,-0.751088,-0.469017,0.258115,0.182103,0.151207,0.596934,...,-0.028228,-0.029454,-0.024224,-0.004768,-0.009325,-0.014907,-0.000052,-0.002426,-0.016972,0
14550,-0.422770,-0.092489,0.942309,-0.071680,-0.648776,0.007953,-0.479971,0.004475,0.273699,0.507728,...,-0.001730,-0.000357,0.013551,-0.001779,-0.008815,0.000180,0.003282,-0.002532,-0.006610,0


In [7]:
from sklearn.model_selection import train_test_split

X_myIDs = node_data_labeled.index.to_numpy() # myIDs for nodes with labels for training/testing
labels = node_data_labeled[label_col].to_numpy() # for stratification

test_size = 0.2
val_size = 0.1 * (1/(1-test_size))

myIDs_train_val, myIDs_test = train_test_split(X_myIDs, test_size=test_size, shuffle=True, stratify=labels)

labels_train_val = node_data_labeled.loc[myIDs_train_val][label_col].to_numpy()
myIDs_train, myIDs_val = train_test_split(myIDs_train_val, test_size=val_size, shuffle=True, stratify=labels_train_val)

# NOTE: train-val-test split is shuffled and stratified
# TODO: look into any special consideration necessary for train-test splits on graph-based models

# create masks
n_nodes = len(node_data)
train_mask = np.zeros(n_nodes, dtype=bool)
train_mask[myIDs_train] = True
train_mask = torch.Tensor(train_mask).type(torch.bool)

val_mask = np.zeros(n_nodes, dtype=bool)
val_mask[myIDs_val] = True
val_mask = torch.Tensor(val_mask).type(torch.bool)

test_mask = np.zeros(n_nodes, dtype=bool)
test_mask[myIDs_test] = True
test_mask = torch.Tensor(test_mask).type(torch.bool)

In [8]:
data = Data(x=X, y=y, edge_index=edge_list)
num_classes = 2
num_features = X.shape[1]

data.train_mask = train_mask
data.val_mask = val_mask
data.test_mask = test_mask

## Graph Convolutional Neural Network

In [9]:
from torch_geometric.nn import GCNConv
import torch.nn.functional as F

# # define GCN architecture
# class GCN(torch.nn.Module):
#     def __init__(self, hidden_channels, num_layers, dropout_rate=0):
#         super(GCN, self).__init__()
#         self.convs = []
#         self.convs.append(GCNConv(num_features, hidden_channels)) # first GCNConv layer

#         for _ in range(num_layers - 2): # middle layers
#             self.convs.append(GCNConv(hidden_channels, hidden_channels))

#         self.convs.append(GCNConv(hidden_channels, num_classes)) # last GCNConv layer
#         self.convs = torch.nn.ModuleList(self.convs)

#         self.dropout_rate = dropout_rate

#     def forward(self, x, edge_index):
#         for conv in self.convs:
#             x = conv(x, edge_index)
#             x = x.relu()
#             x = F.dropout(x, p=self.dropout_rate, training=self.training)

#         return x

# define GCN architecture
class GCN(torch.nn.Module):
    def __init__(self, hidden_channels, num_layers, dropout_rate=0):
        super(GCN, self).__init__()
        self.convs = []
        self.convs.append(GCNConv(num_features, hidden_channels)) # first GCNConv layer

        for _ in range(num_layers - 1): # middle layers
            self.convs.append(GCNConv(hidden_channels, hidden_channels))

        # self.convs.append(GCNConv(hidden_channels, num_classes)) # last GCNConv layer
        self.convs = torch.nn.ModuleList(self.convs)

        self.dense1 = torch.nn.Linear(hidden_channels, hidden_channels)
        self.dense_out = torch.nn.Linear(hidden_channels, num_classes)

        self.dropout_rate = dropout_rate

    def forward(self, x, edge_index):
        for conv in self.convs:
            x = conv(x, edge_index)
            x = x.relu()
            x = F.dropout(x, p=self.dropout_rate, training=self.training)
        x = self.dense1(x)
        x = x.relu()
        x = F.dropout(x, p=self.dropout_rate, training=self.training)
        x = self.dense_out(x)

        return x

model = GCN(hidden_channels=16, num_layers=10, dropout_rate=0)
print(model)

GCN(
  (convs): ModuleList(
    (0): GCNConv(102, 16)
    (1): GCNConv(16, 16)
    (2): GCNConv(16, 16)
    (3): GCNConv(16, 16)
    (4): GCNConv(16, 16)
    (5): GCNConv(16, 16)
    (6): GCNConv(16, 16)
    (7): GCNConv(16, 16)
    (8): GCNConv(16, 16)
    (9): GCNConv(16, 16)
  )
  (dense1): Linear(in_features=16, out_features=16, bias=True)
  (dense_out): Linear(in_features=16, out_features=2, bias=True)
)


In [10]:
import pytorch_lightning as pl

# define Pytorch Lightning model
class LitGCN(pl.LightningModule):
    def __init__(self, model_name, **model_kwargs):
        super().__init__()
        # Saving hyperparameters
        self.save_hyperparameters()

        self.model_name = model_name
        self.model = GCN(**model_kwargs)
        self.loss_module = torch.nn.CrossEntropyLoss()

        self.example_input_array = data

    def forward(self, data, mode="train"):
        x, edge_index = data.x, data.edge_index
        x = self.model(x, edge_index)

        # Only calculate the loss and acc on the nodes corresponding to the mask
        if mode == "train":
            mask = data.train_mask
        elif mode == "val":
            mask = data.val_mask
        elif mode == "test":
            mask = data.test_mask
        else:
            assert False, "Unknown forward mode: %s" % mode

        #TODO: add other metrics like recall, precision, f1, etc...
        loss = self.loss_module(x[mask], data.y[mask])
        acc = (x[mask].argmax(dim=-1) == data.y[mask]).sum().float() / mask.sum()
        return x, loss, acc

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters())#SGD(self.parameters(), lr=0.1, momentum=0.9, weight_decay=2e-3)
        return optimizer

    def training_step(self, batch, batch_idx):
        x, loss, acc = self.forward(batch, mode="train")
        self.log("train_loss", loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
        self.log("train_acc", acc, on_step=True, on_epoch=True, prog_bar=True, logger=True)
        return loss

    def validation_step(self, batch, batch_idx):
        logits, _, acc = self.forward(batch, mode="val")
        self.log("val_acc", acc, on_step=True, on_epoch=True, prog_bar=True, logger=True)
        return logits

    def validation_epoch_end(self, validation_step_outputs):
        # NOTE: can't save non-standard GNN model like this
        # TODO: look into how to save torch geometric models
        # dummy_input = data
        # model_filename = f'{self.model_name}_{str(self.global_step).zfill(5)}.onnx'
        # torch.onnx.export(self, dummy_input, model_filename)
        # wandb.save(model_filename)

        flattened_logits = torch.flatten(torch.cat(validation_step_outputs))
        self.logger.experiment.log({'val_logits': wandb.Histogram(flattened_logits.to('cpu')), 
                                    'global_step': self.global_step})

    def test_step(self, batch, batch_idx):
        x, _, acc = self.forward(batch, mode="test")
        self.log("test_acc", acc, on_step=False, on_epoch=True, prog_bar=True, logger=True)

    # def test_epoch_end(self, test_step_outputs):
    #     # save model as onnx format
    #     pass

In [11]:
import os
notebook_name = 'modeling_gnn.ipynb'
os.environ['WANDB_NOTEBOOK_NAME'] = notebook_name

In [12]:
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger, WandbLogger
import wandb
import torch_geometric.loader

model_name = 'my_gcn_test2'

# logger = TensorBoardLogger("tb_logs", name=model_name)#, log_graph=True)
logger = WandbLogger(name=model_name, project="Project X", log_model="all")#, version=...)


AVAIL_GPUS = min(1, torch.cuda.device_count())
# AVAIL_GPUS = 0 # use when running out VRAM

model = LitGCN(model_name, hidden_channels=16, num_layers=2)#hidden_channels=64, num_layers=10, dropout_rate=0)

data_loader = torch_geometric.loader.DataLoader([data])#, batch_size=1, num_workers=2)


trainer = pl.Trainer(
        callbacks=[ModelCheckpoint(save_weights_only=False, mode="max", monitor="val_acc")],
        gpus=AVAIL_GPUS,
        max_epochs=50,
        logger=logger,
        # progress_bar_refresh_rate=0,
    )  # 0 because epoch size is 1

trainer.fit(model, data_loader, data_loader)
model = LitGCN.load_from_checkpoint(trainer.checkpoint_callback.best_model_path)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[34m[1mwandb[0m: Currently logged in as: [33mawni00[0m (use `wandb login --relogin` to force relogin)



  | Name        | Type             | Params | In sizes                     | Out sizes 
---------------------------------------------------------------------------------------------
0 | model       | GCN              | 2.2 K  | [[14552, 102], [2, 4214097]] | [14552, 2]
1 | loss_module | CrossEntropyLoss | 0      | [[458, 2], [458]]            | ?         
---------------------------------------------------------------------------------------------
2.2 K     Trainable params
0         Non-trainable params
2.2 K     Total params
0.009     Total estimated model params size (MB)


Validation sanity check:   0%|          | 0/1 [00:00<?, ?it/s]

  rank_zero_warn(


                                                                      

  rank_zero_warn(
  rank_zero_warn(


Epoch 49: 100%|██████████| 2/2 [00:00<00:00,  5.34it/s, loss=0.568, v_num=2kjm, train_loss_step=0.508, train_acc_step=0.795, val_acc_step=0.788, val_acc_epoch=0.788, train_loss_epoch=0.515, train_acc_epoch=0.795]


In [13]:
# evaluate

from sklearn.metrics import classification_report
logits, _, _ = model.forward(data.to(device='cpu'))

preds_train = logits[data.train_mask].argmax(dim=-1)
preds_test = logits[data.test_mask].argmax(dim=-1)

y_train = data.y[data.train_mask]
y_test = data.y[data.test_mask]

train_report = classification_report(y_train, preds_train, labels=[0,1], target_names=['negative', 'positive'])
test_report = classification_report(y_test, preds_test, labels=[0,1], target_names=['negative', 'positive'])

print('training metrics')
print(train_report)
print()
print('testing metrics')
print(test_report)

training metrics
              precision    recall  f1-score   support

    negative       0.73      0.43      0.54        94
    positive       0.87      0.96      0.91       364

    accuracy                           0.85       458
   macro avg       0.80      0.69      0.72       458
weighted avg       0.84      0.85      0.83       458


testing metrics
              precision    recall  f1-score   support

    negative       0.83      0.56      0.67        27
    positive       0.89      0.97      0.93       105

    accuracy                           0.89       132
   macro avg       0.86      0.76      0.80       132
weighted avg       0.88      0.89      0.88       132

