In [None]:
# Install required packages.
import os
import torch
os.environ['TORCH'] = torch.__version__
print(torch.__version__)

!pip install -q torch-scatter -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install -q torch-sparse -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install -q git+https://github.com/pyg-team/pytorch_geometric.git

2.0.1+cu118
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.2/10.2 MB[0m [31m61.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.8/4.8 MB[0m [31m37.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for torch_geometric (pyproject.toml) ... [?25l[?25hdone


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd

br_edges = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/data/totaledge_edge_breast-cancer.csv")
c_edges = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/data/totaledge_edge_cancer.csv")
col_edges = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/data/totaledge_edge_colorectal-cancer.csv")
end_edges = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/data/totaledge_edge_endometrial-cancer.csv")
gli_edges = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/data/totaledge_edge_glioma.csv")
ren_edges = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/data/totaledge_edge_renal-cell-carcinoma.csv")

nodes = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/data/FULL.csv")

In [None]:
# nodes = nodes.drop(nodes[nodes["cancer_type"] != "Breast Cancer"].index)
nodes = nodes.dropna(axis=0)

y_label = "cancer_type_detailed" # path -> classifying only cancer types among 6 classes, cancer_type_detailed -> classifying detailed cancer type

# nodes
nodes = nodes.reset_index(drop=True)
print(nodes[y_label].value_counts())

graph_dict = {}
edge_dict = {'breast-cancer':br_edges, 'cancer':c_edges, 'colorectal-cancer':col_edges, 'endometrial-cancer': end_edges, 'glioma':gli_edges, 'renal-cell-carcinoma': ren_edges}

for cancer_type, edges in edge_dict.items():
    graph_dict[cancer_type] = [[], []]
    for idx, row in edges.iterrows():
      col_list = nodes.columns.tolist()
      if(row["source"] not in col_list or row["target"] not in col_list):
        print("something wrong")
        continue
      if(row["weight"]): # if weight != 0
        graph_dict[cancer_type][0].append(nodes.columns.get_loc(row["source"]))
        graph_dict[cancer_type][1].append(nodes.columns.get_loc(row["target"]))

type_dict = {t:i for i, t in enumerate(nodes[y_label].unique())}
type_dict

edge_index = [graph_dict[p] for p in nodes["path"]]
edge_attr = [[[1] for i in range(len(graph_dict[p][0]))] for p in nodes["path"]]

for i, k in enumerate(nodes[y_label]):
  nodes[y_label][i] = [type_dict[nodes[y_label][i]]]

node_feat = torch.tensor([[[x] for x in sublist] for sublist in nodes.iloc[:, :-3].values.tolist()])
num_nodes = [nodes.shape[1] - 3] * len(nodes)
y = torch.tensor(nodes[y_label])

Breast Invasive Ductal Carcinoma     2896
Breast Invasive Lobular Carcinoma     740
Renal Clear Cell Carcinoma            545
Colon Adenocarcinoma                  239
Invasive Breast Carcinoma             178
                                     ... 
Mature T and NK Neoplasms               1
Lung Carcinoid                          1
Gallbladder Adenocarcinoma, NOS         1
Papillary Thyroid Cancer                1
Brenner Tumor                           1
Name: cancer_type_detailed, Length: 141, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nodes["cancer_type_detailed"][i] = [type_dict[nodes["cancer_type_detailed"][i]]]


In [None]:
import torch
from torch_geometric.data import Data

custom_dataset = []

for i in range(len(nodes)):
  d = Data(x=node_feat[i], edge_index=torch.tensor(edge_index[i]), edge_attr=torch.tensor(edge_attr[i]), y=y[i])
  custom_dataset.append(d)

data = custom_dataset[0]

print()
print(data)
print('=============================================================')

# Gather some statistics about the first graph.
print(f'Number of nodes: {data.num_nodes}')
print(f'Number of edges: {data.num_edges}')
print(f'Average node degree: {data.num_edges / data.num_nodes:.2f}')
print(f'Has isolated nodes: {data.has_isolated_nodes()}')
print(f'Has self-loops: {data.has_self_loops()}')
print(f'Is undirected: {data.is_undirected()}')


Data(x=[176, 1], edge_index=[2, 54], edge_attr=[54, 1], y=[1])
Number of nodes: 176
Number of edges: 54
Average node degree: 0.31
Has isolated nodes: True
Has self-loops: False
Is undirected: False


In [None]:
print(len(custom_dataset))

6857


In [None]:
import random

random.shuffle(custom_dataset)

train_dataset = custom_dataset[:6000]
test_dataset = custom_dataset[6000:]

print(f'Number of training graphs: {len(train_dataset)}')
print(f'Number of test graphs: {len(test_dataset)}')

Number of training graphs: 6000
Number of test graphs: 857


In [None]:
from torch_geometric.loader import DataLoader

train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=256, shuffle=False)

In [None]:
from torch.nn import Linear
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.nn import global_mean_pool


class GCN(torch.nn.Module):
    def __init__(self, hidden_channels):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(1, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.conv3 = GCNConv(hidden_channels, hidden_channels)
        self.lin = Linear(hidden_channels, len(type_dict))

    def forward(self, x, edge_index, batch):
        # 1. Obtain node embeddings
        x = self.conv1(x, edge_index)
        x = x.relu()
        x = self.conv2(x, edge_index)
        x = x.relu()
        x = self.conv3(x, edge_index)

        # 2. Readout layer
        x = global_mean_pool(x, batch)  # [batch_size, hidden_channels]

        # 3. Apply a final classifier
        x = F.dropout(x, p=0.7, training=self.training)
        x = self.lin(x)

        return x

model = GCN(hidden_channels=64)
print(model)

GCN(
  (conv1): GCNConv(1, 64)
  (conv2): GCNConv(64, 64)
  (conv3): GCNConv(64, 64)
  (lin): Linear(in_features=64, out_features=141, bias=True)
)


In [None]:
from IPython.display import Javascript
display(Javascript('''google.colab.output.setIframeHeight(0, true, {maxHeight: 300})'''))

device = 'cuda'

model = GCN(hidden_channels=256).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.0025)
criterion = torch.nn.CrossEntropyLoss()

def train():
    model.train()

    for data in train_loader:  # Iterate in batches over the training dataset.
         data.to(device)
         out = model(data.x, data.edge_index, data.batch)  # Perform a single forward pass.
         loss = criterion(out, data.y)  # Compute the loss.
         loss.backward()  # Derive gradients.
         optimizer.step()  # Update parameters based on gradients.
         optimizer.zero_grad()  # Clear gradients.

def test(loader):
     model.eval()

     correct = 0
     for data in loader:  # Iterate in batches over the training/test dataset.
         data.to(device)
         out = model(data.x, data.edge_index, data.batch)
         pred = out.argmax(dim=1)  # Use the class with highest probability.
         correct += int((pred == data.y).sum())  # Check against ground-truth labels.
     return correct / len(loader.dataset)  # Derive ratio of correct predictions.

best_test_acc = 0
best_train_acc = 0

for epoch in range(1, 1201):
    train()
    train_acc = test(train_loader)
    test_acc = test(test_loader)
    best_test_acc = max(best_test_acc, test_acc)
    best_train_acc = max(best_test_acc, train_acc)
    print(f'Epoch: {epoch:03d}, Train Acc: {train_acc:.4f}, Test Acc: {test_acc:.4f}')

print(f'Best Train Acc: {best_train_acc:.4f}, Best Test Acc: {best_test_acc:.4f}')

<IPython.core.display.Javascript object>

Epoch: 001, Train Acc: 0.4237, Test Acc: 0.4131
Epoch: 002, Train Acc: 0.4237, Test Acc: 0.4131
Epoch: 003, Train Acc: 0.4237, Test Acc: 0.4131
Epoch: 004, Train Acc: 0.4237, Test Acc: 0.4131
Epoch: 005, Train Acc: 0.4237, Test Acc: 0.4131
Epoch: 006, Train Acc: 0.4237, Test Acc: 0.4131
Epoch: 007, Train Acc: 0.4237, Test Acc: 0.4131
Epoch: 008, Train Acc: 0.4237, Test Acc: 0.4131
Epoch: 009, Train Acc: 0.4237, Test Acc: 0.4131
Epoch: 010, Train Acc: 0.4237, Test Acc: 0.4131
Epoch: 011, Train Acc: 0.4237, Test Acc: 0.4131
Epoch: 012, Train Acc: 0.4237, Test Acc: 0.4131
Epoch: 013, Train Acc: 0.4243, Test Acc: 0.4131
Epoch: 014, Train Acc: 0.4277, Test Acc: 0.4154
Epoch: 015, Train Acc: 0.4313, Test Acc: 0.4189
Epoch: 016, Train Acc: 0.4292, Test Acc: 0.4177
Epoch: 017, Train Acc: 0.4292, Test Acc: 0.4154
Epoch: 018, Train Acc: 0.4323, Test Acc: 0.4166
Epoch: 019, Train Acc: 0.4295, Test Acc: 0.4177
Epoch: 020, Train Acc: 0.4322, Test Acc: 0.4201
Epoch: 021, Train Acc: 0.4355, Test Acc: