In [None]:
# Install required packages.
import os
import torch
os.environ['TORCH'] = torch.__version__
print(torch.__version__)

!pip install -q torch-scatter -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install -q torch-sparse -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install -q git+https://github.com/pyg-team/pytorch_geometric.git

2.0.1+cu118
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.2/10.2 MB[0m [31m51.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.8/4.8 MB[0m [31m49.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for torch_geometric (pyproject.toml) ... [?25l[?25hdone


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd

edges = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/data/totaledge_edge_breast-cancer.csv")
nodes = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/data/totalnode_brca_tcga.csv")

In [None]:
nodes = nodes.drop(nodes[nodes["cancer_type"] != "Breast Cancer"].index)
# nodes
nodes = nodes.reset_index(drop=True)
nodes["cancer_type_detailed"].value_counts()

Breast Invasive Ductal Carcinoma             812
Breast Invasive Lobular Carcinoma            206
Breast Mixed Ductal and Lobular Carcinoma     28
Breast Invasive Mixed Mucinous Carcinoma      16
Metaplastic Breast Cancer                     14
Invasive Breast Carcinoma                      6
Paget Disease of the Nipple                    3
Adenoid Cystic Breast Cancer                   2
Solid Papillary Carcinoma of the Breast        2
Breast Invasive Carcinoma, NOS                 1
Name: cancer_type_detailed, dtype: int64

In [None]:
graph = [[], []]
cnt = 0
for idx, row in edges.iterrows():
  col_list = nodes.columns.tolist()
  if(row["source"] not in col_list or row["target"] not in col_list):
    print("something wrong")
    cnt += 1
    continue
  if(row["weight"]): # if weight != 0
    graph[0].append(nodes.columns.get_loc(row["source"]))
    graph[1].append(nodes.columns.get_loc(row["target"]))

In [None]:
type_dict = {t:i for i, t in enumerate(nodes["cancer_type_detailed"].unique())}
type_dict

{'Breast Invasive Ductal Carcinoma': 0,
 'Breast Mixed Ductal and Lobular Carcinoma': 1,
 'Breast Invasive Mixed Mucinous Carcinoma': 2,
 'Breast Invasive Lobular Carcinoma': 3,
 'Paget Disease of the Nipple': 4,
 'Adenoid Cystic Breast Cancer': 5,
 'Invasive Breast Carcinoma': 6,
 'Metaplastic Breast Cancer': 7,
 'Solid Papillary Carcinoma of the Breast': 8,
 'Breast Invasive Carcinoma, NOS': 9}

In [None]:
edge_index = torch.tensor([graph] * len(nodes))
edge_attr = torch.tensor([[[1] for i in range(len(graph[0]))] for _ in range(len(nodes))])

In [None]:
for i, k in enumerate(nodes["cancer_type_detailed"]):
  nodes["cancer_type_detailed"][i] = [type_dict[nodes["cancer_type_detailed"][i]]]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nodes["cancer_type_detailed"][i] = [type_dict[nodes["cancer_type_detailed"][i]]]


In [None]:
node_feat = torch.tensor([[[x] for x in sublist] for sublist in nodes.iloc[:, :-3].values.tolist()])
num_nodes = [nodes.shape[1] - 3] * len(nodes)
y = torch.tensor(nodes["cancer_type_detailed"])

In [None]:
import torch
from torch_geometric.datasets import TUDataset
from torch_geometric.data import Data

dataset = TUDataset(root='data/TUDataset', name='MUTAG')

print()
print(f'Dataset: {dataset}:')
print('====================')
print(f'Number of graphs: {len(dataset)}')
print(f'Number of features: {dataset.num_features}')
print(f'Number of classes: {dataset.num_classes}')

data = dataset[0]  # Get the first graph object.

custom_dataset = []

for i in range(len(nodes)):
  d = Data(x=node_feat[i], edge_index=edge_index[i], edge_attr=edge_attr[i], y=y[i])
  custom_dataset.append(d)

print()
print(data)
print('=============================================================')

# Gather some statistics about the first graph.
print(f'Number of nodes: {data.num_nodes}')
print(f'Number of edges: {data.num_edges}')
print(f'Average node degree: {data.num_edges / data.num_nodes:.2f}')
print(f'Has isolated nodes: {data.has_isolated_nodes()}')
print(f'Has self-loops: {data.has_self_loops()}')
print(f'Is undirected: {data.is_undirected()}')

Downloading https://www.chrsmrrs.com/graphkerneldatasets/MUTAG.zip
Extracting data/TUDataset/MUTAG/MUTAG.zip
Processing...



Dataset: MUTAG(188):
Number of graphs: 188
Number of features: 7
Number of classes: 2

Data(edge_index=[2, 38], x=[17, 7], edge_attr=[38, 4], y=[1])
Number of nodes: 17
Number of edges: 38
Average node degree: 2.24
Has isolated nodes: False
Has self-loops: False
Is undirected: True


Done!


In [None]:
import random

dataset = dataset.shuffle()
random.shuffle(custom_dataset)

train_dataset = custom_dataset[:800]
test_dataset = custom_dataset[800:]

print(f'Number of training graphs: {len(train_dataset)}')
print(f'Number of test graphs: {len(test_dataset)}')

Number of training graphs: 800
Number of test graphs: 290


In [None]:
from torch_geometric.loader import DataLoader

train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=256, shuffle=False)

In [None]:
from torch.nn import Linear
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.nn import global_mean_pool


class GCN(torch.nn.Module):
    def __init__(self, hidden_channels):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(1, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.conv3 = GCNConv(hidden_channels, hidden_channels)
        self.lin = Linear(hidden_channels, 10)

    def forward(self, x, edge_index, batch):
        # 1. Obtain node embeddings
        x = self.conv1(x, edge_index)
        x = x.relu()
        x = self.conv2(x, edge_index)
        x = x.relu()
        x = self.conv3(x, edge_index)

        # 2. Readout layer
        x = global_mean_pool(x, batch)  # [batch_size, hidden_channels]

        # 3. Apply a final classifier
        x = F.dropout(x, p=0.7, training=self.training)
        x = self.lin(x)

        return x

model = GCN(hidden_channels=64)
print(model)

GCN(
  (conv1): GCNConv(1, 64)
  (conv2): GCNConv(64, 64)
  (conv3): GCNConv(64, 64)
  (lin): Linear(in_features=64, out_features=10, bias=True)
)


In [None]:
from IPython.display import Javascript
display(Javascript('''google.colab.output.setIframeHeight(0, true, {maxHeight: 300})'''))

device = 'cuda'

model = GCN(hidden_channels=64).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.CrossEntropyLoss()

def train():
    model.train()

    for data in train_loader:  # Iterate in batches over the training dataset.
         data.to(device)
         out = model(data.x, data.edge_index, data.batch)  # Perform a single forward pass.
         loss = criterion(out, data.y)  # Compute the loss.
         loss.backward()  # Derive gradients.
         optimizer.step()  # Update parameters based on gradients.
         optimizer.zero_grad()  # Clear gradients.

def test(loader):
     model.eval()

     correct = 0
     for data in loader:  # Iterate in batches over the training/test dataset.
         data.to(device)
         out = model(data.x, data.edge_index, data.batch)
         pred = out.argmax(dim=1)  # Use the class with highest probability.
         correct += int((pred == data.y).sum())  # Check against ground-truth labels.
     return correct / len(loader.dataset)  # Derive ratio of correct predictions.


for epoch in range(1, 151):
    train()
    train_acc = test(train_loader)
    test_acc = test(test_loader)
    print(f'Epoch: {epoch:03d}, Train Acc: {train_acc:.4f}, Test Acc: {test_acc:.4f}')

<IPython.core.display.Javascript object>

Epoch: 001, Train Acc: 0.7388, Test Acc: 0.7621
Epoch: 002, Train Acc: 0.7388, Test Acc: 0.7621
Epoch: 003, Train Acc: 0.7388, Test Acc: 0.7621
Epoch: 004, Train Acc: 0.7388, Test Acc: 0.7621
Epoch: 005, Train Acc: 0.7388, Test Acc: 0.7621
Epoch: 006, Train Acc: 0.7388, Test Acc: 0.7621
Epoch: 007, Train Acc: 0.7388, Test Acc: 0.7621
Epoch: 008, Train Acc: 0.7388, Test Acc: 0.7621
Epoch: 009, Train Acc: 0.7388, Test Acc: 0.7621
Epoch: 010, Train Acc: 0.7388, Test Acc: 0.7621
Epoch: 011, Train Acc: 0.7388, Test Acc: 0.7621
Epoch: 012, Train Acc: 0.7388, Test Acc: 0.7621
Epoch: 013, Train Acc: 0.7388, Test Acc: 0.7621
Epoch: 014, Train Acc: 0.7388, Test Acc: 0.7621
Epoch: 015, Train Acc: 0.7388, Test Acc: 0.7621
Epoch: 016, Train Acc: 0.7388, Test Acc: 0.7621
Epoch: 017, Train Acc: 0.7388, Test Acc: 0.7621
Epoch: 018, Train Acc: 0.7388, Test Acc: 0.7621
Epoch: 019, Train Acc: 0.7388, Test Acc: 0.7621
Epoch: 020, Train Acc: 0.7388, Test Acc: 0.7621
Epoch: 021, Train Acc: 0.7388, Test Acc: