In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import pandas as pd
import numpy as np
from graphgym.custom_graphgym.loader.protein_batch import ProteinBatchDataset
import matplotlib.pyplot as plt
import os

In [2]:
# General imports
import os
import json
import collections

# Data science imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly
import scipy.sparse as sp

# Import Weights & Biases for Experiment Tracking
import wandb

# Graph imports
import torch
from torch import Tensor
import torch.nn.functional as F

import torch_geometric
from torch_geometric.nn import GCNConv
from torch_geometric.utils import to_networkx

import networkx as nx
from networkx.algorithms import community

from tqdm.auto import trange

In [3]:
os.chdir('/Users/cgu3/Documents/Grape-Pi')

In [4]:
protein_dataset = ProteinBatchDataset("data/protein_graphs", rebuild=True)

Rebuilding...
Done!


In [57]:
protein_dataset.edge_index

tensor([[ 2075,  2075,  2075,  ..., 64290, 63454, 66337],
        [ 1616,  2839,  1909,  ..., 65646, 65646, 65646]])

In [5]:
from torch_geometric.loader import DataLoader
train_loader = DataLoader(protein_dataset[0:8], batch_size=2, shuffle=True)
test_loader = DataLoader(protein_dataset[8:], batch_size=1, shuffle=False)

for step, data in enumerate(train_loader):
    print(f'Step {step + 1}:')
    print('=======')
    print(f'Number of graphs in the current batch: {data.num_graphs}')
    print(data)
    print()

Step 1:
Number of graphs in the current batch: 2
DataBatch(x=[13786, 1], edge_index=[2, 6770568], y=[2], train_mask=[2], val_mask=[2], test_mask=[2], batch=[13786], ptr=[3])

Step 2:
Number of graphs in the current batch: 2
DataBatch(x=[13786, 1], edge_index=[2, 6770568], y=[2], train_mask=[2], val_mask=[2], test_mask=[2], batch=[13786], ptr=[3])

Step 3:
Number of graphs in the current batch: 2
DataBatch(x=[13786, 1], edge_index=[2, 6770568], y=[2], train_mask=[2], val_mask=[2], test_mask=[2], batch=[13786], ptr=[3])

Step 4:
Number of graphs in the current batch: 2
DataBatch(x=[13786, 1], edge_index=[2, 6770568], y=[2], train_mask=[2], val_mask=[2], test_mask=[2], batch=[13786], ptr=[3])


In [6]:
from torch.nn import Linear
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.nn import global_mean_pool


class GCN(torch.nn.Module):
    def __init__(self, hidden_channels):
        super(GCN, self).__init__()
        torch.manual_seed(12345)
        self.conv1 = GCNConv(1, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.conv3 = GCNConv(hidden_channels, hidden_channels)
        self.lin = Linear(hidden_channels, 7)

    def forward(self, x, edge_index, batch):
        # 1. Obtain node embeddings 
        x = self.conv1(x, edge_index)
        x = x.relu()
        x = self.conv2(x, edge_index)
        x = x.relu()
        x = self.conv3(x, edge_index)

        # 2. Readout layer
        x = global_mean_pool(x, batch)  # [batch_size, hidden_channels]

        # 3. Apply a final classifier
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.lin(x)
        
        return x

model = GCN(hidden_channels=64)
print(model)

GCN(
  (conv1): GCNConv(1, 64)
  (conv2): GCNConv(64, 64)
  (conv3): GCNConv(64, 64)
  (lin): Linear(in_features=64, out_features=7, bias=True)
)


In [7]:
from IPython.display import Javascript
display(Javascript('''google.colab.output.setIframeHeight(0, true, {maxHeight: 300})'''))

model = GCN(hidden_channels=64)

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = torch.nn.CrossEntropyLoss()

def train():
    model.train()

    for data in train_loader:  # Iterate in batches over the training dataset.
         out = model(data.x, data.edge_index, data.batch)  # Perform a single forward pass.
         loss = criterion(out, data.y)  # Compute the loss.
         loss.backward()  # Derive gradients.
         optimizer.step()  # Update parameters based on gradients.
         optimizer.zero_grad()  # Clear gradients.

<IPython.core.display.Javascript object>

In [8]:
def test(loader, create_table=False):
     model.eval()
     correct = 0
     loss_ = 0
     for data in loader:  # Iterate in batches over the training/test dataset.
         out = model(data.x, data.edge_index, data.batch)  
         loss = criterion(out, data.y)
         loss_ += loss.item()
         pred = out.argmax(dim=1)  # Use the class with highest probability.
         
         correct += int((pred == data.y).sum())  # Check against ground-truth labels.
     return correct / len(loader.dataset), loss_ / len(loader.dataset)  # Derive ratio of correct predictions.

In [9]:
for epoch in trange(1, 11):
    train()
    train_acc, train_loss = test(train_loader)
    test_acc, test_loss = test(test_loader, create_table=True)
    print(f'Epoch: {epoch:03d}, Train Acc: {train_acc:.4f}, Test Acc: {test_acc:.4f}')

  0%|          | 0/10 [00:00<?, ?it/s]

Epoch: 001, Train Acc: 0.1250, Test Acc: 0.0000
Epoch: 002, Train Acc: 0.1250, Test Acc: 0.0000
Epoch: 003, Train Acc: 0.2500, Test Acc: 0.0000
Epoch: 004, Train Acc: 0.2500, Test Acc: 0.0000
Epoch: 005, Train Acc: 0.3750, Test Acc: 0.5000
Epoch: 006, Train Acc: 0.3750, Test Acc: 0.5000
Epoch: 007, Train Acc: 0.3750, Test Acc: 0.5000
Epoch: 008, Train Acc: 0.3750, Test Acc: 0.5000
Epoch: 009, Train Acc: 0.3750, Test Acc: 0.5000
Epoch: 010, Train Acc: 0.3750, Test Acc: 0.5000
