In [56]:
import torch.optim as optim
from torch_geometric.datasets import Planetoid
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import numpy as np
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.transforms import NormalizeFeatures
from torch_geometric.data import DataLoader
import random


torch.set_printoptions(edgeitems=500)

# seed for reproducibility
seed = 1
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)

### Dataset info:
class 0: without autism associations
class 1: autism genes

In [57]:
import read_data

data = read_data.read()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  autism_df['label'][autism_df['confidence'] == 0.5] = 0


In [58]:
data

Data(edge_index=[2, 811236], num_classes=20, test_mask=[23472], train_mask=[23472], val_mask=[23472], x=[23472, 23472], y=[23472])

In [59]:
print(f'Contains isolated nodes: {data.contains_isolated_nodes()}')
print(f'Contains self-loops: {data.contains_self_loops()}')
print(f'Is undirected: {data.is_undirected()}')
print(f'Average node degree: {data.num_edges / data.num_nodes:.2f}')
# dataset = data
# data.train_mask = data.y >= 0

Contains isolated nodes: False
Contains self-loops: False
Is undirected: True
Average node degree: 34.56


In [60]:
# total nodes
data.train_mask.shape

torch.Size([23472])

In [61]:
# number of training samples
data.train_mask.sum()

tensor(300)

In [62]:
# number of testing samples
data.test_mask.sum()
# data.test_mask = data.y >= 0

tensor(735)

In [63]:
# number of validation samples
data.val_mask.sum()

tensor(800)

In [64]:
data.y[data.train_mask].shape

torch.Size([300])

In [65]:
data.y[data.test_mask].shape

torch.Size([735])

In [66]:
# from torch_geometric.data import DataLoader
#
# loader = DataLoader(data, batch_size=32, shuffle=True)


In [67]:
# data_list = [data]
#

In [68]:
# dataset = DataLoader(data_list)
#

In [69]:
# dataset.num_node_features = data.num_node_features
# dataset.num_classes = data.num_classes


### Visualization Model Using Tensorboard Command
commandline run tensorboard
```
cd src
tensorboard --logdir log
```

In [70]:
# build model
from GCN import GCNStack

model = GCNStack(data.num_node_features, hidden_dim1=128, hidden_dim2=64, hidden_dim3=32, output_dim=data.num_classes, dropout=0.5)
print(model)


GCNStack(
  (convs): ModuleList(
    (0): GCNConv(23472, 128)
    (1): GCNConv(128, 64)
    (2): GCNConv(64, 32)
  )
  (lns): ModuleList(
    (0): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
    (1): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
  )
  (post_mp): Sequential(
    (0): Linear(in_features=32, out_features=32, bias=True)
    (1): Linear(in_features=32, out_features=20, bias=True)
  )
)


In [71]:
# Running on GPU or CPU
use_GPU = True
device = torch.device('cuda' if torch.cuda.is_available() and use_GPU else 'cpu')
model, data = model.to(device), data.to(device)

In [72]:
device


device(type='cuda')

In [73]:
# torch.cuda.empty_cache()

In [74]:
def model_test(loader, model, is_validation=False, is_training=False):
    ''' Testing Code of the Model '''
    model.eval()

    correct = 0
    for data in loader:
        with torch.no_grad():
            emb, pred = model(data.x, data.edge_index)
            pred = pred.argmax(dim=1)
            label = data.y

        if is_training:
            mask = data.val_mask if is_validation else data.train_mask
        else: # testing
            mask = data.val_mask if is_validation else data.test_mask
        # node classification: only evaluate on nodes in test set
        pred = pred[mask]
        label = data.y[mask]
        # testing code
        # training_status = 'Training' if is_training else 'Testing'
        # print(training_status, '$$pred', pred)
        # print(training_status, '%%label', label)
        correct += pred.eq(label).sum().item()
    total = 0
    for data in loader.dataset:
        if is_training:
            total += torch.sum(data.train_mask).item()
        else:
            total += torch.sum(data.test_mask).item()
    return correct / total, pred, label

def model_train(dataset, writer, model, epoch_num, lr, weight_decay, momentum):
    ''' Training code of the model '''
    test_loader = loader = DataLoader(dataset, shuffle=False)

    # Optimizer
    # opt = optim.SGD(model.parameters(), lr=lr, weight_decay=weight_decay, momentum=momentum)
    opt = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)

    # visualize the model architecture in tensorboard
    # writer.add_graph(model, ( data.x, data.edge_index ))

    # Training:
    for epoch in range(epoch_num + 1):
        total_loss = 0
        model.train()
        for batch in loader:
            #print(batch.train_mask, '----')
            opt.zero_grad()
            embedding, pred = model(batch.x, batch.edge_index)
            label = batch.y
            pred = pred[batch.train_mask]
            label = label[batch.train_mask]
            loss = model.loss(pred, label)
            loss.backward()
            opt.step()
            total_loss += loss.item() * batch.num_graphs
        total_loss /= len(loader.dataset)
        writer.add_scalar("loss", total_loss, epoch)

        # if epoch % 5 == 0:
        train_acc, _, _ = model_test(test_loader, model, is_training=True)
        validation_acc, _, _ = model_test(test_loader, model, is_training=False, is_validation=True)
        print("Epoch {}. Loss: {:.4f}. Train accuracy: {:.4f}. Validation accuracy: {:.4f}".format(
            epoch, total_loss, train_acc, validation_acc))
        writer.add_scalar("test accuracy", validation_acc, epoch)

        if epoch % 20 == 0:
            name = 'epoch' + str(epoch)
            writer.add_embedding(embedding, global_step=epoch, tag=name, metadata=batch.y)

    return model

from datetime import datetime
from tensorboardX import SummaryWriter

writer = SummaryWriter("./log/" + datetime.now().strftime("%Y%m%d-%H%M%S"))

model_trained = model_train([data], writer, model, epoch_num=25, lr=0.001, weight_decay=0.00001, momentum=0.9)

Epoch 0. Loss: 3.0630. Train accuracy: 0.1833. Validation accuracy: 0.1469
Epoch 1. Loss: 2.5601. Train accuracy: 0.1967. Validation accuracy: 0.1524
Epoch 2. Loss: 2.2642. Train accuracy: 0.7667. Validation accuracy: 0.8327
Epoch 3. Loss: 2.0424. Train accuracy: 0.8333. Validation accuracy: 0.9374
Epoch 4. Loss: 1.8799. Train accuracy: 0.8267. Validation accuracy: 0.9401
Epoch 5. Loss: 1.7492. Train accuracy: 0.8233. Validation accuracy: 0.9401
Epoch 6. Loss: 1.6341. Train accuracy: 0.8233. Validation accuracy: 0.9401
Epoch 7. Loss: 1.5282. Train accuracy: 0.8233. Validation accuracy: 0.9401
Epoch 8. Loss: 1.4301. Train accuracy: 0.8233. Validation accuracy: 0.9401
Epoch 9. Loss: 1.3394. Train accuracy: 0.8233. Validation accuracy: 0.9401
Epoch 10. Loss: 1.2551. Train accuracy: 0.8233. Validation accuracy: 0.9401
Epoch 11. Loss: 1.1766. Train accuracy: 0.8233. Validation accuracy: 0.9401
Epoch 12. Loss: 1.1036. Train accuracy: 0.8233. Validation accuracy: 0.9401
Epoch 13. Loss: 1.0352

In [75]:
test_acc, pred, label = model_test( DataLoader([data], shuffle=False), model_trained, is_training=False, is_validation=False)

In [76]:
test_acc

0.8870748299319728

In [77]:
pred_np = pred.cpu().detach().numpy()
label_np = label.cpu().detach().numpy()

In [78]:
from sklearn.metrics import f1_score,  recall_score, precision_score

In [79]:
f1_score(pred_np, label_np, average='weighted')

0.9338372171973481

In [80]:
precision_score(pred_np, label_np, average='weighted')

0.9872169860310536

In [81]:
recall_score(pred_np, label_np, average='weighted')

0.8870748299319728