In [None]:
!mkdir data

In [None]:
!pip install -q torch-scatter -f https://pytorch-geometric.com/whl/torch-1.9.0+cu102.html
!pip install -q torch-sparse -f https://pytorch-geometric.com/whl/torch-1.9.0+cu102.html
!pip install -q torch-geometric

In [None]:
import torch
import torch.nn.functional as F
import torch.optim as optim
import torch.nn as nn

from torch_geometric.data import Data
from torch_geometric.nn import GCNConv

from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score, f1_score, accuracy_score, precision_score

import pandas as pd
import networkx as nx
import numpy as np
import matplotlib.pyplot as plt

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device

In [None]:
# build edge index
edge_list = pd.read_csv('data/edges_1.5.csv')
edge_list = edge_list.to_numpy()
edge_index = torch.tensor(edge_list, dtype=torch.long).t().to(device)
print('edge_index', edge_index.shape)

In [None]:
df = pd.read_csv('data/features_1.5_std_all.csv')

# prepare labels
y = df[['researcher']]
y = y.to_numpy()
y = torch.tensor(y, dtype=torch.long).squeeze().to(device)
print('y', y.shape)

# prepare features
x = df.drop(['id', 'researcher'], axis = 1)
x = x.to_numpy()
x = torch.tensor(x, dtype=torch.float).to(device)
print('x', x.shape)

In [None]:
# build Data object
data = Data(
    x=x,
    edge_index=edge_index,
    y=y
)
data

In [None]:
def plot_data(data):
    edges_raw = data.edge_index.cpu().numpy()
    edges = [(x, y) for x, y in zip(edges_raw[0, :], edges_raw[1, :])]
    labels = data.y.cpu().numpy()

    G = nx.Graph()
    G.add_nodes_from(list(range(np.max(edges_raw))))
    G.add_edges_from(edges)
    plt.subplot(111)
    options = {
                'node_size': 30,
                'width': 0.2,
    }
    nx.draw(G, with_labels=False, node_color=labels, cmap=plt.cm.tab10, font_weight='bold', layout=nx.spring_layout(G, k=0.15, iterations=20), **options)
    plt.show()

In [None]:
plot_data(data)

In [None]:
# GNN model
class GNNStack(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, num_layers=2, dropout=0.5, weights=None):
        super(GNNStack, self).__init__()
        self.num_layers = num_layers
        self.dropout = dropout
        self.weights = weights
        self.conv_layers = nn.ModuleList()
        self.conv_layers.append(GCNConv(input_dim, hidden_dim))
        if num_layers > 1:
            for i in range(num_layers - 1):
                self.conv_layers.append(GCNConv(hidden_dim, hidden_dim))
        
        self.post_mp = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim), 
            nn.ReLU(),
            nn.Dropout(dropout), 
            nn.Linear(hidden_dim, hidden_dim), 
            nn.ReLU(),
            nn.Dropout(dropout), 
            nn.Linear(hidden_dim, output_dim)
        )
        

    def forward(self, x, edge_index):
        for i in range(self.num_layers):
            x = self.conv_layers[i](x, edge_index)
            x = F.relu(x)
            x = F.dropout(x, p=0.5, training=self.training)
        out = self.post_mp(x)
        return x, F.log_softmax(out, dim=1)


    def loss(self, pred, labels):
        return F.nll_loss(pred, labels, weight=self.weights)

In [None]:
model = GNNStack(
    input_dim=data.num_node_features, 
    hidden_dim=64, 
    output_dim=2, 
    num_layers=1,
    dropout=0.5,
    weights=None
)

if torch.cuda.is_available():
    model = model.cuda(device)
model

In [None]:
lr = 0.01
weight_decay = 5e-4
optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
optimizer

In [None]:
# set train and test masks
idx = np.arange(len(data.x))
train_idx, test_idx = train_test_split(idx, test_size=.2)
data.train_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
data.train_mask[train_idx] = 1
data.test_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
data.test_mask[test_idx] = 1
train_idx.shape, test_idx.shape

In [None]:
def train(model, data, optimizer):
    model.train()
    optimizer.zero_grad()
    _, out = model(data.x, data.edge_index)
    loss = model.loss(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()
    return loss

In [None]:
@torch.no_grad()
def test(model, data):
    model.eval()
    emb, out = model(data.x, data.edge_index)
    pred = out.argmax(dim=1)
    loss = model.loss(out[data.test_mask], data.y[data.test_mask])
    pred_test = pred[data.test_mask].cpu()
    y_test = data.y[data.test_mask].cpu()
    # metrics
    acc = accuracy_score(y_test, pred_test)
    recall = recall_score(y_test, pred_test)
    precision = precision_score(y_test, pred_test)
    f1 = f1_score(y_test, pred_test)
    return emb, pred, acc, precision, recall, f1, loss, pred_test, y_test

In [None]:
train_loss = []
test_loss = []
for epoch in range(1, 200):
    t_loss = train(model, data, optimizer)
    _, pred, acc, precision, recall, f1, v_loss, _, _ = test(model, data)
    train_loss.append(t_loss)
    test_loss.append(v_loss)
    print(f'Epoch: {epoch:03d}, Loss: {t_loss:.4f}')

plt.figure(figsize=(10,5))
plt.title("Training and Validation Loss")
plt.plot(test_loss,label="val")
plt.plot(train_loss,label="train")
plt.xlabel("Iterations")
plt.ylabel("Loss")
plt.legend()
plt.show()

In [None]:
emb, pred, acc, precision, recall, f1, loss, p_test, y_test = test(model, data)
print(f'Test Accuracy: {acc:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f},')