# Graph Attention Network using PyG and mini-Batch

## imports

In [1]:
import json
import numpy as np
import networkx as nx

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable

from transformers import AlbertTokenizer, AlbertModel

from sklearn.metrics import pairwise_distances

## data load

### 1) Sentence data

In [3]:
%%time

data_path = '../../data/summary/data/train.json'
with open(data_path, 'r', encoding='utf8') as f:
    data = [json.loads(line) for line in f]

Wall time: 4.23 s


In [4]:
# data

In [5]:
sample1 = data[0]

text1 = sample1['doc']
summary1 = sample1['summaries']
labels1 = sample1['labels']
labels1 = labels1.split('\n')
labels1 = [int(label) for label in labels1]

sentences1 = text1.split('\n')

In [6]:
sample2 = data[111]

text2 = sample2['doc']
summary2 = sample2['summaries']
labels2 = sample2['labels']
labels2 = labels2.split('\n')
labels2 = [int(label) for label in labels2]

sentences2 = text2.split('\n')

### 2) node-feature matrix

In [6]:
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
model = AlbertModel.from_pretrained('albert-base-v2')

In [7]:
inputs1 = torch.cat(
    [
        tokenizer.encode(
            sentences1[i],
            add_special_tokens=True,
            max_length=64,
            pad_to_max_length=True,
            return_tensors='pt'
        ) for i in range(len(sentences1))]
)

inputs2 = torch.cat(
    [
        tokenizer.encode(
            sentences2[i],
            add_special_tokens=True,
            max_length=64,
            pad_to_max_length=True,
            return_tensors='pt'
        ) for i in range(len(sentences2))]
)

In [8]:
inputs1.size(), inputs2.size()

(torch.Size([21, 64]), torch.Size([27, 64]))

In [9]:
doc_lens = [21, 27]
inputs = torch.cat([inputs1, inputs2])

In [10]:
outputs = model(inputs)
last_hidden_states = outputs[0]
# features = torch.mean(last_hidden_states, dim=1).squeeze()  # sentence embedding

In [11]:
doc1 = last_hidden_states[:doc_lens[0]]
doc2 = last_hidden_states[doc_lens[0]:]

features1 = torch.mean(doc1, dim=1).squeeze()  # sentence embedding for doc1
features2 = torch.mean(doc2, dim=1).squeeze()  # sentence embedding for doc2

In [12]:
features1.size(), features2.size()

(torch.Size([21, 768]), torch.Size([27, 768]))

### 3) adjacency matrix

In [13]:
threshold = 0.2

cosine_matrix = 1 - pairwise_distances(features1.detach().numpy(), metric="cosine")
adj_matrix1 = (cosine_matrix > threshold) * 1

cosine_matrix = 1 - pairwise_distances(features2.detach().numpy(), metric="cosine")
adj_matrix2 = (cosine_matrix > threshold) * 1

In [14]:
# adj_matrix

### 4) create Graph using Networkx

In [15]:
G1 = nx.from_numpy_matrix(adj_matrix1)
G2 = nx.from_numpy_matrix(adj_matrix2)

In [16]:
e1_list = [e1 for e1, _ in list(G1.edges)]
e2_list = [e2 for _, e2 in list(G1.edges)]

edge_index = [e1_list, e2_list]
edge_index1 = torch.tensor(edge_index)

e1_list = [e1 for e1, _ in list(G2.edges)]
e2_list = [e2 for _, e2 in list(G2.edges)]

edge_index = [e1_list, e2_list]
edge_index2 = torch.tensor(edge_index)

## GAT architecture

In [17]:
from torch_geometric.data import Data, DataLoader
from torch_geometric.nn import GATConv

In [18]:
class GATClassifier(nn.Module):
    def __init__(self, in_dim, hidden_dim, out_dim, num_heads, num_classes=1):
        super().__init__()
        
        self.out_head = 1
        self.out_dim = out_dim
        
        self.conv1 = GATConv(in_dim, hidden_dim, heads=num_heads, dropout=0.6)
        self.conv2 = GATConv(hidden_dim * num_heads, out_dim, concat=False,
                             heads=self.out_head, dropout=0.6)
        
        self.lstm = nn.LSTM(out_dim, 32, 1, batch_first=True, bidirectional=False)
        self.fc = nn.Linear(32, num_classes)
        
    
    def init_hidden(self, batch_size):
        # (num_layers * num_directions, batch_size, hidden_size)
        hidden = Variable(torch.zeros(1, batch_size, 32))
        cell = Variable(torch.zeros(1, batch_size, 32))
        return hidden, cell
    

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        
        x = F.dropout(x, p=0.6, training=True)
        x = self.conv1(x, edge_index)
        x = F.elu(x)
        x = F.dropout(x, p=0.6, training=True)
        x = self.conv2(x, edge_index)
        x = x.view(-1, x.size(0), self.out_dim)
        
        h_0, cell = self.init_hidden(x.size(0))  # initial h_0
        
        output, h_n = self.lstm(x, (h_0, cell))
        
        # many-to-many
        output = self.fc(output)
        
        return output

In [19]:
net = GATClassifier(in_dim=768,
                    hidden_dim=128,
                    out_dim=64,
                    num_heads=2,
                    num_classes=1)

In [20]:
labels1 = torch.tensor(labels1, dtype=torch.float)
labels2 = torch.tensor(labels2, dtype=torch.float)

In [21]:
data_list = [Data(features1, edge_index1, y=labels1), Data(features2, edge_index2, y=labels2)]
loader = DataLoader(data_list, batch_size=2)

In [22]:
# for data in loader:
#     sample = data
    
# sample.num_graphs

In [23]:
# sample.y

In [24]:
output = net(sample)

In [31]:
output.size()

torch.Size([1, 48])

In [28]:
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(net.parameters(), lr=0.001)

In [32]:
labels = sample.y
labels = labels.view(-1, output.size()[1])

In [33]:
optimizer.zero_grad()

output = output.view(-1, output.size()[1])
loss = criterion(output, labels)
loss.backward()
optimizer.step()

In [34]:
loss

tensor(0.6727, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)