# Extractive Summarization with Batch

## imports

In [1]:
import json
import numpy as np
import networkx as nx

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable

from transformers import AlbertTokenizer, AlbertModel

from sklearn.metrics import pairwise_distances

## data load

### 1) Sentence data

In [2]:
%%time

data_path = '../../data/summary/data/train.json'
with open(data_path, 'r') as f:
    data = [json.loads(line) for line in f]

CPU times: user 3.05 s, sys: 288 ms, total: 3.34 s
Wall time: 3.34 s


In [3]:
# data

In [4]:
sample1 = data[123]

text1 = sample1['doc']
summary1 = sample1['summaries']
labels1 = sample1['labels']
labels1 = labels1.split('\n')
labels1 = [int(label) for label in labels1]

sentences1 = text1.split('\n')

In [5]:
sample2 = data[234]

text2 = sample2['doc']
summary2 = sample2['summaries']
labels2 = sample2['labels']
labels2 = labels2.split('\n')
labels2 = [int(label) for label in labels2]

sentences2 = text2.split('\n')

### 2) node-feature matrix

In [6]:
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
model = AlbertModel.from_pretrained('albert-base-v2')

In [135]:
def get_tokenize(docs):
    sent_tokens = [
        torch.cat(
            [tokenizer.encode(
                    sentences[i],
                    add_special_tokens=True,
                    max_length=64,
                    pad_to_max_length=True,
                    return_tensors='pt'
             ) for i in range(len(sentences))]
        ) for sentences in docs
    ]
    
    sent_tokens = torch.cat([*sent_tokens])
    return sent_tokens

In [136]:
docs = [sentences1, sentences2]

sent_tokens = get_tokenize(docs)

In [138]:
offsets = [0] + [inputs1.size()[0], inputs2.size()[0]]
# inputs = torch.cat([inputs1, inputs2])

In [139]:
outputs = model(tokens_list)
word_vecs = outputs[0]

In [54]:
def get_sentence_embedding(word_vecs, offsets):
    docs = []
    for idx in range(len(offsets) - 1):
        docs.append(word_vecs[ offsets[idx]: offsets[idx]+offsets[idx+1] ])
    
    features = [torch.mean(doc, dim=1).squeeze() for doc in docs]
    return features

In [67]:
features_list = get_sentence_embedding(word_vecs, offsets)

In [68]:
features_list[1].size()

torch.Size([20, 768])

### 3) edge_index

In [64]:
# threshold = 0.2

# cosine_matrix = 1 - pairwise_distances(features1.detach().numpy(), metric="cosine")
# adj_matrix1 = (cosine_matrix > threshold) * 1

# cosine_matrix = 1 - pairwise_distances(features2.detach().numpy(), metric="cosine")
# adj_matrix2 = (cosine_matrix > threshold) * 1

In [69]:
def build_graph(features_list, threshold=0.2):
    '''get edge_index for GATLayer'''
    
    edge_index_list = []
    for features in features_list:
        cosine_matrix = 1 - pairwise_distances(features.detach().numpy(), metric="cosine")
        adj_matrix = (cosine_matrix > threshold) * 1

        G = nx.from_numpy_matrix(adj_matrix)

        e1_list = [e1 for e1, _ in list(G.edges)]
        e2_list = [e2 for _, e2 in list(G.edges)]
        edge_index = [e1_list, e2_list]
        edge_index = torch.tensor(edge_index)
        edge_index_list.append(edge_index)

    return edge_index_list

In [70]:
edge_index_list = build_graph(features_list)

### 4) DataLoader for GATLayer

In [73]:
import torch_geometric

In [76]:
labels1 = torch.tensor(labels1, dtype=torch.float)
labels2 = torch.tensor(labels2, dtype=torch.float)

labels_list = [labels1, labels2]

In [77]:
def gat_dataloader(features_list, edge_index_list, labels_list, batch_size):
    data_list = [
        torch_geometric.data.Data(features, edge_index, y=labels)
            for features, edge_index, labels in zip(features_list, edge_index_list, labels_list)
    ]
    
    gat_loader = torch_geometric.data.DataLoader(data_list, batch_size=batch_size)
    return gat_loader

In [78]:
gat_loader = gat_dataloader(features_list, edge_index_list, labels_list, batch_size=2)

In [85]:
# next(iter(gat_loader))

In [82]:
# for data in gat_loader:
#     sample = data
    
# sample.num_graphs

## Extractive Summarization architecture

### 1) GAT Classifier

In [6]:
import torch_geometric

# from torch_geometric.data import Data, DataLoader
from torch_geometric.nn import GATConv

In [7]:
class GATClassifier(nn.Module):
    def __init__(self, in_dim, hidden_dim, out_dim, num_heads, num_classes=1):
        super().__init__()
        
        self.out_head = 1
        self.out_dim = out_dim
        
        self.conv1 = GATConv(in_dim, hidden_dim, heads=num_heads, dropout=0.6)
        self.conv2 = GATConv(hidden_dim * num_heads, out_dim, concat=False,
                             heads=self.out_head, dropout=0.6)
        
        self.lstm = nn.LSTM(out_dim, 32, 1, batch_first=True, bidirectional=False)
        self.fc = nn.Linear(32, num_classes)
        
    
    def init_hidden(self, batch_size):
        # (num_layers * num_directions, batch_size, hidden_size)
        hidden = Variable(torch.zeros(1, batch_size, 32))
        cell = Variable(torch.zeros(1, batch_size, 32))
        return hidden, cell
    

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        
        x = F.dropout(x, p=0.6, training=True)
        x = self.conv1(x, edge_index)
        x = F.elu(x)
        x = F.dropout(x, p=0.6, training=True)
        x = self.conv2(x, edge_index)
        x = x.view(-1, x.size(0), self.out_dim)
        
        h_0, cell = self.init_hidden(x.size(0))  # initial h_0
        
        output, h_n = self.lstm(x, (h_0, cell))
        
        # many-to-many
        output = self.fc(output)
        
        return output

### 2) Summarizer

In [12]:
class Summarizer(nn.Module):
    
    def __init__(self, 
                 in_dim, 
                 hidden_dim, 
                 out_dim, 
                 num_heads, 
                 num_classes=2):
        super(Summarizer, self).__init__()
        
        self.tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
        self.embedder = AlbertModel.from_pretrained('albert-base-v2')
        self.gat_classifier = GATClassifier(in_dim, hidden_dim, out_dim, num_heads, num_classes)

        
    def get_tokenize(self, docs):
        sent_tokens = [
            torch.cat(
                [self.tokenizer.encode(
                        sentences[i],
                        add_special_tokens=True,
                        max_length=64,
                        pad_to_max_length=True,
                        return_tensors='pt'
                 ) for i in range(len(sentences))]
            ) for sentences in docs
        ]

        sent_tokens = torch.cat([*sent_tokens])
        return sent_tokens
    
    def get_sentence_embedding(self, word_vecs, offsets):
        '''get node-featrues(setences embedding)'''
        docs = []
        for idx in range(len(offsets) - 1):
            docs.append(word_vecs[ offsets[idx]: offsets[idx]+offsets[idx+1] ])
        
        features = [torch.mean(doc, dim=1).squeeze() for doc in docs]
        return features
    
    def build_graph(self, features_list, threshold=0.2):
        '''get edge_index for GATLayer'''
        edge_index_list = []
        for features in features_list:
            cosine_matrix = 1 - pairwise_distances(features.detach().numpy(), metric="cosine")
            adj_matrix = (cosine_matrix > threshold) * 1

            G = nx.from_numpy_matrix(adj_matrix)

            e1_list = [e1 for e1, _ in list(G.edges)]
            e2_list = [e2 for _, e2 in list(G.edges)]
            edge_index = [e1_list, e2_list]
            edge_index = torch.tensor(edge_index)
            edge_index_list.append(edge_index)

        return edge_index_list
    
    def get_gat_dataloader(self, features_list, edge_index_list, labels_list, batch_size):
        data_list = [
            torch_geometric.data.Data(features, edge_index, y=labels)
                for features, edge_index, labels in zip(features_list, edge_index_list, labels_list)
        ]

        gat_loader = torch_geometric.data.DataLoader(data_list, batch_size=batch_size, shuffle=False)
        return gat_loader
    

    def forward(self, 
                docs, 
                offsets, 
                labels_list, 
                threshold=0.2, 
                batch_size=32):
        
        sent_tokens = self.get_tokenize(docs)
        word_vecs = self.embedder(sent_tokens)[0]
        features_list = self.get_sentence_embedding(word_vecs, offsets)
        edge_index_list = self.build_graph(features_list, threshold)
        
        # dataloader for GATLayer
        gat_dataloader = self.get_gat_dataloader(features_list, edge_index_list, labels_list, batch_size)
        
        output = self.gat_classifier(next(iter(gat_dataloader)))
        return output

In [13]:
net = Summarizer(in_dim=768,
                 hidden_dim=128,
                 out_dim=64,
                 num_heads=2,
                 num_classes=1)

In [19]:
docs = [sentences1, sentences2]
offsets = [0] + [len(sentences1), len(sentences2)]

labels1 = torch.tensor(labels1, dtype=torch.float)
labels2 = torch.tensor(labels2, dtype=torch.float)

labels_list = [labels1, labels2]

In [15]:
output = net(docs, offsets, labels_list)

In [16]:
# output

In [17]:
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(net.parameters(), lr=0.001)

In [18]:
labels = sample.y
labels = labels.view(-1, output.size()[1])

NameError: name 'sample' is not defined

In [147]:
optimizer.zero_grad()

output = output.view(-1, output.size()[1])
loss = criterion(output, labels)
loss.backward()
optimizer.step()

In [148]:
loss

tensor(0.6993, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)

In [20]:
pytorch_total_params = sum(p.numel() for p in net.parameters() if p.requires_grad)

In [21]:
pytorch_total_params

11910113