# Extractive Summarization using LSTM

## imports

In [1]:
import json
import numpy as np
import networkx as nx

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader, random_split

import torch_geometric
from torch_geometric.nn import GATConv

from transformers import AlbertTokenizer, AlbertModel, AlbertConfig

from sklearn.metrics import pairwise_distances

In [2]:
USE_CUDA = torch.cuda.is_available()
DEVICE = torch.device("cuda" if USE_CUDA else "cpu")

In [3]:
DEVICE

device(type='cuda')

## Extractive Summarization architecture

### 1) GAT Classifier

In [4]:
class LSTM(nn.Module):
    def __init__(self, 
                 vocab_size, 
                 embed_dim=256,
                 hidden_dim=128,
                 num_layers=2,
                 bidirectional=True,
                 dropout=0.2):
        super(LSTM, self).__init__()
        
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        
        if bidirectional:
            self.num_directs = 2
        else:
            self.num_directs = 1
        
        self.dropout = dropout
        self.embed = nn.Embedding(vocab_size, embed_dim)
        self.bilstm = nn.LSTM(embed_dim, hidden_dim, 
                              num_layers=num_layers,
                              batch_first=True, bidirectional=bidirectional)
        self.linear = nn.Linear(self.num_directs*hidden_dim, hidden_dim)
        
    
    def init_hidden(self, batch_size):
        # (num_layers * num_directions, batch_size, hidden_size)
        hidden = Variable(
            torch.zeros(self.num_layers*self.num_directs, batch_size, self.hidden_dim)
        )
        
        cell = Variable(
            torch.zeros(self.num_layers*self.num_directs, batch_size, self.hidden_dim)
        )
        return hidden, cell
        

    def forward(self, sents):
        x = self.embed(sents)
        
        h_0, cell = self.init_hidden(x.size(0))  # initial h_0
        h_0, cell = h_0.to(DEVICE), cell.to(DEVICE)
        
        # (batch, seq, feature)
        output, h_n = self.bilstm(x, (h_0, cell))
        output = torch.mean(output, dim=1)
        output = self.linear(output)
        return output

### 2) Summarizer

In [33]:
class NaiveSummarizer(nn.Module):
    
    def __init__(self,  
                 num_classes=1):
        super(NaiveSummarizer, self).__init__()
        
        self.tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
        self.tokenizer.padding_side = 'left'
        
        self.embedder = LSTM(self.tokenizer.vocab_size)
        self.lstm = nn.LSTM(128, 64, 1, batch_first=True, bidirectional=False)
        self.fc = nn.Linear(64, num_classes)
        

        
    def get_tokenize(self, docs):
        sent_tokens = [
            torch.cat(
                [self.tokenizer.encode(
                        sentences[i],
                        add_special_tokens=True,
                        max_length=64,
                        pad_to_max_length=True,
                        return_tensors='pt'
                 ) for i in range(len(sentences))]
            ) for sentences in docs
        ]

        sent_tokens = torch.cat([*sent_tokens])
        return sent_tokens
    
    def get_sentence_embedding(self, word_vecs, offsets):
        '''get node-featrues(setences embedding)'''
        features = []
        for idx in range(len(offsets) - 1):
            features.append(word_vecs[ offsets[idx]: offsets[idx]+offsets[idx+1] ])
        
        maxlen = max(offsets)
        features = [feature.cpu().detach().numpy() for feature in features]
        
        pad_features = []
        for feature in features:
            pad_len = maxlen - len(feature)
            pad_features.append(
                np.concatenate((np.zeros((pad_len, 128)), feature), axis=0)
            )
            
        
        return torch.tensor(pad_features, dtype=torch.float32).to(DEVICE)
    
    
    def init_hidden(self, batch_size):
        # (num_layers * num_directions, batch_size, hidden_size)
        hidden = Variable(torch.zeros(1, batch_size, 64))
        cell = Variable(torch.zeros(1, batch_size, 64))
        return hidden, cell
    

    def forward(self, 
                docs, 
                offsets, 
                labels_list, 
                threshold=0.2, 
                batch_size=32):
        
        sent_tokens = self.get_tokenize(docs).to(DEVICE)
        word_vecs = self.embedder(sent_tokens)
        features = self.get_sentence_embedding(word_vecs, offsets)
        
        h_0, cell = self.init_hidden(features.size(0))  # initial h_0
        h_0, cell = h_0.to(DEVICE), cell.to(DEVICE)
        
        output, h_n = self.lstm(features, (h_0, cell))
        output = self.fc(output)
        return output

In [34]:
net = NaiveSummarizer(num_classes=1).to(DEVICE)

## dataloader

### 1) Summary Dataset

In [37]:
[0] * 8

[0, 0, 0, 0, 0, 0, 0, 0]

In [38]:
def generate_batch(batch):
    docs = [entry[0] for entry in batch]
    labels = [entry[1] for entry in batch]
    
    offsets = [0] + [len(doc) for doc in docs]
    
    maxlen = max(offsets)
    pad_labels = []
    for label in labels:
        pad_len = maxlen - len(label)
        pad_label = [0] * pad_len + label
        pad_labels.append(pad_label)
        
    return docs, offsets, pad_labels

In [39]:
class SummaryDataset(Dataset):
    
    def __init__(self, path):
        
        with open(path, 'r', encoding='utf8') as f:
            self.data = [json.loads(line) for line in f]
        
    def __len__(self):
        """Returns the number of data."""
        return len(self.data)
    
    def __getitem__(self, idx):
        sentences = self.data[idx]['doc'].split('\n')
        labels = self.data[idx]['labels'].split('\n')
        labels = [int(label) for label in labels]
        
        return sentences, labels

### 2) dataset split (train, valid, test)

In [40]:
data_path = '../../data/summary/data/train.json'

dataset = SummaryDataset(data_path)

In [41]:
train_size = int(0.6 * len(dataset))
valid_size = int(0.2 * len(dataset))
test_size = len(dataset) - (train_size + valid_size)

train_dataset, valid_dataset, test_dataset \
    = random_split(dataset, [train_size, valid_size, test_size])

### 3) Dataloader for Summarizer

In [42]:
train_dataloader = DataLoader(train_dataset, 
                              batch_size=16, 
                              shuffle=False, 
                              collate_fn=generate_batch)

valid_dataloader = DataLoader(valid_dataset, 
                              batch_size=16, 
                              shuffle=False,
                              collate_fn=generate_batch)

test_dataloader = DataLoader(test_dataset, 
                             batch_size=16, 
                             shuffle=False,
                             collate_fn=generate_batch)

In [43]:
batch = next(iter(train_dataloader))

In [44]:
docs, offsets, labels_list = batch

In [45]:
print(len(docs))
print(max(offsets))

16
49


In [46]:
output = net(docs, offsets, labels_list)

In [47]:
output.size()

torch.Size([16, 49, 1])

In [48]:
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(net.parameters(), lr=0.001)

In [49]:
labels_list = [torch.tensor(labels, dtype=torch.float) for labels in labels_list]

labels = torch.cat(labels_list)
labels = labels.view(-1, output.size()[1])
labels = labels.to(DEVICE)

In [50]:
optimizer.zero_grad()

output = output.view(-1, output.size()[1])
loss = criterion(output, labels)
loss.backward()
optimizer.step()

In [51]:
loss

tensor(0.7017, device='cuda:0', grad_fn=<BinaryCrossEntropyWithLogitsBackward>)