# ROUGE Metric Implementation

## imports

In [1]:
import json
from collections import OrderedDict

import numpy as np

import torch
import torch.optim as optim
import torch.nn.functional as F

import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint

from torch.utils.data import Dataset, DataLoader, random_split

from models import BasicSummarizer
from types_ import *

INFO:transformers.file_utils:PyTorch version 1.4.0 available.


In [5]:
USE_CUDA = torch.cuda.is_available()
DEVICE = torch.device("cuda" if USE_CUDA else "cpu")

In [6]:
DEVICE

device(type='cuda')

## data load

### 1) Sentence data

In [2]:
%%time

data_path = '../../data/summary/data/train.json'
with open(data_path, 'r', encoding='utf8') as f:
    data = [json.loads(line) for line in f]

Wall time: 4.32 s


In [3]:
sample1 = data[0]

text1 = sample1['doc']
summary1 = sample1['summaries']
labels1 = sample1['labels']
labels1 = labels1.split('\n')
labels1 = [int(label) for label in labels1]

sentences1 = text1.split('\n')

In [6]:
# sentences1

## dataloader

### 1) Summary Dataset

In [8]:
def generate_batch(batch):
    docs = [entry[0] for entry in batch]
    labels = [entry[1] for entry in batch]
    
    offsets = [0] + [len(doc) for doc in docs]
        
    return docs, offsets, labels

In [9]:
class SummaryDataset(Dataset):
    
    def __init__(self, path):
        
        with open(path, 'r', encoding='utf8') as f:
            self.data = [json.loads(line) for line in f]
        
    def __len__(self):
        """Returns the number of data."""
        return len(self.data)
    
    def __getitem__(self, idx):
        sentences = self.data[idx]['doc'].split('\n')
        labels = self.data[idx]['labels'].split('\n')
        labels = [int(label) for label in labels]
        
        return sentences, labels

### 2) dataset split (train, valid, test)

In [10]:
data_path = '../../data/summary/data/train.json'

dataset = SummaryDataset(data_path)

In [11]:
train_size = int(0.6 * len(dataset))
valid_size = int(0.2 * len(dataset))
test_size = len(dataset) - (train_size + valid_size)

train_dataset, valid_dataset, test_dataset \
    = random_split(dataset, [train_size, valid_size, test_size])

### 3) Dataloader for Summarizer

In [322]:
train_dataloader = DataLoader(train_dataset, 
                              batch_size=32, 
                              shuffle=False, 
                              collate_fn=generate_batch)

valid_dataloader = DataLoader(valid_dataset, 
                              batch_size=32, 
                              shuffle=False,
                              collate_fn=generate_batch)

test_dataloader = DataLoader(test_dataset, 
                             batch_size=32, 
                             shuffle=False,
                             collate_fn=generate_batch)

In [323]:
batch = next(iter(train_dataloader))

In [324]:
docs, offsets, labels_list = batch

## loads trained model

In [325]:
save_path = '../04_train_summarizer/save_weights/basicsumarizer_epoch=18_val_acc=0.62_lr005.pth'

model = BasicSummarizer(in_dim=128,
                        hidden_dim=64,
                        out_dim=32,
                        num_heads=2,
                        num_classes=1).to(DEVICE)

model.load_state_dict(torch.load(save_path))

INFO:transformers.tokenization_utils:loading file https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v2-spiece.model from cache at C:\Users\korea\.cache\torch\transformers\dd1588b85b6fdce1320e224d29ad062e97588e17326b9d05a0b29ee84b8f5f93.c81d4deb77aec08ce575b7a39a989a79dd54f321bfb82c2b54dd35f52f8182cf


<All keys matched successfully>

## Output

In [326]:
model.eval()
with torch.no_grad():
    preds = model(docs, offsets, labels_list)
    preds = torch.sigmoid(preds)

In [327]:
preds = preds.cpu().numpy()
preds = (preds > 0.555).astype(int)
preds = preds.reshape((preds.shape[0], -1))
preds_list = np.split(preds, offsets[1:-1], axis=1)
preds_list = [row.reshape(-1).tolist() for row in preds_list]

## ROUGE Metric Implementation

### 1) save summary results

In [328]:
import os

In [329]:
from rouge_score import rouge_scorer

scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

In [330]:
summary_path = './results/summ'
gold_path = './results/gold'

if not os.path.exists(summary_path):
    os.mkdir(summary_path)

if not os.path.exists(gold_path):
    gold_exist = False
    os.mkdir(gold_path)
else:
    gold_exist = True

In [331]:
r_idx = 0
rouge_scores = []
for doc, preds, labels in zip(docs, preds_list, labels_list):
    summ, gold = [], []
    for sent, pred, label in zip(doc, preds, labels):
        if pred == 1:
            summ.append(sent)
        elif label == 1:
            gold.append(sent)
    
    summ = ' '.join(summ)
    gold = ' '.join(gold)
    
    # write files
    with open(f'{summary_path}/test_summ_{r_idx}.txt', 'w', encoding='utf8') as f:
        f.write(summ)
    with open(f'{gold_path}/gold_summ_{r_idx}.txt', 'w', encoding='utf8') as f:
        f.write(gold)
    
    # rouge score append
    rouge_scores.append(scorer.score(gold, summ))
    
    r_idx += 1

In [332]:
summ

''

In [333]:
gold

"by tahira yaqoob published : 06:21 est , 22 november 2012 updated : 02:52 est , 23 november 2012 a british woman has been sentenced to three months in a dubai jail followed by deportation for having drunken sex in the back of a taxi in the city rebecca blake , 29 , had been arrested earlier this year after partially stripping off in the back of the cab after a 10 - hour drinking binge irishman conor mcredmond , 28 , with whom she was caught , was also convicted and received the same sentence , despite dna tests to find evidence of intercourse coming back negative recruitment consultant blake , from dorking in surrey , was arrested on may 4 after first meeting conor mcredmond just hours earlier their taxi driver alerted police after spotting blake in his rearview mirror with her top off , straddling mcredmond and â\x80\x98 making the sounds of a woman having sex â\x80\x99 jailed : ms rebecca blake ( second right ) and mr conor mcredmond ( left ) chat with their lawyer shaker al shammar

In [334]:
rouge_scores

[{'rouge1': Score(precision=0.42168674698795183, recall=0.1674641148325359, fmeasure=0.2397260273972603),
  'rouge2': Score(precision=0.17073170731707318, recall=0.0673076923076923, fmeasure=0.09655172413793102),
  'rougeL': Score(precision=0.26506024096385544, recall=0.10526315789473684, fmeasure=0.1506849315068493)},
 {'rouge1': Score(precision=0.0, recall=0.0, fmeasure=0.0),
  'rouge2': Score(precision=0.0, recall=0.0, fmeasure=0.0),
  'rougeL': Score(precision=0, recall=0, fmeasure=0)},
 {'rouge1': Score(precision=0.0, recall=0.0, fmeasure=0.0),
  'rouge2': Score(precision=0.0, recall=0.0, fmeasure=0.0),
  'rougeL': Score(precision=0, recall=0, fmeasure=0)},
 {'rouge1': Score(precision=0.4533333333333333, recall=0.06938775510204082, fmeasure=0.12035398230088495),
  'rouge2': Score(precision=0.06756756756756757, recall=0.010224948875255624, fmeasure=0.017761989342806393),
  'rougeL': Score(precision=0.25333333333333335, recall=0.03877551020408163, fmeasure=0.06725663716814159)},
 {'