<a href="https://colab.research.google.com/github/EFRA-DH/sgs/blob/main/scoring.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. Prepare Resources:

## Imports:

In [None]:
import torch
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score
from tqdm.autonotebook import tqdm

from typing import List, Callable, Union, Dict

In [2]:
%%capture
# import rouge score
!pip install rouge_score
from rouge_score import rouge_scorer



In [3]:
%%capture
# import LongDocFactScore
!pip install longdocfactscore
from longdocfactscore.ldfacts import LongDocFACTScore



In [4]:
%%capture
# install nltk:
!pip install nltk

# download 'punkt' reslource:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')

# import actual requirements:
from nltk.tokenize import PunktSentenceTokenizer, word_tokenize



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [5]:
%%capture
# import BART score:
from urllib.request import urlretrieve
urlretrieve("https://raw.githubusercontent.com/neulab/BARTScore/main/bart_score.py", "bart_score.py")
from bart_score import BARTScorer

## Constants:

In [6]:
SENT_TOKENIZER = PunktSentenceTokenizer()
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## Definitions:

In [97]:
from html.parser import HTMLParser

class HTMLSplitter(HTMLParser):
    def __init__(self):
        super().__init__()
        self.reset()
        self.strict = False
        self.convert_charrefs= True
        self.text = ''

    def __call__(self, html:str, window:int, tokenize:Callable[[str], List]=word_tokenize):
        parts, paragraphs, tokens, texts = [], [], [], []

        self.feed(html)
        html = self.get_data()

        # split text in paragraphs:
        part = 0
        cursor = 0
        remaining_tokens = -1
        for paragraph, txt in enumerate(html.split('\n\n')):
            for i,j in SENT_TOKENIZER.span_tokenize(txt):
                sentence = tokenize(txt[i:j+1])
                remaining_tokens -= len(sentence) - 1

                if remaining_tokens <= 0:
                    parts.append(str(part))
                    paragraphs.append(str(paragraph))
                    tokens.append(sentence[:window])
                    texts.append(txt[i:j+1])
                    remaining_tokens = window - len(sentence) + 1

                    part += 1
                    cursor = i

                else:
                    tokens[-1].extend(sentence[:window])
                    texts[-1] = txt[cursor:j+1]

            remaining_tokens = -1

        return {'part':parts, 'paragraph':paragraphs, 'tokens':tokens, 'texts':texts}

    @property
    def ends_with_space(self):
        if len(self.text) == 0: return True
        else: return self.text[-1] in (' ', '\n', '\r', '\t', '>')

    @property
    def ends_with_newline(self):
        if self.text.endswith('\n  - '): return True
        elif len(self.text) == 0: return True
        else: return self.text[-1] in ('\n', '\r', '>')

    def handle_starttag(self, tag, attrs):
        if tag in ('table', 'tr', 'th', 'td'):
            self.text += f'<{tag}>' if self.ends_with_newline else f'\n<{tag}>'

        elif tag == 'b' or tag == 'strong':
            self.text += '**' if self.ends_with_space else ' **'

        elif tag == 'i':
            if not self.ends_with_space:
                self.text += '*' if self.ends_with_space else ' *'

        elif tag == 'li':
            self.text += '  - ' if self.ends_with_newline else '\n  - '

        elif tag == 'p':
            if not self.ends_with_newline:
                self.text += '\n'

        elif tag.startswith('h'):
            self.text += '\n**' if self.ends_with_newline else '\n\n**'

        elif not self.ends_with_space:
            self.text += ' '

    def handle_endtag(self, tag):
        if tag in ('table', 'tr', 'th', 'td'):
            self.text += f'</{tag}>'

        if tag == 'b' or tag == 'strong':
            self.text += '** '

        elif tag == 'i':
            self.text += '* '

        elif tag == 'p':
            if not self.ends_with_space:
                self.text += '\n'

        elif tag.startswith('h'):
            self.text += '**\n'

        elif not self.ends_with_space:
            self.text += ' '

    def handle_data(self, data):
        self.text += data.replace('\n', '').replace('\r', '').strip()

    def get_data(self):
        return self.text.strip()


# 2. Compute Metrics:

In [26]:
# load data:
!gdown 15go23FfbJqJ2kEe-WlWXoB6n96BtbErQ
data = pd.read_csv('manual_and_llama70b_summ.csv').fillna('')

data.head()

Downloading...
From (original): https://drive.google.com/uc?id=15go23FfbJqJ2kEe-WlWXoB6n96BtbErQ
From (redirected): https://drive.google.com/uc?id=15go23FfbJqJ2kEe-WlWXoB6n96BtbErQ&confirm=t&uuid=c300a393-9e01-4201-8972-0ad1643e545c
To: /content/manual_and_llama70b_summ.csv
100% 1.30G/1.30G [00:11<00:00, 115MB/s]


Unnamed: 0,dataset,english_content_url,english_summary,english_title,original_content_url,original_summary,original_title,post_id,source_id,source_name,topics,types,html,cleaned_html,llama70b_title,llama70b_summary
0,manual_summary,gs://c-labs1-efra/english/7472062c-1335-488e-b...,Meat processors are in talks with the governme...,Carbon dioxide 'threatens food security' says ...,gs://c-labs1-efra/original/7472062c-1335-488e-...,Meat processors are in talks with the governme...,Carbon dioxide 'threatens food security' says ...,7472062c-1335-488e-b319-4f2e73da19b5,004bbb24-2d5e-4215-b76c-c947f277771f,BBC,[{'id': '57097913-1379-4424-a4fb-d619cf64e535'...,[{'id': '6a7393f6-21c9-4622-8996-70361417c714'...,"<div data-digico-rtf-version=""1.0""><figure><im...",Getty Images \nPoultry producers said the shor...,UK Meat Industry Faces Carbon Dioxide Shortage...,The UK meat industry is facing a crisis due to...
1,manual_summary,gs://c-labs1-efra/english/c5c108fa-9b42-4fc1-a...,A new type of recyclable meat packaging tray d...,Plastic pollution: New meat tray 'could save t...,gs://c-labs1-efra/original/c5c108fa-9b42-4fc1-...,A new type of recyclable meat packaging tray d...,Plastic pollution: New meat tray 'could save t...,c5c108fa-9b42-4fc1-aa2b-aedbafb0f513,004bbb24-2d5e-4215-b76c-c947f277771f,BBC,[{'id': '57097913-1379-4424-a4fb-d619cf64e535'...,[{'id': '6a7393f6-21c9-4622-8996-70361417c714'...,"<div data-digico-rtf-version=""1.0""><figure></f...",**A new type of recyclable meat packaging tray...,Revolutionary Recyclable Meat Packaging Tray R...,A Swansea University student has designed a 10...
2,manual_summary,gs://c-labs1-efra/english/1b5fd7b6-f92b-447a-8...,Racehorses in Great Britain will not be allowe...,New rules aimed at ensuring racehorses do not ...,gs://c-labs1-efra/original/1b5fd7b6-f92b-447a-...,Racehorses in Great Britain will not be allowe...,New rules aimed at ensuring racehorses do not ...,1b5fd7b6-f92b-447a-8d8d-877af6b7171b,004bbb24-2d5e-4215-b76c-c947f277771f,BBC,[{'id': '0e3de27e-3394-4477-86de-898fde8e5566'...,[{'id': '6a7393f6-21c9-4622-8996-70361417c714'...,"<div data-digico-rtf-version=""1.0""><figure><im...","Image caption, The new rule change applies to ...",UK Racehorses to be Signed Out of Food Chain t...,The British Horseracing Authority (BHA) has in...
3,manual_summary,gs://c-labs1-efra/english/92ea9206-0817-4045-9...,The end of the Government's deal with CF ferti...,Concerns over food shortages as CO2 deal ends,gs://c-labs1-efra/original/92ea9206-0817-4045-...,The end of the Government's deal with CF ferti...,Concerns over food shortages as CO2 deal ends,92ea9206-0817-4045-944f-d8c2e70d5238,004bbb24-2d5e-4215-b76c-c947f277771f,BBC,[{'id': '0e3de27e-3394-4477-86de-898fde8e5566'...,[{'id': '6a7393f6-21c9-4622-8996-70361417c714'...,"<div data-digico-rtf-version=""1.0""><figure><im...",**UK food and drink firms say they remain conc...,UK Food and Drink Firms Concerned About Supply...,UK food and drink firms are worried about pote...
4,manual_summary,gs://c-labs1-efra/english/ab4fcc57-e029-4c7a-a...,"The new agreement, called the Windsor Framewor...",EU and UK strike new deal over post-Brexit tra...,gs://c-labs1-efra/original/ab4fcc57-e029-4c7a-...,"The new agreement, called the Windsor Framewor...",EU and UK strike new deal over post-Brexit tra...,ab4fcc57-e029-4c7a-a7bd-f3a675bb1d0c,724d0920-41e4-4b5e-bf2a-717df4e75359,CNN,[{'id': '0e3de27e-3394-4477-86de-898fde8e5566'...,[{'id': '6a7393f6-21c9-4622-8996-70361417c714'...,"<div data-digico-rtf-version=""1.0""><figure><pi...",London CNN —\nBritain and the European Union h...,UK and EU Reach Agreement on New Trade Rules f...,"The UK and EU have reached a deal, known as th..."


## Overall Summary Metrics:

In [17]:
def score(originals:List[str], summaries:List[str], rouge_types:List[str]=['rouge1', 'rougeL'], device:Union[str, torch.device, int]=DEVICE) -> pd.DataFrame:
    """
    Compute common metrics for summary evaluation.

    :param originals: List of original texts.
    :param summaries: List of original summaries.
    :param rouge_types: List of the names of specific ROUGE-score types (default: ['rouge1', 'rougeL']).
    :param device: Torch device to be used with BART-score.

    :return: Pandas DataFrame of scores.
    """
    # just to be sure...
    assert len(originals) == len(summaries)

    # compute scores:
    scores = {}

    # compute ROUGE-F1:
    scorer = rouge_scorer.RougeScorer(rouge_types, use_stemmer=True)
    for key in rouge_types: scores[key] = []
    for o, s in tqdm(zip(originals, summaries), total=len(originals), desc='Computing ROUGE'):
        rouge = scorer.score(o, s)
        for key in rouge: scores[key].append(rouge[key].fmeasure)

    # compute Long-Doc-Facts-Score:
    scorer = LongDocFACTScore(device=device)
    scores['ldfacts'] = scorer.score_src_hyp_long(originals, summaries)

    return pd.DataFrame(scores)

In [18]:
scores_manual = score(data['cleaned_html'], data['english_summary'])
scores_manual.to_csv('scores_manual.csv')

scores_manual.head()

Computing ROUGE:   0%|          | 0/5 [00:00<?, ?it/s]

Unnamed: 0,rouge1,rougeL,ldfacts
0,0.151762,0.081301,-3.342417
1,0.462025,0.408228,-2.116693
2,0.136662,0.120894,-2.470614
3,0.127527,0.07776,-4.562559
4,0.153998,0.13228,-1.74545


In [19]:
scores_llama70b = score(data['cleaned_html'], data['llama70b_summary'])
scores_llama70b.to_csv('scores_llama70b.csv')

scores_llama70b.head()

Computing ROUGE:   0%|          | 0/5 [00:00<?, ?it/s]

Unnamed: 0,rouge1,rougeL,ldfacts
0,0.23822,0.157068,-3.681248
1,0.257143,0.157143,-3.223632
2,0.191677,0.131148,-2.346488
3,0.209408,0.169954,-2.512157
4,0.091837,0.073469,-3.759611


## Paragraph-Based Metrics:

In [138]:
class ParagraphScorer:
    def __init__(self, score:str='rougeL', max_paragraph_size:Union[int, float, None]=None, device:Union[str, torch.device, int]=DEVICE):
        """
        Create a new paragraph based scorer.

        :param score: Base score.
        :param max_paragraph_size: Maximum size of the chunks.
        :param device: Torch device to be used with BART-score.
        """

        if score.startswith('rouge'):
            self.max_paragraph_size = max_paragraph_size
            scorer                  = rouge_scorer.RougeScorer([score], use_stemmer=True)
            self.scoring_fcn        = lambda originals, summaries: [scorer.score(o, s)[score].fmeasure for o, s in zip(originals, summaries)]

        elif score == 'bart':
            self.max_paragraph_size = max_paragraph_size if max_paragraph_size is not None else 1024
            scorer                  = BARTScorer(device=device, max_length=max_paragraph_size)
            self.scoring_fcn        = lambda originals, summaries: np.exp(scorer.score(originals, summaries))

        else: raise ValueError(f'Unknown score "{score}"')

    def __call__(self, originals:List[str], summaries:List[str], **kwargs):
        """
        Compute scores per chunk in the original document and report the percentage of chunks needed to get half the total score.

        :param originals: List of original texts.
        :param summaries: List of original summaries.
        :param **kwargs:  Optional keyword arguments for the underlying HTMLParser-class.

        :return: Pandas DataFrame of scores.
        """
        return self.score(originals, summaries, **kwargs)

    def score(self, originals:List[str], summaries:List[str], **kwargs):
        """
        Compute scores per chunk in the original document and report the percentage of chunks needed to get half the total score.

        :param originals: List of original texts.
        :param summaries: List of original summaries.
        :param **kwargs:  Optional keyword arguments for the underlying HTMLParser-class.

        :return: Pandas DataFrame of scores.
        """
        scores = []
        for original, summary in zip(tqdm(originals), summaries):
            splitter = HTMLSplitter()

            n = self.max_paragraph_size
            if n is None: n = len(word_tokenize(summary))
            elif isinstance(n, float) and n <= 1.: n = int(n*len(word_tokenize(original)))

            parts_text = splitter(original, n, **kwargs)
            scores_text = self.scoring_fcn(parts_text['texts'], [summary]*len(parts_text['texts']))

            scores.append(np.mean(np.cumsum(np.sort(scores_text)) > (.5*np.sum(scores_text))))

        return scores

In [139]:
def score_paragraphs(originals:List[str], summaries:List[str], max_paragraph_size:int=20, rouge_types:List[str]=['rouge1', 'rougeL'], device:Union[str, torch.device, int]=DEVICE) -> pd.DataFrame:
    """
    Compute chunk based metrics for summary evaluation.

    :param originals: List of original texts.
    :param summaries: List of original summaries.
    :param rouge_types: List of the names of specific ROUGE-score types (default: ['rouge1', 'rougeL']).
    :param device: Torch device to be used with BART-score.

    :return: Pandas DataFrame of scores.
    """
    # just to be sure...
    assert len(originals) == len(summaries)

    # compute scores:
    scores = {}

    # compute ROUGE-F1:
    for metric in rouge_types:
      scorer = ParagraphScorer(metric, max_paragraph_size=max_paragraph_size)
      scores[metric] = scorer.score(originals, summaries)

    # compute BART-Score:
    scorer = ParagraphScorer('bart', max_paragraph_size=max_paragraph_size)
    scores['bart'] = scorer.score(originals, summaries)

    return pd.DataFrame(scores)

In [140]:
scores_density = score_paragraphs(data['html'], data['english_summary'])
scores_density.to_csv('scores_llama70b.csv')

scores_density.head()

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]