In [1]:
!pip install -q rouge_score transformers bert_score

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m52.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m311.2/311.2 kB[0m [31m21.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m94.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m70.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.0/295.0 kB[0m [31m35.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone


In [24]:
import torch
from typing import List, Dict
from rouge_score import rouge_scorer, scoring
import bert_score
from transformers import AutoTokenizer


START_POPULATION='<pop>'
END_POPULATION='</pop>'
START_INTERVENTION='<int>'
END_INTERVENTION='</int>'
START_OUTCOME='<out>'
END_OUTCOME='</out>'
START_BACKGROUND = '<background>'
END_BACKGROUND = '</background>'
START_REFERENCE = '<ref>'
END_REFERENCE = '</ref>'
START_EVIDENCE = '<evidence>'
END_EVIDENCE = '</evidence>'
SEP_TOKEN = '<sep>'
EXTRA_TOKENS = [
    START_BACKGROUND,
    END_BACKGROUND,
    START_REFERENCE,
    END_REFERENCE,
    SEP_TOKEN,
    START_POPULATION,
    END_POPULATION,
    START_INTERVENTION,
    END_INTERVENTION,
    START_OUTCOME,
    END_OUTCOME,
    START_EVIDENCE,
    END_EVIDENCE,
]


def rouge_scores(
    preds: List[List[torch.Tensor]], targets: List[List[torch.Tensor]],
    tokenizer, use_stemmer=False, use_aggregator=False
):
    # largely copied from https://github.com/huggingface/nlp/blob/master/metrics/rouge/rouge.py#L84
    # and from https://github.com/allenai/ms2/blob/a03ab009e00c5e412b4c55f6ec4f9b49c2d8a7f6/ms2/models/utils.py
    rouge_types = ['rouge1', 'rouge2', 'rougeL', 'rougeLsum']
    scorer = rouge_scorer.RougeScorer(rouge_types=rouge_types, use_stemmer=use_stemmer)
    refs, hyps = [], []
    for p, t in zip(preds, targets):
        assert len(p) == len(t)
        refs.extend(p)
        hyps.extend(t)

    if use_aggregator:
        aggregator = scoring.BootstrapAggregator()
        scores = None
    else:
        aggregator = None
        scores = []

    for ref, pred in zip(refs, hyps):
        if isinstance(ref, torch.Tensor):
            ref = tokenizer.decode(ref).lower()
        if isinstance(pred, torch.Tensor):
            pred = tokenizer.decode(pred).lower()
        score = scorer.score(ref, pred)
        if use_aggregator:
            aggregator.add_scores(score)
        else:
            scores.append(score)

    if use_aggregator:
        result = aggregator.aggregate()
    else:
        result = {}
        for key in scores[0]:
            result[key] = list(score[key] for score in scores)

    return result


def get_tokenizer(tokenizer_type: str):
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_type, additional_special_tokens=EXTRA_TOKENS)
    return tokenizer


def calculate_rouge(targets: Dict[str, Dict], generated: Dict[str, str]) -> Dict:
    """
    Calculate ROUGE scores
    :param targets: dict of docid -> {'target': target_text}
    :param generated: dict of docid -> generated_text
    :return: dict of ROUGE scores (rouge1, rouge2, rougeL, rougeLsum)
    """
    # copied from https://github.com/allenai/mslr-shared-task/blob/c2218c1a440cf5172d784065b48af2d6c5c50f9a/evaluator/evaluator.py
    print("Computing ROUGE scores...")
    docids = list(targets.keys())
    target_texts = [[targets[docid]['target']] for docid in docids]
    generated_texts = [[generated.get(docid, '')] for docid in docids]

    # rouge scoring
    tokenizer = get_tokenizer('facebook/bart-base')
    rouge_results = rouge_scores(generated_texts, target_texts, tokenizer, use_aggregator=True)
    return rouge_results


def calculate_mid_rouge(targets: Dict[str, Dict], generated: Dict[str, str]) -> Dict:
    """
    Calculate ROUGE scores but only return mid fmeasure.
    :param targets: dict of docid -> {'target': target_text}
    :param generated: dict of docid -> generated_text
    :return: dict of ROUGE scores (rouge1, rouge2, rougeL, rougeLsum)
    """
    results = calculate_rouge(targets, generated)
    return {
        "rouge": results,
        "rouge1": results["rouge1"].mid.fmeasure,
        "rouge2": results["rouge2"].mid.fmeasure,
        "rougeL": results["rougeL"].mid.fmeasure,
        "rougeLsum": results["rougeLsum"].mid.fmeasure,
    }


def calculate_bertscore(
    targets: Dict[str, Dict], generated: Dict[str, str], model_type="roberta-large"
) -> Dict:
    """
    Calculate BERTscore
    :param targets: dict of docid -> {'target': target_text}
    :param generated: dict of docid -> generated_text
    :param model_type: model type for BERTscore. Choose from
        ['bert-base-uncased', 'bert-large-uncased', 'roberta-base',
        'roberta-large']. Default: 'roberta-large'
    :return: dict of BERTscore results (bs_ps, bs_rs, bs_fs) (precision, recall, f1)
    """
    # copied from https://github.com/allenai/mslr-shared-task/blob/c2218c1a440cf5172d784065b48af2d6c5c50f9a/evaluator/evaluator.py
    # original bert score: https://github.com/Tiiiger/bert_score
    print("Computing BERTscore...")
    docids = list(targets.keys())
    target_texts = [targets[docid]['target'] for docid in docids]
    generated_texts = [generated.get(docid, '') for docid in docids]

    # BERTscore
    bs_ps, bs_rs, bs_fs = bert_score.score(generated_texts, target_texts, model_type=model_type)
    return {
        "bs_ps": bs_ps,
        "bs_rs": bs_rs,
        "bs_fs": bs_fs
    }


def calculate_mean_bertscore(
    targets: Dict[str, Dict], generated: Dict[str, str], model_type="microsoft/deberta-xlarge-mnli"
) -> Dict:
    """
    Calculate mean BERTscore
    :param targets: dict of docid -> {'target': target_text}
    :param generated: dict of docid -> generated_text
    :param model_type: model type for BERTscore. Choose from
        ['bert-base-uncased', 'bert-large-uncased', 'roberta-base',
        'roberta-large']. Default: 'roberta-large'
    :return: dict of mean BERTscore results (bs_ps, bs_rs, bs_fs) (precision, recall, f1)
    """
    individual_results = calculate_bertscore(targets, generated, model_type=model_type)

    results = {
        "bertscore_avg_p": torch.mean(individual_results["bs_ps"]).item(),
        "bertscore_avg_r": torch.mean(individual_results["bs_rs"]).item(),
        "bertscore_avg_f": torch.mean(individual_results["bs_fs"]).item(),
        "bertscore_std_p": torch.std(individual_results["bs_ps"]).item(),
        "bertscore_std_r": torch.std(individual_results["bs_rs"]).item(),
        "bertscore_std_f": torch.std(individual_results["bs_fs"]).item(),
    }
    return results

In [25]:
# Test with some toy examples

targets = {
    "doc1": {"target": "The quick brown fox jumps over the lazy dog"},
    "doc2": {"target": "fruit flies like a banana"},
    "doc3": {"target": "fruit flies like a banana"},
    "doc4": {"target": "fruit flies like a banana"},
    "doc5": {"target": "everything is chaotic"},
    "doc6": {"target": "The quick brown fox jumps over the lazy dog"},
}

generated = {
    "doc1": "A lazy dog is under a hopping speedy fox",  # synonym
    "doc2": "some insects are attracted to a yellow fruit",  # one interpretation
    "doc3": "most fruits have the aerodynamic properties of a banana",  # another interpretation
    "doc4": "nothing makes sense",  # completely irrelevant
    "doc5": "nothing makes sense",  # synonym
    "doc6": "The quick brown fox jumps over the lazy dog",  # perfect/identical
}

rouge_results = calculate_rouge(targets, generated)
display(rouge_results)

bertscore_results = calculate_bertscore(targets, generated, "microsoft/deberta-xlarge-mnli")
display(bertscore_results)

Computing ROUGE scores...


{'rouge1': AggregateScore(low=Score(precision=0.12222222222222223, recall=0.0787037037037037, fmeasure=0.0989010989010989), mid=Score(precision=0.35555555555555557, recall=0.2962962962962963, fmeasure=0.32051282051282054), high=Score(precision=0.6333333333333333, recall=0.5972222222222222, fmeasure=0.6068376068376068)),
 'rouge2': AggregateScore(low=Score(precision=0.020833333333333332, recall=0.020833333333333332, fmeasure=0.020833333333333332), mid=Score(precision=0.20833333333333334, recall=0.20833333333333334, fmeasure=0.20833333333333334), high=Score(precision=0.5625, recall=0.5416666666666666, fmeasure=0.548611111111111)),
 'rougeL': AggregateScore(low=Score(precision=0.07037037037037037, recall=0.05787037037037037, fmeasure=0.06267806267806268), mid=Score(precision=0.2740740740740741, recall=0.24074074074074078, fmeasure=0.254985754985755), high=Score(precision=0.6037962962962963, recall=0.5740740740740741, fmeasure=0.5884920634920633)),
 'rougeLsum': AggregateScore(low=Score(pr

Computing BERTscore...


{'bs_ps': tensor([0.7381, 0.5301, 0.5480, 0.3825, 0.6740, 1.0000]),
 'bs_rs': tensor([0.7276, 0.5883, 0.6598, 0.4499, 0.6900, 1.0000]),
 'bs_fs': tensor([0.7328, 0.5577, 0.5987, 0.4135, 0.6819, 1.0000])}

In [26]:
# Test with some toy examples

targets = {
    "28514886": {"target": "Current evidence from systematic review and meta- analysis revealed that probiotics are the most promising intervention in reduction of the incidence of NEC in VLBW neonates . As per the evidence , prebiotics modulate the composition of human intestine microflora to the benefit of the host by suppression of colonization of harmful microorganism and /or the stimulation of bifidobacterial growth , decreased stool viscosity , reduced gastrointestinal transit time , and better feed tolerance ."},
    "18842808": {"target": "The use of glucomannan did not appear to significantly alter any other study endpoints . Pediatric patients , patients receiving dietary modification , and patients with impaired glucose metabolism did not benefit from glucomannan to the same degree . Glucomannan appears to beneficially affect total cholesterol , LDL cholesterol , triglycerides , body weight , and FBG , but not HDL cholesterol or BP"},
}

generated = {
    "28514886": "Retrieve concise conclusion without background: BACKGROUND : Necrotizing enterocolitis ( NEC ) is one of the most destructive diseases associated with the intestine. We aim to determine the effect of a preterm formula containing partially hydrolyzed whey protein, modified vegetable oil with a high & bgr;-palmitic acid content, on the intestinal flora. We hypothesized that enteral supplementation of a prebiotic mixture consisting of neutral oligosaccharides ( (SC)GOS/(LC)FOS ) and acidic oligosaccharides ( AOS ) on intestinal permeability. In a double-blind trial 20 preterm infants ( gestational age 27 ( 24 - 31 ) weeks, postnatal age 42 ( 11 - 84 ) days ), and weight at study entry 1570 ( 1080 - 2300 ) g were enrolled. The infants were randomized to receive either a formula with 8 g/L of either GOS/LCFOS ( 1.5 or 3.0 g/kg ). The stool specimens were quantitatively cultured weekly for the number of bifidobacteria, gastric residue, bowel habits, and feeding tolerance.  Clinical examination including anthropometric measurements, microbiological analysis of fecal sample s, and blood leukocyte population analysis were performed at birth and 6 and 10 weeks... The results showed that the incidence of NEC, the group fed the oligosaccharide supplemented formula increased to the upper range of infant growth, and the placebo group. The incidence of > or = 1 serious infection, as measured by extrusion force ( P=0.006 ), was not significantly different in the supplemented group ( P = 0.056 )..  The intestinal microbiota of infants who received a st and ard formula seems to resemble a more mature gut flora, while the 0.8 g/dL group, 9.7 - 14 % of these neonates.. Conclusion : Neonatal enteric NEC. The intestinal flora of preterm neonates was not different between the 2 groups. and/.}).",
    "18842808": "Retrieve concise conclusion without background: BACKGROUND The purpose of this study was to evaluate the effectiveness of the hydrosoluble fiber glucomannan to a Step-One-Diet in mildly hypercholesterolemic type II diabetic and non-diabetic subjects and to compare the response of these two subject groups to the treatments. MATERIAL / METHODS One hundred and seventy six men and women were included to receive either active fiber substance or placebo in r and omized placebo-controlled studies. The subjects were encouraged not to change their ordinary diets or general lifestyle during the investigation. RESULTS : After a three-days food recall, a balanced diet with adequate caloric intake was provided to all obese children. In all patients before and 2 - 4 months after the intervention, the plasma lipids ( weight, height, weight excess ) and laboratory data ( serum levels of cholesterol, HDL, triglycerides, glucose, fructosamine, glycosylated hemoglobin, RBC, WBC, hemoglobin, iron, calcium, Cu and Zn ) have been determined. Excess weight and triglycerids levels were significantly decreased in treated obese patients than in obese controls 4 months later. Both groups experienced decreases in ( P < 0.01 ) body weight, percent body fat, systolic blood pressure, waist circumference, and plasma glucose levels. After 12 weeks, HDL-C and TAG improved significantly in the fiber ( 10 % and -34 % ) and placebo ( 14 %, -43 % ) groups. The results of lipid profiles did not differ between subject groups. Overall plasma lathosterol concentrations, as well as FBG, and other lipids were lowered ( P<0.05 ). The study to perform a meta- analysis of r, omized controlled trials of glucarannan on plasma lipid, FBG.. and).-..",
}

rouge_results = calculate_mid_rouge(targets, generated)
display(rouge_results)

bertscore_results = calculate_mean_bertscore(targets, generated, "microsoft/deberta-xlarge-mnli")
display(bertscore_results)

Computing ROUGE scores...


{'rouge': {'rouge1': AggregateScore(low=Score(precision=0.4727272727272727, recall=0.09961685823754789, fmeasure=0.16455696202531644), mid=Score(precision=0.4754940711462451, recall=0.10959103781442611, fmeasure=0.1779306549257017), high=Score(precision=0.4782608695652174, recall=0.11956521739130435, fmeasure=0.19130434782608696)),
  'rouge2': AggregateScore(low=Score(precision=0.08823529411764706, recall=0.019230769230769232, fmeasure=0.03184713375796178), mid=Score(precision=0.09041394335511982, recall=0.020524475524475526, fmeasure=0.033416278249243286), high=Score(precision=0.09259259259259259, recall=0.02181818181818182, fmeasure=0.03498542274052478)),
  'rougeL': AggregateScore(low=Score(precision=0.2727272727272727, recall=0.05747126436781609, fmeasure=0.0949367088607595), mid=Score(precision=0.2812911725955204, recall=0.06496751624187906, fmeasure=0.10543936892313338), high=Score(precision=0.2898550724637681, recall=0.07246376811594203, fmeasure=0.11594202898550726)),
  'rougeL

Computing BERTscore...


{'bertscore_avg_p': 0.47608551383018494,
 'bertscore_avg_r': 0.6032038927078247,
 'bertscore_avg_f': 0.5320615768432617,
 'bertscore_std_p': 0.006571157369762659,
 'bertscore_std_r': 0.014886519871652126,
 'bertscore_std_f': 0.0016888242680579424}