## Downloading dependendensies

In [None]:
# run these commands in the parent directory to this file, then cd to frank/,
# and run the rest of the code in the notebook from there.

# ! git clone https://github.com/artidoro/frank.git
# ! pip install -r frank/requirements.txt

In [None]:
! pip3 install transformers
! pip3 install sentencepiece
! pip3 install protobuf==3.20.0
! pip3 install torch==1.12.0 --extra-index-url https://download.pytorch.org/whl/cu113
! pip3 install nvidia-pyindex
! pip3 install nvidia-cudnn
! pip install evaluate
! pip install nltk

In [29]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /home/alkobakalova/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

## Benchmarking models for relativity to input

In [6]:
import json

with open("data/benchmark_data.json", "r") as file:
    benchmark_data = json.load(file)

Choose on of the two models below for benchmarking. Experimentally, roberta produces better results.

In [3]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("ynie/roberta-large-snli_mnli_fever_anli_R1_R2_R3-nli")

model = AutoModelForSequenceClassification.from_pretrained("ynie/roberta-large-snli_mnli_fever_anli_R1_R2_R3-nli")

# from transformers import AutoTokenizer, AutoModelForSequenceClassification
# 
# tokenizer = AutoTokenizer.from_pretrained("ynie/xlnet-large-cased-snli_mnli_fever_anli_R1_R2_R3-nli")
# 
# model = AutoModelForSequenceClassification.from_pretrained("ynie/xlnet-large-cased-snli_mnli_fever_anli_R1_R2_R3-nli")

Some weights of the model checkpoint at ynie/roberta-large-snli_mnli_fever_anli_R1_R2_R3-nli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
results = []
label_to_explanation = {0: "entailment", 1: "neutral", 2 : "contradiction"}
device = "cuda:0"
model = model.to(device)

In [7]:
from nltk.tokenize import sent_tokenize

for item in benchmark_data[len(results):]:
    article_sents = sent_tokenize(item["article"])
    summary_sents = sent_tokenize(item["summary"])
    entailment_scores = []
    neutral_scores = []
    contradiction_scores = []
    for s_sent in summary_sents:
        entailment_scores.append([])
        neutral_scores.append([])
        contradiction_scores.append([])
        for a_sent in article_sents:
            tokenized_input_seq_pair = tokenizer.encode_plus(a_sent, s_sent,
                                                     max_length=256,
                                                     return_token_type_ids=True, truncation=True)
            input_ids = torch.Tensor(tokenized_input_seq_pair['input_ids']).long().unsqueeze(0)
            token_type_ids = torch.Tensor(tokenized_input_seq_pair['token_type_ids']).long().unsqueeze(0)
            attention_mask = torch.Tensor(tokenized_input_seq_pair['attention_mask']).long().unsqueeze(0)

            with torch.no_grad():
                outputs = model(input_ids.to(device),
                                attention_mask=attention_mask.to(device),
                                token_type_ids=token_type_ids.to(device),
                                labels=None)
            scores = torch.softmax(outputs[0], dim=1)[0].tolist()
            entailment_scores[-1].append(scores[0])
            neutral_scores[-1].append(scores[1])
            contradiction_scores[-1].append(scores[2])
    results.append({
        "article" : item["article"],
        "summary": item["summary"],
        "hash" : item["hash"],
        "reference" : item["reference"],
        "model_name" : item["model_name"],
        "split" : item["split"],
        "entailment_scores" : entailment_scores,
        "neutral_scores": neutral_scores,
        "contradiction_scores": contradiction_scores
    })

In [10]:
# A 1 indicates that there was no such errors in the summary, a 0 indicates that every sentence contained one such error
# scores = [[per article] per summary]

import numpy as np

def agg_max(res):
    res["score"] = max([1 - score for scores_ in res["contradiction_scores"] for score in scores_])
    return res
                
def agg_min(res):
    res["score"] = min([1 - score for scores_ in res["contradiction_scores"] for score in scores_])
    return res

def agg_avg(res):
    scores = [1 - score for scores_ in res["contradiction_scores"] for score in scores_]
    res["score"] = sum(scores) / len(scores)
    return res

def true_agg(res):
    scores = [1 - max([score for score in scores_]) for scores_ in res["contradiction_scores"]]
    res["score"] = sum(scores) / len(scores)
    return res

def true_agg_r(res):
    scores_r = np.array(res["contradiction_scores"]).T
    scores = [1 - max([score for score in scores_]) for scores_ in scores_r]
    res["score"] = sum(scores) / len(scores)
    return res

def true_agg_2(res):
    scores = [min([1 - score for score in scores_]) for scores_ in res["contradiction_scores"]]
    res["score"] = sum(scores) / len(scores)
    return res

def true_agg_2_r(res):
    scores_r = np.array(res["contradiction_scores"]).T
    scores = [min([1 - score for score in scores_]) for scores_ in scores_r]
    res["score"] = sum(scores) / len(scores)
    return res

def agg_contradiction_max(res):
    res["score"] = 1 - max([score for scores_ in res["contradiction_scores"] for score in scores_])
    return res
                
def agg_contradiction_min(res):
    res["score"] = 1 - min([score for scores_ in res["contradiction_scores"] for score in scores_])
    return res

def agg_contradiction_avg(res):
    scores = [score for scores_ in res["contradiction_scores"] for score in scores_]
    res["score"] = 1 - sum(scores) / len(scores)
    return res

In [11]:
import json

agg_foos = [agg_avg, agg_min, agg_max, agg_contradiction_max, agg_contradiction_min, agg_contradiction_avg, true_agg,
            true_agg_2, true_agg_r, true_agg_2_r]
agg_foos_names = ["agg_avg", "agg_min", "agg_max", "agg_contradiction_max", "agg_contradiction_min",
                  "agg_contradiction_avg", "true_agg", "true_agg_2", "true_agg_r", "true_agg_2_r"]

agg_results = {}
for agg_foo, name in zip(agg_foos, agg_foos_names):
    agg_results[name] = [agg_foo(res) for res in results]
    with open(f"results/roberta-{name}-results.json", "w") as file:
        json.dump(agg_results[name], file)

In [19]:
! python evaluation/evaluate.py --metrics_outputs "results/roberta-true_agg_2_r-results.json" --dataset cnndm

  from scipy.stats.stats import pearsonr, spearmanr
  from scipy.stats.stats import pearsonr, spearmanr
Info: metric Bleu used 375 summaries to calculate correlation.
Info: metric Meteor used 375 summaries to calculate correlation.
Info: metric Rouge 1 used 375 summaries to calculate correlation.
Info: metric Rouge 2 used 375 summaries to calculate correlation.
Info: metric Rouge L used 375 summaries to calculate correlation.
Info: metric BertScore P Art used 375 summaries to calculate correlation.
Info: metric FEQA used 375 summaries to calculate correlation.
Info: metric QAGS used 375 summaries to calculate correlation.
Info: metric Dep Entail used 339 summaries to calculate correlation.
Info: metric FactCC used 375 summaries to calculate correlation.
Info: metric score used 375 summaries to calculate correlation.
                  pearson  pearson p-value  spearman  spearman p-value
Bleu             0.115828     2.489224e-02  0.059618      2.494614e-01
Meteor           0.145437     

In [20]:
! python evaluation/evaluate.py --metrics_outputs "results/roberta-true_agg_r-results.json" --dataset cnndm

  from scipy.stats.stats import pearsonr, spearmanr
  from scipy.stats.stats import pearsonr, spearmanr
Info: metric Bleu used 375 summaries to calculate correlation.
Info: metric Meteor used 375 summaries to calculate correlation.
Info: metric Rouge 1 used 375 summaries to calculate correlation.
Info: metric Rouge 2 used 375 summaries to calculate correlation.
Info: metric Rouge L used 375 summaries to calculate correlation.
Info: metric BertScore P Art used 375 summaries to calculate correlation.
Info: metric FEQA used 375 summaries to calculate correlation.
Info: metric QAGS used 375 summaries to calculate correlation.
Info: metric Dep Entail used 339 summaries to calculate correlation.
Info: metric FactCC used 375 summaries to calculate correlation.
Info: metric score used 375 summaries to calculate correlation.
                  pearson  pearson p-value  spearman  spearman p-value
Bleu             0.115828     2.489224e-02  0.059618      2.494614e-01
Meteor           0.145437     

In [24]:
! python evaluation/evaluate.py --metrics_outputs "xlnet-max-results.json" --dataset cnndm

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Info: metric Bleu used 375 summaries to calculate correlation.
Info: metric Meteor used 375 summaries to calculate correlation.
Info: metric Rouge 1 used 375 summaries to calculate correlation.
Info: metric Rouge 2 used 375 summaries to calculate correlation.
Info: metric Rouge L used 375 summaries to calculate correlation.
Info: metric BertScore P Art used 375 summaries to calculate correlation.
Info: metric FEQA used 375 summaries to calculate correlation.
Info: metric QAGS used 375 summaries to calculate correlation.
Info: metric Dep Entail used 339 summaries to calculate correlation.
Info: metric FactCC used 375 summaries to calculate correlation.
Info: metric score used 375 summaries to calculate correl