<a href="https://colab.research.google.com/github/ErikHersmann/quantifying-inconsistencies-in-abstractive-summarization/blob/main/Bachelorarbeit_(1_16).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

All the used metrics are credited in the thesis itself, please refer to the references appendix

In [1]:
#@title # prerequisites
!pip install datasets --quiet
!pip install rouge_score --quiet
!pip install evaluate --quiet
!pip install bert_score --quiet
!pip install SentencePiece
!pip install transformers --quiet

import torch
import numpy as np
import pandas as pd
import datasets
import evaluate
from datasets import load_dataset
from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification
from scipy import stats
from time import time_ns

# datasets
summeval = load_dataset("mteb/summeval")
# {'test': ['machine_summaries', 'human_summaries', 'relevance', 'coherence', 'fluency', 'consistency', 'text', 'id']}
summeval_references = summeval['test']['text']
summeval_summaries = summeval['test']['machine_summaries']
summeval_summaries_1 = [i[0] for i in summeval['test']['machine_summaries']]
# xsum = load_dataset("xsum")
# cnnDaily = load_dataset("cnn_dailymail", '3.0.0')
runtimes = {}

def correlations(score):
  """correlation with 'relevance', 'fluency', 'coherence', 'consistency' """
  #print("relevance: ", np.corrcoef(score, [i[0] for  i in summeval['test']['relevance']] )[0][1],"p: ", stats.spearmanr(score, [i[0] for i in summeval['test']['relevance']])[1])
  print("relevance:   ", stats.pearsonr(score, [i[0] for  i in summeval['test']['relevance']]))
  #print("fluency: ", np.corrcoef(score, [i[0] for i in summeval['test']['fluency']] )[0][1],"p: ", stats.spearmanr(score, [i[0] for i in summeval['test']['fluency']])[1])
  print("fluency:     ", stats.pearsonr(score, [i[0] for  i in summeval['test']['fluency']]))
  #print("coherence: ", np.corrcoef(score, [i[0] for  i in summeval['test']['coherence']] )[0][1],"p: ", stats.spearmanr(score, [i[0] for i in summeval['test']['coherence']])[1])
  print("coherence:   ", stats.pearsonr(score, [i[0] for  i in summeval['test']['coherence']]))
  #print("consistency: ", np.corrcoef(score, [i[0] for  i in summeval['test']['consistency']] )[0][1],"p: ", stats.spearmanr(score, [i[0] for i in summeval['test']['consistency']])[1])
  print("consistency: ", stats.pearsonr(score, [i[0] for  i in summeval['test']['consistency']]))

  print("average:     ", np.mean( [stats.pearsonr(score, [i[0] for  i in summeval['test']['relevance']])[0], stats.pearsonr(score, [i[0] for  i in summeval['test']['fluency']])[0], stats.pearsonr(score, [i[0] for  i in summeval['test']['coherence']])[0], stats.pearsonr(score, [i[0] for  i in summeval['test']['consistency']])[0]] )  )



In [9]:
#@title NER-Overlap_OWN


# "Davlan/distilbert-base-multilingual-cased-ner-hrl"
tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")
classifier_2= pipeline("ner", model=model, tokenizer=tokenizer)




def entities(summaries, references):
  out_ent = []
  inp_ent = []
  ratio_ent = []
  for out,inp in zip(summaries,references):
    # summaries
    entities_summaries = classifier_2(out)
    out_ent.append(entities_summaries)
    # references
    entities_references = classifier_2(inp)
    inp_ent.append(entities_references)
    # ratio
    if len(entities_references) > 0:
      ratio_ent.append(len(entities_summaries)/len(entities_references))
  return out_ent, inp_ent, ratio_ent

  # testing
def hallucination_absolute(summaries, references):
  counter = []
  for summaries, references in zip(summaries, references):
    a = [i['word'] for i in summaries]
    b = [i['word'] for i in references]
    c = 0
    for i in a: # tokens in the summaries
      if i not in b: # tokens in the references
        # print(i)
        c += 1 # hallucinated token
    counter.append(c)
  return counter

def hallucination_relative_d(summaries, references):
  counter = []
  for summaries, references in zip(summaries, references):
    a = [i['word'] for i in summaries]
    b = [i['word'] for i in references]
    c = 0
    for i in a: # tokens in the summaries
      if i not in b: # tokens in the references
        # print(i)
        c += 1 # hallucinated token
    if len(b) > 0 and c > 0:
      counter.append(c/len(b))
    else:
      counter.append(0)
  return counter

def hallucination_relative_s(summaries, references):
  counter = []
  for summaries, references in zip(summaries, references):
    a = [i['word'] for i in summaries]
    b = [i['word'] for i in references]
    c = 0
    for i in a: # tokens in the summaries
      if i not in b: # tokens in the references
        # print(i)
        c += 1 # hallucinated token
    if len(a) > 0 and c > 0:
      counter.append(c/len(a))
    else:
      counter.append(0)
  return counter


def ner_overlap_d(references):
  entities_summaries, entities_references, ratio_entities = entities(summeval_summaries_1, references)
  return hallucination_relative_d(entities_summaries, entities_references)

def ner_overlap_s(references):
  entities_summaries, entities_references, ratio_entities = entities(summeval_summaries_1, references)
  return hallucination_relative_s(entities_summaries, entities_references)

timer = time_ns()
result_ner_d = ner_overlap_d(summeval_references)
runtimes['D-NER_Overlap']=time_ns()-timer
timer = time_ns()
result_ner_s = ner_overlap_s(summeval_references)
runtimes['S-NER_Overlap'] = time_ns()-timer

print("D-NER-Overlap results: ")
correlations(result_ner_d)
print("S-NER-Overlap results: ")
correlations(result_ner_s)

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


D-NER-Overlap results: 
relevance:    PearsonRResult(statistic=-0.008419430376599809, pvalue=0.9337424191194839)
fluency:      PearsonRResult(statistic=0.011639458199696636, pvalue=0.9084965694121638)
coherence:    PearsonRResult(statistic=-0.07052562190271128, pvalue=0.4856411492045646)
consistency:  PearsonRResult(statistic=-0.04801485457762249, pvalue=0.6352249234249837)
average:      -0.028830112164309234
S-NER-Overlap results: 
relevance:    PearsonRResult(statistic=-0.12713969923454807, pvalue=0.20747651258130387)
fluency:      PearsonRResult(statistic=0.051954883848731696, pvalue=0.6076969000416053)
coherence:    PearsonRResult(statistic=-0.09631660661924771, pvalue=0.34045060132179017)
consistency:  PearsonRResult(statistic=-0.058378682750706616, pvalue=0.5639802806213454)
average:      -0.057470026188942674


In [10]:
#@title maxword2vec
# "https://github.com/RaRe-Technologies/gensim-data"
import gensim.downloader

info = gensim.downloader.info()  # show info about available models/datasets

model = gensim.downloader.load("glove-wiki-gigaword-300")
#########

timer = time_ns()

scores = []

for i in range(100):
# For each Summary-Reference pair

  closeness = [0]*len(summeval_references[i].split(" ")) # Holds the best word score for each word in the reference
  c = 0
  for word1 in summeval_references[i].split(" "):
  # For each word in the reference i
    temporary_distances = []
    for word2 in summeval_summaries_1[i].split(" "):
    # For each word in the summary i
      try:
        temporary_distances.append(model.distance(word1, word2))
      except:
        pass # word2 doesnt exist in vocabulary
    try:
      # Best word score for word1 in the summary
      closeness[c] = max(temporary_distances) # might change this later
    except:
      pass # word1 doesnt exist in vocabulary
    c += 1
  # mean of the closeness for each word in the reference is the final score
  scores.append(np.mean(closeness)) # penalize low lowest closeness

runtimes["word2vec"] = time_ns() - timer
correlations(scores)

relevance:    PearsonRResult(statistic=0.30152480380806806, pvalue=0.0022989133451447276)
fluency:      PearsonRResult(statistic=0.022097537008995867, pvalue=0.8272547454740174)
coherence:    PearsonRResult(statistic=0.26009647685734494, pvalue=0.008964453054205167)
consistency:  PearsonRResult(statistic=0.1847519574656273, pvalue=0.06574373368431406)
average:      0.19211769378500904


In [11]:
#@title # bartscore
!git clone https://github.com/neulab/BARTScore.git
%cd BARTScore/
!pip install -r requirements.txt
# To use the CNNDM version BARTScore
from bart_score import BARTScorer
bart_scorer = BARTScorer(device='cuda:0', checkpoint='facebook/bart-large-cnn')
timer = time_ns()
bartscore = bart_scorer.multi_ref_score(summeval['test']['text'],summeval['test']['machine_summaries'], batch_size=4) # generation scores from the first list of texts to the second list of texts.
runtimes['BART'] = time_ns() - timer
correlations(bartscore)
%cd /content/


Cloning into 'BARTScore'...
remote: Enumerating objects: 220, done.[K
remote: Counting objects: 100% (26/26), done.[K
remote: Compressing objects: 100% (12/12), done.[K
remote: Total 220 (delta 18), reused 14 (delta 14), pack-reused 194[K
Receiving objects: 100% (220/220), 101.98 MiB | 11.15 MiB/s, done.
Resolving deltas: 100% (47/47), done.
Updating files: 100% (192/192), done.
/content/BARTScore
Collecting absl-py==0.12.0 (from -r requirements.txt (line 1))
  Downloading absl_py-0.12.0-py3-none-any.whl (129 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m129.4/129.4 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting antlr4-python3-runtime==4.8 (from -r requirements.txt (line 2))
  Downloading antlr4-python3-runtime-4.8.tar.gz (112 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m112.4/112.4 kB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting bert-score==0.3.9 (from -r 

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

relevance:    PearsonRResult(statistic=0.5403278413272568, pvalue=6.540305863585185e-09)
fluency:      PearsonRResult(statistic=0.17742649776049846, pvalue=0.07739617314529704)
coherence:    PearsonRResult(statistic=0.49068296841913145, pvalue=2.1874994339522356e-07)
consistency:  PearsonRResult(statistic=0.44299555814415487, pvalue=3.922807102046324e-06)
average:      0.4128582164127604
/content


In [12]:
#@title # Rouge
from rouge_score import rouge_scorer
import evaluate

rouge = evaluate.load('rouge')

timer = time_ns()
results_rouge = rouge.compute(predictions=[i[0] for i in summeval_summaries],
                        references=summeval_references,
                        rouge_types= ['rouge2', 'rouge3', 'rougeL'], use_aggregator=False )
runtimes['Rouge'] = time_ns() - timer

for key in results_rouge.keys():
  print(key, ":")
  correlations(results_rouge[key])

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

rouge2 :
relevance:    PearsonRResult(statistic=0.3427431635378553, pvalue=0.0004817606543045116)
fluency:      PearsonRResult(statistic=0.2159989341578081, pvalue=0.030898229808621343)
coherence:    PearsonRResult(statistic=0.2672503244338714, pvalue=0.007188562855552697)
consistency:  PearsonRResult(statistic=0.25670188757550805, pvalue=0.009934537124156607)
average:      0.27067357742626075
rouge3 :
relevance:    PearsonRResult(statistic=0.39388389417794856, pvalue=5.024363087441451e-05)
fluency:      PearsonRResult(statistic=0.25591019173914925, pvalue=0.010173602029370631)
coherence:    PearsonRResult(statistic=0.34987076256089966, pvalue=0.00035945872113391354)
consistency:  PearsonRResult(statistic=0.31715435311805695, pvalue=0.0013040006503918528)
average:      0.3292048003990136
rougeL :
relevance:    PearsonRResult(statistic=0.37991232050719226, pvalue=9.675623804402001e-05)
fluency:      PearsonRResult(statistic=0.15322745355395095, pvalue=0.12800578190612744)
coherence:    

In [13]:
#@title # UniEval

!git clone https://github.com/maszhongming/UniEval.git
%cd UniEval
!pip install -r requirements.txt

from utils import convert_to_json
from metric.evaluator import get_evaluator

task = 'summarization'

# Prepare data for pre-trained evaluators
data = convert_to_json(output_list=summeval_summaries_1,
                       src_list=summeval_references , ref_list=[i[0] for i in summeval['test']['human_summaries']])
# Initialize evaluator for a specific task
evaluator = get_evaluator(task)
# Get multi-dimensional evaluation scores
import nltk
nltk.download('punkt')

timer = time_ns()
uni_scores = evaluator.evaluate(data, print_result=False)
runtimes['UniEval'] = time_ns() - timer

print("\n\n")
print("fluency: ", stats.spearmanr([i['fluency'] for i in uni_scores], [i[0] for i in summeval['test']['fluency']] )[:])
print("coherence: ", stats.spearmanr([i['coherence'] for i in uni_scores], [i[0] for  i in summeval['test']['coherence']] )[:])
print("relevance: ", stats.spearmanr([i['relevance'] for i in uni_scores], [i[0] for  i in summeval['test']['relevance']] )[:])
print("consistency: ", stats.spearmanr([i['consistency'] for i in uni_scores], [i[0] for  i in summeval['test']['consistency']] )[:])

%cd /content/

Cloning into 'UniEval'...
remote: Enumerating objects: 83, done.[K
remote: Counting objects: 100% (83/83), done.[K
remote: Compressing objects: 100% (66/66), done.[K
remote: Total 83 (delta 16), reused 72 (delta 13), pack-reused 0[K
Receiving objects: 100% (83/83), 1.97 MiB | 10.29 MiB/s, done.
Resolving deltas: 100% (16/16), done.
/content/UniEval
Collecting accelerate (from -r requirements.txt (line 2))
  Downloading accelerate-0.22.0-py3-none-any.whl (251 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m251.2/251.2 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
Collecting py7zr (from -r requirements.txt (line 8))
  Downloading py7zr-0.20.6-py3-none-any.whl (66 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.7/66.7 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
Collecting texttable (from py7zr->-r requirements.txt (line 8))
  Downloading texttable-1.6.7-py2.py3-none-any.whl (10 kB)
Collecting pycryptodomex>=3.6.6 (from py7zr->-r requirements

Downloading (…)lve/main/config.json:   0%|          | 0.00/631 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.89k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Evaluating coherence of 100 samples !!!


100%|██████████| 13/13 [00:28<00:00,  2.19s/it]


Evaluating consistency of 100 samples !!!


100%|██████████| 56/56 [01:42<00:00,  1.83s/it]


Evaluating fluency of 100 samples !!!


100%|██████████| 56/56 [00:07<00:00,  7.43it/s]


Evaluating relevance of 100 samples !!!


100%|██████████| 13/13 [00:06<00:00,  1.87it/s]




fluency:  (0.41634841618984264, 1.644030723306243e-05)
coherence:  (0.29600106446707386, 0.002788809636390165)
relevance:  (0.17024506653388619, 0.09037313971520285)
consistency:  (0.5380849427657046, 7.760069819348344e-09)
/content





In [14]:
#@title # Bertscore
bertscore = evaluate.load("bertscore")
timer = time_ns()
results_bert = bertscore.compute(predictions=summeval_summaries_1, references=summeval_references, lang="en")
runtimes['BERT'] = time_ns() - timer
# correlation with 'relevance', 'coherence', 'fluency', 'consistency'
print("f1:")
correlations(results_bert['f1'])
print("precision:")
correlations(results_bert['precision'])
print("recall:")
correlations(results_bert['recall'])

Downloading builder script:   0%|          | 0.00/7.95k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


f1:
relevance:    PearsonRResult(statistic=0.5055569518428744, pvalue=8.099438543033638e-08)
fluency:      PearsonRResult(statistic=0.3907777835625099, pvalue=5.827230263918517e-05)
coherence:    PearsonRResult(statistic=0.4295630439165452, pvalue=8.201052741686972e-06)
consistency:  PearsonRResult(statistic=0.34489914809379085, pvalue=0.0004412420373098065)
average:      0.4176992318539301
precision:
relevance:    PearsonRResult(statistic=0.5550003683482198, pvalue=2.0704857880188876e-09)
fluency:      PearsonRResult(statistic=0.3784729329765165, pvalue=0.00010334207808121539)
coherence:    PearsonRResult(statistic=0.5071994107570016, pvalue=7.236660270060857e-08)
consistency:  PearsonRResult(statistic=0.43667609789356493, pvalue=5.571481631626548e-06)
average:      0.46933720249382566
recall:
relevance:    PearsonRResult(statistic=0.3746697959704175, pvalue=0.00012280085203946739)
fluency:      PearsonRResult(statistic=0.33418758886709526, pvalue=0.0006785320488427495)
coherence:    

In [2]:
#@title # questEval
!git clone https://github.com/ThomasScialom/QuestEval.git
!pip install unidecode --quiet
!pip install SentencePiece --quiet
%cd QuestEval/
from questeval.questeval_metric import QuestEval
questeval = QuestEval(no_cuda=False)

source_1 = "Since 2000, the recipient of the Kate Greenaway medal has also been presented with the Colin Mears award to the value of 35000."
prediction_1 = "Since 2000, the winner of the Kate Greenaway medal has also been given to the Colin Mears award of the Kate Greenaway medal."
references_1 = [
    "Since 2000, the recipient of the Kate Greenaway Medal will also receive the Colin Mears Awad which worth 5000 pounds",
    "Since 2000, the recipient of the Kate Greenaway Medal has also been given the Colin Mears Award."
]

source_2 = "He is also a member of another Jungiery boyband 183 Club."
prediction_2 = "He also has another Jungiery Boyband 183 club."
references_2 = [
    "He's also a member of another Jungiery boyband, 183 Club.",
    "He belonged to the Jungiery boyband 183 Club."
]

timer = time_ns()
score_quest = questeval.corpus_questeval(
    hypothesis=summeval_summaries_1,
    sources=summeval_references ,
    list_references=[i[0] for i in summeval['test']['human_summaries']]
)
runtimes['questEval'] = time_ns() - timer

correlations(score_quest['ex_level_scores'])

fatal: destination path 'QuestEval' already exists and is not an empty directory.
/content/QuestEval


  self.metric_BERTScore = load_metric("bertscore")
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. If you see this, DO NOT PANIC! This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

relevance:    PearsonRResult(statistic=0.3158070047452836, pvalue=0.001370972449745073)
fluency:      PearsonRResult(statistic=0.043351589778926114, pvalue=0.6684567678044365)
coherence:    PearsonRResult(statistic=0.2857360823330423, pvalue=0.003954435328131328)
consistency:  PearsonRResult(statistic=0.18419882152041286, pvalue=0.06657073805694187)
average:      0.20727337459441625


In [None]:
#@title # FEQA (function returns error code)

# from google.colab import drive
# drive.mount('/content/gdrive')
# %cd /content/gdrive/MyDrive/b_thesis/feqa

# https://drive.google.com/drive/folders/1ckywlk60FYSEGhOF_qiqdIOabS2OaBzl?usp=drive_link # squad
# https://drive.google.com/drive/folders/1N-aS7yYEDYKd8otFJ9A0h4jaafeC7a1G?usp=drive_link # checkpts

!git clone https://github.com/esdurmus/feqa
%cd feqa/qa_models/
!gdown --folder 1ckywlk60FYSEGhOF_qiqdIOabS2OaBzl
%cd /content/feqa/bart_qg/
!gdown --folder 1N-aS7yYEDYKd8otFJ9A0h4jaafeC7a1G

%cd /content/feqa/
# !pip install -r requirements.txt
!pip install benepar==0.1.3 --quiet # benepar[GPU]==0.1.3
!pip install fairseq --quiet

import benepar
import nltk
from feqa import FEQA
nltk.download('punkt')
benepar.download('benepar_en2')
nltk.download('stopwords')
!python -m spacy download en_core_web_sm
scorer = FEQA(use_gpu=True) # maybe GPU works now? benepar is the issue I think


feqa_scores = scorer.compute_score(summeval_references, summeval_summaries_1, aggregate=False)

In [None]:
#@title #gptscore (rate limit from openai)


!git clone https://github.com/jinlanfu/GPTScore.git
%cd GPTScore/
!pip install mosestokenizer --quiet
!pip install openai --quiet
from gpt3_score import gpt3score
gpt3score_results = gpt3score(summeval_references, summeval_summaries, 'curie', api_key='') # removed the api key, but it didn't work with mine anyways
# rate limit ?
print(gpt3score_results)
# correlation with 'relevance', 'coherence', 'fluency', 'consistency'
print("fluency: ", np.corrcoef(gpt3score_results, [np.mean(i) for i in summeval['test']['fluency']] )[0][1])
print("coherence: ", np.corrcoef(gpt3score_results, [np.mean(i) for i in summeval['test']['coherence']] )[0][1])
print("relevance: ", np.corrcoef(gpt3score_results, [np.mean(i) for i in summeval['test']['relevance']] )[0][1])
print("consistency: ", np.corrcoef(gpt3score_results, [np.mean(i) for i in summeval['test']['consistency']] )[0][1])
# !python gpt3_score.py --dataname "summeval" --use_demo False --use_ist False --gpt3_score True --gpt3model "curie" --out_dir_name "gpt3Score_based" --aspect 'quality'
%cd /content/


In [None]:
#@title # summaC (incompatibility issues with pyarrow)
!pip install pyarrow==12.0.1
!pip install summac

from summac.model_summac import SummaCZS, SummaCConv

model_zs = SummaCZS(granularity="sentence", model_name="vitc", device="cuda") # If you have a GPU: switch to: device="cuda"
model_conv = SummaCConv(models=["vitc"], bins='percentile', granularity="sentence", nli_labels="e", device="cpu", start_file="default", agg="mean")

document = """Scientists are studying Mars to learn about the Red Planet and find landing sites for future missions.
One possible site, known as Arcadia Planitia, is covered instrange sinuous features.
The shapes could be signs that the area is actually made of glaciers, which are large masses of slow-moving ice.
Arcadia Planitia is in Mars' northern lowlands."""

summary1 = "There are strange shape patterns on Arcadia Planitia. The shapes could indicate the area might be made of glaciers. This makes Arcadia Planitia ideal for future missions."
score_zs1 = model_zs.score([document], [summary1])
score_conv1 = model_conv.score([document], [summary1])
print("[Summary 1] SummaCZS Score: %.3f; SummacConv score: %.3f" % (score_zs1["scores"][0], score_conv1["scores"][0])) # [Summary 1] SummaCZS Score: 0.582; SummacConv score: 0.536

summary2 = "There are strange shape patterns on Arcadia Planitia. The shapes could indicate the area might be made of glaciers."
score_zs2 = model_zs.score([document], [summary2])
score_conv2 = model_conv.score([document], [summary2])
print("[Summary 2] SummaCZS Score: %.3f; SummacConv score: %.3f" % (score_zs2["scores"][0], score_conv2["scores"][0])) # [Summary 2] SummaCZS Score: 0.877; SummacConv score: 0.709

In [3]:
#@title # Runtime | A look at the state of summarization

for item in runtimes.items():
  print(f"{item[0]}: {round(item[1]/10**9,1)} seconds")



absolute_scores = [[] for _ in range(4)] # 4 dimensions
print(f"\n\nState of summarization:")
for i in range(16):

  relevance = np.mean([j[i] for j in summeval['test']['relevance']])
  # print("relevance :", relevance)
  absolute_scores[0].append(relevance)

  fluency = np.mean([j[i] for j in summeval['test']['fluency']])
  # print("fluency :", fluency)
  absolute_scores[1].append(fluency)

  coherence = np.mean([j[i] for j in summeval['test']['coherence']])
  # print("coherence :", fluency)
  absolute_scores[2].append(coherence)

  consistency = np.mean([j[i] for j in summeval['test']['consistency']])
  # print("consistency :", consistency)
  absolute_scores[3].append(consistency)

for dimension in absolute_scores:
  print("best :",round(max(dimension), 2), "model: ", dimension.index(max(dimension))+1)
  print("average: ", round(np.mean(dimension),2))

print("\n\nmodel 9 through 12:")
print([round(i[9],2) for i in absolute_scores], np.mean([round(i[9],2) for i in absolute_scores])) # all dimensions for above 4 models
print([round(i[10],2) for i in absolute_scores], np.mean([round(i[10],2) for i in absolute_scores])) # all dimensions for above 4 models
print([round(i[11],2) for i in absolute_scores], np.mean([round(i[11],2) for i in absolute_scores])) # all dimensions for above 4 models
print([round(i[12],2) for i in absolute_scores], np.mean([round(i[12],2) for i in absolute_scores])) # all dimensions for above 4 models
print("model 1:")
print([round(i[0],2) for i in absolute_scores], np.mean([round(i[0],2) for i in absolute_scores])) # all dimensions for model 1

questEval: 924.9 seconds


State of summarization:
best : 4.26 model:  10
average:  3.78
best : 4.94 model:  12
average:  4.67
best : 4.18 model:  13
average:  3.41
best : 4.99 model:  11
average:  4.66


model 9 through 12:
[4.26, 4.88, 4.16, 4.91] 4.5525
[3.81, 4.83, 3.28, 4.99] 4.2275
[4.14, 4.94, 4.16, 4.98] 4.555
[4.25, 4.9, 4.18, 4.94] 4.5675
model 1:
[3.15, 3.65, 2.28, 3.27] 3.0875
