<a href="https://colab.research.google.com/github/ErikHersmann/quantifying-inconsistencies-in-abstractive-summarization/blob/main/Bachelorarbeit_(16_16).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

All the used metrics are credited in the thesis itself, please refer to the references appendix

In [27]:
#@title # prerequisites
!pip install datasets --quiet
!pip install rouge_score --quiet
!pip install evaluate --quiet
!pip install bert_score --quiet
!pip install SentencePiece
!pip install transformers --quiet

import torch
import numpy as np
import pandas as pd
import datasets
import evaluate
from datasets import load_dataset
from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification
from scipy import stats
from time import time_ns



summeval = load_dataset("mteb/summeval")
# {'test': ['machine_summaries', 'human_summaries', 'relevance', 'coherence', 'fluency', 'consistency', 'text', 'id']}
summeval_references = summeval['test']['text']
summeval_summaries = summeval['test']['machine_summaries']

runtimes = {}

def correlations(scores):
  relevance, fluency, coherence, consistency = [], [], [], []
  # transform score from 16 len 100 arrays to 1 len 1600 array
  new_scores = [] # all 100 from a model continuously
  # transform dimensions into 4 len 1600 arrays
  new_dims = {'relevance': [], 'fluency': [], 'coherence': [], 'consistency':[]}

  for c,score in enumerate(scores): # score is a len 100 array, c goes from 0 to 15
    for i in range(100):
      new_scores.append(score[i])
      new_dims['relevance'].append( summeval['test']['relevance'][i][c] )
      new_dims['fluency'].append( summeval['test']['fluency'][i][c]  )
      new_dims['coherence'].append( summeval['test']['coherence'][i][c]  )
      new_dims['consistency'].append( summeval['test']['consistency'][i][c]  )

  print(len(new_dims['relevance']))
  print("relevance:   ", stats.spearmanr(new_scores, new_dims['relevance']))
  print("fluency:     ", stats.spearmanr(new_scores, new_dims['fluency']))
  print("coherence:   ", stats.spearmanr(new_scores, new_dims['coherence']))
  print("consistency: ", stats.spearmanr(new_scores, new_dims['consistency']))
  print("average:     ", np.mean( [stats.spearmanr(new_scores, new_dims['relevance'])[0], stats.spearmanr(new_scores, new_dims['fluency'])[0], stats.spearmanr(new_scores, new_dims['coherence'])[0], stats.spearmanr(new_scores, new_dims['consistency'])[0]] ))



In [2]:
#@title NER-Overlap_OWN (16/16)


# "Davlan/distilbert-base-multilingual-cased-ner-hrl"
tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")
classifier_2= pipeline("ner", model=model, tokenizer=tokenizer)




def entities(summaries, references):
  out_ent = []
  inp_ent = []
  ratio_ent = []
  for out,inp in zip(summaries,references):
    # summaries
    entities_summaries = classifier_2(out)
    out_ent.append(entities_summaries)
    # references
    entities_references = classifier_2(inp)
    inp_ent.append(entities_references)
    # ratio
    if len(entities_references) > 0:
      ratio_ent.append(len(entities_summaries)/len(entities_references))
  return out_ent, inp_ent, ratio_ent

  # testing
def hallucination_absolute(summaries, references):
  counter = []
  for summaries, references in zip(summaries, references):
    a = [i['word'] for i in summaries]
    b = [i['word'] for i in references]
    c = 0
    for i in a: # tokens in the summaries
      if i not in b: # tokens in the references
        # print(i)
        c += 1 # hallucinated token
    counter.append(c)
  return counter

def hallucination_relative_d(summaries, references):
  counter = []
  for summaries, references in zip(summaries, references):
    a = [i['word'] for i in summaries]
    b = [i['word'] for i in references]
    c = 0
    for i in a: # tokens in the summaries
      if i not in b: # tokens in the references
        # print(i)
        c += 1 # hallucinated token
    if len(b) > 0 and c > 0:
      counter.append(c/len(b))
    else:
      counter.append(0)
  return counter

def hallucination_relative_s(summaries, references):
  counter = []
  for summaries, references in zip(summaries, references):
    a = [i['word'] for i in summaries]
    b = [i['word'] for i in references]
    c = 0
    for i in a: # tokens in the summaries
      if i not in b: # tokens in the references
        # print(i)
        c += 1 # hallucinated token
    if len(a) > 0 and c > 0:
      counter.append(c/len(a))
    else:
      counter.append(0)
  return counter


def ner_overlap_d(references, summaries):
  entities_summaries, entities_references, ratio_entities = entities(summaries, references)
  return hallucination_relative_d(entities_summaries, entities_references)

def ner_overlap_s(references, summaries):
  entities_summaries, entities_references, ratio_entities = entities(summaries, references)
  return hallucination_relative_s(entities_summaries, entities_references)

timer = time_ns()
result_ner_d = [[] for _ in range(16)]
for i in range(16):
  result_ner_d[i].append(ner_overlap_d(summeval_references, [j[i] for j in summeval_summaries]))
runtimes['D-NER_Overlap']=time_ns()-timer

result_ner_s = [[] for _ in range(16)]
timer = time_ns()
for i in range(16):
  result_ner_s[i].append(ner_overlap_s(summeval_references, [j[i] for j in summeval_summaries]))
runtimes['S-NER_Overlap'] = time_ns()-timer



for i in range(16):
  result_ner_d[i] = result_ner_d[i][0]
  result_ner_s[i] = result_ner_s[i][0]
print("D-NER-Overlap results: ")
correlations(result_ner_d)
print("S-NER-Overlap results: ")
correlations(result_ner_s)

Downloading (…)okenizer_config.json:   0%|          | 0.00/59.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/829 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/433M [00:00<?, ?B/s]

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


D-NER-Overlap results: 


ValueError: ignored

In [26]:
# this cell is just here for convenience, running the above cell should yield the same results
correlations(result_ner_d)
correlations(result_ner_s)

1600
relevance:    SignificanceResult(statistic=-0.020717999755585165, pvalue=0.40757921137555053)
fluency:      SignificanceResult(statistic=-0.04608470116560033, pvalue=0.06533973651570367)
coherence:    SignificanceResult(statistic=-0.033242853164085824, pvalue=0.18383403839462265)
consistency:  SignificanceResult(statistic=-0.02427133350590347, pvalue=0.331928116908211)
average:      -0.031079221897793698
1600
relevance:    SignificanceResult(statistic=-0.04200061865698237, pvalue=0.09306382346243715)
fluency:      SignificanceResult(statistic=-0.02388600276278446, pvalue=0.3396638624365951)
coherence:    SignificanceResult(statistic=-0.07107135072799352, pvalue=0.004451898396630335)
consistency:  SignificanceResult(statistic=-0.0038864850662824984, pvalue=0.8765544709281461)
average:      -0.03521111430351071


In [28]:
#@title word2vec_metric_OWN (16/16)
# "https://github.com/RaRe-Technologies/gensim-data"
import gensim.downloader

info = gensim.downloader.info()  # show info about available models/datasets

model = gensim.downloader.load("glove-wiki-gigaword-300")
#########

timer = time_ns()
scores = [[] for _ in range(16)]
for j in range(16):

  for i in range(100):
  # For each Summary-Reference pair

    closeness = [0]*len(summeval_references[i].split(" ")) # Holds the best word score for each word in the reference
    c = 0
    for word1 in summeval_references[i].split(" "):
    # For each word in the reference i
      temporary_distances = []
      for word2 in summeval_summaries[i][j].split(" "):
      # For each word in the summary i
        try:
          temporary_distances.append(model.distance(word1, word2))
        except:
          pass # word2 doesnt exist in vocabulary
      try:
        # Best word score for word1 in the summary
        closeness[c] = max(temporary_distances) # might change this later
      except:
        pass # word1 doesnt exist in vocabulary
      c += 1
    # mean of the closeness for each word in the reference is the final score
    scores[j].append(np.mean(closeness)) # penalize low lowest closeness

runtimes["word2vec"] = time_ns() - timer
correlations(scores)

1600
relevance:    SignificanceResult(statistic=0.16434640324589847, pvalue=3.7473821128850864e-11)
fluency:      SignificanceResult(statistic=0.04003328034844653, pvalue=0.10943863649232984)
coherence:    SignificanceResult(statistic=0.03229142724438922, pvalue=0.19670912900247242)
consistency:  SignificanceResult(statistic=0.09256016036291706, pvalue=0.00020933685437691995)
average:      0.08230781780041282
