In [None]:
!pip install datasets
!pip install bert_score
!pip install evaluate
!pip install rouge_score
!pip install nltk

In [None]:
from datasets import load_dataset

dataset = load_dataset("stanfordnlp/snli") #we used load_dataset("snli") but it seems it got changed to this
# Or to use the mnli dataset
# dataset = load_dataset("nyu-mll/multi_nli") #we used load_dataset("multi_nli") but it seems it got changed to this

In [None]:
premise = dataset['test']['premise'][:]
hypothesis = dataset['test']['hypothesis'][:]

In [None]:
s1 = dataset['train']['premise'][:]
s2 = dataset['train']['hypothesis'][:]

In [None]:
from bert_score import BERTScorer
scorer = BERTScorer(lang="en",rescale_with_baseline=True)

In [None]:
from tqdm import tqdm
import pickle
def bulk_bertscore(premise_array,hypothesis_array,step=10000, save_name = 'all_bertscore_scores_snli_.pkl'):

    scores = {
        'P':[],
        'R':[],
        'F1':[],
    }
    for i in tqdm(range(0,len(premise_array),step)):
        premise = premise_array[i:i+step]
        hypothesis = hypothesis_array[i:i+step]

        P,R,F1 = scorer.score(premise,hypothesis)

        scores['P'] = scores['P'] + P.tolist()
        scores['R'] = scores['R'] + R.tolist()
        scores['F1'] = scores['F1'] + F1.tolist()

        del P
        del R
        del F1

        with open(save_name, 'wb') as f:
            pickle.dump(scores, f)


    return scores

In [None]:
bert_scores = bulk_bertscore(s1,s2, step=10000, save_name = 'all_bertscore_scores_snli_.pkl')

In [None]:
from transformers import pipeline
nli_classifier = pipeline("text-classification", model = "microsoft/deberta-large-mnli", device=0)

In [None]:
from sentence_transformers import SentenceTransformer, util
similarity_model = SentenceTransformer('all-mpnet-base-v2')

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from tqdm import tqdm

def cosmic(arr1,arr2):
    similarity_model.max_seq_length = 512
    arr_combined = arr1+arr2
    embeddings = similarity_model.encode(arr_combined)
    similarity_scores = []
    for i in (range(len(embeddings)//2)):
        arr = np.array([np.array(embeddings[i]),np.array(embeddings[i+len(embeddings)//2])])
        similarity_scores.append(max(cosine_similarity(arr)[0][1],0))
#         '[CLS]'+premise[i]+'[SEP]'+hypothesis[i]+'[SEP]
    arr2 = [arr_combined[i]+' '+arr_combined[i+len(arr_combined)//2] for i in tqdm(range(len(arr_combined)//2))]
    classifier_results = entailment_classifier2(arr2,return_all_scores=True,truncation=True, max_length=512, batch_size=100)
    entailment_scores = []
    for i in (range(len(classifier_results))):
        entailment_scores.append(1-classifier_results[i][0]['score'])
    cosmic_scores = []
    for i in (range(len(classifier_results))):
        cosmic_scores.append(similarity_scores[i]*entailment_scores[i])
    return cosmic_scores,similarity_scores,entailment_scores

In [None]:
from tqdm import tqdm
import pickle
def bulk_cosmic(premise_array,hypothesis_array,step=10000, save_name = 'all_scores_snli_.pkl'):

    scores = {
        'cosmic':[],
        'similarity':[],
        'consistency':[],
    }
    for i in tqdm(range(0,len(premise_array),step)):
        premise = premise_array[i:i+step]
        hypothesis = hypothesis_array[i:i+step]

        intermediate_cosmic,intermediate_similarity,intermediate_consistency = cosmic(premise,hypothesis)

        scores['cosmic'] = scores['cosmic'] + intermediate_cosmic
        scores['similarity'] = scores['similarity'] + intermediate_similarity
        scores['consistency'] = scores['consistency'] + intermediate_consistency

        del intermediate_cosmic
        del intermediate_similarity
        del intermediate_consistency

        with open(save_name, 'wb') as f:
            pickle.dump(scores, f)


    return scores

In [None]:
cosmic_scores_snli = bulk_cosmic(s1, s2, save_name = 'all_scores_snli_2024.pkl')

In [None]:
def similarity_score(arr1,arr2):
    similarity_model.max_seq_length = 512
    arr_combined = arr1+arr2
    embeddings = similarity_model.encode(arr_combined)**5
    similarity_scores = []
    for i in (range(len(embeddings)//2)):
        arr = np.array([np.array(embeddings[i]),np.array(embeddings[i+len(embeddings)//2])])
        similarity_scores.append(max(cosine_similarity(arr)[0][1],0))
    return similarity_scores

In [None]:
from tqdm import tqdm
import pickle
def bulk_similarity(premise_array,hypothesis_array,step=10000, save_name = 'all_similarity_scores_snli_2024.pkl'):

    scores = {
        'score':[]
    }
    for i in tqdm(range(0,len(premise_array),step)):
        premise = premise_array[i:i+step]
        hypothesis = hypothesis_array[i:i+step]

        similarities = similarity_score(premise,hypothesis)

        scores['score'] = scores['score'] + similarities

        del similarities

        with open(save_name, 'wb') as f:
            pickle.dump(scores, f)


    return scores

In [None]:
bulk_similarity_scores = bulk_similarity(s1, s2)

In [None]:
#BLEU Score code from the evaluate library https://github.com/huggingface/evaluate/blob/main/metrics/bleu/bleu.py

import collections
import math


def _get_ngrams(segment, max_order):
  """Extracts all n-grams upto a given maximum order from an input segment.

  Args:
    segment: text segment from which n-grams will be extracted.
    max_order: maximum length in tokens of the n-grams returned by this
        methods.

  Returns:
    The Counter containing all n-grams upto max_order in segment
    with a count of how many times each n-gram occurred.
  """
  ngram_counts = collections.Counter()
  for order in range(1, max_order + 1):
    for i in range(0, len(segment) - order + 1):
      ngram = tuple(segment[i:i+order])
      ngram_counts[ngram] += 1
  return ngram_counts

def compute_bleu(reference_corpus, translation_corpus, max_order=4,
                 smooth=False):
  """Computes BLEU score of translated segments against one or more references.

  Args:
    reference_corpus: list of lists of references for each translation. Each
        reference should be tokenized into a list of tokens.
    translation_corpus: list of translations to score. Each translation
        should be tokenized into a list of tokens.
    max_order: Maximum n-gram order to use when computing BLEU score.
    smooth: Whether or not to apply Lin et al. 2004 smoothing.

  Returns:
    3-Tuple with the BLEU score, n-gram precisions, geometric mean of n-gram
    precisions and brevity penalty.
  """
  matches_by_order = [0] * max_order
  possible_matches_by_order = [0] * max_order
  reference_length = 0
  translation_length = 0
  for (references, translation) in zip(reference_corpus,
                                       translation_corpus):
    reference_length += min(len(r) for r in references)
    translation_length += len(translation)

    merged_ref_ngram_counts = collections.Counter()
    for reference in references:
      merged_ref_ngram_counts |= _get_ngrams(reference, max_order)
    translation_ngram_counts = _get_ngrams(translation, max_order)
    overlap = translation_ngram_counts & merged_ref_ngram_counts
    for ngram in overlap:
      matches_by_order[len(ngram)-1] += overlap[ngram]
    for order in range(1, max_order+1):
      possible_matches = len(translation) - order + 1
      if possible_matches > 0:
        possible_matches_by_order[order-1] += possible_matches

  precisions = [0] * max_order
  for i in range(0, max_order):
    if smooth:
      precisions[i] = ((matches_by_order[i] + 1.) /
                       (possible_matches_by_order[i] + 1.))
    else:
      if possible_matches_by_order[i] > 0:
        precisions[i] = (float(matches_by_order[i]) /
                         possible_matches_by_order[i])
      else:
        precisions[i] = 0.0

  if min(precisions) > 0:
    p_log_sum = sum((1. / max_order) * math.log(p) for p in precisions)
    geo_mean = math.exp(p_log_sum)
  else:
    geo_mean = 0

  ratio = float(translation_length) / reference_length

  if ratio > 1.0:
    bp = 1.
  else:
    bp = math.exp(1 - 1. / ratio)

  bleu = geo_mean * bp

  return (bleu, precisions, bp, ratio, translation_length, reference_length)


from tqdm import tqdm

import re
from functools import lru_cache


class BaseTokenizer:
    """A base dummy tokenizer to derive from."""

    def signature(self):
        """
        Returns a signature for the tokenizer.
        :return: signature string
        """
        return "none"

    def __call__(self, line):
        """
        Tokenizes an input line with the tokenizer.
        :param line: a segment to tokenize
        :return: the tokenized line
        """
        return line


class TokenizerRegexp(BaseTokenizer):
    def signature(self):
        return "re"

    def __init__(self):
        self._re = [
            # language-dependent part (assuming Western languages)
            (re.compile(r"([\{-\~\[-\` -\&\(-\+\:-\@\/])"), r" \1 "),
            # tokenize period and comma unless preceded by a digit
            (re.compile(r"([^0-9])([\.,])"), r"\1 \2 "),
            # tokenize period and comma unless followed by a digit
            (re.compile(r"([\.,])([^0-9])"), r" \1 \2"),
            # tokenize dash when preceded by a digit
            (re.compile(r"([0-9])(-)"), r"\1 \2 "),
            # one space only between words
            # NOTE: Doing this in Python (below) is faster
            # (re.compile(r'\s+'), r' '),
        ]

    @lru_cache(maxsize=2**16)
    def __call__(self, line):
        """Common post-processing tokenizer for `13a` and `zh` tokenizers.
        :param line: a segment to tokenize
        :return: the tokenized line
        """
        for (_re, repl) in self._re:
            line = _re.sub(repl, line)

        # no leading or trailing spaces, single space within words
        # return ' '.join(line.split())
        # This line is changed with regards to the original tokenizer (seen above) to return individual words
        return line.split()


class Tokenizer13a(BaseTokenizer):
    def signature(self):
        return "13a"

    def __init__(self):
        self._post_tokenizer = TokenizerRegexp()

    @lru_cache(maxsize=2**16)
    def __call__(self, line):
        """Tokenizes an input line using a relatively minimal tokenization
        that is however equivalent to mteval-v13a, used by WMT.
        :param line: a segment to tokenize
        :return: the tokenized line
        """

        # language-independent part:
        line = line.replace("<skipped>", "")
        line = line.replace("-\n", "")
        line = line.replace("\n", " ")

        if "&" in line:
            line = line.replace("&quot;", '"')
            line = line.replace("&amp;", "&")
            line = line.replace("&lt;", "<")
            line = line.replace("&gt;", ">")

        return self._post_tokenizer(f" {line} ")

def calculate_blue(predictions, references, tokenizer):
    references = [[ref] for ref in references]


    references = [[tokenizer(r) for r in ref] for ref in references]
    predictions = [tokenizer(p) for p in predictions]
    score = compute_bleu(
        reference_corpus=references, translation_corpus=predictions, max_order=4, smooth=False
    )
    (bleu, precisions, bp, ratio, translation_length, reference_length) = score

    return {
        "bleu": bleu,
        "precisions": precisions,
        "brevity_penalty": bp,
        "length_ratio": ratio,
        "translation_length": translation_length,
        "reference_length": reference_length,
    }

In [None]:
from tqdm import tqdm
import pickle
from nltk.translate.bleu_score import sentence_bleu

def bulk_bleuscore(premise_array,hypothesis_array,step=10000, save_name = 'all_BleuScore_scores_snli_.pkl'):

    scores = {
        'score':[],
    }
    for i in tqdm(range(0,len(premise_array),step)):
        premise = premise_array[i:i+step]
        hypothesis = hypothesis_array[i:i+step]

        bleu_scores = []
        for i in (range(len(premise))):
            bleu_score = calculate_blue([hypothesis[i]],[premise[i]],Tokenizer13a())['bleu']
            bleu_scores.append(bleu_score)

        scores['score'] = scores['score'] + bleu_scores

        del bleu_scores

        with open(save_name, 'wb') as f:
            pickle.dump(scores, f)


    return scores

In [None]:
bulk_bleu_scores = bulk_bleuscore(s1,s2,step=10000, save_name = 'all_BleuScore_scores_snli_.pkl')

In [None]:
from tqdm import tqdm
import pickle
from rouge_score import rouge_scorer
rouge_scorer1 = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

def bulk_rougescore(premise_array,hypothesis_array,step=10000, save_name = 'all_RougeScore_scores_snli_.pkl'):

    scores = {
        'rouge1':[],
        'rouge2':[],
        'rougeL':[],
    }
    for i in tqdm(range(0,len(premise_array),step)):
        premise = premise_array[i:i+step]
        hypothesis = hypothesis_array[i:i+step]

        rougeL_scores = []
        rouge1_scores = []
        rouge2_scores = []
        for i in (range(len(premise))):
            rouge_score = rouge_scorer1.score(hypothesis[i],premise[i])
            rouge1_scores.append(rouge_score['rouge1'].fmeasure)
            rouge2_scores.append(rouge_score['rouge2'].fmeasure)
            rougeL_scores.append(rouge_score['rougeL'].fmeasure)

        scores['rouge1'] = scores['rouge1'] + rouge1_scores
        scores['rouge2'] = scores['rouge2'] + rouge2_scores
        scores['rougeL'] = scores['rougeL'] + rougeL_scores

        del rougeL_scores
        del rouge1_scores
        del rouge2_scores

        with open(save_name, 'wb') as f:
            pickle.dump(scores, f)


    return scores

In [None]:
bulk_rouge_scores = bulk_rougescore(s1,s2,step=10000, save_name = 'all_RougeScore_scores_snli_.pkl')

In [None]:
!git clone https://github.com/neulab/BARTScore.git

In [None]:
import sys
from os.path import dirname
sys.path.append('./BARTScore')

In [None]:
from bart_score import BARTScorer

In [None]:
bart_scorer = BARTScorer(device='cuda:0', checkpoint='facebook/bart-large-cnn')

In [None]:
from tqdm import tqdm
import pickle
def bulk_bartscore(premise_array,hypothesis_array,step=10000, save_name = 'all_BARTScore_scores_snli_.pkl'):

    scores = {
        'score':[],
    }
    for i in tqdm(range(0,len(premise_array),step)):
        premise = premise_array[i:i+step]
        hypothesis = hypothesis_array[i:i+step]

        score = bart_scorer.score(premise,hypothesis)

        scores['score'] = scores['score'] +score

        del score

        with open(save_name, 'wb') as f:
            pickle.dump(scores, f)


    return scores

In [None]:
bulk_bart_scores = bulk_bartscore(s1,s2, step=10000, save_name = 'all_BARTScore_scores_snli_.pkl')

In [None]:
!git clone https://github.com/AIPHES/emnlp19-moverscore.git
# due to deprications, you may need to use numpy 1.26.4, and you may need to go to the moverscore_v2.py file and replace all occurances of np.float with float

Cloning into 'emnlp19-moverscore'...
remote: Enumerating objects: 459, done.[K
remote: Counting objects: 100% (24/24), done.[K
remote: Compressing objects: 100% (3/3), done.[K
remote: Total 459 (delta 21), reused 21 (delta 21), pack-reused 435 (from 2)[K
Receiving objects: 100% (459/459), 7.07 MiB | 5.29 MiB/s, done.
Resolving deltas: 100% (212/212), done.


In [None]:
!pip install -U pyemd

Collecting pyemd
  Downloading pyemd-1.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.6 kB)
Downloading pyemd-1.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (666 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m666.6/666.6 kB[0m [31m15.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyemd
Successfully installed pyemd-1.0.0


In [None]:
!pip install --upgrade pytest



In [None]:
!pip install numpy==1.26.4

Collecting numpy==1.26.4
  Using cached numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Using cached numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
Installing collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 1.24.0
    Uninstalling numpy-1.24.0:
      Successfully uninstalled numpy-1.24.0
Successfully installed numpy-1.26.4


In [None]:
import sys
from os.path import dirname
sys.path.append('./emnlp19-moverscore')

In [None]:
from moverscore_v2 import get_idf_dict, word_mover_score
from collections import defaultdict
idf_dict_hyp = defaultdict(lambda: 1.)
idf_dict_ref = defaultdict(lambda: 1.)
scores = word_mover_score(['This is interesting.'], ['This is fun'], idf_dict_ref, idf_dict_hyp, \
                          stop_words=[], n_gram=1, remove_subwords=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
from tqdm import tqdm
import pickle
def bulk_moverscore(premise_array,hypothesis_array,step=10000, save_name = 'all_MoverScore_scores_snli_.pkl'):

    scores = {
        'score':[],
    }
    for i in tqdm(range(0,len(premise_array),step)):
        premise = premise_array[i:i+step]
        hypothesis = hypothesis_array[i:i+step]

        score = word_mover_score(premise,hypothesis, idf_dict_ref, idf_dict_hyp, \
                          stop_words=[], n_gram=1, remove_subwords=True)

        scores['score'] = scores['score'] +score

        del score

        with open(save_name, 'wb') as f:
            pickle.dump(scores, f)


    return scores

In [None]:
bulk_mover_scores = bulk_moverscore(s1,s2, step=10000, save_name = 'all_MoverScore_scores_snli_.pkl')

In [None]:
import numpy as np
import pandas as pd
similarity_score = cosmic_scores_snli['similarity']
consistency_score = cosmic_scores_snli['consistency']
high_consistency = [True if x>0.5 else False for x in cosmic_scores_snli['consistency']]
low_consistency = [True if x<=0.5 else False for x in cosmic_scores_snli['consistency']]


correlations = []
scores = [
    cosmic_scores_snli['cosmic'],
    similarity_score,
    bulk_bert_scores['F1'],
    bulk_bart_scores['score'],
    bulk_mover_scores['score'],
    bulk_bleu_scores['score'],
    bulk_rouge_scores['rouge1'],
    bulk_rouge_scores['rouge2'],
    bulk_rouge_scores['rougeL'],
]

for score in scores:
    score_consistent = list(compress(score,high_consistency))
    similarity_consistent = list(compress(similarity_score,high_consistency))
    score_inconsistent = list(compress(score,low_consistency))
    similarity_inconsistent = list(compress(similarity_score,low_consistency))
    correlations.append([np.corrcoef(similarity_score, score)[0][1], np.corrcoef(consistency_score, score)[0][1],np.corrcoef(similarity_consistent, score_consistent)[0][1], np.corrcoef(consistency_consistent, score_consistent)[0][1],np.corrcoef(similarity_inconsistent, score_inconsistent)[0][1], np.corrcoef(consistency_inconsistent, score_inconsistent)[0][1]])

df_snli = pd.DataFrame(correlations,columns=['Embedding Similarity', 'Consistency','Embedding Similarity (Consistent)', 'Consistency (Consistent)','Embedding Similarity (inconsistent)', 'Consistency (inconsistent)'], index=['Cosmic','Embedding Similarity','BERTScore','BARTScore','MoverScore','BLEU', "ROUGE-1", 'ROUGE-2',"ROUGE-L"] )
df_snli