In [1]:
!pip install evaluate
!pip install transformers
!pip install bert_score
!pip install sentence_transformers
!pip install tensorflow_hub
!pip install tensorflow-text

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub>=0.7.0
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m18.5 MB/s[0m eta [36m0:00:00[0m
Collecting dill
  Downloading dill-0.3.6-py3-none-any.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets>=2.0.0
  Downloading datasets-2.11.0-py3-none-any.whl (468 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m468.7/468.7 kB[0m [31m34.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 k

In [2]:
from evaluate import load
import pandas as pd
from ast import literal_eval
from sentence_transformers import CrossEncoder
import tensorflow_hub as hub
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text

import numpy as np
from scipy.special import softmax


In [3]:
#setup globals for bem tokenization, code from https://colab.research.google.com/github/google-research-datasets/answer-equivalence-dataset/blob/main/Answer_Equivalence_BEM_example.ipynb#scrollTo=NND0honxyDaJ
VOCAB_PATH = 'gs://cloud-tpu-checkpoints/bert/keras_bert/uncased_L-12_H-768_A-12/vocab.txt'

vocab_table = tf.lookup.StaticVocabularyTable(
        tf.lookup.TextFileInitializer(
            filename=VOCAB_PATH,
            key_dtype=tf.string,
            key_index=tf.lookup.TextFileIndex.WHOLE_LINE,
            value_dtype=tf.int64,
            value_index=tf.lookup.TextFileIndex.LINE_NUMBER
        ), 
        num_oov_buckets=1)
cls_id, sep_id = vocab_table.lookup(tf.convert_to_tensor(['[CLS]', '[SEP]']))
tokenizer = text.BertTokenizer(vocab_lookup_table=vocab_table, 
                               token_out_type=tf.int64, 
                               preserve_unused_token=True, 
                               lower_case=True)

In [4]:
bertscore = load("bertscore")
SAS = CrossEncoder('cross-encoder/stsb-roberta-large')
bem = hub.load('https://tfhub.dev/google/answer_equivalence/bem/1')


Downloading builder script:   0%|          | 0.00/7.95k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/139 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

In [5]:
def compute_bertscore_f1(references, prediction):
    predictions = [prediction] * len(references)
    results = bertscore.compute(predictions=predictions, references=references, model_type="roberta-large") 
    return max(results['f1'])

In [6]:
def compute_SAS_scores(references, prediction):
    predictions = [prediction] * len(references)
    inputs = list(zip(references, predictions))
    scores = SAS.predict(inputs)
    return max(scores)

In [7]:
#helper functions for the bem model code from https://colab.research.google.com/github/google-research-datasets/answer-equivalence-dataset/blob/main/Answer_Equivalence_BEM_example.ipynb#scrollTo=S_4PBlU5yj3f
def bertify_example(example):
    question = tokenizer.tokenize(example['question']).merge_dims(1, 2)
    reference = tokenizer.tokenize(example['reference']).merge_dims(1, 2)
    candidate = tokenizer.tokenize(example['candidate']).merge_dims(1, 2)

    input_ids, segment_ids = text.combine_segments(
        (candidate, reference, question), cls_id, sep_id)

    return {'input_ids': input_ids.numpy(), 'segment_ids': segment_ids.numpy()}


def pad(a, length=512):
    return np.append(a, np.zeros(length - a.shape[-1], np.int32))


def bertify_examples(examples):
    input_ids = []
    segment_ids = []
    for example in examples:
        example_inputs = bertify_example(example)
        input_ids.append(pad(example_inputs['input_ids']))
        segment_ids.append(pad(example_inputs['segment_ids']))

    return {'input_ids': np.stack(input_ids), 'segment_ids': np.stack(segment_ids)}

def predict_example(examples):
    inputs = bertify_examples(examples)
    raw_outputs = bem(inputs)
    return float(softmax(np.squeeze(raw_outputs))[1])    

def compute_BEM_scores(question, references, prediction):
    equivalences = []
    for reference in references:
        data = [{'question': question, 'reference': reference, 'candidate': prediction}]
        equivalences.append(predict_example(data))
    return max(equivalences)

In [8]:
examples = [{
    'question': 'why is the sky blue',
    'reference': 'light scattering',
    'candidate': 'scattering of light'
    }]

inputs = bertify_examples(examples)

# The outputs are raw logits.
raw_outputs = bem(inputs)

# They can be transformed into a classification 'probability' like so:
bem_score = float(softmax(np.squeeze(raw_outputs))[1])

print(f'BEM score: {bem_score}')

BEM score: 0.9891805052757263


In [9]:

df = pd.read_csv("/content/answer_pairs2.csv")
df['Gold Answers'] = df['Gold Answers'].apply(literal_eval)


In [10]:
df['BERTScore Similarities GPT'] = df.apply(lambda row: compute_bertscore_f1(row['Gold Answers'], row['GPT Answer']), axis=1)


Downloading (…)lve/main/config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.43G [00:00<?, ?B/s]

In [11]:
df['SAS Similarities GPT'] = df.apply(lambda row: compute_SAS_scores(row['Gold Answers'], row['GPT Answer']), axis=1)

In [12]:
df['BEM Similarities GPT'] = df.apply(lambda row: compute_BEM_scores(row['Questions'], row['Gold Answers'], row['GPT Answer']), axis=1)

In [13]:
df.head()

Unnamed: 0.1,Unnamed: 0,Contexts,Questions,Gold Answers,GPT Answer,LERC Score GPT,Annotator Label - GPT Answer,Category,BERTScore Similarities GPT,SAS Similarities GPT,BEM Similarities GPT
0,0,"['Do - Right , segment of The Rocky and Bullwi...",who was the bad guy in dudley do right,[Snidely Whiplash],Snidely Whiplash,5.081576,3,Exact match,1.0,0.966667,0.989872
1,1,['Martin Deborah Knox Fiona MacDonald Janice R...,who won the women's gold medal curling,[Sweden (],It depends on the specific year and competitio...,1.372812,1,DATC,0.806277,0.039457,0.057492
2,2,[' The Dome of the Rock ( Arabic : قبة الصخ...,who built the dome of the rock mosque,[initially completed in 691 CE at the order of...,The dome of the rock mosque was built by Calip...,3.717231,3,Semantically the same,0.911975,0.656942,0.823226
3,3,['Chevrolet Zafira compact MPV . Chevrolet m...,what does hhr stand for on a chevy,[Heritage High Roof],HHR stands for Heritage High Roof.,4.726677,3,Semantically the same,0.812078,0.838378,0.989761
4,4,"[""was mentored by one of the show 's four judg...",who won x factor when olly murs came second,[Joe McElderry],Joe McElderry,5.079546,3,Exact match,1.0,0.963869,0.991353


In [14]:
df.to_csv("answer_pairs2.csv", index=False)