In [16]:
pip install transformers


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.21.0-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 20.9 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 48.4 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 54.9 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 14.9 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    U

In [28]:
from transformers import AutoTokenizer, AutoModel
import numpy as np
import torch
import json
from sklearn.utils import shuffle


In [18]:
#path = "../../data/data.json" # will be changed to suit data path later
#dataset = json.load(path)

with open("y.json", 'r', encoding='utf-8') as f:
 data = json.load(f)
#path =  # will be changed to suit data path later
dataset = data


1. Define a **NeuralEmbedder** class to abstract away the embedding process for the retriever

In [19]:
class NeuralEmbedder():
  def __init__(self, model_name, tokenizer_name):
    self.tokenizer = AutoTokenizer.from_pretrained(model_name) 
    self.bert_model = AutoModel.from_pretrained(tokenizer_name)
  def embed(self,text):
    return self.bert_model(**self.tokenizer(text,return_tensors="pt"))[0][:,0,:].squeeze(0).numpy()

2. Define **the search engine** class. We embedded the documents once and saved the representations in a numpy matrix so we would not have to compute them repeatedly.

In [29]:
class NeuralSearchEngine():

  def __init__(self, embedder):
    self.embedder = embedder

  def index(self, documents):
    self.documents = documents
    encoded_docs = []
    for d in documents:
      with torch.no_grad():
        d_encoded = self.embedder.embed(d)
      encoded_docs.append(d_encoded.reshape(-1,768))
    self.index = np.concatenate(encoded_docs,axis=0)
  
  def search(self, query):
    with torch.no_grad():
      q_encoded = self.embedder.embed(query).reshape(-1,768)
    scores = q_encoded.dot(self.index.T)[0]

    scores = shuffle(scores, random_state = 0)
    args = np.argsort(scores)[::-1]

    print("\nThe query:", query,"\nTop three:")

    predicted = ""
    for i in range(3):
      print((i+1),'-','Score:',scores[args[i]],'doc:',self.documents[args[i]])
      if i == 0:
        predicted = self.documents[args[i]]
       
    return predicted

    

**main code**

In [32]:
def tasb_score(dataset):
  # number of correct predictions
  correct = 0
  # h@1 evaluation metric
  total_hit_at_1 = 0
  # number of queries
  count = 0

  # create an embedder object the tokenizer and model 
  embedder = NeuralEmbedder("sebastian-hofstaetter/distilbert-dot-tas_b-b256-msmarco","sebastian-hofstaetter/distilbert-dot-tas_b-b256-msmarco")

  # loop through each query
  for query in dataset:
    count +=1
    print(count)

    docs = []
    for description in query["options"].values():
      docs.append(description)

    # create a search engine object for this query 
    engine = NeuralSearchEngine(embedder)
    # index the options into the search engine
    engine.index(docs)

    # check if model predicted the correct answer
    ## get the predicted description
    predicted_description = engine.search(query["query"])
    print("predicted: "+predicted_description)
    ## loop through all correct options to find the predicted id
    for option in query["options"]:
      print("option: "+query["options"][option])
      print("id: "+option)
      if query["options"][option] == predicted_description:
        predicted_id = option

    ## check if predicted id is the same as correct id
    if predicted_id == query["answer"]:
      print(True, ": The correct description has the highest score.","\n")
      correct += 1
      total_hit_at_1 += 1
    else:
      print(False, ": The correct description is:", (query["options"][query["answer"]]),"\n")
      
  print("Total correct =", correct)
  print("average h@1",total_hit_at_1/count)

tasb_score(dataset)

1

The query: I want a drink with chocolate or cinnamon for a cosy night in. 
Top three:
1 - Score: 94.44159 doc: Chocolate ice milk sundaes topped with chocolate vanilla sauce
2 - Score: 94.186745 doc: Iced coffee with cinnamon and anise
3 - Score: 92.37852 doc: Standard hot chocolate drink
predicted: Chocolate ice milk sundaes topped with chocolate vanilla sauce
option: Standard hot chocolate drink
id: 06653f1315
option: Chocolate ice milk sundaes topped with chocolate vanilla sauce
id: 6617a13bff
option: Baked cinnamon rolls with pecans
id: 56ecf3f01c
option: Classic baked chocolate brownies
id: 15d5cb830e
option: Iced coffee with cinnamon and anise
id: 3b4fbed7d5
False : The correct description is: Standard hot chocolate drink 

2

The query: It's freezing and I'm a craving a thick soup. 
Top three:
1 - Score: 94.17376 doc: Kielbasa sausage stew with potato, carrots and shrimp
2 - Score: 91.58474 doc: Creamy onion soup
3 - Score: 90.31273 doc: Traditional chicken soup
predicted: Ki

**old main code**

In [None]:
def tasb_score(dataset):
  # number of correct predictions
  correct = 0
  # h@1 evaluation metric
  total_hit_at_1 = 0
  # number of queries
  count = 0

  # create an embedder object the tokenizer and model 
  embedder = NeuralEmbedder("sebastian-hofstaetter/distilbert-dot-tas_b-b256-msmarco","sebastian-hofstaetter/distilbert-dot-tas_b-b256-msmarco")

  # loop through each query
  for query in dataset:
    count +=1
    print(count)

    docs = []
    for description in query["options"].values():
      docs.append(description)

    # create a search engine object for this query 
    engine = NeuralSearchEngine(embedder)
    # index the options into the search engine
    engine.index(docs)

    # check if model predicted the correct answer
    if engine.search(query["query"]) == list(query["correct_answer"].values())[0]:
      print(True, ": The correct description has the highest score.","\n")
      correct += 1
      total_hit_at_1 += 1
    else:
      print(False, ": The correct description is:", list(query["correct_answer"].values())[0],"\n")
      
  print("Total correct =", correct)
  print("average h@1",total_hit_at_1/count)

tasb_score(dataset)