# Setup

## Install packages

In [282]:
!pip install -U sentence-transformers rank_bm25 faiss-gpu datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Using cached datasets-2.6.1-py3-none-any.whl (441 kB)
Collecting huggingface-hub>=0.4.0
  Using cached huggingface_hub-0.10.1-py3-none-any.whl (163 kB)
Collecting transformers<5.0.0,>=4.6.0
  Using cached transformers-4.24.0-py3-none-any.whl (5.5 MB)
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Using cached tokenizers-0.13.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
Collecting tqdm
  Downloading tqdm-4.64.1-py2.py3-none-any.whl (78 kB)
[K     |████████████████████████████████| 78 kB 3.0 MB/s 
Collecting pyarrow>=6.0.0
  Downloading pyarrow-10.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (35.4 MB)
[K     |████████████████████████████████| 35.4 MB 95.5 MB/s 
Installing collected packages: tqdm, tokenizers, huggingface-hub, transformers, pyarrow, datasets
  Attempting uninstall: tqdm
    Found existing installation: tqdm 4.4

## Mount your Google drive in order to save data

In [2]:
from google.colab import drive
from pathlib import Path
import os
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
!mkdir -p drive/MyDrive/ai_agents/hw3
!mkdir -p drive/MyDrive/ai_agents/hw3/.cache

In [3]:
os.chdir("drive/MyDrive/ai_agents/hw3")

## Download a small corpus of Wikipedia articles and split it into snippets

We will use a corpus used by the SentenceTransformers author. This cell constructs a list, `passages`, containined `(title, snippet)` tuples.

In [4]:
import json
import gzip
import torch
from sentence_transformers import util

if not torch.cuda.is_available():
    print("Warning: No GPU found. Please add GPU to your notebook")


wikipedia_filepath = 'simplewiki-2020-11-01.jsonl.gz'

if not os.path.exists(wikipedia_filepath):
    util.http_get('http://sbert.net/datasets/simplewiki-2020-11-01.jsonl.gz', wikipedia_filepath)


passages = []
with gzip.open(wikipedia_filepath, 'rt', encoding='utf8') as fIn:
    for line in fIn:
        data = json.loads(line.strip())
        for paragraph in data['paragraphs']:
            # We encode the passages as [title, text]
            passages.append(dict(title=data['title'], passage=paragraph))

In [5]:
from datasets import Dataset

passages = Dataset.from_list(passages)
type(passages)

datasets.arrow_dataset.Dataset

# Problem 3

## **3.1**: Build a BM25 Search Index

Construct a search index that, given a query span, returns a top-n list of support passages.

In [7]:
from typing import List,Tuple
import datasets
## Here is a base class that you should use for both sparse and dense retrieval

class RetrievalIndex:

  def __init__(self, corpus: datasets.arrow_dataset.Dataset):
    self.corpus = corpus
    
  def __getitem__(self, item):
    return [i for i in self.corpus.select([item])][0]

  @classmethod
  def build_index(cls, corpus, **kwargs):
    """
    Class method that constructs a retrieval index from the corpus
    """
    return cls(corpus, **kwargs)

  
  def lookup(self, query_strs: List[str], topk = 5) -> List[List[Tuple[str, str, float]]]:
    """
    Accepts a list of query strings and returns a list of lists of (title, passage, score) tuples
    """
    raise NotImplementedError()

In [368]:
from rank_bm25 import BM25Okapi
import numpy as np
import math,numpy as np
class BM25RetrievalIndex(RetrievalIndex):
  def __init__(self, corpus):
    """
    tokenize your corpus and initialize your BM25 index. 
    Follow the simple usage shown on the library's Github page: https://github.com/dorianbrown/rank_bm25
    """
    self.corpustitles = np.array([i['title'] for i in corpus])
    self.corpus = [document['passage'].split(" ") for document in corpus]
    self.bm25 = BM25Okapi(self.corpus)

  def lookup(self, query_strs, topk = 5) -> List[List[Tuple[str, str, float]]]:
    """
    Retrieve document scores from your BM25 index for each of a list of queries.
    make sure that each list of returned items is sorted by document score.
    """
    query_strs = [query.split(" ") for query in query_strs]
    final_tuples = []
    for query in query_strs:
      tuples=[]
      scores = self.bm25.get_scores(query)
      top_k = np.argsort(scores)[::-1][:topk]
      for i in range(topk):
        tuples.append((self.corpustitles[top_k[i]], self.corpus[top_k[i]], scores[top_k[i]]))
      final_tuples.append(tuples)
    return final_tuples


In [369]:
bm25_index = BM25RetrievalIndex.build_index(passages)

In [375]:
bm25_index.lookup(["why do birds fly in a v formation?"],5)

[[('Collective animal behaviour',
   ['3.',
    '"Easier',
    'movement":',
    'Groups',
    'of',
    'animals',
    'moving',
    'together',
    '(such',
    'as',
    'fish',
    'or',
    'birds)',
    'save',
    'energy.',
    'Many',
    'of',
    'the',
    'larger',
    'birds',
    'fly',
    'in',
    'flocks.',
    'Flying',
    'in',
    'flocks',
    'helps',
    'in',
    'reducing',
    'the',
    'energy',
    'needed.',
    'Many',
    'large',
    'birds',
    'fly',
    'in',
    'a',
    'V-formation,',
    'which',
    'helps',
    'individuals',
    'save',
    '12–20',
    '%',
    'of',
    'the',
    'energy',
    'they',
    'would',
    'need',
    'to',
    'fly',
    'alone.',
    'Red',
    'Knots',
    '"Calidris',
    'canutus"',
    'and',
    'Dunlins',
    '"Calidris',
    'alpina"',
    'were',
    'found',
    'in',
    'radar',
    'studies',
    'to',
    'fly',
    '5\xa0km',
    'per',
    'hour',
    'faster',
    'in',
    'flocks',
    't

#Passage Retrieval for Questions I used in my HW-1 with BM25

In [373]:
query_strs= ["How does the white ball pass through the chute during a game of paid pool on a scratch, but the other balls won't?",
 'How are military documentaries, which include deployment footage interviews, made?',
 'Why is it that getting raw meat in your blood (through a cut on your finger) or mucous membranes (under fingernails) does not cause food poisoning, but eating it can make you violently ill?',
 'Why is snow white when water and ice are clear?',
 'Why do we use a different letter "a" when typing as opposed to when writing?']
similars = bm25_index.lookup(query_strs,3)
for j in similars:
  for i in j:
    print("Title: ",i[0],"Passage: ", " ".join(i[1]),"Score: ",i[2])
  print("\n\n")

Title:  Straight pool Passage:  In straight pool, the person shooting may attempt to pocket any ball on the table. The aim of the game is to reach a set number of points. The amount of points needed to win is agreed to before the game. One point is scored for each ball pocketed legally (that is, without a foul). A typical game might require a player to score 100 points. This means that at least 100 balls must be pocketed to win. In professional competition, straight pool is usually played to 150 points. Straight pool is a call-pocket game. This means that the player must call what pocket they mean to sink the ball into on every shot. It does not matter how balls reach the pocket. As long as no foul is involved, and the balls goes into the pocket that is called, a point is scored. Score:  55.17981286144577
Title:  Straight pool Passage:  In the first rack in straight pool (when you place all the balls together before the break), the fifteen object balls (the colored balls) are racked in

## **3.2**: Building a Dense Retrieval Index

In [214]:
import faiss 
from sentence_transformers import SentenceTransformer, CrossEncoder, util


class DenseRetrievalIndex(RetrievalIndex):
  def __init__(self, corpus: List[Tuple[str,str]], precomputed_index : str =None):
    """ 
    compute the embeddings for each passage in the wiki corpus, then feed them 
    to the `add_faiss_index` builtin function from HuggingFace's Dataset class
    https://huggingface.co/docs/datasets/v1.2.1/faiss_and_ea.html

    (Optional but recommended) if the filepath argument `precomputed_index` is not None, 
    then this should not compute the embeddings but rather call load_faiss_index on the path

    """
    self.ds_with_embeddings=[]
    if precomputed_index != None:
      self.corpustitles = np.array([i['title'] for i in corpus])


      self.encoder = SentenceTransformer('msmarco-MiniLM-L-6-v3')
      print("done creating")
      print(type(corpus))
      self.ds_with_embeddings = corpus.map(lambda example:  {'embeddings': self.encoder.encode(example['title'] + ' ~x~ ' + example['passage'])})
      self.ds_with_embeddings.add_faiss_index(column='embeddings')
    else:
      corpus.load_faiss_index('embeddings', 'msmarco_sbert.faiss')


  def save(self, file):
    """
    (Optional but recommended) helper that saves the index to a file using `save_faiss_index` 
    """
    self.ds_with_embeddings.save_faiss_index('embeddings', file)
  def lookup(self, query_strs, topk=10):

    finaltuple = []
    for query in query_strs:
      question_emb = self.encoder.encode(query)
      scores,examples = self.ds_with_embeddings.get_nearest_examples('embeddings', question_emb, k=topk)
      scores_examples=[]
      for i in range(len(scores)):
        scores_examples.append((scores[i],(examples['title'][i],examples['passage'][i],scores[i])))
      scores_examples.sort(reverse=True)
      finaltuple.append([i[1] for i in scores_examples[:topk]])

    return finaltuple

  @classmethod
  def from_file(cls, corpus, file):
      """
      (Optional but recommended) helper that loads the index from the specified filepath 
      """
      assert os.path.exists(file)
      return cls(corpus=corpus, precomputed_index=file)

In [215]:
dense_index = DenseRetrievalIndex.build_index(passages)

## Uncomment this line and initialize this way if you have already computed and saved the index
# dense_index = DenseRetrievalIndex.from_file(passages, 'msmarco_sbert.faiss')


done creating
<class 'datasets.arrow_dataset.Dataset'>


  0%|          | 0/509663 [00:00<?, ?ex/s]

In [348]:
dense_index.lookup(["why do flocks of birds fly in a v formation?"], topk=5)

[[('Collective animal behaviour',
   '3. "Easier movement": Groups of animals moving together (such as fish or birds) save energy. Many of the larger birds fly in flocks. Flying in flocks helps in reducing the energy needed. Many large birds fly in a V-formation, which helps individuals save 12–20 % of the energy they would need to fly alone. Red Knots "Calidris canutus" and Dunlins "Calidris alpina" were found in radar studies to fly 5\xa0km per hour faster in flocks than when they were flying alone.',
   44.239742),
  ('Flightless birds',
   'Despite this general picture, there have been birds who lost the power of flight soon after flight evolved. The first fossil flightless birds occurred in the Cretaceous period. It has long been recognised that there are circumstances where it is definitely not a good thing to have wings. The connection between oceanic islands and flightlessness was known to Darwin. The explanation is that, first, oceanic islands have few predators. Second, that 

#Passage Retrieval for Questions I used in my HW-1 with Dense Retreival

In [347]:
query_strs= ["How does the white ball pass through the chute during a game of paid pool on a scratch, but the other balls won't?",
 'How are military documentaries, which include deployment footage interviews, made?',
 'Why is it that getting raw meat in your blood (through a cut on your finger) or mucous membranes (under fingernails) does not cause food poisoning, but eating it can make you violently ill?',
 'Why is snow white when water and ice are clear?',
 'Why do we use a different letter "a" when typing as opposed to when writing?']
dense_index.lookup(query_strs, topk=5)

[[('Bat-and-ball games',
   'The name bat-and-ball games comes from some common parts of most of these games. In most games, a player on the fielding team puts a ball into play, mostly by throwing the ball. (How the player does this depends on the game\'s rules.) Then a player on the batting team tries to hit the ball, usually with a "bat." A bat is a kind of club, though the size and shape depend on the rules. This player can then run between the safe zones in the field to score points, though the fielding team can use the ball on the player from the batting team, or a target in one of the safe zones, to get the player "out" if they are not in a safe zone. (In cricket, a player who is out can not score points for the rest of their team\'s scoring turn, while in other bat-and-ball sports, they might have to wait before batting again.) Thus, the further the batting player can hit the ball away from the defensive players, the more points they are likely to be able to score.',
   38.81210

## **3.3** Using a Reranking Cross Encoder

In [358]:
from sentence_transformers import CrossEncoder

class RerankingDenseRetrievalIndex(DenseRetrievalIndex):
  def __init__(self, *args, **kwargs):
    self.cross_encoder = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")
    self.encoder = SentenceTransformer('msmarco-MiniLM-L-6-v3')

  def lookup(self, query_strs, topk=5, initial_topk=50):
    """
    retrieve `initial_topk` candidates as in `DenseRetrievalIndex` class, but 
    then rerank them according to scores of `self.cross_encoder`
    """
    finaltuple = []
    for query in query_strs:
      reranked=[]
      tuples=[]
      question_emb = self.encoder.encode(query)
      scores,examples = dense_index.ds_with_embeddings.get_nearest_examples('embeddings', question_emb, k=initial_topk)
      scores_examples=[]
      for i in range(len(scores)):
        scores_examples.append((examples['title'][i],examples['passage'][i],scores[i]))

      for i in range(initial_topk):
        tuples.append((query,scores_examples[i][1]))

      scores = self.cross_encoder.predict(tuples)

      for i in range(len(scores)):
        reranked.append((scores[i],scores_examples[i]))
      reranked.sort(reverse=True)
      
      finaltuple.append([i[1] for i in reranked[:topk]])
    return finaltuple

In [359]:
## you should not need to recompute the embeddings or index if you implemented the 
## recommended helper functions
ranking_index = RerankingDenseRetrievalIndex.from_file(passages, 'msmarco_sbert.faiss')

In [357]:
ranking_index.lookup(["why is the sky blue?", "why do flocks of birds fly in a v formation?"], topk=5)

[[('Sky',
   'The sky, which is made up of gas molecules, is blue because of the random scattering of sunlight by the molecules. Rayleigh scattering defines the amount of scattering of light rays. Blue light scatters much more than red, which is why the sky appears blue on a clear day. Depending on the time of day, the sky may appear different colors. At dawn or dusk the sky may appear red, orange, or even green and purple depending on how low the sun is and how close it is to night.',
   32.083675),
  ('Moon',
   'In the Earth, the sky is blue because the blue rays of the sun bounce off the gases in the atmosphere, making it look like blue light is coming from the sky. But on the moon, because there is no atmosphere, the sky looks black, even in the daytime. There is no atmosphere to protect the moon from the rocks that fall from outer space, and these meteorites crash right into the moon and make wide, shallow holes called craters. The moon has thousands of them. Newer craters gradua

#Passage Retrieval for Questions I used in my HW-1 with Reranking

In [360]:
query_strs= ["How does the white ball pass through the chute during a game of paid pool on a scratch, but the other balls won't?",
 'How are military documentaries, which include deployment footage interviews, made?',
 'Why is it that getting raw meat in your blood (through a cut on your finger) or mucous membranes (under fingernails) does not cause food poisoning, but eating it can make you violently ill?',
 'Why is snow white when water and ice are clear?',
 'Why do we use a different letter "a" when typing as opposed to when writing?']
ranking_index.lookup(query_strs,topk=3)

[[('Chess',
   "During the game the two opponents take turns to move one of their pieces to a different square of the board. One player ('White') has pieces of a light color; the other player ('Black') has pieces of a dark color. There are rules about how pieces move, and about taking the opponent's pieces off the board. The player with white pieces always makes the first move. Because of this, White has a small advantage, and wins more often than Black in tournament games.",
   40.524796),
  ('Straight pool',
   "In the first rack in straight pool (when you place all the balls together before the break), the fifteen object balls (the colored balls) are racked in a triangular frame, usually made from aluminum, wood, or plastic. The center of the ball at the top of the rack is placed over the a spot on the table called the foot spot. Traditionally, the 1 ball is placed at the rack's right corner, and the 5 ball is placed at the rack's left corner. Other balls are placed randomly. All th

# Problem 4

In [290]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

qar_tokenizer = AutoTokenizer.from_pretrained('yjernite/bart_eli5')
qar_model = AutoModelForSeq2SeqLM.from_pretrained('yjernite/bart_eli5')

from huggingface_hub import notebook_login
notebook_login()

In [324]:
!pip install rouge
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


In [294]:
# TODO Write an eval loop that retrieves top-1 documents for each ELI5 dev question
# then feeds them (concatenated to the question) to the qar_model wrapped in a `pipeline`. 
from transformers import pipeline
from datasets import load_dataset, load_metric
text2text = pipeline("text2text-generation", model = qar_model,tokenizer = qar_tokenizer )

In [299]:
raw_datasets = load_dataset("eli5", split="test_eli5[10:20]")



In [328]:
from rouge import Rouge

#Retrieve documents for evaluation
retrieval = ranking_index.lookup(raw_datasets["title"],topk=1)

model_output = []
inputs=[]
for i in range(10):
  #Concatenate the question and passage for input. Call the passage as 'Context'
  inputs.append("Question: " + raw_datasets["title"][i] + "\nContext: " + retrieval[i][1][1][1])

#Generate answers
model_output = text2text(inputs, max_length = 100)

#Evaluate using mean values of rouge1, rouge2, rougel
rouge = Rouge()
rouge.get_scores([raw_datasets["answers"][i]['text'][0] for i in range(10)],[i['generated_text'] for i in model_output], avg=True)


{'rouge-1': {'r': 0.36863615143026907,
  'p': 0.07473015816677091,
  'f': 0.11503037809824619},
 'rouge-2': {'r': 0.0634536541889483,
  'p': 0.009620773664406248,
  'f': 0.015104426167607293},
 'rouge-l': {'r': 0.34879488158899924,
  'p': 0.07134126927788201,
  'f': 0.1092993839461994}}

# Problem 5

In [392]:
## TODO feed your 5 questions from the end of HW1 through the pipeline and perform
## qualitative analysis

query_strs= ["How does the white ball pass through the chute during a game of paid pool on a scratch, but the other balls won't?",
 'How are military documentaries, which include deployment footage interviews, made?',
 'Why is it that getting raw meat in your blood (through a cut on your finger) or mucous membranes (under fingernails) does not cause food poisoning, but eating it can make you violently ill?',
 'Why is snow white when water and ice are clear?',
 'Why do we use a different letter "a" when typing as opposed to when writing?']

#BM25 Retreival
BM25retrieval = bm25_index.lookup(query_strs,1)

#Dense Retrieval
denseretrieval = dense_index.lookup(query_strs, topk=1)

#Reranking Retreieval
rerankingretrieval = ranking_index.lookup(query_strs,topk=1)

inputs1=[]
for i in range(5):
  inputs1.append("Question: " + query_strs[i] + "\nContext: " + " ".join(BM25retrieval[i][0][1]))
BM25_model_output = text2text(inputs1, max_length = 100)

inputs2=[]
for i in range(5):
  inputs2.append("Question: " + query_strs[i] + "\nContext: " + denseretrieval[i][0][1])
dense_model_output = text2text(inputs2, max_length = 100)

inputs3=[]
for i in range(5):
  inputs3.append("Question: " + query_strs[i] + "\nContext: " + rerankingretrieval[i][0][1])
reranking_model_output = text2text(inputs3, max_length = 100)


In [395]:
for i in range(len(inputs1)):
  print("input: ",query_strs[i])
  print()
  print("BM25 Context: ", " ".join(BM25retrieval[i][0][1]))
  print("Answer: ",BM25_model_output[i]["generated_text"])
  print()
  print("Dense Context: ",denseretrieval[i][0][1])
  print("Answer: ",dense_model_output[i]["generated_text"])
  print()
  print("Reranking Context: ", rerankingretrieval[i][0][1])
  print("Answer: ",reranking_model_output[i]["generated_text"])

  print("\n")

input:  How does the white ball pass through the chute during a game of paid pool on a scratch, but the other balls won't?

BM25 Context:  In straight pool, the person shooting may attempt to pocket any ball on the table. The aim of the game is to reach a set number of points. The amount of points needed to win is agreed to before the game. One point is scored for each ball pocketed legally (that is, without a foul). A typical game might require a player to score 100 points. This means that at least 100 balls must be pocketed to win. In professional competition, straight pool is usually played to 150 points. Straight pool is a call-pocket game. This means that the player must call what pocket they mean to sink the ball into on every shot. It does not matter how balls reach the pocket. As long as no foul is involved, and the balls goes into the pocket that is called, a point is scored.
Answer:   The white ball is a "call-pocket" ball. It is a ball that is not supposed to be in the chute

##Part of Q3.3: How well do the methods do with a question that you came up with?

In [403]:
#Q3.3 How well do the methods do with a question that you came up with?
query_strs= ["Which university did Albert Einstein teach in?"]

#BM25 Retreival
BM25retrieval = bm25_index.lookup(query_strs,5)

#Dense Retrieval
denseretrieval = dense_index.lookup(query_strs, topk=1)

#Reranking Retreieval
rerankingretrieval = ranking_index.lookup(query_strs,topk=5)

for i in range(5):
  print("BM25 Context: ", " ".join(BM25retrieval[0][i][1]))
  print()
print()
for i in range(5):
  print("Dense Context: ",denseretrieval[0][i][1])
  print()
print()
for i in range(5):
  print("Reranking Context: ", rerankingretrieval[0][i][1])
  print()
print()



BM25 Context:  In 1916 Albert Einstein wrote an explanation of gravity called general relativity.

BM25 Context:  Albert Einstein and Norbert Wiener also studied Brownian Movement, with greater mathematical precision.

BM25 Context:  The concept of zero-point energy was developed in Germany by Albert Einstein and Otto Stern in 1913.

BM25 Context:  Albert Einstein kept a photograph of Faraday on his study wall alongside pictures of Isaac Newton and James Clerk Maxwell.

BM25 Context:  In 2017, he won the Albert Einstein World Award of Science. In 2018, Yaghi won the Wolf Prize in Chemistry.


Dense Context:  Einstein was born in Ulm, Württemberg, Germany, on 14 March 1879. His family was Jewish, but was not very . However, later in life Einstein became very interested in his Judaism. Einstein did not begin speaking until he was 2 years old. According to his younger sister, Maja, "He had such difficulty with language that those around him feared he would never learn". When Einstein was 