# Baseline using BM25

## Imports

In [34]:
import pandas as pd
import numpy as np
from rank_bm25 import BM25Okapi
import nltk
import string

In [36]:
nltk.download("stopwords")
STOPWORDS = set(nltk.corpus.stopwords.words("english"))
PUNCTUATIONS = string.punctuation

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Magnus\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [65]:
STOPWORDS

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

## Read data

In [4]:
# Queries dev, train or eval
queries_dev_df = pd.read_csv("data/queries/queries.dev.tsv", sep='\t', header=None)
print(len(queries_dev_df))

# Not used currently
queries_train_df = pd.read_csv("data/queries/queries.train.tsv", sep='\t', header=None)
print(len(queries_train_df))
queries_eval_df = pd.read_csv("data/queries/queries.eval.tsv", sep='\t', header=None)
print(len(queries_eval_df))

101093
808731
101092


In [3]:
# Passages to rank based on query
collection_df = pd.read_csv("data/collection/collection.tsv", sep='\t', header=None)
len(collection_df)

8841823

In [8]:
query_sample = queries_dev_df.head(500)
passage_sample = collection_df.head(5000)

## Preprocessing

In [87]:
queries_id = np.array(query_sample.iloc[:, 0])
queries = np.array(query_sample.iloc[:, -1])
queries[0]

'cost of endless pools/swim spa'

In [88]:
passages_id = np.array(passage_sample.iloc[:, 0])
passages = np.array(passage_sample.iloc[:, -1])
passages[0]

'The presence of communication amid scientific minds was equally important to the success of the Manhattan Project as scientific intellect was. The only cloud hanging over the impressive achievement of the atomic researchers and engineers is what their success truly meant; hundreds of thousands of innocent lives obliterated.'

In [56]:
def tokenize(corpus):
    tokenized_corpus = []

    for doc in corpus:
        # Remove specific punctuations
        for punctuation in PUNCTUATIONS:
            doc = doc.replace(punctuation, " ")

        # Get only the words, not the whitespace
        words = [word for word in doc.split(" ") if word]

        # Remove specific stopwords
        words = [word for word in words if word not in STOPWORDS]

        # Add to the list of tokenized docs
        tokenized_corpus.append(words)

    return tokenized_corpus

In [58]:
tokenized_queries = tokenize(queries)
tokenized_queries[0:3]

[['cost', 'endless', 'pools', 'swim', 'spa'], ['pcnt'], ['pcb', 'waste']]

In [59]:
tokenized_passages = tokenize(passages)
tokenized_passages[0:3]

[['The',
  'presence',
  'communication',
  'amid',
  'scientific',
  'minds',
  'equally',
  'important',
  'success',
  'Manhattan',
  'Project',
  'scientific',
  'intellect',
  'The',
  'cloud',
  'hanging',
  'impressive',
  'achievement',
  'atomic',
  'researchers',
  'engineers',
  'success',
  'truly',
  'meant',
  'hundreds',
  'thousands',
  'innocent',
  'lives',
  'obliterated'],
 ['The',
  'Manhattan',
  'Project',
  'atomic',
  'bomb',
  'helped',
  'bring',
  'end',
  'World',
  'War',
  'II',
  'Its',
  'legacy',
  'peaceful',
  'uses',
  'atomic',
  'energy',
  'continues',
  'impact',
  'history',
  'science'],
 ['Essay',
  'The',
  'Manhattan',
  'Project',
  'The',
  'Manhattan',
  'Project',
  'The',
  'Manhattan',
  'Project',
  'see',
  'making',
  'atomic',
  'bomb',
  'possible',
  'The',
  'success',
  'project',
  'would',
  'forever',
  'change',
  'world',
  'forever',
  'making',
  'known',
  'something',
  'powerful',
  'manmade']]

In [89]:
# Dictionary to look up id
query_lookup = {}
for idx, query in enumerate(queries):
    query_lookup[query] = queries_id[idx]

passage_lookup = {}
for idx, passage in enumerate(passages):
    passage_lookup[passage] = passages_id[idx]

## BM25 Implementation
- https://pypi.org/project/rank-bm25/
- http://www.cs.otago.ac.nz/homepages/andrew/papers/2014-2.pdf

In [60]:
bm25 = BM25Okapi(tokenized_passages)
query1 = tokenized_queries[0]
query1

['cost', 'endless', 'pools', 'swim', 'spa']

In [61]:
doc_scores = bm25.get_scores(query1)
print(len(doc_scores))
print(doc_scores)

5000
[0. 0. 0. ... 0. 0. 0.]


In [92]:
top_3 = bm25.get_top_n(query1, passages, n=3)
top_3

['The Army Navy Country Club has been operational for over 85 years and offers two golf courses, six swimming pools, 20 tennis courts (6 indoor and 14 outdoor), a fitness center and many more amenities.',
 'Difference Between Electromagnetic Waves and Radio Waves. Electromagnetic waves are a type of wave that is present in nature. The applications of electromagnetic waves are endless. The theory of electromagnetism is a vast field in classical mechanics and in modern physics, as well.',
 "If you're going to grow mint in a container, I would chose a nice BIG and deep one and mix 'em up (spearmint, peppermint, orange mint, apple mint, chocolate mint, lime mint, the varieties are endless!). It's nice to have different types."]

In [95]:
query1_id = query_lookup[queries[0]]
query1_id

1048578

In [97]:
for passage in top_3:
    print(passage_lookup[passage])

894
1839
3789


In [None]:
# TODO:
    # Use train/eval to see how well the bm25 works

In [109]:
qrels_dev_df = pd.read_csv("data/qrels.dev.tsv", sep='\t', header=None)
qrels_dev_df.head(10)

Unnamed: 0,0,1,2,3
0,1102432,0,2026790,1
1,1102431,0,7066866,1
2,1102431,0,7066867,1
3,1090282,0,7066900,1
4,39449,0,7066905,1
5,76162,0,7066915,1
6,195512,0,7066971,1
7,1090280,0,7067004,1
8,331318,0,5309290,1
9,300674,0,7067032,1


In [110]:
len(qrels_dev_df)

59273

In [115]:
len(qrels_dev_df.iloc[:, 0].unique())

55578

In [117]:
qrels_dev_df.iloc[:, 3].unique()

array([1], dtype=int64)

In [None]:
# Qid, literal 0, Pid, relevance

## Resultater
- Se M5-retrieval_evaluation for å se hvordan man skal sammenlikne rank og ground truth