In [None]:
!pip install ir_datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting ir_datasets
  Downloading ir_datasets-0.5.4-py3-none-any.whl (311 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m311.5/311.5 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting warc3-wet-clueweb09>=0.2.5
  Downloading warc3-wet-clueweb09-0.2.5.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pyautocorpus>=0.1.1
  Downloading pyautocorpus-0.1.9-cp39-cp39-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (293 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m293.3/293.3 kB[0m [31m21.1 MB/s[0m eta [36m0:00:00[0m
Collecting lz4>=3.1.1
  Downloading lz4-4.3.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m23.1 MB/s[0m eta [36m0:00:00[0m
Collecting zlib-state>=0.1.3
  Downloading zlib_state-0.1.5-cp39-

In [None]:
!pip install nltk

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import ir_datasets
dataset = ir_datasets.load("cranfield")

In [None]:
corpus = [doc.text for doc in dataset.docs_iter()]

[INFO] [starting] http://ir.dcs.gla.ac.uk/resources/test_collections/cran/cran.tar.gz
[INFO] [finished] http://ir.dcs.gla.ac.uk/resources/test_collections/cran/cran.tar.gz: [00:00] [507kB] [962kB/s]


In [None]:
corpus[:2]

['experimental investigation of the aerodynamics of a\nwing in a slipstream .\n  an experimental study of a wing in a propeller slipstream was\nmade in order to determine the spanwise distribution of the lift\nincrease due to slipstream at different angles of attack of the wing\nand at different free stream to slipstream velocity ratios .  the\nresults were intended in part as an evaluation basis for different\ntheoretical treatments of this problem .\n  the comparative span loading curves, together with\nsupporting evidence, showed that a substantial part of the lift increment\nproduced by the slipstream was due to a /destalling/ or\nboundary-layer-control effect .  the integrated remaining lift\nincrement, after subtracting this destalling lift, was found to agree\nwell with a potential flow theory .\n  an empirical evaluation of the destalling effects was made for\nthe specific configuration of the experiment .',
 "simple shear flow past a flat plate in an incompressible fluid of sm

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

# prepare data 

In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import math
from tqdm import tqdm

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
preprocessed_corpus = []
for doc in corpus:
    tokens = word_tokenize(doc.lower())
    filtered_tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
    preprocessed_corpus.append(' '.join(filtered_tokens))

In [None]:
queries = [query.text for query in dataset.queries_iter()]
rels = {}
for qrel in dataset.qrels_iter():
  rels[int(qrel.query_id)] = []
for qrel in dataset.qrels_iter():
  rels[int(qrel.query_id)].append(int(qrel.doc_id))

In [None]:
def build_doc_tfidf(docs):
  appear_in_docs = {}
  posting_list = {}

  # init 
  for doc in docs:
    for term in doc.split():
      appear_in_docs[term] = 0
      posting_list[term] = []


  for i, doc in enumerate(docs):
    appear_in_this = {}
    for term in doc.split():
      if not term in appear_in_this:
        appear_in_docs[term] += 1
        posting_list[term].append(i)

      appear_in_this[term] = 1

  idf = {}
  for term in posting_list:
    idf[term] = math.log2(1400 / appear_in_docs[term])

  docs_weight = []
  for doc in docs:
    cur_weight = {}
    tf = {}
    for term in doc.split():
      tf[term] = 0
    for term in doc.split():
      tf[term] += 1

    for term in tf:
      cur_weight[term] = tf[term] * idf[term]
    
    docs_weight.append(cur_weight)

  return docs_weight, posting_list, idf

In [None]:
doc_tfidf, posting, idf = build_doc_tfidf(preprocessed_corpus)

# query

In [None]:
res = {}
for i, query in tqdm(enumerate(queries)): 

  tokens = word_tokenize(query.lower())
  terms = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]

  # calc weights for query
  qtf = {} 
  for term in terms:
    qtf[term] = 0
  for term in terms:
    qtf[term] += 1
  
  myDocScore = {}
  for term in terms:
    if term in posting:
      for doc_id in posting[term]:
        if doc_id not in myDocScore:
          myDocScore[doc_id] = doc_tfidf[doc_id][term] * (qtf[term] * idf[term])
        else:
          myDocScore[doc_id] += doc_tfidf[doc_id][term] * (qtf[term] * idf[term])

  # divide by len because the weight is not normalized yet
  for id in myDocScore:
    myDocScore[id] /= len(doc_tfidf[id])
    
  sorted_score = [key+1 for key, value in sorted(myDocScore.items(), key=lambda item: -item[1])]
  res[i+1] = sorted_score

225it [00:00, 288.10it/s]


# evaluate

In [None]:
def calculate_interpolated_map(queries, relevant_docs):
    sum=0
    lens=len(queries.keys())
    for query in queries.keys():
        ranked_docs = queries[query]
        precision = []
        recall = []
        relevant = set(relevant_docs[query])
        retrieved = set()
        for i, doc in enumerate(ranked_docs):
            if doc in relevant:
                retrieved.add(doc)
            precision.append(len(retrieved) / (i + 1))
            recall.append(len(retrieved) / len(relevant))
        precision_sum=0
        for i in range(11):
            recall_level=i/10
            precision_list=[precision[i] for i in range(len(precision)) if recall[i] >= recall_level]
            if(len(precision_list)==0):
                precision_list=0
            else:
              precision_list=max(precision_list)
            precision_sum+=(precision_list/11)
        sum+=(precision_sum/lens)

    return sum

In [26]:
result_vector_score = res
true_answer = rels

In [27]:
calculate_interpolated_map(result_vector_score, true_answer)

0.3436494952901474