In [1]:
!pip install KafNafParserPy
!pip install nltk

Now using node v22.17.0 (npm v10.9.2)
Collecting KafNafParserPy
  Downloading KafNafParserPy-1.896.tar.gz (37 kB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hCollecting lxml (from KafNafParserPy)
  Downloading lxml-6.0.1-cp310-cp310-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl.metadata (3.8 kB)
Downloading lxml-6.0.1-cp310-cp310-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl (5.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.3/5.3 MB[0m [31m29.8 MB/s[0m  [33m0:00:00[0m
[?25hBuilding wheels for collected packages: KafNafParserPy
  Building wheel for KafNafParserPy (pyproject.toml) ... [?25ldone
[?25h  Created wheel for KafNafParserPy: filename=kafnafparserpy-1.896-py3-none-any.whl size=51545 sha256=60e33d389168699ec56f1d504f8cf6172118c21e753af522e859470a5528385c
  Stored in directory: /home/erich/.cache/pip/wheels/a3/41/21/8d195

In [13]:
import numpy as np
import pandas as pd
from KafNafParserPy import KafNafParser
import nltk

# Implementación métricas de evaluación de IR

In [14]:
def precision(query_relevance: list):
  return sum(query_relevance)/len(query_relevance)

def precision_at_k(query_relevance: list, k: int):
  return sum(query_relevance[:k])/k

def recall_at_k(query_relevance: list, relevant_docs: int, k: int):
  return sum(query_relevance[:k])/relevant_docs

def average_precision(query_relevance: list):
  relevant_docs = sum(query_relevance)
  found_relevant = 0
  k = 0
  acc_precision = 0
  while found_relevant < relevant_docs:
    if query_relevance[k] == 0:
      k+=1
      continue
    acc_precision += precision(query_relevance[:k+1])
    found_relevant += 1
    k += 1
  return acc_precision/relevant_docs

def mean_average_precision(queries_relevance: list[list]):
  return sum([average_precision(x) for x in queries_relevance])/len(queries_relevance)

def dcg_at_k(query_relevance: list, k):
    acc_dcg = 0
    for i in range(k):
        acc_dcg += query_relevance[i]/np.log2(max(i+1,2))
    return acc_dcg

def ndcg_at_k(query_relevance: list, k: int):
    dcg_k = dcg_at_k(query_relevance, k)
    best_dcg_k = dcg_at_k(sorted(query_relevance, reverse=True), k)
    return dcg_k/best_dcg_k


# precision([0, 0, 0, 1])
# precision_at_k([0, 0, 0, 1], 1)
# recall_at_k([0, 0, 0, 1], 4, 1)
# average_precision([0, 1, 0, 1, 1, 1, 1])
# mean_average_precision([[0, 1, 0, 1, 1, 1, 1], [0, 0, 0, 1]])
# dcg_at_k([4,4,3,0,0,1,3,3,3,0], 6)
# ndcg_at_k([4,3,4,2,0,0,0,1,1,0], 2)


# Implementación de motores de búsqueda

### Setup

#### NLTK Setup

In [15]:
from nltk.stem import PorterStemmer, WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')
stop_words = nltk.corpus.stopwords.words('english')

wptk = nltk.WordPunctTokenizer()
ps = PorterStemmer()
lm = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to /home/erich/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/erich/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


#### Load data

In [16]:

import os

# Define the path to the desired directory
directory_path = './data'

# Check if the directory exists
if os.path.exists(directory_path):
    print(f"Directory '{directory_path}' exists.")
else:
    print(f"Directory '{directory_path}' does not exist.")
    print("Please make sure the path is correct and the folder is shared with your account if it's not your own.")


Directory './data' exists.


## Búsqueda binaria con índice invertido

### Normalize documents

In [None]:
import re
def normalize_document(doc):
  doc = re.sub(r"[^a-zA-Z1-9\s]", " ", doc, flags=re.I|re.A|re.MULTILINE)
  doc = doc.lower()
  doc = doc.strip()
  tokens = wptk.tokenize(doc)
  filtered_tokens = [ps.stem(token) for token in tokens if token not in stop_words]
  # filtered_tokens = [lm.lemmatize(token) for token in tokens if token not in stop_words]
  doc = ' '.join(filtered_tokens)
  return doc, filtered_tokens

In [18]:
files = []
tokens = []
for file in os.listdir(directory_path + '/docs-raw-texts'):
  id = file.split('.')[1]
  doc = KafNafParser(directory_path + f"/docs-raw-texts/{file}")
  title = str(doc.get_header().get_fileDesc().get_title())
  text, new_tokens = normalize_document(str(doc.get_raw()))
  for token in new_tokens:
    tokens.append((token, id))
  files.append((id, title, text))

# Inv Index has the following shape:
# {<term1>: {<doc_1>: tf_doc1, <doc_2>: tf_doc2, ...} }

inv_index = {}

for t in tokens:
  if t[0] not in inv_index:
    inv_index[t[0]] = {}

  if t[1] not in inv_index[t[0]]:
    inv_index[t[0]][t[1]] = 1
  else:
    inv_index[t[0]][t[1]] += 1

for t, v in inv_index.items():
  inv_index[t] = {'freq': len(v), 'docs': v}

docs = pd.DataFrame(files, columns=['id', 'title', 'text'])

In [40]:
inv_index["barber"]

{'freq': 2, 'docs': {'d162': 1, 'd086': 1}}

In [19]:
# Inv Index API

def get_term_docs(inv_index, term):
  return set(inv_index[term]['docs'].keys())

def get_term_tf(inv_index, term, doc):
  return inv_index[term]['docs'].get(doc, 0)

def get_term_df(inv_index, term):
  return inv_index[term]['freq']

In [20]:

# for t, v in inv_index.items():
#   if list(filter(lambda x: x>1, v['docs'].values())):
#     print(f"{t} has repeated docs: {v}")

  # repeated = len(v['docs']) == len(set(v['docs']))
  # if repeated:
  #   print(f"{t} has repeated docs")

In [21]:
# Normalization + Tokenize + Stemmer = 13702 terms
# Regex filter + Normalization + Tokenize + Stemmer = 13339 terms
# Regex filter + Normalization + Tokenize + Lemmatizer = 166623 terms
len(inv_index)

13339

### Queries

In [41]:


def parse_query(query: str):
  query_split = query.split()

  include_terms = set()
  exclude_terms = set()

  not_pattern = r'NOT\s+([a-zA-Z1-9]+)'
  not_matches = re.findall(not_pattern, query, re.IGNORECASE)
  exclude_terms = {normalize_document(term)[0] for term in not_matches}

  query_without_not = re.sub(r'\s+NOT\s+\w+', '', query, flags=re.IGNORECASE)
  word_pattern = r'\b(?!AND\b|NOT\b)(\w+)\b'
  include_matches = re.findall(word_pattern, query_without_not, re.IGNORECASE)
  include_terms = {normalize_document(term)[0] for term in include_matches}

  return include_terms, exclude_terms

def calculate_query(query: str):
  include_terms, exclude_terms = parse_query(query)

  docs = []
  for t in include_terms:
    if t not in inv_index:
      continue
    docs.append(set(inv_index[t]['docs'].keys()))

  result = set.intersection(*docs)
  for t in exclude_terms:
    excluded_docs = set(inv_index[t]['docs'].keys())
    result = result.difference(excluded_docs)

  return result


# query = "move AND three AND chair" # d006, d312
query = "Move AND Three AND Chair NOT brigham" # d006
calculate_query(query)

{'d006'}

### Calculate queries

In [42]:
queries = []
for file in os.listdir(directory_path + '/queries-raw-texts'):
  id = file.split('.')[1]
  doc = KafNafParser(directory_path + f"/queries-raw-texts/{file}")
  _, tokens = normalize_document(str(doc.get_raw()))
  queries.append((id, ' AND '.join(tokens)))
queries = sorted(queries)

In [43]:
results = []
for query in queries:
  result = calculate_query(query[1])
  results.append((query[0], result))

In [44]:
with open(directory_path + '/BSII-AND-queries_results.tsv', 'w') as f:
  for r in results:
    res_queries_str = ",".join([f"{doc}:1" for doc in r[1]])
    f.write(f"{r[0]}\t{res_queries_str}")
    f.write("\n")


# Recuperación ranqueada y vectorización

## Estrategia construcción de representación tf.idf

**TODO**: Describir estrategia

In [45]:
def create_tf_idf(inv_index):
  term_index = []
  doc_map = {}
  doc_index = []
  doc_i = 0

  # Calculate number of docs and assing indexes
  for t, v in inv_index.items():
    term_index.append(t)
    for d, tf in v['docs'].items():
      if d not in doc_map:
        doc_map[d] = len(doc_index)
        doc_index.append(d)

  N = len(doc_index)
  tf_idf = np.zeros((len(term_index), N))

  # Calculate idf per term and tf-idf per document per term
  for i, t in enumerate(term_index):
    idf = np.log10(N/inv_index[t]['freq'])

    for d, tf in inv_index[t]['docs'].items():
      tf_idf[i, doc_map[d]] = np.log10(1+tf) * idf

  return tf_idf, term_index, doc_index, doc_map

tf_idf, term_index, doc_index, doc_map = create_tf_idf(inv_index)

In [46]:
def cos_similarity(doc1_v, doc2_v):
  norm_doc1_v = doc1_v/np.linalg.norm(np.array(doc1_v))
  norm_doc2_v = doc2_v/np.linalg.norm(np.array(doc2_v))
  return np.dot(norm_doc1_v, norm_doc2_v)

In [47]:
term_index.index('poetri')

2617

In [48]:
def calculate_doc_tfidf(query):
  _, filtered_tokens = normalize_document(query)
  tokens_index = {term: filtered_tokens.count(term) for term in filtered_tokens if term in inv_index}
  query_tf_idf = np.zeros(len(term_index))

  for t, tf in tokens_index.items():
    idf = np.log10(len(doc_index)/inv_index[t]['freq'])
    tfidf = np.log10(1+tf)*idf
    query_tf_idf[term_index.index(t)] = tfidf

  return query_tf_idf

# Sample term index: {'famou': 156, 'german': 894, 'poetri': 204}
# query = "famous German poetry"
# calculate_doc_tfidf(query)

In [49]:
queries = []
for file in os.listdir(directory_path + '/queries-raw-texts'):
  id = file.split('.')[1]
  doc = KafNafParser(directory_path + f"/queries-raw-texts/{file}")
  _, tokens = normalize_document(str(doc.get_raw()))
  queries.append((id, ' AND '.join(tokens)))
queries = sorted(queries)

In [50]:
def calculate_query_rank(query):
  query_tf_idf = calculate_doc_tfidf(query)
  similarity = [cos_similarity(query_tf_idf, tf_idf[:, i]) for i in range(tf_idf.shape[1])]
  query_result = zip(doc_index, similarity)
  query_result = list(filter(lambda x: x[1] > 0, query_result))
  query_result = sorted(query_result, key=lambda x: x[1], reverse=True)
  return query_result

results = []
for query in queries:
  result = calculate_query_rank(query[1])
  results.append((query[0], result))
results = sorted(results)

In [51]:
with open(directory_path + '/RRDV-consultas_resultados.tsv', 'w') as f:
  for r in results:
    res_queries_str = ",".join([f"{doc[0]}:{doc[1]}" for doc in r[1]])
    f.write(f"{r[0]}\t{res_queries_str}")
    f.write("\n")

# Judgements

In [52]:
judgements = pd.read_csv(directory_path + '/relevance-judgments.tsv', sep='\t', header=None)
judgements[1] = judgements[1].str.split(',', expand=False)

In [None]:
metrics = [] # query, P@M, R@M, NDCG@M
queries_precision = []

for q in range(len(results)):
# for q in range(1):
  query_id, relevant_docs = judgements.iloc[q]
  M = len(relevant_docs)
  relevant_docs_bin = {doc.split(':')[0]: doc.split(':')[1] for doc in relevant_docs}

  bin_relevance = np.zeros(M)
  ranked_relevance = np.zeros(M)
  query_result = results[q][1]
  for k in range(M):
    if query_result[k][0] in relevant_docs_bin:
      bin_relevance[k] = 1

    ranked_relevance[k] = relevant_docs_bin.get(query_result[k][0], 0)

  metrics.append((
      query_id,
      precision_at_k(bin_relevance, M),
      recall_at_k(bin_relevance, M, M),
      ndcg_at_k(ranked_relevance, M)
  ))
  queries_precision.append(bin_relevance)

res_map = mean_average_precision(queries_precision)
res_map

In [68]:
queries_precision

[array([0., 0., 1.]),
 array([1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0.]),
 array([1., 1., 0., 1., 0., 0.]),
 array([1., 1., 0., 1., 1., 1., 0.]),
 array([1., 1., 1., 0., 1., 0.]),
 array([0., 1., 0., 0.]),
 array([1., 1., 1., 1., 1., 1., 1., 0., 1., 0., 0., 1.]),
 array([1., 1., 1., 1., 1., 0.]),
 array([1., 0., 1., 0., 1., 0., 0., 0.]),
 array([1., 1., 1., 0.]),
 array([1., 1., 0., 1., 1.]),
 array([0., 0., 1., 1., 1., 1., 1., 1., 0., 0., 1., 0.]),
 array([0., 1.]),
 array([1., 1., 1., 0.]),
 array([1., 1., 1., 0., 1., 1., 1.]),
 array([1., 0.]),
 array([1., 1., 1., 0., 0., 0., 1.]),
 array([1., 1., 0., 0., 0., 0., 0., 0.]),
 array([0., 0., 0., 1., 0.]),
 array([1., 1., 0., 0.]),
 array([1.]),
 array([1., 1., 0., 1., 0., 0., 0., 0.]),
 array([1., 1., 0.]),
 array([1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 0.]),
 array([1., 1., 1., 1., 1.]),
 array([1.]),
 array([1., 1., 1., 1., 1., 1., 1., 1., 0., 1.]),
 array([1., 1., 0.]),
 array([1., 0., 1., 0., 0., 1., 0., 0.]),
 array([1., 1.

In [69]:
len(queries_precision)

35

In [66]:
queries_precision[0]

array([0., 0., 1.])

In [67]:
queries_precision[1]

array([1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0.])

In [54]:
import csv

with open(directory_path + '/IR_Metrics.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['query_id', 'P@M', 'R@M', 'NDCG@M'])
    writer.writerows(metrics)

In [None]:
res_map = mean_average_recall(queries_precision)
res_map

0.8698643828142127

In [64]:
queries_precision

[array([0., 0., 1.]),
 array([1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0.]),
 array([1., 1., 0., 1., 0., 0.]),
 array([1., 1., 0., 1., 1., 1., 0.]),
 array([1., 1., 1., 0., 1., 0.]),
 array([0., 1., 0., 0.]),
 array([1., 1., 1., 1., 1., 1., 1., 0., 1., 0., 0., 1.]),
 array([1., 1., 1., 1., 1., 0.]),
 array([1., 0., 1., 0., 1., 0., 0., 0.]),
 array([1., 1., 1., 0.]),
 array([1., 1., 0., 1., 1.]),
 array([0., 0., 1., 1., 1., 1., 1., 1., 0., 0., 1., 0.]),
 array([0., 1.]),
 array([1., 1., 1., 0.]),
 array([1., 1., 1., 0., 1., 1., 1.]),
 array([1., 0.]),
 array([1., 1., 1., 0., 0., 0., 1.]),
 array([1., 1., 0., 0., 0., 0., 0., 0.]),
 array([0., 0., 0., 1., 0.]),
 array([1., 1., 0., 0.]),
 array([1.]),
 array([1., 1., 0., 1., 0., 0., 0., 0.]),
 array([1., 1., 0.]),
 array([1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 0.]),
 array([1., 1., 1., 1., 1.]),
 array([1.]),
 array([1., 1., 1., 1., 1., 1., 1., 1., 0., 1.]),
 array([1., 1., 0.]),
 array([1., 0., 1., 0., 0., 1., 0., 0.]),
 array([1., 1.

In [56]:
docs.to_csv(directory_path + '/docs.csv', index=False)