## Francisco Teixeira Rocha Aragão - 2021031726
## Lorenzo Carneiro Magalhães - 2021031505

### Implementação do problema de Entity Search - Information Retrieval


### Primeira submissão

In [None]:
import csv
import json
import re
import string
from pathlib import Path
from typing import List

from rank_bm25 import BM25Okapi
from tqdm import tqdm
import nltk

from nltk.tokenize import wordpunct_tokenize

# definindo constantes para normalização dos textos
STOPWORDS = set(nltk.corpus.stopwords.words("english"))
PUNCT = set(string.punctuation)
REMOVE_SPACE = re.compile(r"\s+") # regex para substituir múltiplos espaços por um único espaço

def normalize(text: str) -> List[str]:
     """
     • lower-case
     • tokeniza com wordpunct_tokenize
     • remove stop-words / pontuação
     • descarta tokens de 1 caractere
     """
     text = REMOVE_SPACE.sub(" ", text.lower())
     tokens = [
         t for t in wordpunct_tokenize(text)
         if t not in STOPWORDS and t not in PUNCT and len(t) > 1
     ]
     return tokens


# definindo constantes e caminhos para arquivos
DATA_DIR     = Path("data")
CORPUS_PATH  = DATA_DIR / "corpus.jsonl"
TEST_PATH    = DATA_DIR / "test_queries.csv"
SUBM_PATH    = Path("submission.csv")
TOP_K        = 100  # máx. de entidades por query como descrito no enunciado


# salvando estruturas de indice pra armazenar informações do corpus
docs_tokens: List[List[str]] = []
entity_ids: List[str]        = []

with CORPUS_PATH.open(encoding="utf-8") as f:
    for line in tqdm(f, desc="corpus.jsonl"):
        doc = json.loads(line)

        # concatena campos relevantes de cada documento no corpus
        combined = " ".join([
            doc.get("title", ""),
            doc.get("text",  ""),
            " ".join(doc.get("keywords", [])),
        ])

        docs_tokens.append(normalize(combined))
        entity_ids.append(doc["id"])

bm25 = BM25Okapi(docs_tokens)
print(f"Num : {len(entity_ids):,} documentos indexados.\n")


print("ranking BM25")
rows_out: List[List[str]] = []

# pegando os scores de cada query e salvando os resultados
with TEST_PATH.open(encoding="utf-8") as f:
    
    reader = csv.DictReader(f)
    
    for row in tqdm(reader, desc="test_queries.csv"):
        
        qid, query = row["QueryId"], row["Query"]
        
        q_tokens   = normalize(query)

        if not q_tokens: # isso é só pra evitar queries vazias, idealmente não ocorre 
            continue
        
        scores   = bm25.get_scores(q_tokens)
        best_idx = sorted(range(len(scores)),
                          key=scores.__getitem__, reverse=True)[:TOP_K]
        
        rows_out.extend([[qid, entity_ids[i]] for i in best_idx])

print(f"total  de linhas na saída: {len(rows_out):,}\n")


# escrevendo arquivo de saida com os resultados
with SUBM_PATH.open("w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    
    writer.writerow(["QueryId", "EntityId"])
    writer.writerows(rows_out)

print(f"fim")


### Segunda submissão

In [24]:
import os
import csv, json, tqdm, numpy as np
from pathlib import Path
#from pyserini.search import SimpleSearcher
from pyserini.search.lucene import LuceneSearcher
from sentence_transformers import CrossEncoder

# caminhos aos arquivos importantes
DATA      = Path("data/ir-20251-rc")
TEST_FILE = DATA / "test_queries.csv"
SUBM_FILE = Path("submission.csv")
INDEX_DIR = Path("index_entities")

# parametros do pipeline de ranking
CAND_K = 1000   # numero de candidatos que recupero inicialmente
FINAL_K= 100    # numero final de entidades
W_CE = 0.7    # peso do cross-encoder na interpolação com o bm25

# trabalhando com formatação do corpus para Anserini
path_in  = DATA/ "corpus.jsonl"

os.makedirs("corp_anserini", exist_ok=True)
path_out = Path("corp_anserini", "corpus.jsonl")


with path_in.open(encoding="utf-8") as fin, \
     path_out.open("w", encoding="utf-8") as fout:
    for line in tqdm.tqdm(fin, desc="convert"):
        obj = json.loads(line)
        contents = " ".join([
            obj.get("title",""),
            " ".join(obj.get("keywords", [])),
            obj.get("text","")
        ])
        fout.write(json.dumps({"id": obj["id"], "contents": contents}) + "\n")

# essa variavel de ambiente evita que o java use muita memoria
# instalei o java com : sudo apt install openjdk-17-sdk
os.environ['_JAVA_OPTIONS'] = '-Xms4g -Xmx24g'

convert: 4641784it [00:37, 123545.53it/s]


In [None]:


import subprocess

cmd = [
    'python', '-m', 'pyserini.index.lucene',
    '-collection', 'JsonCollection',
    '-generator', 'DefaultLuceneDocumentGenerator',
    '-input', './corp_anserini',
    '-index', 'index_entities',
    '-threads', '16',
    '-storePositions', '-storeDocvectors', '-storeRaw'
]

subprocess.run(cmd, check=True)




Picked up _JAVA_OPTIONS: -Xms4g -Xmx24g


2025-06-21 14:51:10,783 INFO  [main] index.AbstractIndexer (AbstractIndexer.java:205) - Setting log level to INFO
2025-06-21 14:51:10,785 INFO  [main] index.AbstractIndexer (AbstractIndexer.java:209) - AbstractIndexer settings:
2025-06-21 14:51:10,785 INFO  [main] index.AbstractIndexer (AbstractIndexer.java:210) -  + DocumentCollection path: ./corp_anserini
2025-06-21 14:51:10,785 INFO  [main] index.AbstractIndexer (AbstractIndexer.java:211) -  + CollectionClass: JsonCollection
2025-06-21 14:51:10,786 INFO  [main] index.AbstractIndexer (AbstractIndexer.java:212) -  + Index path: index_entities
2025-06-21 14:51:10,786 INFO  [main] index.AbstractIndexer (AbstractIndexer.java:213) -  + Threads: 16
2025-06-21 14:51:10,786 INFO  [main] index.AbstractIndexer (AbstractIndexer.java:214) -  + Optimize (merge segments)? false
2025-06-21 14:51:10,811 INFO  [main] index.IndexCollection (IndexCollection.java:246) - Using DefaultEnglishAnalyzer
2025-06-21 14:51:10,811 INFO  [main] index.IndexCollect

Jun 21, 2025 2:51:10 PM org.apache.lucene.store.MemorySegmentIndexInputProvider <init>
INFO: Using MemorySegmentIndexInput with Java 21; to disable start with -Dorg.apache.lucene.store.MMapDirectory.enableMemorySegments=false


2025-06-21 14:52:10,912 INFO  [main] index.AbstractIndexer (AbstractIndexer.java:259) - 1,330,000 documents indexed


In [8]:

# agora sim inicio das tarefas de ranqueamento
searcher = LuceneSearcher(str(INDEX_DIR))
searcher.set_bm25(k1=0.92, b=0.36) # parametros do bm25, k1 controla a sensibilidade ao tamanho do documento, b controla a normalização
searcher.set_rm3(fb_terms=10, fb_docs=50, original_query_weight=0.5) # parâmetros do RM3, fb_terms é o número de termos de feedback, fb_docs é o número de documentos de feedback, original_query_weight é o peso da query original

ce = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-12-v2')

# como vou fazer interpolação dos modelos, preciso normalizar os scores dos metodos para poder agregalos
def normalize_scores(d):
    vals = np.array(list(d.values()))
    return {k: (v - vals.min()) / (np.ptp(vals) + 1e-9) for k, v in d.items()}

rows_out = []

with TEST_FILE.open() as f:
    reader = csv.DictReader(f)
    for row in tqdm.tqdm(reader, desc="Queries"):
        qid, query = row["QueryId"], row["Query"]

        # inicialmente uso rm3 + bm25 para achar candidatos relevantes para a query
        hits = searcher.search(query, CAND_K)
        cand_ids  = [h.docid for h in hits]
        bm25_dict = {h.docid: h.score for h in hits}

        # agora que tenho os candidatos, uso o cross-encoder para re-ranquear esses candidatos
        texts = [searcher.doc(did).raw() for did in cand_ids]
        ce_scores = ce.predict([(query, t) for t in texts], batch_size=32)
        ce_dict = dict(zip(cand_ids, ce_scores))

        # jutno os scores com interpolação
        b_norm = normalize_scores(bm25_dict)
        c_norm = normalize_scores(ce_dict)
        final_scores = {d: W_CE*c_norm[d] + (1-W_CE)*b_norm[d] for d in cand_ids}

        top_ids = sorted(final_scores, key=final_scores.get, reverse=True)[:FINAL_K]
        rows_out.extend([[qid, did] for did in top_ids])

# salvo os resultados obtidos 
with SUBM_FILE.open("w", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["QueryId", "EntityId"])
    writer.writerows(rows_out)

print("fim")

Jun 21, 2025 2:19:48 PM org.apache.lucene.store.MemorySegmentIndexInputProvider <init>
INFO: Using MemorySegmentIndexInput with Java 21; to disable start with -Dorg.apache.lucene.store.MMapDirectory.enableMemorySegments=false
Queries: 13it [09:04, 41.86s/it]


KeyboardInterrupt: 

## Terceira submissão

In [None]:
import os
import csv, json, tqdm, numpy as np
import re
from pathlib import Path
#from pyserini.search import SimpleSearcher
from pyserini.search.lucene import LuceneSearcher
from sentence_transformers import CrossEncoder
import string
import nltk

from typing import List

from nltk.tokenize import wordpunct_tokenize

import subprocess

# definindo constantes para normalização dos textos
STOPWORDS = set(nltk.corpus.stopwords.words("english"))
PUNCT = set(string.punctuation)
REMOVE_SPACE = re.compile(r"\s+") # regex para substituir múltiplos espaços por um único espaço

def normalize(text: str) -> str:
     """
     • lower-case
     • tokeniza com wordpunct_tokenize
     • remove stop-words / pontuação
     • descarta tokens de 1 caractere
     """
     text = REMOVE_SPACE.sub(" ", text.lower())
     tokens = [
         t for t in wordpunct_tokenize(text)
         if t not in STOPWORDS and t not in PUNCT and len(t) > 1
     ]
     return " ".join(tokens)

In [None]:


# caminhos aos arquivos importantes
DATA      = Path("data/ir-20251-rc")
TEST_FILE = DATA / "test_queries.csv"
SUBM_FILE = Path("submission.csv")
INDEX_DIR = Path("index_entities")



# trabalhando com formatação do corpus para Anserini
path_in  = DATA/ "corpus.jsonl"

os.makedirs("corp_anserini", exist_ok=True)
path_out = Path("corp_anserini", "corpus.jsonl")


with path_in.open(encoding="utf-8") as fin, \
     path_out.open("w", encoding="utf-8") as fout:
    for line in tqdm.tqdm(fin, desc="convert"):
        obj = json.loads(line)
        contents = " ".join([
            obj.get("title",""),
            " ".join(obj.get("keywords", [])),
            obj.get("text","")
        ])
        fout.write(json.dumps({"id": obj["id"], "contents": normalize(contents)}) + "\n")



  from .autonotebook import tqdm as notebook_tqdm
convert: 4641784it [03:24, 22689.42it/s]


In [None]:

# essa variavel de ambiente evita que o java use muita memoria
# instalei o java com : sudo apt install openjdk-17-sdk
os.environ['_JAVA_OPTIONS'] = '-Xms4g -Xmx24g'

cmd = [
    'python', '-m', 'pyserini.index.lucene',
    '-collection', 'JsonCollection',
    '-generator', 'DefaultLuceneDocumentGenerator',
    '-input', './corp_anserini',
    '-index', 'index_entities',
    '-threads', '4',
    '-storePositions', '-storeDocvectors', '-storeRaw'
]

subprocess.run(cmd, check=True)

Picked up _JAVA_OPTIONS: -Xms4g -Xmx24g


2025-06-21 15:09:53,396 INFO  [main] index.AbstractIndexer (AbstractIndexer.java:205) - Setting log level to INFO
2025-06-21 15:09:53,398 INFO  [main] index.AbstractIndexer (AbstractIndexer.java:209) - AbstractIndexer settings:
2025-06-21 15:09:53,398 INFO  [main] index.AbstractIndexer (AbstractIndexer.java:210) -  + DocumentCollection path: ./corp_anserini
2025-06-21 15:09:53,399 INFO  [main] index.AbstractIndexer (AbstractIndexer.java:211) -  + CollectionClass: JsonCollection
2025-06-21 15:09:53,399 INFO  [main] index.AbstractIndexer (AbstractIndexer.java:212) -  + Index path: index_entities
2025-06-21 15:09:53,399 INFO  [main] index.AbstractIndexer (AbstractIndexer.java:213) -  + Threads: 4
2025-06-21 15:09:53,399 INFO  [main] index.AbstractIndexer (AbstractIndexer.java:214) -  + Optimize (merge segments)? false
2025-06-21 15:09:53,420 INFO  [main] index.IndexCollection (IndexCollection.java:246) - Using DefaultEnglishAnalyzer
2025-06-21 15:09:53,421 INFO  [main] index.IndexCollecti

Jun 21, 2025 3:09:53 PM org.apache.lucene.store.MemorySegmentIndexInputProvider <init>
INFO: Using MemorySegmentIndexInput with Java 21; to disable start with -Dorg.apache.lucene.store.MMapDirectory.enableMemorySegments=false


2025-06-21 15:10:53,510 INFO  [main] index.AbstractIndexer (AbstractIndexer.java:259) - 1,540,000 documents indexed
2025-06-21 15:11:53,511 INFO  [main] index.AbstractIndexer (AbstractIndexer.java:259) - 3,080,000 documents indexed
2025-06-21 15:12:53,511 INFO  [main] index.AbstractIndexer (AbstractIndexer.java:259) - 4,610,000 documents indexed
2025-06-21 15:13:17,629 INFO  [main] index.AbstractIndexer (AbstractIndexer.java:307) - Indexing Complete! 4,641,784 documents indexed
2025-06-21 15:13:17,629 INFO  [main] index.AbstractIndexer (AbstractIndexer.java:309) - indexed:        4,641,784
2025-06-21 15:13:17,630 INFO  [main] index.AbstractIndexer (AbstractIndexer.java:310) - unindexable:            0
2025-06-21 15:13:17,630 INFO  [main] index.AbstractIndexer (AbstractIndexer.java:311) - empty:                  0
2025-06-21 15:13:17,630 INFO  [main] index.AbstractIndexer (AbstractIndexer.java:312) - skipped:                0
2025-06-21 15:13:17,630 INFO  [main] index.AbstractIndexer (A

CompletedProcess(args=['python', '-m', 'pyserini.index.lucene', '-collection', 'JsonCollection', '-generator', 'DefaultLuceneDocumentGenerator', '-input', './corp_anserini', '-index', 'index_entities', '-threads', '4', '-storePositions', '-storeDocvectors', '-storeRaw'], returncode=0)

In [8]:
# parametros do pipeline de ranking
CAND_K = 500   # numero de candidatos que recupero inicialmente
FINAL_K= 100    # numero final de entidades
W_CE = 0.8    # peso do cross-encoder na interpolação com o bm25

# agora sim inicio das tarefas de ranqueamento
searcher = LuceneSearcher(str(INDEX_DIR))
searcher.set_bm25(k1=1.2, b=0.75) # parametros do bm25, k1 controla a sensibilidade ao tamanho do documento, b controla a normalização
searcher.set_rm3(fb_terms=10, fb_docs=50, original_query_weight=0.6) # parâmetros do RM3, fb_terms é o número de termos de feedback, fb_docs é o número de documentos de feedback, original_query_weight é o peso da query original

ce = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-12-v2')

# como vou fazer interpolação dos modelos, preciso normalizar os scores dos metodos para poder agregalos
def normalize_scores(d):
    vals = np.array(list(d.values()))
    return {k: (v - vals.min()) / (np.ptp(vals) + 1e-9) for k, v in d.items()}

rows_out = []

with TEST_FILE.open() as f:
    reader = csv.DictReader(f)
    for row in tqdm.tqdm(reader, desc="Queries"):
        qid, query = row["QueryId"], row["Query"]

        # inicialmente uso rm3 + bm25 para achar candidatos relevantes para a query
        hits = searcher.search(query, CAND_K)
        cand_ids  = [h.docid for h in hits]
        bm25_dict = {h.docid: h.score for h in hits}

        # agora que tenho os candidatos, uso o cross-encoder para re-ranquear esses candidatos
        texts = [searcher.doc(did).raw() for did in cand_ids]
        ce_scores = ce.predict([(query, t) for t in texts], batch_size=32)
        ce_dict = dict(zip(cand_ids, ce_scores))

        # jutno os scores com interpolação
        b_norm = normalize_scores(bm25_dict)
        c_norm = normalize_scores(ce_dict)
        final_scores = {d: W_CE*c_norm[d] + (1-W_CE)*b_norm[d] for d in cand_ids}

        top_ids = sorted(final_scores, key=final_scores.get, reverse=True)[:FINAL_K]
        rows_out.extend([[qid, did] for did in top_ids])

# salvo os resultados obtidos 
with SUBM_FILE.open("w", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["QueryId", "EntityId"])
    writer.writerows(rows_out)

print("fim")

Queries: 233it [36:22,  9.37s/it]

fim





### Quarta submissão

In [2]:
# parametros do pipeline de ranking
CAND_K = 1000   # numero de candidatos que recupero inicialmente
FINAL_K= 100    # numero final de entidades

# agora sim inicio das tarefas de ranqueamento
searcher = LuceneSearcher(str(INDEX_DIR))
searcher.set_bm25(k1=1.2, b=0.75) # parametros do bm25, k1 controla a sensibilidade ao tamanho do documento, b controla a normalização
searcher.set_rm3(fb_terms=12, fb_docs=50, original_query_weight=0.6) # parâmetros do RM3, fb_terms é o número de termos de feedback, fb_docs é o número de documentos de feedback, original_query_weight é o peso da query original

ce = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-12-v2')

# como vou fazer interpolação dos modelos, preciso normalizar os scores dos metodos para poder agregalos
def normalize_scores(d):
    vals = np.array(list(d.values()))
    return {k: (v - vals.min()) / (np.ptp(vals) + 1e-9) for k, v in d.items()}

rows_out = []

with TEST_FILE.open() as f:
    reader = csv.DictReader(f)
    for row in tqdm.tqdm(reader, desc="Queries"):
        qid, query = row["QueryId"], row["Query"]

        # inicialmente uso rm3 + bm25 para achar candidatos relevantes para a query
        hits = searcher.search(query, CAND_K)
        cand_ids  = [h.docid for h in hits]
        bm25_dict = {h.docid: h.score for h in hits}

        # agora que tenho os candidatos, uso o cross-encoder para re-ranquear esses candidatos
        texts = [searcher.doc(did).raw() for did in cand_ids]
        ce_scores = ce.predict([(query, t) for t in texts], batch_size=32)
        ce_dict = dict(zip(cand_ids, ce_scores))

        # jutno os scores com interpolação
        b_norm = normalize_scores(bm25_dict)
        c_norm = normalize_scores(ce_dict)

        final_scores = {d: c_norm[d] for d in cand_ids}

        top_ids = sorted(final_scores, key=final_scores.get, reverse=True)[:FINAL_K]
        rows_out.extend([[qid, did] for did in top_ids])

# salvo os resultados obtidos 
with SUBM_FILE.open("w", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["QueryId", "EntityId"])
    writer.writerows(rows_out)

print("fim")

Jun 21, 2025 4:06:52 PM org.apache.lucene.store.MemorySegmentIndexInputProvider <init>
INFO: Using MemorySegmentIndexInput with Java 21; to disable start with -Dorg.apache.lucene.store.MMapDirectory.enableMemorySegments=false
Queries: 233it [1:20:11, 20.65s/it]

fim



