**Examen Final – Diseño e Implementación de un Sistema de Recuperación de Información**

**Nombre:** Aarón Yumancela

**Objetivo del sistema**


El sistema de recuperación de información debe ser capaz de responder consultas textuales de
ejemplo como:



*   “International conflict”
*   “Economic policy news”
*   “Economic policy news”
*   “Natural disasters”


El sistema deberá retornar un ranking de documentos ordenados por relevancia, utilizando un pipeline
de recuperación semántica y re-ranking.


**0) Setup (instalación + imports)**

In [1]:
# Instalación de dependencias
!pip -q install kagglehub faiss-cpu sentence-transformers nltk tqdm

# Librerías principales del sistema
import os, json, re
import numpy as np
import faiss
from tqdm import tqdm

import nltk
# Recursos lingüísticos NLTK
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

from sentence_transformers import SentenceTransformer, CrossEncoder
import kagglehub


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/23.8 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/23.8 MB[0m [31m157.0 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━[0m [32m14.1/23.8 MB[0m [31m208.9 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m23.2/23.8 MB[0m [31m259.9 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m23.8/23.8 MB[0m [31m253.9 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.8/23.8 MB[0m [31m113.2 MB/s[0m eta [36m0:00:00[0m
[?25h

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


**1) Preprocesamiento (tokenización, minúsculas, stopwords, stemming)**

In [2]:
# Configuración de preprocesamiento
STOPWORDS = set(stopwords.words("english"))
STEMMER = PorterStemmer()

# Preprocesamiento del texto
def preprocess(text: str) -> str:
    if not text:
        return ""
    text = text.lower()
    text = re.sub(r"\s+", " ", text)
    tokens = word_tokenize(text)
    tokens = [t for t in tokens if t.isalnum()]
    tokens = [t for t in tokens if t not in STOPWORDS]
    tokens = [STEMMER.stem(t) for t in tokens]
    return " ".join(tokens)


**2) Cargar dataset (arXiv vía KaggleHub)**

In [3]:
# Carga del dataset arXiv
path = kagglehub.dataset_download("Cornell-University/arxiv")
print("Dataset path:", path)

data_file = os.path.join(path, "arxiv-metadata-oai-snapshot.json")
assert os.path.exists(data_file), f"No se encontró el archivo: {data_file}"


Using Colab cache for faster access to the 'arxiv' dataset.
Dataset path: /kaggle/input/arxiv


In [31]:
# Límite de documentos
N_DOCS = 5000

raw_docs = []
# Lectura de títulos y resúmenes
with open(data_file, "r") as f:
    for i, line in enumerate(f):
        if i >= N_DOCS:
            break
        paper = json.loads(line)
        title = paper.get("title", "") or ""
        abstract = paper.get("abstract", "") or ""
        text = (title + " " + abstract).strip()
        if text:
            raw_docs.append({
                "doc_id": paper.get("id", f"doc_{i}"),
                "title": title.strip(),
                "text": text
            })

len(raw_docs), raw_docs[0]["doc_id"], raw_docs[0]["title"][:80]


(5000,
 '0704.0001',
 'Calculation of prompt diphoton production cross sections at Tevatron and\n  LHC e')

**3) Embeddings de documentos + estructura vectorial (FAISS)**

In [32]:
# Estructuras de documentos
doc_ids = []
doc_texts_proc = []
doc_texts_raw = []

# Procesar textos del dataset
for d in raw_docs:
    raw = d["text"]
    proc = preprocess(raw)

    if proc.strip():
        doc_ids.append(d["doc_id"])
        doc_texts_proc.append(proc)
        doc_texts_raw.append(raw)


len(doc_ids), len(doc_texts_proc), len(doc_texts_raw)



(5000, 5000, 5000)

In [33]:

doc_texts = doc_texts_proc


In [34]:
# Acceso a texto original
doc_text_by_id = {
    did: raw for did, raw in zip(doc_ids, doc_texts_raw)
}


In [35]:
# Modelo de embeddings
bi_encoder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

BATCH_SIZE = 64
emb_list = []

# Generación de embeddings
for i in tqdm(range(0, len(doc_texts), BATCH_SIZE)):
    batch = doc_texts[i:i+BATCH_SIZE]
    emb = bi_encoder.encode(batch, convert_to_numpy=True, normalize_embeddings=True)
    emb_list.append(emb)

doc_emb = np.vstack(emb_list).astype("float32")
doc_emb.shape


100%|██████████| 79/79 [00:09<00:00,  8.04it/s]


(5000, 384)

In [36]:
dim = doc_emb.shape[1]
# Índice FAISS coseno
index = faiss.IndexFlatIP(dim)
index.add(doc_emb)

index.ntotal


5000

**4) Recuperación inicial (First-stage retrieval top-k)**

In [37]:
# Recuperación vectorial inicial
def first_stage_retrieval(query_text: str, k: int = 50):
    q = preprocess(query_text)
    q_emb = bi_encoder.encode([q], convert_to_numpy=True, normalize_embeddings=True).astype("float32")
    scores, idxs = index.search(q_emb, k)

    idxs = idxs[0].tolist()
    scores = scores[0].tolist()

    results = [(doc_ids[i], float(s)) for i, s in zip(idxs, scores) if i != -1]
    return results


**5) Re-ranking (Cross-Encoder) sobre top-k**

In [38]:
# Modelo de re-ranking
cross_encoder = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")

doc_text_by_id = {did: raw for did, raw in zip(doc_ids, doc_texts_raw)}

# Reordenamiento semántico
def rerank(query_text: str, candidates, top_n: int = 10):
    pairs = []
    kept = []
    for did, _ in candidates:
        txt = doc_text_by_id.get(did)
        if txt:
            pairs.append((query_text, txt))
            kept.append(did)

    if not pairs:
        return []

    ce_scores = cross_encoder.predict(pairs)
    reranked = sorted(zip(kept, ce_scores), key=lambda x: x[1], reverse=True)
    return reranked[:top_n]


**6) Simulación de consultas (múltiples queries) + mostrar antes/después**

In [47]:
# Consultas de evaluación
queries = [
    {"query_id": "Q1", "text": "international conflict"},
    {"query_id": "Q2", "text": "economic policy news"},
    {"query_id": "Q3", "text": "natural disasters"},
    {"query_id": "Q4", "text": "neural networks"},
    {"query_id": "Q5", "text": "information retrieval"},
]


In [48]:
# Visualización de resultados

def show_results(qid, query_text, first_stage, reranked, n_show=5, snippet_len=200):
    print("="*90)
    print(f"QID: {qid}")
    print(f"QUERY: {query_text}\n")

    print("First-stage retrieval (FAISS):")
    for rank, (did, sc) in enumerate(first_stage[:n_show], 1):
        txt = doc_text_by_id.get(did, "")
        snippet = txt[:snippet_len].replace("\n", " ")
        print(f"{rank:02d}. {did} | score={sc:.4f}")
        print(f"     ↳ {snippet}...\n")

    print("Re-ranked results (Cross-Encoder):")
    for rank, (did, sc) in enumerate(reranked[:n_show], 1):
        txt = doc_text_by_id.get(did, "")
        snippet = txt[:snippet_len].replace("\n", " ")
        print(f"{rank:02d}. {did} | ce_score={float(sc):.4f}")
        print(f"     ↳ {snippet}...\n")


In [49]:
K_CANDIDATES = 50
TOP_FINAL = 10

for q in queries:
    qid = q["query_id"]
    qtext = q["text"]

    cand = first_stage_retrieval(qtext, k=K_CANDIDATES)
    final_rank = rerank(qtext, cand, top_n=TOP_FINAL)

    show_results(qid, qtext, cand, final_rank, n_show=5)


QID: Q1
QUERY: international conflict

First-stage retrieval (FAISS):
01. 0704.0525 | score=0.2792
     ↳ On the Energy-Momentum Problem in Static Einstein Universe   This paper has been removed by arXiv administrators because it plagiarizes gr-qc/0410004, gr-qc/0603075, and others.   This paper also has ...

02. 0705.0403 | score=0.2578
     ↳ Tracking control for multi-agent consensus with an active leader and   variable topology   In this paper, we consider the coordination control of a group of autonomous mobile agents with multiple lead...

03. 0704.3862 | score=0.2533
     ↳ An Integrated Human-Computer System for Controlling Interstate Disputes   In this paper we develop a scientific approach to control inter-country conflict. This system makes use of a neural network an...

04. 0705.0233 | score=0.2442
     ↳ Coordination for a Group of Autonomous Mobile Agents with Multiple   Leaders   In this paper, we consider the coordination control of a group of autonomous mobile agents w

**7) Evaluación: Precision@k y Recall@k + impacto del re-ranking**

In [50]:
def build_proxy_qrels(queries, m_rels=20, k_candidates=200):
    qrels = {}
    for q in queries:
        qid = q["query_id"]
        qtext = q["text"]
        cand = first_stage_retrieval(qtext, k=k_candidates)
        rel_set = set([did for did, _ in cand[:m_rels]])
        qrels[qid] = rel_set
    return qrels

qrels = build_proxy_qrels(queries, m_rels=20, k_candidates=200)
{k: len(v) for k, v in qrels.items()}


{'Q1': 20, 'Q2': 20, 'Q3': 20, 'Q4': 20, 'Q5': 20}

In [51]:
def precision_at_k(ranked_doc_ids, rel_set, k):
    if k <= 0:
        return 0.0
    topk = ranked_doc_ids[:k]
    hits = sum(1 for d in topk if d in rel_set)
    return hits / k

def recall_at_k(ranked_doc_ids, rel_set, k):
    if not rel_set:
        return 0.0
    topk = ranked_doc_ids[:k]
    hits = sum(1 for d in topk if d in rel_set)
    return hits / len(rel_set)


In [52]:
def evaluate(queries, qrels, k_eval=10, k_candidates=50, top_final=10):
    p_first, r_first = [], []
    p_rer, r_rer = [], []

    for q in queries:
        qid = q["query_id"]
        qtext = q["text"]
        rel = qrels.get(qid, set())

        first = first_stage_retrieval(qtext, k=k_candidates)
        first_ids = [did for did, _ in first]

        rer = rerank(qtext, first, top_n=top_final)
        rer_ids = [did for did, _ in rer]

        p_first.append(precision_at_k(first_ids, rel, k_eval))
        r_first.append(recall_at_k(first_ids, rel, k_eval))
        p_rer.append(precision_at_k(rer_ids, rel, k_eval))
        r_rer.append(recall_at_k(rer_ids, rel, k_eval))

    return {
        "P@k_first": float(np.mean(p_first)),
        "R@k_first": float(np.mean(r_first)),
        "P@k_rerank": float(np.mean(p_rer)),
        "R@k_rerank": float(np.mean(r_rer)),
    }

results = evaluate(queries, qrels, k_eval=10, k_candidates=50, top_final=10)

print("\n" + "="*60)
print("EVALUACIÓN DEL SISTEMA (k = 10)")
print("="*60)

print("First-stage retrieval (FAISS):")
print(f"   Precision@10 = {results['P@k_first']:.3f}")
print(f"   Recall@10    = {results['R@k_first']:.3f}\n")

print("Re-ranking (Cross-Encoder):")
print(f"   Precision@10 = {results['P@k_rerank']:.3f}")
print(f"   Recall@10    = {results['R@k_rerank']:.3f}\n")



EVALUACIÓN DEL SISTEMA (k = 10)
First-stage retrieval (FAISS):
   Precision@10 = 1.000
   Recall@10    = 0.500

Re-ranking (Cross-Encoder):
   Precision@10 = 0.620
   Recall@10    = 0.310

