# Examen Final – Diseño e Implementación de un Sistema de Recuperación de Información


## 0.-Carga del dataset

### Import desde Kaggle

In [48]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("Cornell-University/arxiv")

print("Path to dataset files:", path)


Using Colab cache for faster access to the 'arxiv' dataset.
Path to dataset files: /kaggle/input/arxiv


In [49]:
import os
os.listdir(path)


['arxiv-metadata-oai-snapshot.json']

In [51]:
import os
import json

file_path = os.path.join(path, "arxiv-metadata-oai-snapshot.json")

documents = []

with open(file_path, "r") as f:
    for i, line in enumerate(f):
        doc = json.loads(line)
        documents.append(doc)
        if i == 10000:  # límite inicial (recomendado)
            break

print(len(documents))
print(documents[0].keys())


10001
dict_keys(['id', 'submitter', 'authors', 'title', 'comments', 'journal-ref', 'doi', 'report-no', 'categories', 'license', 'abstract', 'versions', 'update_date', 'authors_parsed'])


### Creacion del corpus

In [52]:
corpus = []
doc_ids = []

for doc in documents:
    text = (doc.get("title", "") + " " + doc.get("abstract", "")).strip()
    if text:
        corpus.append(text)
        doc_ids.append(doc.get("id"))

print(f"Total de documentos en corpus: {len(corpus)}")
print(corpus[0][:300], "...")  # muestra los primeros 300 caracteres del primer doc


Total de documentos en corpus: 10001
Calculation of prompt diphoton production cross sections at Tevatron and
  LHC energies   A fully differential calculation in perturbative quantum chromodynamics is
presented for the production of massive photon pairs at hadron colliders. All
next-to-leading order perturbative contributions from qua ...


### Convertir corpus a df

In [54]:
import pandas as pd

#DataFrame con los campos relevantes
df = pd.DataFrame({
    "id": doc_ids,
    "title": [doc.get("title", "") for doc in documents],
    "abstract": [doc.get("abstract", "") for doc in documents],
    "categories": [doc.get("categories", "") for doc in documents],
    "authors": [doc.get("authors", "") for doc in documents]
})

# Añadimos una columna de texto completo para RI
df["text"] = df["title"] + " " + df["abstract"]

# Vemos las primeras filas
df.head()



Unnamed: 0,id,title,abstract,categories,authors,text
0,704.0001,Calculation of prompt diphoton production cros...,A fully differential calculation in perturba...,hep-ph,"C. Bal\'azs, E. L. Berger, P. M. Nadolsky, C.-...",Calculation of prompt diphoton production cros...
1,704.0002,Sparsity-certifying Graph Decompositions,"We describe a new algorithm, the $(k,\ell)$-...",math.CO cs.CG,Ileana Streinu and Louis Theran,Sparsity-certifying Graph Decompositions We ...
2,704.0003,The evolution of the Earth-Moon system based o...,The evolution of Earth-Moon system is descri...,physics.gen-ph,Hongjun Pan,The evolution of the Earth-Moon system based o...
3,704.0004,A determinant of Stirling cycle numbers counts...,We show that a determinant of Stirling cycle...,math.CO,David Callan,A determinant of Stirling cycle numbers counts...
4,704.0005,From dyadic $\Lambda_{\alpha}$ to $\Lambda_{\a...,In this paper we show how to compute the $\L...,math.CA math.FA,Wael Abu-Shammala and Alberto Torchinsky,From dyadic $\Lambda_{\alpha}$ to $\Lambda_{\a...


## 1.-Preprocesamiento

### Preparar Librerias

In [64]:
#Preparar librerias
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re

#Descargar recursos de NLTK
nltk.download("punkt")
nltk.download("punkt_tab")
nltk.download("stopwords")

stop_words = set(stopwords.words("english"))

#Aplicamos Lematizacion
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

nltk.download("wordnet")
nltk.download("omw-1.4")
nltk.download("averaged_perceptron_tagger")
nltk.download("averaged_perceptron_tagger_eng")




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


True

### Tokenizacion y limpieza

In [56]:
#Tokenizacion y limpieza
def preprocess_text(text):
    #Convertir a minúsculas
    text = text.lower()

    #Quitar puntuación y caracteres no alfanuméricos
    text = re.sub(r"[^a-z0-9\s]", " ", text)

    #Tokenizar
    tokens = word_tokenize(text)

    #stopwords
    tokens = [t for t in tokens if t not in stop_words and len(t) > 1]

    return tokens


In [59]:
#Tokenizar la columna 'text'
df["tokens"] = df["text"].apply(preprocess_text)

#Mostrar tokens
print(df["tokens"].iloc[0][:20], "…")


['calculation', 'prompt', 'diphoton', 'production', 'cross', 'sections', 'tevatron', 'lhc', 'energies', 'fully', 'differential', 'calculation', 'perturbative', 'quantum', 'chromodynamics', 'presented', 'production', 'massive', 'photon', 'pairs'] …


### Lematizacion

In [61]:
#Funcion de lematizacion
from nltk.corpus import wordnet
from nltk import pos_tag

def get_wordnet_pos(tag):
    if tag.startswith("J"):
        return wordnet.ADJ
    elif tag.startswith("V"):
        return wordnet.VERB
    elif tag.startswith("N"):
        return wordnet.NOUN
    elif tag.startswith("R"):
        return wordnet.ADV
    else:
        return wordnet.NOUN


lemmatizer = WordNetLemmatizer()

def lemmatize_tokens(tokens):
    pos_tags = pos_tag(tokens)  # obtener etiquetas POS
    lemmatized = [lemmatizer.lemmatize(word, get_wordnet_pos(pos)) for word, pos in pos_tags]
    return lemmatized



In [65]:
#Aplicar lematizacion sobre los tokens
df["tokens_lemmatized"] = df["tokens"].apply(lemmatize_tokens)

# Ejemplo del primer documento
print(df["tokens_lemmatized"].iloc[0][:20], "…")


['calculation', 'prompt', 'diphoton', 'production', 'cross', 'section', 'tevatron', 'lhc', 'energy', 'fully', 'differential', 'calculation', 'perturbative', 'quantum', 'chromodynamics', 'present', 'production', 'massive', 'photon', 'pair'] …


## 2.-Representación mediante Embeddings

### Preparar Librerias

In [66]:
!pip install -q sentence-transformers faiss-cpu

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.8/23.8 MB[0m [31m83.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [67]:
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss

#Cargar un modelo preentrenado
model = SentenceTransformer('all-MiniLM-L6-v2')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

### Embeddings

In [68]:
#Columna de texto limpio
df["text_lemmatized"] = df["tokens_lemmatized"].apply(lambda x: " ".join(x))

#Lista de documentos
corpus_texts = df["text_lemmatized"].tolist()

#Generar embeddings
corpus_embeddings = model.encode(corpus_texts, show_progress_bar=True, convert_to_numpy=True)

print("Shape de los embeddings:", corpus_embeddings.shape)


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

Shape de los embeddings: (10001, 384)


### Indice FAISS

In [69]:
embedding_dim = corpus_embeddings.shape[1]

#CreaR índice de FAISS
index = faiss.IndexFlatL2(embedding_dim)

#Añadir embeddings
index.add(corpus_embeddings)

print("Número de vectores en el índice:", index.ntotal)


Número de vectores en el índice: 10001


### Funcion de Busqueda

In [70]:
def search_query(query, top_k=5):
    #Preprocesar la consulta
    query_tokens = preprocess_text(query)
    query_tokens = lemmatize_tokens(query_tokens)
    query_text = " ".join(query_tokens)

    #Generar embedding
    query_embedding = model.encode([query_text], convert_to_numpy=True)

    #Buscar en FAISS
    distances, indices = index.search(query_embedding, top_k)

    #Mostrar resultados
    results = []
    for dist, idx in zip(distances[0], indices[0]):
        results.append({
            "id": df.iloc[idx]["id"],
            "title": df.iloc[idx]["title"],
            "abstract": df.iloc[idx]["abstract"][:300] + "...",
            "distance": dist
        })
    return results


In [71]:
#Prueba
query = "quantum computing optimization"
results = search_query(query, top_k=5)

for r in results:
    print(f"Doc ID: {r['id']} | Distance: {r['distance']:.4f}")
    print(r['title'])
    print(r['abstract'])
    print("-"*80)


Doc ID: 0705.0017 | Distance: 0.6554
Checking Equivalence of Quantum Circuits and States
  Quantum computing promises exponential speed-ups for important simulation and
optimization problems. It also poses new CAD problems that are similar to, but
more challenging, than the related problems in classical (non-quantum) CAD,
such as determining if two states or circuits are functionally eq...
--------------------------------------------------------------------------------
Doc ID: 0704.0202 | Distance: 0.6746
Towards Minimal Resources of Measurement-based Quantum Computation
  We improve the upper bound on the minimal resources required for
measurement-based quantum computation. Minimizing the resources required for
this model is a key issue for experimental realization of a quantum computer
based on projective measurements. This new upper bound allows also to reply in
...
--------------------------------------------------------------------------------
Doc ID: 0705.2784 | Distance: 0.6963


## 3.-Recuperación Inicial (First-Stage Retrieval)

### Funcion de busqueda top_k

In [72]:
#Recupera los top_k documentos más cercanos
def first_stage_retrieval(query, top_k=10):

    #Preprocesar la consulta
    query_tokens = preprocess_text(query)
    query_tokens = lemmatize_tokens(query_tokens)
    query_text = " ".join(query_tokens)

    #Obtener embedding
    query_embedding = model.encode([query_text], convert_to_numpy=True)

    #Buscar en el índice FAISS
    distances, indices = index.search(query_embedding, top_k)

    #Obtener los documentos
    candidates = []
    for dist, idx in zip(distances[0], indices[0]):
        candidates.append({
            "id": df.iloc[idx]["id"],
            "title": df.iloc[idx]["title"],
            "abstract": df.iloc[idx]["abstract"][:300] + "...",
            "distance": dist
        })
    return candidates


In [73]:
#Prueba
query = "machine learning for quantum optimization"
top_k = 5

candidates = first_stage_retrieval(query, top_k=top_k)

for i, doc in enumerate(candidates, 1):
    print(f"{i}. Doc ID: {doc['id']} | Distance: {doc['distance']:.4f}")
    print(doc["title"])
    print(doc["abstract"])
    print("-"*80)


1. Doc ID: 0705.3333 | Distance: 0.9093
Simulation of Quantum Algorithms with a Symbolic Programming Language
  This study examines the simulation of quantum algorithms on a classical
computer. The program code implemented on a classical computer will be a
straight connection between the mathematical formulation of quantum mechanics
and computational methods. The computational language will include formulat...
--------------------------------------------------------------------------------
2. Doc ID: 0704.3630 | Distance: 0.9096
Adiabatic Rotation, Quantum Search and Preparation of Superposition
  States
  We introduce the idea of using adiabatic rotation to generate superpositions
of a large class of quantum states. For quantum computing this is an
interesting alternative to the well-studied "straight line" adiabatic
evolution. In ways that complement recent results, we show how to efficiently
prep...
--------------------------------------------------------------------------------
3. 

## 4.-Re-ranking de Resultados

### Preparar Librerias

In [74]:
!pip install -q sentence-transformers

### Cargar modelo

In [75]:
from sentence_transformers import CrossEncoder

#Modelo preentrenado para relevancia semántica
cross_encoder_model = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')


config.json:   0%|          | 0.00/794 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

### Funcion de Re-ranking

In [76]:
#Reordenar los documentos usando un Cross-Encoder
def rerank_candidates(query, candidates):
    #Preparar inputs
    cross_inp = [(query, df.loc[df['id'] == c['id'], 'text_lemmatized'].values[0]) for c in candidates]

    #Obtener scores
    scores = cross_encoder_model.predict(cross_inp)

    #Asociar scores
    for i, c in enumerate(candidates):
        c['score'] = scores[i]

    #Reordenar por score
    reranked = sorted(candidates, key=lambda x: x['score'], reverse=True)

    return reranked


### Pipeline busqueda y re-ranking

In [86]:
#Aplicacion de busqueda y re-ranking
query = "energy physics experiments"
top_k = 10

#FAISS
candidates = first_stage_retrieval(query, top_k=top_k)

#Re-ranking
final_ranking = rerank_candidates(query, candidates)

#Resultados
for i, doc in enumerate(final_ranking, 1):
    print(f"{i}. Doc ID: {doc['id']} | Score: {doc['score']:.4f} | Distance: {doc['distance']:.4f}")
    print(doc["title"])
    print(doc["abstract"])
    print("-"*80)


1. Doc ID: 0705.4583 | Score: 0.5917 | Distance: 1.2051
Photons as a Probe of Minicharged Particles
  Low energy experiments with photons can provide deep insights into
fundamental physics. In this note we concentrate on minicharged particles. We
discuss how they can arise in extensions of the standard model and how we can
search for them using a variety of laboratory experiments.
...
--------------------------------------------------------------------------------
2. Doc ID: 0705.1868 | Score: -0.6440 | Distance: 1.2662
Nonlinear optical response of wave packets on quantized potential energy
  surfaces
  We calculated the dynamics of nuclear wave packets in coupled
electron-vibration systems and their nonlinear optical responses. We found that
the quantized nature of the vibrational modes is observed in pump-probe spectra
particularly in weakly interacting electron-vibration systems such as cyanin...
--------------------------------------------------------------------------------
3. Do

## 5.-Simulación de Consultas

In [78]:
#Conjunto de consultas
queries = [
    "quantum computing optimization",
    "machine learning for physics simulations",
    "neural networks for natural language processing",
    "graph algorithms in computer science",
    "high energy particle physics experiments"
]


### Ejecucion del pipeline

In [79]:
def run_pipeline(query, top_k=5):
    #First-stage retrieval
    candidates = first_stage_retrieval(query, top_k=top_k)

    #Guardamos resultados
    initial_results = [{
        "id": c["id"],
        "title": c["title"],
        "abstract": c["abstract"][:200]+"...",
        "distance": c["distance"]
    } for c in candidates]

    #Re-ranking
    final_results = rerank_candidates(query, candidates)

    #Resultados
    final_results_formatted = [{
        "id": c["id"],
        "title": c["title"],
        "abstract": c["abstract"][:200]+"...",
        "score": c["score"],
        "distance": c["distance"]
    } for c in final_results]

    return initial_results, final_results_formatted


### Resultados de multiples consultas


In [80]:
top_k = 5

for q in queries:
    print(f"\n=== Consulta: {q} ===\n")

    initial, final = run_pipeline(q, top_k=top_k)

    print(">>> Resultados antes del re-ranking (FAISS top-k):")
    for i, doc in enumerate(initial, 1):
        print(f"{i}. {doc['title']} | Distance: {doc['distance']:.4f}")
    print("\n>>> Resultados después del re-ranking (Cross-Encoder):")
    for i, doc in enumerate(final, 1):
        print(f"{i}. {doc['title']} | Score: {doc['score']:.4f} | Distance: {doc['distance']:.4f}")
    print("="*100)



=== Consulta: quantum computing optimization ===

>>> Resultados antes del re-ranking (FAISS top-k):
1. Checking Equivalence of Quantum Circuits and States | Distance: 0.6554
2. Towards Minimal Resources of Measurement-based Quantum Computation | Distance: 0.6746
3. Quantum algorithms for hidden nonlinear structures | Distance: 0.6963
4. Adiabatic Rotation, Quantum Search and Preparation of Superposition
  States | Distance: 0.6981
5. Simulation of Quantum Algorithms with a Symbolic Programming Language | Distance: 0.7091

>>> Resultados después del re-ranking (Cross-Encoder):
1. Checking Equivalence of Quantum Circuits and States | Score: 2.7619 | Distance: 0.6554
2. Adiabatic Rotation, Quantum Search and Preparation of Superposition
  States | Score: 0.8547 | Distance: 0.6981
3. Quantum algorithms for hidden nonlinear structures | Score: -0.5802 | Distance: 0.6963
4. Towards Minimal Resources of Measurement-based Quantum Computation | Score: -0.7337 | Distance: 0.6746
5. Simulation 

## 6.-Evaluación del Sistema

### Definir qrels

In [94]:
#Selecion de documentos como relevantes para la query
qrels = {}

for query in queries:
    #Top-10 documentos FAISS
    candidates = first_stage_retrieval(query, top_k=10)
    relevant_ids = {c['id'] for c in candidates[:3]}
    qrels[query] = relevant_ids


### Funciones para metricas

In [88]:
def precision_at_k(retrieved_ids, relevant_ids, k):
    retrieved_k = retrieved_ids[:k]
    return len(set(retrieved_k) & set(relevant_ids)) / k

def recall_at_k(retrieved_ids, relevant_ids, k):
    retrieved_k = retrieved_ids[:k]
    return len(set(retrieved_k) & set(relevant_ids)) / len(relevant_ids) if relevant_ids else 0.0


### Evaluacion del pipeline

In [95]:
top_k = 5

for query in queries:
    initial_results, final_results = run_pipeline(query, top_k=top_k)

    #IDs de documentos
    initial_ids = [doc["id"] for doc in initial_results]
    final_ids = [doc["id"] for doc in final_results]

    relevant_ids = qrels.get(query, set())

    #Calcular métricas
    prec_initial = precision_at_k(initial_ids, relevant_ids, top_k)
    rec_initial = recall_at_k(initial_ids, relevant_ids, top_k)

    prec_final = precision_at_k(final_ids, relevant_ids, top_k)
    rec_final = recall_at_k(final_ids, relevant_ids, top_k)

    print(f"\n=== Query: {query} ===")
    print(f"Precision@{top_k} - Antes: {prec_initial:.2f} | Después: {prec_final:.2f}")
    print(f"Recall@{top_k}    - Antes: {rec_initial:.2f} | Después: {rec_final:.2f}")
    print("="*80)



=== Query: quantum computing optimization ===
Precision@5 - Antes: 0.60 | Después: 0.60
Recall@5    - Antes: 1.00 | Después: 1.00

=== Query: machine learning for physics simulations ===
Precision@5 - Antes: 0.60 | Después: 0.60
Recall@5    - Antes: 1.00 | Después: 1.00

=== Query: neural networks for natural language processing ===
Precision@5 - Antes: 0.60 | Después: 0.60
Recall@5    - Antes: 1.00 | Después: 1.00

=== Query: graph algorithms in computer science ===
Precision@5 - Antes: 0.60 | Después: 0.60
Recall@5    - Antes: 1.00 | Después: 1.00

=== Query: high energy particle physics experiments ===
Precision@5 - Antes: 0.60 | Después: 0.60
Recall@5    - Antes: 1.00 | Después: 1.00


## 7.-Análisis de Resultados

El pipeline basado en embeddings y búsqueda vectorial con FAISS, permitió recuperar documentos semánticamente cercanos a las consultas, dando una recuperación inicial rápida y eficiente, pese a que en algunos casos los documentos no estaban perfectamente alineados con la intención exacta de la consulta.

Estos scores obtenidos en pipeline no están limitados a [0,1], ni a valores positivos estos pueden ser negativos si el modelo estima que el documento es poco relevante. Sus valores son relativos, no absolutos. Ya que lo importante es el orden de los scores, no el valor exacto.

El re-ranking mejoró significativamente la relevancia de los resultados generando que los documentos más pertinentes subieron en el ranking, mientras que los menos relevantes descendieron. permitiendo asi afinar la consulta con documentos mas relevantes al alcance de los usuarios.