In [13]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/arxiv/arxiv-metadata-oai-snapshot.json


In [16]:
!pip install sentence-transformers faiss-cpu nltk kagglehub



In [5]:
!pip install -q ir_measures

# Examen Final Arciniegas

## Parte 1: Importaciones

In [21]:
%%capture
!pip install faiss-gpu sentence-transformers

import pandas as pd
import numpy as np
import json
import os
import nltk
import faiss
from sentence_transformers import SentenceTransformer, CrossEncoder
from tqdm.notebook import tqdm
import torch

# Configuración del dispositivo
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Usando dispositivo: {device}")

# Descargas silenciosas de NLTK
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)

## Parte 2: carga de corpus, queries y qrels

In [38]:
json_file_path = '/kaggle/input/arxiv/arxiv-metadata-oai-snapshot.json'
try:
    df = pd.read_json(json_file_path, lines=True, nrows=10000)
    print(f"Dataset cargado. Filas: {len(df)}")
    
except ValueError as e:
    print("No se encontró el archivo.")
    raise e

# Inicializamos estructuras
docs_store = {}
queries_store = {}
qrels_store = {}

print("Generando documentos")

# generar docs_store
for index, row in tqdm(df.iterrows(), total=df.shape[0], desc="Procesando Docs"):
    doc_id = str(row['id'])
    title = str(row.get('title', '')).replace('\n', ' ').strip()
    abstract = str(row.get('abstract', '')).replace('\n', ' ').strip()
    
    docs_store[doc_id] = f"{title}. {abstract}"

# generar queries y qrels
sample_queries = df.sample(20, random_state=42)

for index, row in sample_queries.iterrows():
    qid = f"Q_{row['id']}"   
    doc_id = str(row['id'])  
    queries_store[qid] = str(row['title']).replace('\n', ' ').strip()
    qrels_store[qid] = {doc_id: 1}

print(f"Total Documentos (docs_store): {len(docs_store)}")
print(f"Total Consultas (queries_store): {len(queries_store)}")
print(f"Total Qrels (qrels_store):       {len(qrels_store)}")

Dataset cargado. Filas: 10000
Generando documentos


Procesando Docs:   0%|          | 0/10000 [00:00<?, ?it/s]

Total Documentos (docs_store): 10000
Total Consultas (queries_store): 20
Total Qrels (qrels_store):       20


## Parte 3: Preprocesamiento

Eliminacion de stopwords, normalizacio y stemming

In [23]:
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
translator = str.maketrans('', '', string.punctuation)

def preprocess_text(text):
    # Normalización
    text = text.lower()
    text = text.translate(translator)
    
    # Tokenizacion
    tokens = text.split()
    
    # Stopwords y Stemming
    cleaned_tokens = [stemmer.stem(word) for word in tokens if word not in stop_words]
    
    return " ".join(cleaned_tokens)

sample_text = "International conflict and economic policy news."
print(f"Original: {sample_text}")
print(f"Procesado: {preprocess_text(sample_text)}")

Original: International conflict and economic policy news.
Procesado: intern conflict econom polici news


## Parte 4: Embeddings y FAISS

In [27]:
model_embedding = SentenceTransformer('all-MiniLM-L6-v2', device=device)

doc_ids = list(docs_store.keys())


print("Preprocesando")
passages = [preprocess_text(docs_store[did]) for did in tqdm(doc_ids, desc="Preprocesando")]

# Generar Embeddings
print("Generando embeddings")
doc_embeddings = model_embedding.encode(
    passages, 
    batch_size=128, 
    show_progress_bar=True, 
    convert_to_numpy=True, 
    normalize_embeddings=True
)

# FAISS
d = doc_embeddings.shape[1]
index = faiss.IndexFlatIP(d) 
index.add(doc_embeddings)
print(f"FAISS creado con {index.ntotal} documentos.")

def retrieve_candidates(query_text, top_k=50):
    query_processed = preprocess_text(query_text) 
    q_embed = model_embedding.encode([query_processed], convert_to_numpy=True, normalize_embeddings=True)
    D, I = index.search(q_embed, top_k)
    
    results = {}
    for score, idx in zip(D[0], I[0]):
        if idx < len(doc_ids): 
            results[doc_ids[idx]] = float(score)
    return results

Preprocesando


Preprocesando:   0%|          | 0/10000 [00:00<?, ?it/s]

Generando embeddings


Batches:   0%|          | 0/79 [00:00<?, ?it/s]

FAISS creado con 10000 documentos.


## Parte 5: Re Rankings - Cross enconder

In [36]:
import numpy as np
import pandas as pd

cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2', device=device)


def rerank_results(query_text, initial_results_dict, top_k=50):
    if not initial_results_dict:
        return {}
    
    pairs = []
    doc_ids_list = []
    
    for doc_id in list(initial_results_dict.keys())[:top_k]:
        doc_text = docs_store.get(doc_id, "")
        pairs.append([query_text, doc_text])
        doc_ids_list.append(doc_id)
    
    if not pairs: return {}
    scores = cross_encoder.predict(pairs, show_progress_bar=False)
    
    reranked = {}
    for i, doc_id in enumerate(doc_ids_list):
        reranked[doc_id] = float(scores[i])
        
    # Ordenar descendente
    return dict(sorted(reranked.items(), key=lambda x: x[1], reverse=True))

In [37]:
id_demo = list(queries_store.keys())[0]
query_demo = queries_store[qid_demo]

print(f"Consulta ID: {qid_demo}")
print(f"Texto: '{query_demo}'")

res_stage1 = retrieve_candidates(query_demo, top_k=50)

res_stage2 = rerank_results(query_demo, res_stage1, top_k=50)

print("\nComparativa Top-5 Documentos:")
print(f"{'Pos':<4} | {'ID (FAISS)':<15} {'Score':<10} || {'ID (Re-rank)':<15} {'Score':<10}")
print("-" * 65)

top_ids_1 = list(res_stage1.items())[:5]
top_ids_2 = list(res_stage2.items())[:5]

for i in range(5):
    id1, sc1 = top_ids_1[i]
    id2, sc2 = top_ids_2[i]
    print(f"{i+1:<4} | {id1:<15} {sc1:.4f}     || {id2:<15} {sc2:.4f}")

correct_doc = list(qrels_store[qid_demo].keys())[0]
rank_pos = list(res_stage2.keys()).index(correct_doc) + 1 if correct_doc in res_stage2 else ">50"
print(f"\nEl documento({correct_doc}) quedó en la posición: {rank_pos}")

Consulta ID: Q_705.225
Texto: 'Topological Quiver Matrix Models and Quantum Foam'

Comparativa Top-5 Documentos:
Pos  | ID (FAISS)      Score      || ID (Re-rank)    Score     
-----------------------------------------------------------------
1    | 705.225         0.8188     || 705.225         9.9021
2    | 705.3236        0.4935     || 704.0278        0.8805
3    | 704.1712        0.4735     || 705.3892        0.4875
4    | 704.0278        0.4619     || 705.1645        -0.7480
5    | 704.0796        0.4544     || 704.1291        -1.8485

El documento(705.225) quedó en la posición: 1


## Parte 6: Evaluacion del sistema y simulacion de consulta

In [35]:

def calculate_metrics(results_dict, qrels_dict, k=10):
    # Tomamos los top-k 
    top_hits = list(results_dict.keys())[:k]
    relevant_retrieved = sum([1 for doc in top_hits if qrels_dict.get(doc, 0) > 0])
    total_relevant = sum([1 for rel in qrels_dict.values() if rel > 0])
    
    # Precision
    precision = relevant_retrieved / k
    
    # Recall
    recall = relevant_retrieved / total_relevant if total_relevant > 0 else 0.0
    
    return precision, recall

print(f"Evaluando el sistema sobre {len(queries_store)} consultas de prueba...")

metrics_data = []
for qid, query_text in tqdm(queries_store.items(), desc="Evaluando"):
    qrels_query = qrels_store.get(qid, {})
    if not qrels_query: continue
    
    # recuperación FAISS  50
    res_stage1 = retrieve_candidates(query_text, top_k=50)
    p10_s1, r10_s1 = calculate_metrics(res_stage1, qrels_query, k=10)
    
    # Cross-Encoder
    res_stage2 = rerank_results(query_text, res_stage1, top_k=50)
    p10_s2, r10_s2 = calculate_metrics(res_stage2, qrels_query, k=10)
    
    metrics_data.append({
        "Query ID": qid,
        "P@10 (Inicial)": p10_s1,
        "R@10 (Inicial)": r10_s1,
        "P@10 (Re-rank)": p10_s2,
        "R@10 (Re-rank)": r10_s2
    })

df_metrics = pd.DataFrame(metrics_data)


print(f"Promedio Precision@10 (Inicial): {df_metrics['P@10 (Inicial)'].mean():.4f}")
print(f"Promedio Precision@10 (Final):   {df_metrics['P@10 (Re-rank)'].mean():.4f}")
print(f"Promedio Recall@10 (Inicial):    {df_metrics['R@10 (Inicial)'].mean():.4f}")
print(f"Promedio Recall@10 (Final):      {df_metrics['R@10 (Re-rank)'].mean():.4f}")

print("\nDetalle por consulta (Primeras 5):")
display(df_metrics.head())

Evaluando el sistema sobre 20 consultas de prueba...


Evaluando:   0%|          | 0/20 [00:00<?, ?it/s]

Promedio Precision@10 (Inicial): 0.0950
Promedio Precision@10 (Final):   0.1000
Promedio Recall@10 (Inicial):    0.9500
Promedio Recall@10 (Final):      1.0000

Detalle por consulta (Primeras 5):


Unnamed: 0,Query ID,P@10 (Inicial),R@10 (Inicial),P@10 (Re-rank),R@10 (Re-rank)
0,Q_705.225,0.1,1.0,0.1,1.0
1,Q_705.0682,0.0,0.0,0.1,1.0
2,Q_704.1732,0.1,1.0,0.1,1.0
3,Q_705.074,0.1,1.0,0.1,1.0
4,Q_705.0519,0.1,1.0,0.1,1.0


## Parte 7: Analisis

En los resultados se puede observar una precisión de 0.1000 que parece muy baja, pero en esta simulación representa un puntaje perfecto. Esto se debe a que para cada consulta de prueba definimos que existe exactamente un documento relevante. Ademas, si el sistema devuelve 10 resultados y solo 1 es correcto, la nota máxima posible es 1/10 = 0.10. Por lo tanto, obtener este valor significa que el sistema encontró la respuesta correcta el 100% de las veces y la colocó exitosamente dentro de la primera página de resultados demostrando una busqueda exitos.


La comparación entre etapas demuestra por qué el re-ranking es vital. Mientras que la búsqueda inicial atrapa candidatos rápidamente, a veces deja el documento correcto en posiciones bajas. La etapa de Re-ranking revisó esos candidatos y rescató los documentos relevantes, subiéndolos a las primeras posiciones. Esto se evidencia en que el Recall, ya que se mantuvo en 1.0, asegurando que el usuario final siempre vea la respuesta correcta en el top 10, corrigiendo cualquier error de ordenamiento.