### Importación de librerías

In [4]:
import os
import requests
import re
import pandas as pd
import numpy as np
import pandas as pd
import json

from dotenv import load_dotenv
from bs4 import BeautifulSoup

from openai import OpenAI
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from rank_bm25 import BM25Okapi

import nltk
from nltk import regexp_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('wordnet', quiet=True)

from sentence_transformers import SentenceTransformer
from tqdm import tqdm

import faiss

  from .autonotebook import tqdm as notebook_tqdm


### Creación del corpus

In [5]:
# Ruta al archivo
filename = "arxiv-metadata-oai-snapshot.json"

# Contador y almacenamiento temporal
docs = []
target_percent = 0.01  # 1%

# Contar líneas totales
with open(filename, "r", encoding="utf-8") as f:
    total_lines = sum(1 for _ in f)

subset_size = max(1, int(target_percent * total_lines))

# Cargar solo 1% del total
with open(filename, "r", encoding="utf-8") as f:
    for i, line in enumerate(f):
        if i >= subset_size:
            break
        try:
            item = json.loads(line)
            # Extraer solo columnas necesarias
            docs.append({
                "id": item.get("id", ""),
                "title": item.get("title", ""),
                "abstract": item.get("abstract", ""),
                "authors": item.get("authors", ""),
                "categories": item.get("categories", ""),
                "update_date": item.get("update_date", "")
            })
        except json.JSONDecodeError:
            continue

# Crear DataFrame
df = pd.DataFrame(docs)
df



Unnamed: 0,id,title,abstract,authors,categories,update_date
0,0704.0001,Calculation of prompt diphoton production cros...,A fully differential calculation in perturba...,"C. Bal\'azs, E. L. Berger, P. M. Nadolsky, C.-...",hep-ph,2008-11-26
1,0704.0002,Sparsity-certifying Graph Decompositions,"We describe a new algorithm, the $(k,\ell)$-...",Ileana Streinu and Louis Theran,math.CO cs.CG,2008-12-13
2,0704.0003,The evolution of the Earth-Moon system based o...,The evolution of Earth-Moon system is descri...,Hongjun Pan,physics.gen-ph,2008-01-13
3,0704.0004,A determinant of Stirling cycle numbers counts...,We show that a determinant of Stirling cycle...,David Callan,math.CO,2007-05-23
4,0704.0005,From dyadic $\Lambda_{\alpha}$ to $\Lambda_{\a...,In this paper we show how to compute the $\L...,Wael Abu-Shammala and Alberto Torchinsky,math.CA math.FA,2013-10-15
...,...,...,...,...,...,...
27918,0710.0971,The Extending for Composite Skyrme Model,"In this paper, we have extended the composit...","Pham Thuc Tuyen, Do Quoc Tuan",nucl-th,2007-10-05
27919,0710.0972,A Floer homology for exact contact embeddings,In this paper we construct the Floer homolog...,"Kai Cieliebak, Urs Frauenfelder",math.SG,2007-10-05
27920,0710.0973,Modulation invariant bilinear T(1) theorem,We prove a T(1) theorem for bilinear singula...,"Arpad Benyi, Ciprian Demeter, Andrea R. Nahmod...",math.CA math.AP,2007-10-05
27921,0710.0974,Hawking radiation in GHS and non-extremal D1-D...,We apply the method of Banerjee and Kulkarni...,"Sunandan Gangopadhyay, Shailesh Kulkarni",hep-th,2008-11-26


### Preprocesamiento

In [6]:
# Unir título y abstract en una misma columna
df['raw'] = df['title'] + '. ' + df['abstract'].apply(lambda x: ', '.join(x) if isinstance(x, list) else str(x))
df

Unnamed: 0,id,title,abstract,authors,categories,update_date,raw
0,0704.0001,Calculation of prompt diphoton production cros...,A fully differential calculation in perturba...,"C. Bal\'azs, E. L. Berger, P. M. Nadolsky, C.-...",hep-ph,2008-11-26,Calculation of prompt diphoton production cros...
1,0704.0002,Sparsity-certifying Graph Decompositions,"We describe a new algorithm, the $(k,\ell)$-...",Ileana Streinu and Louis Theran,math.CO cs.CG,2008-12-13,Sparsity-certifying Graph Decompositions. We...
2,0704.0003,The evolution of the Earth-Moon system based o...,The evolution of Earth-Moon system is descri...,Hongjun Pan,physics.gen-ph,2008-01-13,The evolution of the Earth-Moon system based o...
3,0704.0004,A determinant of Stirling cycle numbers counts...,We show that a determinant of Stirling cycle...,David Callan,math.CO,2007-05-23,A determinant of Stirling cycle numbers counts...
4,0704.0005,From dyadic $\Lambda_{\alpha}$ to $\Lambda_{\a...,In this paper we show how to compute the $\L...,Wael Abu-Shammala and Alberto Torchinsky,math.CA math.FA,2013-10-15,From dyadic $\Lambda_{\alpha}$ to $\Lambda_{\a...
...,...,...,...,...,...,...,...
27918,0710.0971,The Extending for Composite Skyrme Model,"In this paper, we have extended the composit...","Pham Thuc Tuyen, Do Quoc Tuan",nucl-th,2007-10-05,The Extending for Composite Skyrme Model. In...
27919,0710.0972,A Floer homology for exact contact embeddings,In this paper we construct the Floer homolog...,"Kai Cieliebak, Urs Frauenfelder",math.SG,2007-10-05,A Floer homology for exact contact embeddings....
27920,0710.0973,Modulation invariant bilinear T(1) theorem,We prove a T(1) theorem for bilinear singula...,"Arpad Benyi, Ciprian Demeter, Andrea R. Nahmod...",math.CA math.AP,2007-10-05,Modulation invariant bilinear T(1) theorem. ...
27921,0710.0974,Hawking radiation in GHS and non-extremal D1-D...,We apply the method of Banerjee and Kulkarni...,"Sunandan Gangopadhyay, Shailesh Kulkarni",hep-th,2008-11-26,Hawking radiation in GHS and non-extremal D1-D...


In [7]:
# Función de preprocesamiento
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_doc(doc):
    tokens = regexp_tokenize(doc.lower(), r'\w+')                       # tokenizar
    tokens = [token for token in tokens if token not in stop_words]     # eliminar stopwords
    tokens = [lemmatizer.lemmatize(t) for t in tokens]                  # Lematización
    return ' '.join(tokens)

In [8]:
# Preprocesar la columna del título con abstract unidos
df["preprocessed"] = df["raw"].apply(preprocess_doc)
df

Unnamed: 0,id,title,abstract,authors,categories,update_date,raw,preprocessed
0,0704.0001,Calculation of prompt diphoton production cros...,A fully differential calculation in perturba...,"C. Bal\'azs, E. L. Berger, P. M. Nadolsky, C.-...",hep-ph,2008-11-26,Calculation of prompt diphoton production cros...,calculation prompt diphoton production cross s...
1,0704.0002,Sparsity-certifying Graph Decompositions,"We describe a new algorithm, the $(k,\ell)$-...",Ileana Streinu and Louis Theran,math.CO cs.CG,2008-12-13,Sparsity-certifying Graph Decompositions. We...,sparsity certifying graph decomposition descri...
2,0704.0003,The evolution of the Earth-Moon system based o...,The evolution of Earth-Moon system is descri...,Hongjun Pan,physics.gen-ph,2008-01-13,The evolution of the Earth-Moon system based o...,evolution earth moon system based dark matter ...
3,0704.0004,A determinant of Stirling cycle numbers counts...,We show that a determinant of Stirling cycle...,David Callan,math.CO,2007-05-23,A determinant of Stirling cycle numbers counts...,determinant stirling cycle number count unlabe...
4,0704.0005,From dyadic $\Lambda_{\alpha}$ to $\Lambda_{\a...,In this paper we show how to compute the $\L...,Wael Abu-Shammala and Alberto Torchinsky,math.CA math.FA,2013-10-15,From dyadic $\Lambda_{\alpha}$ to $\Lambda_{\a...,dyadic lambda_ alpha lambda_ alpha paper show ...
...,...,...,...,...,...,...,...,...
27918,0710.0971,The Extending for Composite Skyrme Model,"In this paper, we have extended the composit...","Pham Thuc Tuyen, Do Quoc Tuan",nucl-th,2007-10-05,The Extending for Composite Skyrme Model. In...,extending composite skyrme model paper extende...
27919,0710.0972,A Floer homology for exact contact embeddings,In this paper we construct the Floer homolog...,"Kai Cieliebak, Urs Frauenfelder",math.SG,2007-10-05,A Floer homology for exact contact embeddings....,floer homology exact contact embeddings paper ...
27920,0710.0973,Modulation invariant bilinear T(1) theorem,We prove a T(1) theorem for bilinear singula...,"Arpad Benyi, Ciprian Demeter, Andrea R. Nahmod...",math.CA math.AP,2007-10-05,Modulation invariant bilinear T(1) theorem. ...,modulation invariant bilinear 1 theorem prove ...
27921,0710.0974,Hawking radiation in GHS and non-extremal D1-D...,We apply the method of Banerjee and Kulkarni...,"Sunandan Gangopadhyay, Shailesh Kulkarni",hep-th,2008-11-26,Hawking radiation in GHS and non-extremal D1-D...,hawking radiation ghs non extremal d1 d5 black...


### Carga del modelo de embedding

In [9]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [10]:
# Función para hacer embeddings por chuncks porque me faltó ram xd
def generate_embeddings(df, text_column='preprocessed', model_name='all-MiniLM-L6-v2', batch_size=32):
    model = SentenceTransformer(model_name)
    embeddings = []

    texts = df[text_column].fillna('').tolist()

    for i in tqdm(range(0, len(texts), batch_size), desc="Generando embeddings"):
        batch = texts[i:i+batch_size]
        batch_embeddings = model.encode(batch, convert_to_numpy=True)
        embeddings.append(batch_embeddings)

    # Concatenar todos los vectores
    embeddings = np.vstack(embeddings)

    # Guardar en la columna
    df['embeddings'] = embeddings.tolist()

    return df, embeddings


In [11]:
# Crear los embeddings
df, embeddings = generate_embeddings(df, text_column='preprocessed', batch_size=32)

Generando embeddings: 100%|██████████| 873/873 [09:53<00:00,  1.47it/s]


In [12]:
df

Unnamed: 0,id,title,abstract,authors,categories,update_date,raw,preprocessed,embeddings
0,0704.0001,Calculation of prompt diphoton production cros...,A fully differential calculation in perturba...,"C. Bal\'azs, E. L. Berger, P. M. Nadolsky, C.-...",hep-ph,2008-11-26,Calculation of prompt diphoton production cros...,calculation prompt diphoton production cross s...,"[-0.12137093394994736, 0.014926150441169739, 0..."
1,0704.0002,Sparsity-certifying Graph Decompositions,"We describe a new algorithm, the $(k,\ell)$-...",Ileana Streinu and Louis Theran,math.CO cs.CG,2008-12-13,Sparsity-certifying Graph Decompositions. We...,sparsity certifying graph decomposition descri...,"[-0.008536569774150848, 0.04919973015785217, -..."
2,0704.0003,The evolution of the Earth-Moon system based o...,The evolution of Earth-Moon system is descri...,Hongjun Pan,physics.gen-ph,2008-01-13,The evolution of the Earth-Moon system based o...,evolution earth moon system based dark matter ...,"[-0.022492259740829468, -0.06854434311389923, ..."
3,0704.0004,A determinant of Stirling cycle numbers counts...,We show that a determinant of Stirling cycle...,David Callan,math.CO,2007-05-23,A determinant of Stirling cycle numbers counts...,determinant stirling cycle number count unlabe...,"[-0.0460694283246994, 0.015293452888727188, -0..."
4,0704.0005,From dyadic $\Lambda_{\alpha}$ to $\Lambda_{\a...,In this paper we show how to compute the $\L...,Wael Abu-Shammala and Alberto Torchinsky,math.CA math.FA,2013-10-15,From dyadic $\Lambda_{\alpha}$ to $\Lambda_{\a...,dyadic lambda_ alpha lambda_ alpha paper show ...,"[-0.006292410660535097, 0.015691002830863, -0...."
...,...,...,...,...,...,...,...,...,...
27918,0710.0971,The Extending for Composite Skyrme Model,"In this paper, we have extended the composit...","Pham Thuc Tuyen, Do Quoc Tuan",nucl-th,2007-10-05,The Extending for Composite Skyrme Model. In...,extending composite skyrme model paper extende...,"[-0.025608807802200317, -0.049002766609191895,..."
27919,0710.0972,A Floer homology for exact contact embeddings,In this paper we construct the Floer homolog...,"Kai Cieliebak, Urs Frauenfelder",math.SG,2007-10-05,A Floer homology for exact contact embeddings....,floer homology exact contact embeddings paper ...,"[-0.07691294699907303, -0.006913523655384779, ..."
27920,0710.0973,Modulation invariant bilinear T(1) theorem,We prove a T(1) theorem for bilinear singula...,"Arpad Benyi, Ciprian Demeter, Andrea R. Nahmod...",math.CA math.AP,2007-10-05,Modulation invariant bilinear T(1) theorem. ...,modulation invariant bilinear 1 theorem prove ...,"[-0.006705889943987131, -0.0131675498560071, 0..."
27921,0710.0974,Hawking radiation in GHS and non-extremal D1-D...,We apply the method of Banerjee and Kulkarni...,"Sunandan Gangopadhyay, Shailesh Kulkarni",hep-th,2008-11-26,Hawking radiation in GHS and non-extremal D1-D...,hawking radiation ghs non extremal d1 d5 black...,"[-0.024427659809589386, -0.015633659437298775,..."


### Carga de queries y embeddings

In [13]:
# Leer el archivo queries.txt desde el mismo directorio
with open("queries.txt", "r", encoding="utf-8") as f:
    queries = [line.strip() for line in f if line.strip()]

# Mostrar las queries
queries

['Evolution of the earth',
 'determinant of Stirling',
 'Skyrme Model',
 'Floer homology for exact contact embeddings',
 'Hawking radiation']

In [14]:
query_embeddings = []

query_embeddings = model.encode(queries, convert_to_numpy=True)

print(query_embeddings[0])

[-6.07385896e-02  2.50446256e-02  7.46295974e-02 -2.53126171e-04
  4.90168370e-02  6.78570336e-03 -1.24885170e-02  1.37290359e-02
  2.52804104e-02  4.19690758e-02  1.95630118e-02 -7.63973594e-02
 -4.48138677e-02 -4.83865254e-02 -2.36435905e-02 -3.38195190e-02
 -1.14691928e-01 -9.32855308e-02 -3.34198549e-02 -2.23895833e-02
 -4.46170568e-02  8.86856392e-02 -9.60974116e-03  7.87472278e-02
 -5.51036671e-02  6.93922304e-03  4.37120646e-02  5.24423346e-02
  1.74720809e-02 -8.74584466e-02  4.48246896e-02 -2.77381446e-02
  5.91677986e-02 -2.82426421e-02 -4.30252627e-02  3.15402783e-02
 -4.51159887e-02  1.22309942e-02  4.39122953e-02 -8.68358184e-04
 -1.53085096e-02 -8.20049420e-02 -1.21291298e-02 -3.44455764e-02
  2.42257211e-02  3.07961018e-03 -1.76169351e-02 -4.35783732e-04
 -4.25018296e-02  3.13176960e-02 -1.42509928e-02 -5.37619777e-02
 -7.70565271e-02 -2.32678819e-02  2.02874206e-02 -1.23174116e-02
  1.61325297e-04 -6.48085028e-02  9.84463394e-02 -7.54429027e-02
  4.62686047e-02 -6.53818

### TF-IDF, BM25 y FAISS

In [25]:
# Función para búsqueda de similitud coseno con TF-IDF
def search_tfidf(df, query, top_k=10):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(df["preprocessed"])
    query_vec = vectorizer.transform([query])
    similarities = cosine_similarity(query_vec, tfidf_matrix).flatten()
    
    top_indices = similarities.argsort()[-top_k:][::-1]
    return df.iloc[top_indices][["id", "title", "abstract"]].assign(score=similarities[top_indices])

In [26]:
# Función para búsqueda por BM25
def search_bm25(df, query, top_k=10):
    tokenized_corpus = [doc.split() for doc in df["preprocessed"]]
    bm25 = BM25Okapi(tokenized_corpus)
    query_tokens = query.lower().split()
    scores = bm25.get_scores(query_tokens)
    
    top_indices = np.argsort(scores)[-top_k:][::-1]
    return df.iloc[top_indices][["id", "title", "abstract"]].assign(score=np.array(scores)[top_indices])

In [27]:
# Función para la creación del índice faiss con distancia euclidiana
def build_faiss_index(embeddings):
    dim = embeddings.shape[1]
    index = faiss.IndexFlatL2(dim)
    index.add(embeddings)
    return index

# Función de búsqueda en el índice faiss
def search_faiss(df, query_embedding, index, top_k=10):

    distances, indices = index.search(query_embedding, top_k)
    results = df.iloc[indices[0]][["id", "title", "abstract"]].copy()
    results["distance"] = distances[0]
    return results

### Retrieval

In [28]:
# Crear índice FAISS una sola vez
faiss_index = build_faiss_index(embeddings)

# Diccionarios para guardar resultados
tfidf_all_results = {}
bm25_all_results = {}
faiss_all_results = {}

# Recorrer las queries
for i, query in enumerate(queries):
    print(f"\n--- Consulta {i+1}: {query} ---\n")

    print("TF-IDF:")
    tfidf_results = search_tfidf(df, query, top_k=3)
    tfidf_all_results[i] = tfidf_results  # Guardar resultados
    for idx, row in tfidf_results.iterrows():
        print(f"[{row['score']:.4f}] {row['title'][:80]}")

    print("\nBM25:")
    bm25_results = search_bm25(df, query, top_k=3)
    bm25_all_results[i] = bm25_results  # Guardar resultados
    for idx, row in bm25_results.iterrows():
        print(f"[{row['score']:.4f}] {row['title'][:80]}")

    print("\nFAISS:")
    faiss_results = search_faiss(df, query_embeddings[i].reshape(1, -1), faiss_index, top_k=3)
    faiss_all_results[i] = faiss_results  # Guardar resultados
    for idx, row in faiss_results.iterrows():
        print(f"[{row['distance']:.4f}] {row['title'][:80]}")

    print("\nComparación de IDs:")
    print("TF-IDF:", list(tfidf_results["id"]))
    print("BM25:", list(bm25_results["id"]))
    print("FAISS:", list(faiss_results["id"]))



--- Consulta 1: Evolution of the earth ---

TF-IDF:
[0.4921] On the Planetary acceleration and the Rotation of the Earth
[0.4766] The evolution of the Earth-Moon system based on the dark matter field
  fluid mo
[0.4321] Geochemistry of U and Th and its Influence on the Origin and Evolution
  of the 

BM25:
[14.0559] The evolution of the Earth-Moon system based on the dark matter field
  fluid mo
[14.0537] Geochemistry of U and Th and its Influence on the Origin and Evolution
  of the 
[12.5302] Origin of the Ocean on the Earth: Early Evolution of Water D/H in a
  Hydrogen-r

FAISS:
[0.9060] Evolutionary Catastrophes and the Goldilocks Problem
[0.9957] Geochemistry of U and Th and its Influence on the Origin and Evolution
  of the 
[1.1336] On the change of latitude of Arctic East Siberia at the end of the
  Pleistocene

Comparación de IDs:
TF-IDF: ['0708.0666', '0704.0003', '0706.1089']
BM25: ['0704.0003', '0706.1089', '0709.2025']
FAISS: ['0709.2309', '0706.1089', '0704.2489']

--- C

En la consulta "Evolution of the earth" que es la primera realizada, los métodos TF-IDF y BM25 comparten dos documentos relevantes, mientras que FAISS recupera resultados diferentes, con solo un documento en común con los otros métodos. Esto refleja que TF-IDF y BM25 priorizan coincidencias textuales, mientras FAISS se enfoca en similitud semántica. La similitud entre rankings es parcial, lo que sugiere enfoques complementarios. Por lo que, es importante revisar si la respuesta generada por RAG realmente usa la información de los documentos recuperados y si responde de manera clara y coherente a lo que se está preguntando.
Es importante mencionar que, se eligió utilizar FAISS como el métodos de IR para el RAG porque permite encontrar documentos similares a nivel semántico, no solo por coincidencia exacta de palabras, a diferencia de TF-IDF y BM25.

### RAG

In [19]:
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")

# Crear cliente
client = OpenAI(api_key=api_key)

In [29]:
# Generar respuestas usando FAISS como contexto
for i, query in enumerate(queries):
    faiss_results = faiss_all_results[i]

    # Tomar solo los 3 documentos más relevantes
    top_faiss = faiss_results.head(3)

    # Construir contexto solo con los 3 mejores
    context = "\n".join(
        f"{j+1}. {df.loc[idx, 'raw']}" for j, idx in enumerate(top_faiss.index)
    )

    # Prompt con contexto y query actual
    prompt = f"""Eres una aplicación de tipo Retrieval-Augmented Generation (RAG) especializada en artículos científicos.
Tu tarea es ayudar al usuario a encontrar información relevante sobre su consulta, utilizando únicamente el contexto proporcionado.
La respuesta debe ser clara, precisa y estar basada únicamente en el contenido recuperado. 
Si no encuentras información relacionada en el contexto, responde: "Lo siento, no encontré información relevante en los documentos recuperados."

Contexto:
{context}

Pregunta del usuario:
{query}
"""

    # Ejecutar llamada al modelo
    response = client.responses.create(
        model="gpt-4.1",
        input=prompt
    )

    # Mostrar respuesta
    print(f"\n--- Respuesta a la consulta {i+1}: {query} ---\n")
    print(response.output_text)




--- Respuesta a la consulta 1: Evolution of the earth ---

Basándome en el contexto proporcionado, la evolución de la Tierra está estrechamente relacionada con varios factores:

1. **Eventos Catastróficos y Evolución Biológica:** Según el concepto del "problema de Ricitos de Oro", ciertos parámetros planetarios, incluidas las catástrofes evolutivas y las extinciones masivas, desempeñan un papel fundamental en la evolución biológica. Sin embargo, existe un sesgo observacional (sesgo de confianza antropocéntrica) que dificulta discernir si los procesos evolutivos terrestres son únicos en la galaxia. Esto apoya la importancia de la investigación astrobiológica y SETI para entender mejor la posible excepcionalidad de la Tierra (Contexto 1).

2. **Geoquímica de U y Th en la Evolución de la Corteza y de la Vida:** La migración y distribución de uranio (U) y torio (Th) han sido decisivas en la formación y evolución de la corteza terrestre. Estos elementos, al combinarse con componentes volát

#### Diferencias entre modelos y utilidad del RAG.

Los modelos de recuperación como TF-IDF, BM25 y FAISS presentan diferencias en cómo identifican la relevancia de los documentos: mientras TF-IDF y BM25 se basan en coincidencias textuales, FAISS utiliza similitud semántica a través de embeddings. Estas diferencias afectan los resultados recuperados y su orden. La utilidad del enfoque RAG radica en que combina la recuperación de información con un LLM, por lo que, recupera información relevante del corpus y genera respuestas más completas y contextualizadas, lo que mejora la calidad de las respuestas frente a un modelo generativo puro.