### Parámetros de Configuración

In [37]:
# Librerías Usadas
import re
import os
import requests
import xml.etree.ElementTree as ET

# Configuración para Arxiv API (se usa el endpoint de consulta con búsqueda)
ARXIV_API_URL = "http://export.arxiv.org/api/query?search_query={type_query}:{query}&start={start}&max_results={max_results}&sortBy={sortby}&sortOrder={sortorder}"

# Configuración para GitHub API
GITHUB_API_URL = "https://api.github.com"
GITHUB_TOKEN = os.environ.get("GITHUB_TOKEN", "")

# Parámetro por defecto para la consulta en Arxiv (por ejemplo, "RAG")
QUERY_TOPIC = "RAG"
TYPE_QUERY = "all"
START=0
MAX_RES=10
SORTBY = "submittedDate"
SORTORDER = "descending"

### Consulta a la API

In [38]:
url = ARXIV_API_URL.format(type_query=TYPE_QUERY,query=QUERY_TOPIC,start=START,max_results=MAX_RES,sortby=SORTBY,sortorder=SORTORDER)
response = requests.get(url)

### Parser de información

In [41]:
def parse_response(xml_data):
        root = ET.fromstring(xml_data)
        ns = {"atom": "http://www.w3.org/2005/Atom", "arxiv": "http://arxiv.org/schemas/atom"}
        articles = []
        for entry in root.findall("atom:entry", ns):
            # Versión: se extrae del id (p.ej. ...v1, ...v2, etc.)
            version = None
            m = re.search(r'v(\d+)$', id_text)
            if m:
                version = m.group(1)
            # Categoría primaria
            primary_category = ""
            primary_cat_elem = entry.find("arxiv:primary_category", ns)
            if primary_cat_elem is not None:
                primary_category = primary_cat_elem.attrib.get("term", "")
            # Fechas
            published = entry.find("atom:published", ns).text.strip() if entry.find("atom:published", ns) is not None else ""
            updated = entry.find("atom:updated", ns).text.strip() if entry.find("atom:updated", ns) is not None else ""
            # Título y resumen
            title = entry.find("atom:title", ns).text.strip() if entry.find("atom:title", ns) is not None else ""
            summary = entry.find("atom:summary", ns).text.strip() if entry.find("atom:summary", ns) is not None else ""
            # Autores (lista de diccionarios)
            authors = []
            for author in entry.findall("atom:author", ns):
                name = author.find("atom:name", ns).text.strip() if author.find("atom:name", ns) is not None else ""
                authors.append({"name": name})
            # Link del artículo (se toma el primero con rel="alternate")
            link_article = ""
            for link in entry.findall("atom:link", ns):
                if link.attrib.get("rel") == "alternate":
                    link_article = link.attrib.get("href", "")
                    break
            article = {
                "title": title,
                "published": published,
                "summary": summary,
                "primary_category": primary_category,
                "updated": updated,
                "authors": authors,
                "link_article": link_article,
                "version": version,
            }
            articles.append(article)
        return articles

### Resultados

In [43]:
response.text

'<?xml version="1.0" encoding="UTF-8"?>\n<feed xmlns="http://www.w3.org/2005/Atom">\n  <link href="http://arxiv.org/api/query?search_query%3Dall%3ARAG%26id_list%3D%26start%3D0%26max_results%3D10" rel="self" type="application/atom+xml"/>\n  <title type="html">ArXiv Query: search_query=all:RAG&amp;id_list=&amp;start=0&amp;max_results=10</title>\n  <id>http://arxiv.org/api/j/JCFc+b/wEUWgIEysf/YgADPTs</id>\n  <updated>2025-02-25T00:00:00-05:00</updated>\n  <opensearch:totalResults xmlns:opensearch="http://a9.com/-/spec/opensearch/1.1/">1579</opensearch:totalResults>\n  <opensearch:startIndex xmlns:opensearch="http://a9.com/-/spec/opensearch/1.1/">0</opensearch:startIndex>\n  <opensearch:itemsPerPage xmlns:opensearch="http://a9.com/-/spec/opensearch/1.1/">10</opensearch:itemsPerPage>\n  <entry>\n    <id>http://arxiv.org/abs/2502.18418v1</id>\n    <updated>2025-02-25T18:14:06Z</updated>\n    <published>2025-02-25T18:14:06Z</published>\n    <title>Rank1: Test-Time Compute for Reranking in Inf