In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.action_chains import ActionChains
import time
import re
import os
import polars as pl

PUBMED_TRENDING_URL = "https://pubmed.ncbi.nlm.nih.gov/trending/"

In [None]:
def parse_nbib_data(nbib_text: str) -> dict | None:
    """
    Analiza un bloque de texto en formato NBIB/MEDLINE, extrae los campos requeridos
    y valida que todos estén presentes.
    """
    # Mapeo de campos requeridos a sus etiquetas NBIB
    nbib_map = {
        'Title': 'TI',
        'Authors': 'AU',
        'Abstract': 'AB',
        'Journal': 'JT',
        'Date': 'DP',
        'DOI': 'LID'
    }
    required_fields = list(nbib_map.keys())
    
    data = {}
    authors = []
    last_tag = None

    for line in nbib_text.strip().split('\n'):
        line = line.strip()
        if not line:
            continue

        match = re.match(r'^([A-Z]{2,4})\s*-\s*(.*)', line)
        if match:
            tag, value = match.groups()
            last_tag = tag.strip()

            # Solo captura el valor de LID si la línea contiene la etiqueta [doi]
            if last_tag == nbib_map['DOI'] and '[doi]' in line:
                data['DOI'] = value.split(' ')[0].strip()
            
            # Manejar autores (puede haber múltiples)
            if last_tag == nbib_map['Authors']:
                authors.append(value.strip())
            # Manejar otros campos
            elif last_tag in nbib_map.values():
                field_name = [k for k, v in nbib_map.items() if v == last_tag][0]
                if field_name not in data:
                    data[field_name] = value.strip()
        # Manejar campos multi-línea como el Abstract (AB)
        elif last_tag and last_tag == nbib_map['Abstract'] and 'Abstract' in data:
            data['Abstract'] += ' ' + line

    if authors:
        data['Authors'] = ", ".join(authors)

    # Limpieza final y validación
    if 'DOI' in data:
        data['DOI'] = data['DOI'].split(' ')[0] # Extraer solo el DOI

    for field in required_fields:
        if field not in data or not data[field]:
            print(f"Artículo descartado. Campo requerido '{field}' no encontrado en el NBIB.")
            return None
            
    return data

In [None]:
options = webdriver.ChromeOptions()
options.add_experimental_option("prefs", {"download.default_directory": os.path.abspath(".")})
# options.add_argument("--headless")
options.add_argument("--start-maximized")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
wait = WebDriverWait(driver, 10)

driver.get(PUBMED_TRENDING_URL)
actions = ActionChains(driver)

In [None]:
# --- Mostrar primeros 300 registros ---
# wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, 'button[data-ga-category="display_options"]'))).click()
# select_element = driver.find_element(By.ID, "id_size")
# select_object = Select(select_element)
# select_object.select_by_visible_text("100")
# wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, 'button[data-ga-category="display_options"]'))).click()
# for i in range (3):
#     time.sleep(1)
#     wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'button[data-ga-action="show_more"]'))).click()
# wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'button[data-ga-action="show_more"]'))).click()

In [None]:
article_data=[]
reviewed=0

while(len(article_data)<300):
    article_urls = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'a.docsum-title')))
    # print(f"article_urls={len(article_urls)}")
    for i in range(10):
        if len(article_data)<300:
            article_urls[reviewed].click()
            # print(article_urls[reviewed].text)
            time.sleep(1)
            wait.until(EC.element_to_be_clickable((By.XPATH, '/html/body/div[5]/aside/div/div[2]/div/button[1]'))).click()
            time.sleep(1)
            file_name = ((wait.until(EC.visibility_of_element_located((By.XPATH, '/html/body/div[5]/div[2]/div/div[2]/div[2]/form'))).get_attribute("action")).split("/")[3] + ".nbib")
            # print(f"file_name={file_name}")
            wait.until(EC.element_to_be_clickable((By.XPATH, '/html/body/div[5]/div[2]/div/div[2]/div[2]/form/button'))).click()
            driver.execute_script("window.history.go(-1)")
            reviewed+=1

            time.sleep(1)
            contenido=""
            with open(file_name, 'r', encoding='utf-8') as f:
                contenido = f.read()
                # print(f"Contenido del archivo {archivo_entrada} leído correctamente.")
            contenido_parseado = parse_nbib_data(contenido)
            if contenido_parseado:
                article_data.append(contenido_parseado)
            os.remove(file_name)
            # print(f"reviewed={reviewed}, article_data={article_data}")
    if len(article_data)<300:
        wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'button[data-ga-action="show_more"]'))).click()
        # print("Showing more...")
    print(f"len(article_data)={len(article_data)}")
    time.sleep(1)




In [None]:
# 1. Crear un DataFrame de Polars a partir de la lista de diccionarios
df = pl.DataFrame(article_data)

# 2. Seleccionar y ordenar las columnas según la especificación de la práctica
column_order = ["DOI", "Title", "Authors", "Abstract", "Journal", "Date"]
df = df.select(column_order)

# 3. Guardar el DataFrame en un archivo CSV con separador de tabulación
output_file = "../pubmed_raw_corpus.csv"
df.write_csv(output_file, separator='\t')

In [None]:
df

In [5]:

def formatear_fecha(fecha: str):
    mes_num = {"Jan": "01", "Feb": "02", "Mar": "03", "Apr": "04", "May": "05", "Jun": "06",
           "Jul": "07", "Aug": "08", "Sep": "09", "Oct": "10", "Nov": "11", "Dec": "12"}
    # Suponiendo que al menos el año está (YYYY MM DD, YYYY MM, YYYY)
    # Formato original: YYYY MM DD
    fecha_separada = fecha.split(" ")
    año = fecha_separada[0]
    mes = fecha_separada[1].split("-")[-1] if fecha_separada[1].split("-") else fecha_separada[1] if len(fecha_separada) >= 2 else ""
    dia = fecha_separada[2] if len(fecha_separada) == 3 else ""
    return (año+"/"+mes_num[mes]+"/"+dia)   

formatear_fecha("2025 Jan-Dec 14")

'2025/12/14'