# 0. Libraries

In [None]:
import numpy as np
import pandas as pd
from Bio import Entrez
import xml.etree.ElementTree as ET
import os
import re
from datetime import datetime
from typing import List, Dict, Optional, Tuple
import requests
from urllib.parse import urljoin, urlparse
import time
import json

# 5. Download html of articles

In [2]:
import os
import time
import random
import requests
import pandas as pd
from bs4 import BeautifulSoup

def get_html_from_pmc(pmc_id: str) -> BeautifulSoup:
    """Descarga y devuelve el contenido HTML del artículo de PubMed Central (PMC),
    con headers y una pausa aleatoria para evitar bloqueos."""
    
    url = f"https://www.ncbi.nlm.nih.gov/pmc/articles/{pmc_id}/"

    headers = {
        "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
                      "(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
    }

    try:
        response = requests.get(url, headers=headers, timeout=15)

        # Pausa entre 1 y 3 segundos para evitar bloqueo
        time.sleep(random.uniform(1, 3))

        if response.status_code == 200:
            return BeautifulSoup(response.text, "html.parser")
        else:
            print(f"⚠️ Error {response.status_code} al descargar {pmc_id}")
            return None

    except requests.exceptions.RequestException as e:
        print(f"❌ Error al conectar con {pmc_id}: {e}")
        return None


def download_html_from_csv(csv_file: str, output_folder: str = "pmc_htmls") -> None:
    """Descarga los archivos HTML de todos los artículos del CSV y los guarda localmente."""
    df = pd.read_csv(csv_file)

    # Crear carpeta si no existe
    os.makedirs(output_folder, exist_ok=True)

    for _, row in df.iterrows():
        pmc_id = row["PMC_ID"]
        print(f"📥 Descargando {pmc_id} ...")
        
        url = f"https://www.ncbi.nlm.nih.gov/pmc/articles/{pmc_id}/"
        headers = {
            "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
                          "(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
        }

        try:
            response = requests.get(url, headers=headers, timeout=15)

            # Pausa entre 1 y 3 segundos para evitar bloqueo
            time.sleep(random.uniform(1, 3))

            if response.status_code == 200:
                file_path = os.path.join(output_folder, f"{pmc_id}.html")
                with open(file_path, "w", encoding="utf-8") as f:
                    f.write(response.text)
                print(f"✅ Guardado en {file_path}")
            else:
                print(f"⚠️ Error {response.status_code} al descargar {pmc_id}")

        except requests.exceptions.RequestException as e:
            print(f"❌ Error al conectar con {pmc_id}: {e}")

# 6. Add the others columns to csv with 
df = pd.read_csv("articles_with_pmcid.csv")

# 4. Main

In [3]:
if __name__ == "__main__":
    # Ejemplo de uso
    csv_file = "articles_with_pmcid.csv"
    download_html_from_csv(csv_file)  
    print("🎉 Descarga completada de todos los artículos HTML.")


📥 Descargando PMC4136787 ...
✅ Guardado en pmc_htmls/PMC4136787.html
📥 Descargando PMC3630201 ...
✅ Guardado en pmc_htmls/PMC3630201.html
📥 Descargando PMC11988870 ...
✅ Guardado en pmc_htmls/PMC11988870.html
📥 Descargando PMC7998608 ...
✅ Guardado en pmc_htmls/PMC7998608.html
📥 Descargando PMC5587110 ...
✅ Guardado en pmc_htmls/PMC5587110.html
📥 Descargando PMC8396460 ...
✅ Guardado en pmc_htmls/PMC8396460.html
📥 Descargando PMC5666799 ...
✅ Guardado en pmc_htmls/PMC5666799.html
📥 Descargando PMC5460236 ...
✅ Guardado en pmc_htmls/PMC5460236.html
📥 Descargando PMC6222041 ...
✅ Guardado en pmc_htmls/PMC6222041.html
📥 Descargando PMC6813909 ...
✅ Guardado en pmc_htmls/PMC6813909.html
📥 Descargando PMC4095884 ...
✅ Guardado en pmc_htmls/PMC4095884.html
📥 Descargando PMC3040128 ...
✅ Guardado en pmc_htmls/PMC3040128.html
📥 Descargando PMC3177255 ...
✅ Guardado en pmc_htmls/PMC3177255.html
📥 Descargando PMC11500582 ...
✅ Guardado en pmc_htmls/PMC11500582.html
📥 Descargando PMC5387210 ...
✅

In [5]:
def extract_pmc_id(url_or_id):
    """Extrae el PMC ID de una URL o devuelve el ID si ya está limpio"""
    if isinstance(url_or_id, str):
        if 'PMC' in url_or_id:
            # Buscar patrón PMC seguido de números
            match = re.search(r'PMC\d+', url_or_id)
            if match:
                return match.group()
    return url_or_id

def get_article_xml(pmc_id):
    """Descarga el XML del artículo desde PMC"""
    try:
        # URL de la API de PMC para obtener el XML
        url = f"https://www.ncbi.nlm.nih.gov/pmc/articles/{pmc_id}/?tool=python_script&email=researcher@example.com"
        
        # También intentamos con la API de efetch
        efetch_url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pmc&id={pmc_id.replace('PMC', '')}&rettype=xml"
        
        response = requests.get(efetch_url, timeout=30)
        response.raise_for_status()
        
        return response.text
    except Exception as e:
        print(f"Error descargando {pmc_id}: {e}")
        return None

def extract_authors(root):
    """Extrae los autores del XML"""
    authors = []
    
    # Buscar autores en diferentes ubicaciones del XML
    for contrib in root.findall(".//contrib[@contrib-type='author']"):
        author_parts = []
        
        # Nombre
        given_names = contrib.find(".//given-names")
        if given_names is not None:
            author_parts.append(given_names.text)
        
        # Apellido
        surname = contrib.find(".//surname")
        if surname is not None:
            author_parts.append(surname.text)
        
        if author_parts:
            authors.append(" ".join(author_parts))
    
    return "; ".join(authors) if authors else "N/A"

def extract_keywords(root):
    """Extrae las palabras clave del XML"""
    keywords = []
    
    # Buscar keywords en diferentes ubicaciones
    for kwd in root.findall(".//kwd"):
        if kwd.text:
            keywords.append(kwd.text.strip())
    
    return "; ".join(keywords) if keywords else "N/A"

def extract_references_count(root):
    """Cuenta las referencias del artículo"""
    refs = root.findall(".//ref")
    return len(refs)

def extract_abstract(root):
    """Extrae el abstract del XML"""
    abstract_parts = []
    
    # Buscar abstract
    abstract_elem = root.find(".//abstract")
    if abstract_elem is not None:
        # Obtener todo el texto del abstract
        for p in abstract_elem.findall(".//p"):
            if p.text:
                abstract_parts.append(p.text.strip())
    
    abstract_text = " ".join(abstract_parts) if abstract_parts else "N/A"
    
    # Limpiar el texto
    abstract_text = re.sub(r'\s+', ' ', abstract_text).strip()
    return abstract_text

def extract_publication_info(root):
    """Extrae información de publicación"""
    # Año
    year = "N/A"
    pub_date = root.find(".//pub-date[@pub-type='epub']") or root.find(".//pub-date[@pub-type='ppub']") or root.find(".//pub-date")
    if pub_date is not None:
        year_elem = pub_date.find(".//year")
        if year_elem is not None:
            year = year_elem.text
    
    # Tipo de publicación
    article_type = "N/A"
    article_elem = root.find(".//article[@article-type]")
    if article_elem is not None:
        article_type = article_elem.get('article-type', 'N/A')
    
    return year, article_type

def process_csv_file(input_file):
    """Procesa el archivo CSV y extrae los datos de cada artículo"""
    
    # Leer el CSV
    try:
        df = pd.read_csv(input_file)
        print(f"Archivo CSV cargado con {len(df)} filas")
    except Exception as e:
        print(f"Error leyendo el CSV: {e}")
        return
    
    # Preparar lista para los resultados
    results = []
    
    # Procesar cada fila
    for idx, row in df.iterrows():
        print(f"Procesando artículo {idx + 1}/{len(df)}...")
        
        # Extraer PMC ID
        pmc_id = extract_pmc_id(row.get('PMC_ID', ''))
        if not pmc_id or not pmc_id.startswith('PMC'):
            print(f"PMC ID inválido en fila {idx}: {pmc_id}")
            continue
        
        # Descargar XML
        xml_content = get_article_xml(pmc_id)
        if not xml_content:
            print(f"No se pudo descargar XML para {pmc_id}")
            continue
        
        try:
            # Parsear XML
            root = ET.fromstring(xml_content)
            
            # Extraer título
            title_elem = root.find(".//title-group/article-title") or root.find(".//article-title")
            title = title_elem.text if title_elem is not None else row.get('Title', 'N/A')
            
            # Extraer datos
            authors = extract_authors(root)
            keywords = extract_keywords(root)
            references = extract_references_count(root)
            abstract = extract_abstract(root)
            year, pub_type = extract_publication_info(root)
            
            # URL original
            url = row.get('Link', 'N/A')
            
            # Agregar a resultados
            results.append({
                'PMC_ID': pmc_id,
                'Title': title,
                'Authors': authors,
                'Keywords': keywords,
                'References_Count': references,
                'Abstract': abstract,
                'Year': year,
                'Publication_Type': pub_type,
                'URL': url
            })
            
            print(f"✓ Procesado exitosamente: {pmc_id}")
            
        except ET.ParseError as e:
            print(f"Error parseando XML para {pmc_id}: {e}")
        except Exception as e:
            print(f"Error procesando {pmc_id}: {e}")
        
        # Pausa para no sobrecargar el servidor
        time.sleep(1)
    
    # Guardar resultados
    if results:
        output_df = pd.DataFrame(results)
        output_df.to_csv('result2', index=False, encoding='utf-8')
        print(f"\n✓ Archivo guardado: result2.csv ({len(results)} artículos procesados)")
        
        # Mostrar resumen
        print("\nResumen de datos extraídos:")
        print(f"- Artículos procesados: {len(results)}")
        print(f"- Con autores: {sum(1 for r in results if r['Authors'] != 'N/A')}")
        print(f"- Con keywords: {sum(1 for r in results if r['Keywords'] != 'N/A')}")
        print(f"- Con abstract: {sum(1 for r in results if r['Abstract'] != 'N/A')}")
    else:
        print("No se pudieron procesar artículos.")

# Función principal
def main():
    input_file = "articles_with_pmcid.csv"  # Cambia esto por el nombre de tu archivo
    
    print("=== Extractor de Datos de Artículos PMC ===")
    print(f"Procesando archivo: {input_file}")
    print("Esto puede tomar varios minutos dependiendo del número de artículos...\n")
    
    process_csv_file(input_file)

In [6]:
if __name__ == "__main__":
    main()

=== Extractor de Datos de Artículos PMC ===
Procesando archivo: articles_with_pmcid.csv
Esto puede tomar varios minutos dependiendo del número de artículos...

Archivo CSV cargado con 607 filas
Procesando artículo 1/607...


  title_elem = root.find(".//title-group/article-title") or root.find(".//article-title")
  pub_date = root.find(".//pub-date[@pub-type='epub']") or root.find(".//pub-date[@pub-type='ppub']") or root.find(".//pub-date")


✓ Procesado exitosamente: PMC4136787
Procesando artículo 2/607...
✓ Procesado exitosamente: PMC3630201
Procesando artículo 3/607...
✓ Procesado exitosamente: PMC11988870
Procesando artículo 4/607...
✓ Procesado exitosamente: PMC7998608
Procesando artículo 5/607...
✓ Procesado exitosamente: PMC5587110
Procesando artículo 6/607...
✓ Procesado exitosamente: PMC8396460
Procesando artículo 7/607...
✓ Procesado exitosamente: PMC5666799
Procesando artículo 8/607...
✓ Procesado exitosamente: PMC5460236
Procesando artículo 9/607...
✓ Procesado exitosamente: PMC6222041
Procesando artículo 10/607...
✓ Procesado exitosamente: PMC6813909
Procesando artículo 11/607...
✓ Procesado exitosamente: PMC4095884
Procesando artículo 12/607...
✓ Procesado exitosamente: PMC3040128
Procesando artículo 13/607...
✓ Procesado exitosamente: PMC3177255
Procesando artículo 14/607...
✓ Procesado exitosamente: PMC11500582
Procesando artículo 15/607...
✓ Procesado exitosamente: PMC5387210
Procesando artículo 16/607...
✓

In [20]:
dfr = pd.read_csv("articles_data.csv")
dfr.head()

Unnamed: 0,PMC_ID,Title,Authors,Introduction,Development/Methods,Results,Discussion,References
0,PMC4136787,,[],"After a 16-year hiatus, Russia resumed in 2013...",The study was approved by IACUC of MSU Institu...,Living conditions for animals considered optim...,Living conditions for animals considered optim...,[]
1,PMC3630201,,[],"On Earth, at 1 g, mechanical loading of mammal...",All experimental animal procedures for STS-131...,All flight and ground control mice were observ...,"In this study, we investigated cellular and mo...",[]
2,PMC11988870,,[],"Microgravity, a condition characterized by min...",A comprehensive literature review was conducte...,,Recent research demonstrates that microgravity...,[]
3,PMC7998608,,[],Human adipose-derived stem cells (hASCs) are e...,Human lipoaspirates were obtained from 12 heal...,Cells were expanded for three passages before ...,"Through novel advances in cell biology, adult ...",[]
4,PMC5587110,,[],The ISS National Laboratory is a unique resear...,"In order to validate the system, a number of g...",In order to assess the functionality of PCR in...,One of the major obstacles to space exploratio...,[]


In [21]:
dfr2 = pd.read_csv("result2")
dfr2.head()

Unnamed: 0,PMC_ID,Title,Authors,Keywords,References_Count,Abstract,Year,Publication_Type,URL
0,PMC4136787,Mice in Bion-M 1 Space Mission: Training and S...,Alexander Andreev-Andrievskiy; Anfisa Popova; ...,,37,"After a 16-year hiatus, Russia has resumed its...",2014,research-article,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4...
1,PMC3630201,Microgravity Induces Pelvic Bone Loss through ...,Elizabeth A. Blaber; Natalya Dvorochkin; Chial...,,74,Bone is a dynamically remodeled tissue that re...,2013,research-article,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...
2,PMC11988870,Microgravity and Cellular Biology: Insights in...,Nelson Adolfo López Garzón; María Virginia Pin...,microgravity; tissue effects; immune system; c...,70,"Microgravity, defined by minimal gravitational...",2025,review-article,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...
3,PMC7998608,Selective Proliferation of Highly Functional A...,Takanobu Mashiko; Koji Kanayama; Natsumi Saito...,adipose-derived stem cell; microgravity cultur...,48,Therapeutic effects of adult stem-cell transpl...,2021,research-article,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...
4,PMC5587110,Microgravity validation of a novel system for ...,Macarena Parra; Jimmy Jung; Travis D. Boone; L...,,38,The International Space Station (ISS) National...,2017,research-article,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...


# Change the bad columns

In [14]:
are_equal = dfr["PMC_ID"].equals(dfr2["PMC_ID"])
print(f"The columns PMC_ID are equal: {are_equal}")

The columns PMC_ID are equal: True


In [23]:
dfr.columns

Index(['PMC_ID', 'Title', 'Authors', 'Introduction', 'Development/Methods',
       'Results', 'Discussion', 'References'],
      dtype='object')

In [24]:
dfr2.columns

Index(['PMC_ID', 'Title', 'Authors', 'Keywords', 'References_Count',
       'Abstract', 'Year', 'Publication_Type', 'URL'],
      dtype='object')

In [28]:
# Change the bad columns
dfr[["Title", "Authors", "References"]] = dfr2[["Title", "Authors", "References_Count"]]
dfr = dfr.rename(columns={"References": "References_Count"})
dfr.head()

Unnamed: 0,PMC_ID,Title,Authors,Introduction,Development/Methods,Results,Discussion,References_Count
0,PMC4136787,Mice in Bion-M 1 Space Mission: Training and S...,Alexander Andreev-Andrievskiy; Anfisa Popova; ...,"After a 16-year hiatus, Russia resumed in 2013...",The study was approved by IACUC of MSU Institu...,Living conditions for animals considered optim...,Living conditions for animals considered optim...,37
1,PMC3630201,Microgravity Induces Pelvic Bone Loss through ...,Elizabeth A. Blaber; Natalya Dvorochkin; Chial...,"On Earth, at 1 g, mechanical loading of mammal...",All experimental animal procedures for STS-131...,All flight and ground control mice were observ...,"In this study, we investigated cellular and mo...",74
2,PMC11988870,Microgravity and Cellular Biology: Insights in...,Nelson Adolfo López Garzón; María Virginia Pin...,"Microgravity, a condition characterized by min...",A comprehensive literature review was conducte...,,Recent research demonstrates that microgravity...,70
3,PMC7998608,Selective Proliferation of Highly Functional A...,Takanobu Mashiko; Koji Kanayama; Natsumi Saito...,Human adipose-derived stem cells (hASCs) are e...,Human lipoaspirates were obtained from 12 heal...,Cells were expanded for three passages before ...,"Through novel advances in cell biology, adult ...",48
4,PMC5587110,Microgravity validation of a novel system for ...,Macarena Parra; Jimmy Jung; Travis D. Boone; L...,The ISS National Laboratory is a unique resear...,"In order to validate the system, a number of g...",In order to assess the functionality of PCR in...,One of the major obstacles to space exploratio...,38


In [32]:
# Add other columns
dfr[["Keywords", "Abstract", "Year", "Publicaction_type", "URL"]] = dfr2[["Keywords", "Abstract", "Year", "Publication_Type", "URL"]]
dfr.head(10)

Unnamed: 0,PMC_ID,Title,Authors,Introduction,Development/Methods,Results,Discussion,References_Count,Keywords,Abstract,Year,Publicaction_type,URL
0,PMC4136787,Mice in Bion-M 1 Space Mission: Training and S...,Alexander Andreev-Andrievskiy; Anfisa Popova; ...,"After a 16-year hiatus, Russia resumed in 2013...",The study was approved by IACUC of MSU Institu...,Living conditions for animals considered optim...,Living conditions for animals considered optim...,37,,"After a 16-year hiatus, Russia has resumed its...",2014,research-article,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4...
1,PMC3630201,Microgravity Induces Pelvic Bone Loss through ...,Elizabeth A. Blaber; Natalya Dvorochkin; Chial...,"On Earth, at 1 g, mechanical loading of mammal...",All experimental animal procedures for STS-131...,All flight and ground control mice were observ...,"In this study, we investigated cellular and mo...",74,,Bone is a dynamically remodeled tissue that re...,2013,research-article,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...
2,PMC11988870,Microgravity and Cellular Biology: Insights in...,Nelson Adolfo López Garzón; María Virginia Pin...,"Microgravity, a condition characterized by min...",A comprehensive literature review was conducte...,,Recent research demonstrates that microgravity...,70,microgravity; tissue effects; immune system; c...,"Microgravity, defined by minimal gravitational...",2025,review-article,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...
3,PMC7998608,Selective Proliferation of Highly Functional A...,Takanobu Mashiko; Koji Kanayama; Natsumi Saito...,Human adipose-derived stem cells (hASCs) are e...,Human lipoaspirates were obtained from 12 heal...,Cells were expanded for three passages before ...,"Through novel advances in cell biology, adult ...",48,adipose-derived stem cell; microgravity cultur...,Therapeutic effects of adult stem-cell transpl...,2021,research-article,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...
4,PMC5587110,Microgravity validation of a novel system for ...,Macarena Parra; Jimmy Jung; Travis D. Boone; L...,The ISS National Laboratory is a unique resear...,"In order to validate the system, a number of g...",In order to assess the functionality of PCR in...,One of the major obstacles to space exploratio...,38,,The International Space Station (ISS) National...,2017,research-article,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...
5,PMC8396460,Spaceflight Modulates the Expression of Key Ox...,Akhilesh Kumar; Candice G. T. Tahimic; Eduardo...,Responses to spaceflight include cardiovascula...,All animal procedures were conducted in accord...,"Immediately after landing, all flight (FLT) an...",Our findings provide new insight into how the ...,76,spaceflight; microgravity; heart; gene express...,Spaceflight causes cardiovascular changes due ...,2021,research-article,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8...
6,PMC5666799,Dose- and Ion-Dependent Effects in the Oxidati...,Joshua S. Alwood; Luan H. Tran; Ann-Sofie Schr...,Structural degradation and oxidative stress fo...,"Male C57BL/6J mice (Jackson Laboratories, Bar ...",To evaluate the individual effects of radiatio...,Heavy-ion irradiation during space missions is...,57,cancellous bone; osteoblast; ionizing radiatio...,Space radiation may pose a risk to skeletal he...,2017,research-article,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...
7,PMC5460236,From the bench to exploration medicine: NASA l...,Joshua S. Alwood; April E. Ronca; Richard C. M...,With the International Space Station (ISS) ava...,,,,87,,NASA’s Space Biology and Human Research Progra...,2017,brief-report,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...
8,PMC6222041,High-precision method for cyclic loading of sm...,Megan M. Pendleton; Saghi Sadoughi; Alfred Li;...,"A number of diseases, including osteoporosis (...",We first present our new method in detail. The...,"Of all three methods tested, the new (K FEA ) ...",Our new method for cyclic loading of small-ani...,36,Fatigue; Bone mechanics; Mouse; Vertebrae; Bon...,One potentially important bone quality charact...,2018,research-article,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...
9,PMC6813909,Effects of,Megan M. Pendleton; Shannon R. Emerzian; Jenni...,"For a variety of clinical applications, bones ...","Forty-eight female, 20-week old (skeletally-ma...","For monotonic compression testing, the vertebr...",These results demonstrate that the monotonic s...,0,ionizing radiation; bone strength; fatigue; co...,Bone can become brittle when exposed to ionizi...,2019,research-article,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...


In [41]:
# Guardar el abstract de la primera fila
with open('abstract.txt', 'w', encoding='utf-8') as archivo:
    archivo.write(dfr["Abstract"].iloc[0])

print("Abstract guardado en abstract.txt")

Abstract guardado en abstract.txt
