# 0. Libraries

In [165]:
import numpy as np
import pandas as pd
from Bio import Entrez
import xml.etree.ElementTree as ET
import os
import re
from datetime import datetime
from typing import List, Dict, Optional, Tuple
import requests
from urllib.parse import urljoin, urlparse
import time
import json

# 1. Modules

In [166]:
# Return a PMC article XML given its ID
def fetch_pmc_xml(pmc_id, filename=None, email="your_email@example.com"):
    """
    Download the XML of a PMC article given its ID.

    Args:
        pmc_id (str): The PMC ID, e.g. "PMC4136787" or just "4136787".
        filename (str): Optional, file name to save the XML.
        email (str): Your email (required by NCBI).

    Returns:
        str: The XML content of the article as string.
    """
    # Set email for NCBI (required)
    Entrez.email = email
    
    # Clean PMC ID (remove PMC prefix if present)
    clean_id = pmc_id.replace("PMC", "") if pmc_id.startswith("PMC") else pmc_id
    
    try:
        # Fetch XML data
        handle = Entrez.efetch(db="pmc", id=clean_id, rettype="full", retmode="xml")
        xml_data = handle.read()
        handle.close()
        
        # Convert bytes to string
        if isinstance(xml_data, bytes):
            xml_content = xml_data.decode('utf-8')
        else:
            xml_content = xml_data
        
        # Save to file if filename provided
        if filename:
            with open(filename, "w", encoding="utf-8") as f:
                f.write(xml_content)
            print(f"XML saved to {filename}")
        
        return xml_content
        
    except Exception as e:
        print(f"Error fetching PMC ID {pmc_id}: {e}")
        return None

In [167]:
def load_xml(xml_content_or_file):
    """
    Load XML content (string or file path).
    """
    try:
        # Try parsing from string
        root = ET.fromstring(xml_content_or_file)
    except ET.ParseError:
        # Otherwise parse from file
        tree = ET.parse(xml_content_or_file)
        root = tree.getroot()
    return root

In [168]:
def get_authors(root):
    """Extract list of authors."""
    authors = []
    for contrib in root.findall(".//contrib[@contrib-type='author']"):
        surname = contrib.find("name/surname")
        given = contrib.find("name/given-names")
        if surname is not None and given is not None:
            authors.append(f"{given.text} {surname.text}")
    return authors

In [169]:
def get_title(root):
    """Extract article title."""
    title_elem = root.find(".//article-title")
    return title_elem.text if title_elem is not None else None

In [170]:
def get_abstract(root):
    """Extract abstract text."""
    abstract_elems = root.findall(".//abstract//p")
    return " ".join("".join(p.itertext()) for p in abstract_elems) if abstract_elems else None

In [171]:
def get_journal_info(root):
    """Extract journal information."""
    journal_info = {}
    
    # Journal title
    journal_title = root.find(".//journal-title")
    journal_info['journal_title'] = journal_title.text if journal_title is not None else None
    
    # ISSN
    issn = root.find(".//issn")
    journal_info['issn'] = issn.text if issn is not None else None
    
    # Volume
    volume = root.find(".//volume")
    journal_info['volume'] = volume.text if volume is not None else None
    
    # Issue
    issue = root.find(".//issue")
    journal_info['issue'] = issue.text if issue is not None else None
    
    return journal_info

In [172]:
def get_publication_date(root):
    """Extract publication date."""
    pub_date = root.find(".//pub-date[@pub-type='epub']") or root.find(".//pub-date")
    if pub_date is not None:
        year = pub_date.find("year")
        month = pub_date.find("month")
        day = pub_date.find("day")
        
        year_text = year.text if year is not None else "1900"
        month_text = month.text if month is not None else "01"
        day_text = day.text if day is not None else "01"
        
        try:
            return f"{year_text}-{month_text.zfill(2)}-{day_text.zfill(2)}"
        except:
            return year_text
    return None

In [173]:
def get_doi(root):
    """Extract DOI."""
    doi = root.find(".//article-id[@pub-id-type='doi']")
    return doi.text if doi is not None else None

In [174]:
def get_keywords(root):
    """Extract keywords."""
    keywords = []
    for kwd in root.findall(".//kwd"):
        if kwd.text:
            keywords.append(kwd.text.strip())
    return keywords

In [175]:
def clean_text(text):
    """Clean and normalize text."""
    if not text:
        return None
    # Remove excessive whitespace and normalize
    text = re.sub(r'\s+', ' ', text.strip())
    return text

In [176]:
def get_section_by_title(root, section_titles):
    """Extract content from sections by matching titles."""
    content = []
    
    for sec in root.findall(".//sec"):
        title_elem = sec.find("title")
        if title_elem is not None:
            title = title_elem.text.lower() if title_elem.text else ""
            
            # Check if any of the section titles match
            if any(sect_title.lower() in title for sect_title in section_titles):
                # Get all paragraphs in this section
                paragraphs = sec.findall(".//p")
                section_text = " ".join("".join(p.itertext()) for p in paragraphs)
                content.append(clean_text(section_text))
    
    return " ".join(content) if content else None

In [177]:
def get_introduction(root):
    """Extract introduction section."""
    intro_titles = ["introduction", "background", "rationale"]
    return get_section_by_title(root, intro_titles)

In [178]:
def get_methods(root):
    """Extract methods/methodology section."""
    method_titles = ["methods", "methodology", "materials and methods", 
                    "experimental procedures", "procedures", "approach"]
    return get_section_by_title(root, method_titles)

In [179]:
def get_results(root):
    """Extract results section."""
    result_titles = ["results", "findings", "observations"]
    return get_section_by_title(root, result_titles)

In [180]:
def get_discussion(root):
    """Extract discussion section."""
    discussion_titles = ["discussion", "interpretation", "analysis"]
    return get_section_by_title(root, discussion_titles)

In [181]:
def get_conclusions(root):
    """Extract conclusions section."""
    conclusion_titles = ["conclusion", "conclusions", "summary", "implications", 
                        "future work", "future directions"]
    return get_section_by_title(root, conclusion_titles)

In [182]:
def get_references(root):
    """Extract references."""
    references = []
    
    for ref in root.findall(".//ref"):
        ref_text = "".join(ref.itertext()).strip()
        if ref_text:
            references.append(clean_text(ref_text))
    
    return references

In [183]:
def get_figures_and_tables(root):
    """Extract information about figures and tables."""
    figures = []
    tables = []
    
    # Get figures
    for fig in root.findall(".//fig"):
        fig_info = {}
        
        # Figure ID
        fig_id = fig.get("id")
        fig_info['id'] = fig_id
        
        # Figure caption
        caption = fig.find(".//caption")
        if caption is not None:
            fig_info['caption'] = clean_text("".join(caption.itertext()))
        
        figures.append(fig_info)
    
    # Get tables
    for table in root.findall(".//table-wrap"):
        table_info = {}
        
        # Table ID
        table_id = table.get("id")
        table_info['id'] = table_id
        
        # Table caption
        caption = table.find(".//caption")
        if caption is not None:
            table_info['caption'] = clean_text("".join(caption.itertext()))
        
        tables.append(table_info)
    
    return figures, tables

In [184]:
def get_num_images(xml_root):
    """
    Devuelve el número de imágenes encontradas en un artículo.
    Busca etiquetas <fig> y <graphic>.
    """
    num_figs = len(xml_root.findall(".//fig"))
    num_graphics = len(xml_root.findall(".//graphic"))
    return num_figs + num_graphics

In [185]:
def get_image_links(xml_root):
    """
    Devuelve una lista con los links de las imágenes encontradas en el artículo.
    Busca atributos xlink:href en etiquetas <graphic>.
    """
    ns = {"xlink": "http://www.w3.org/1999/xlink"}  # namespace usado en los XML de PMC
    links = []

    for graphic in xml_root.findall(".//graphic", ns):
        href = graphic.get("{http://www.w3.org/1999/xlink}href")
        if href:
            links.append(href)

    return links


In [186]:
def extract_article_data(xml_file_path):
    """
    Extract comprehensive data from a PMC XML file.
    
    Args:
        xml_file_path (str): Path to the XML file
        
    Returns:
        dict: Dictionary containing all extracted data
    """
    try:
        root = load_xml(xml_file_path)
        
        # Extract all available data
        data = {
            'pmc_id': os.path.basename(xml_file_path).replace('.xml', ''),
            'title': get_title(root),
            'authors': get_authors(root),
            'abstract': get_abstract(root),
            'journal_info': get_journal_info(root),
            'publication_date': get_publication_date(root),
            'doi': get_doi(root),
            'keywords': get_keywords(root),
            'introduction': get_introduction(root),
            'methods': get_methods(root),
            'results': get_results(root),
            'discussion': get_discussion(root),
            'conclusions': get_conclusions(root),
            'references': get_references(root),
            'num_references': len(get_references(root)),
            'extraction_date': datetime.now().isoformat()
        }
        
        # Get figures and tables
        figures, tables = get_figures_and_tables(root)
        data['figures'] = figures
        data['tables'] = tables
        data['num_figures'] = len(figures)
        data['num_tables'] = len(tables)
        
        return data
        
    except Exception as e:
        print(f"Error extracting data from {xml_file_path}: {e}")
        return None

In [187]:
def create_dataframe_from_extracted_data(json_file):
    """
    Create a pandas DataFrame from the extracted JSON data.
    
    Args:
        json_file (str): Path to the JSON file with extracted data
        
    Returns:
        pandas.DataFrame: DataFrame with the extracted data
    """
    import json
    
    with open(json_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    # Flatten the data for DataFrame creation
    flattened_data = []
    
    for article in data:
        flat_article = {
            'pmc_id': article.get('pmc_id'),
            'title': article.get('title'),
            'authors': ', '.join(article.get('authors', [])),
            'num_authors': len(article.get('authors', [])),
            'journal_title': article.get('journal_info', {}).get('journal_title'),
            'issn': article.get('journal_info', {}).get('issn'),
            'volume': article.get('journal_info', {}).get('volume'),
            'issue': article.get('journal_info', {}).get('issue'),
            'publication_date': article.get('publication_date'),
            'doi': article.get('doi'),
            'keywords': ', '.join(article.get('keywords', [])),
            'num_keywords': len(article.get('keywords', [])),
            'abstract': article.get('abstract'),
            'abstract_length': len(article.get('abstract', '') or ''),
            'introduction': article.get('introduction'),
            'introduction_length': len(article.get('introduction', '') or ''),
            'methods': article.get('methods'),
            'methods_length': len(article.get('methods', '') or ''),
            'results': article.get('results'),
            'results_length': len(article.get('results', '') or ''),
            'discussion': article.get('discussion'),
            'discussion_length': len(article.get('discussion', '') or ''),
            'conclusions': article.get('conclusions'),
            'conclusions_length': len(article.get('conclusions', '') or ''),
            'num_references': article.get('num_references', 0),
            'num_figures': article.get('num_figures', 0),
            'num_tables': article.get('num_tables', 0),
            'extraction_date': article.get('extraction_date')
        }
        
        flattened_data.append(flat_article)
    
    df = pd.DataFrame(flattened_data)
    return df


In [188]:
def process_all_xmls(xml_folder_path, output_file="extracted_articles_data.json"):
    """
    Process all XML files in a folder and extract comprehensive data.
    
    Args:
        xml_folder_path (str): Path to folder containing XML files
        output_file (str): Output file name for the results
        
    Returns:
        list: List of dictionaries containing extracted data
    """
    import json
    
    all_articles = []
    xml_files = [f for f in os.listdir(xml_folder_path) if f.endswith('.xml')]
    
    print(f"Processing {len(xml_files)} XML files...")
    
    for i, xml_file in enumerate(xml_files, 1):
        xml_path = os.path.join(xml_folder_path, xml_file)
        print(f"Processing {i}/{len(xml_files)}: {xml_file}")
        
        article_data = extract_article_data(xml_path)
        if article_data:
            all_articles.append(article_data)
        
        # Save progress every 50 files
        if i % 50 == 0:
            with open(f"temp_{output_file}", 'w', encoding='utf-8') as f:
                json.dump(all_articles, f, indent=2, ensure_ascii=False)
            print(f"Progress saved: {i} files processed")
    
    # Save final results
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(all_articles, f, indent=2, ensure_ascii=False)
    
    print(f"Extraction complete! Data saved to {output_file}")
    print(f"Total articles processed: {len(all_articles)}")
    
    return all_articles


# 2. Extract the data

In [189]:
# Download the articles CSV file
!wget https://raw.githubusercontent.com/jgalazka/SB_publications/main/SB_publication_PMC.csv -O SB_publication_PMC.csv

--2025-09-23 19:33:57--  https://raw.githubusercontent.com/jgalazka/SB_publications/main/SB_publication_PMC.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.108.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 97057 (95K) [text/plain]
Saving to: ‘SB_publication_PMC.csv’


2025-09-23 19:33:57 (950 KB/s) - ‘SB_publication_PMC.csv’ saved [97057/97057]



In [190]:
df = pd.read_csv("SB_publication_PMC.csv")
# Create a new column with just the PMC ID
df["PMC_ID"] = df["Link"].str.extract(r'(PMC\d+)')
# Guardar el DataFrame en un nuevo CSV
df.to_csv("articles_with_pmcid.csv", index=False)
# Show the dataframe
df.head()

Unnamed: 0,Title,Link,PMC_ID
0,Mice in Bion-M 1 space mission: training and s...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4...,PMC4136787
1,Microgravity induces pelvic bone loss through ...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...,PMC3630201
2,Stem Cell Health and Tissue Regeneration in Mi...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...,PMC11988870
3,Microgravity Reduces the Differentiation and R...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...,PMC7998608
4,Microgravity validation of a novel system for ...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,PMC5587110


In [191]:
df["PMC_ID"].duplicated().sum()

np.int64(35)

In [192]:
df["PMC_ID"].value_counts()

PMC_ID
PMC12008199    4
PMC5748516     3
PMC8044432     3
PMC11477029    2
PMC10831389    2
              ..
PMC12040010    1
PMC11833055    1
PMC9146534     1
PMC11492218    1
PMC9865768     1
Name: count, Length: 572, dtype: int64

In [193]:
607-35

572

In [194]:
# Create a directory to save XML files
os.makedirs("pmc_xmls", exist_ok=True)

In [195]:
# Setting Email for NCBI Entrez
email = "atauchimamani@gmail.com"

# Download XMLs for each article
for idx, row in df.iterrows():
    pmc_id = row["PMC_ID"]
    if pd.notna(pmc_id):
        filename = os.path.join("pmc_xmls", f"{pmc_id}.xml")
        if not os.path.exists(filename):  # Avoid re-downloading
            print(f"Fetching {pmc_id}...")
            fetch_pmc_xml(pmc_id, filename=filename, email=email)
        else:
            print(f"{pmc_id} already downloaded.")

PMC4136787 already downloaded.
PMC3630201 already downloaded.
PMC11988870 already downloaded.
PMC7998608 already downloaded.
PMC5587110 already downloaded.
PMC8396460 already downloaded.
PMC5666799 already downloaded.
PMC5460236 already downloaded.
PMC6222041 already downloaded.
PMC6813909 already downloaded.
PMC4095884 already downloaded.
PMC3040128 already downloaded.
PMC3177255 already downloaded.
PMC11500582 already downloaded.
PMC5387210 already downloaded.
PMC4642138 already downloaded.
PMC5387210 already downloaded.
PMC2915878 already downloaded.
PMC3901686 already downloaded.
PMC6985101 already downloaded.
PMC6387434 already downloaded.
PMC6371294 already downloaded.
PMC7072278 already downloaded.
PMC8441986 already downloaded.
PMC9400218 already downloaded.
PMC9267413 already downloaded.
PMC9576569 already downloaded.
PMC10789781 already downloaded.
PMC10772081 already downloaded.
PMC11166946 already downloaded.
PMC11166944 already downloaded.
PMC11166968 already downloaded.
P

In [196]:
# Count a xml articles
folder_name = "pmc_xmls"

# Contar los archivos XML en la carpeta
xml_files = [f for f in os.listdir(folder_name) if f.endswith(".xml")]
print(f"Total de archivos XML: {len(xml_files)}")

Total de archivos XML: 572


In [None]:
# Ejemplo de uso:
if __name__ == "__main__":

    # Carpeta donde están los XML
    xml_folder = "pmc_xmls"

    # Archivos de salida
    json_output = "comprehensive_articles_data.json"
    csv_output = "comprehensive_articles_data.csv"

    # Procesar todos los XMLs y guardar JSON y CSV
    all_data = process_all_xmls(xml_folder, json_output)

    # Guardar también en JSON
    with open(json_output, "w", encoding="utf-8") as f:
        json.dump(all_data, f, ensure_ascii=False, indent=2)

    # Crear DataFrame
    df = pd.DataFrame(all_data)

    # Guardar como CSV
    df.to_csv(csv_output, index=False, encoding="utf-8")

    print(f"✅ JSON guardado en: {json_output}")
    print(f"✅ CSV guardado en: {csv_output}")
    print(f"\nDataFrame creado con {len(df)} artículos")
    print(f"Columnas: {list(df.columns)}")

    # Mostrar estadísticas básicas
    print("\n📊 Estadísticas básicas:")
    if "abstract" in df.columns:
        print(f"Articles with abstract: {df['abstract'].notna().sum()}")
    if "introduction" in df.columns:
        print(f"Articles with introduction: {df['introduction'].notna().sum()}")
    if "methods" in df.columns:
        print(f"Articles with methods: {df['methods'].notna().sum()}")
    if "results" in df.columns:
        print(f"Articles with results: {df['results'].notna().sum()}")
    if "discussion" in df.columns:
        print(f"Articles with discussion: {df['discussion'].notna().sum()}")
    if "conclusions" in df.columns:
        print(f"Articles with conclusions: {df['conclusions'].notna().sum()}")
    if "num_references" in df.columns:
        print(f"Average number of references: {df['num_references'].mean():.1f}")
    if "num_figures" in df.columns:
        print(f"Average number of figures: {df['num_figures'].mean():.1f}")
    if "num_tables" in df.columns:
        print(f"Average number of tables: {df['num_tables'].mean():.1f}")


Processing 572 XML files...
Processing 1/572: PMC3570223.xml
Processing 2/572: PMC7610290.xml
Processing 3/572: PMC4035928.xml
Processing 4/572: PMC10715203.xml
Processing 5/572: PMC10472590.xml
Processing 6/572: PMC7555797.xml
Processing 7/572: PMC10390562.xml
Processing 8/572: PMC11053165.xml
Processing 9/572: PMC3748764.xml
Processing 10/572: PMC10926278.xml
Processing 11/572: PMC11166952.xml
Processing 12/572: PMC3005423.xml
Processing 13/572: PMC10025027.xml
Processing 14/572: PMC2824534.xml
Processing 15/572: PMC7667275.xml
Processing 16/572: PMC10503492.xml
Processing 17/572: PMC7516158.xml
Processing 18/572: PMC5666799.xml
Processing 19/572: PMC5761896.xml
Processing 20/572: PMC5460236.xml
Processing 21/572: PMC12034939.xml
Processing 22/572: PMC10370681.xml
Processing 23/572: PMC8739323.xml
Processing 24/572: PMC8220224.xml
Processing 25/572: PMC3774184.xml
Processing 26/572: PMC6597714.xml
Processing 27/572: PMC5052530.xml
Processing 28/572: PMC7787258.xml
Processing 29/572: 

  pub_date = root.find(".//pub-date[@pub-type='epub']") or root.find(".//pub-date")


Processing 35/572: PMC11166655.xml
Processing 36/572: PMC7467030.xml
Processing 37/572: PMC5454470.xml
Processing 38/572: PMC6081456.xml
Processing 39/572: PMC4923109.xml
Processing 40/572: PMC3982735.xml
Processing 41/572: PMC3615599.xml
Processing 42/572: PMC10284894.xml
Processing 43/572: PMC7012842.xml
Processing 44/572: PMC9549344.xml
Processing 45/572: PMC7235020.xml
Processing 46/572: PMC7393961.xml
Processing 47/572: PMC8113475.xml
Processing 48/572: PMC5114340.xml
Processing 49/572: PMC6124165.xml
Processing 50/572: PMC11167039.xml
Progress saved: 50 files processed
Processing 51/572: PMC5896955.xml
Processing 52/572: PMC7503278.xml
Processing 53/572: PMC4118556.xml
Processing 54/572: PMC3962621.xml
Processing 55/572: PMC6599637.xml
Processing 56/572: PMC8274610.xml
Processing 57/572: PMC6590338.xml
Processing 58/572: PMC11999716.xml
Processing 59/572: PMC4228280.xml
Processing 60/572: PMC11451251.xml
Processing 61/572: PMC11579474.xml
Processing 62/572: PMC10410709.xml
Proces

In [None]:
df1 = pd.read_csv("comprehensive_articles_data.csv")
df1["keywords"].head(10)

EmptyDataError: No columns to parse from file