# 0. Libraries

In [33]:
import numpy as np
import pandas as pd
from Bio import Entrez
import xml.etree.ElementTree as ET

# 1. Modules

In [34]:
# Return a PMC article XML given its ID
def fetch_pmc_xml(pmc_id, filename=None, email="your_email@example.com"):
    """
    Download the XML of a PMC article given its ID.

    Args:
        pmc_id (str): The PMC ID, e.g. "PMC4136787" or just "4136787".
        filename (str): Optional, file name to save the XML.
        email (str): Your email (required by NCBI).

    Returns:
        str: The XML content of the article as string.
    """
    # Set email for NCBI (required)
    Entrez.email = email
    
    # Clean PMC ID (remove PMC prefix if present)
    clean_id = pmc_id.replace("PMC", "") if pmc_id.startswith("PMC") else pmc_id
    
    try:
        # Fetch XML data
        handle = Entrez.efetch(db="pmc", id=clean_id, rettype="full", retmode="xml")
        xml_data = handle.read()
        handle.close()
        
        # Convert bytes to string
        if isinstance(xml_data, bytes):
            xml_content = xml_data.decode('utf-8')
        else:
            xml_content = xml_data
        
        # Save to file if filename provided
        if filename:
            with open(filename, "w", encoding="utf-8") as f:
                f.write(xml_content)
            print(f"XML saved to {filename}")
        
        return xml_content
        
    except Exception as e:
        print(f"Error fetching PMC ID {pmc_id}: {e}")
        return None

In [35]:
def load_xml(xml_content_or_file):
    """
    Load XML content (string or file path).
    """
    try:
        # Try parsing from string
        root = ET.fromstring(xml_content_or_file)
    except ET.ParseError:
        # Otherwise parse from file
        tree = ET.parse(xml_content_or_file)
        root = tree.getroot()
    return root

In [36]:
def get_authors(root):
    """Extract list of authors."""
    authors = []
    for contrib in root.findall(".//contrib[@contrib-type='author']"):
        surname = contrib.find("name/surname")
        given = contrib.find("name/given-names")
        if surname is not None and given is not None:
            authors.append(f"{given.text} {surname.text}")
    return authors

In [37]:
def get_title(root):
    """Extract article title."""
    title_elem = root.find(".//article-title")
    return title_elem.text if title_elem is not None else None

In [38]:
def get_abstract(root):
    """Extract abstract text."""
    abstract_elems = root.findall(".//abstract//p")
    return " ".join("".join(p.itertext()) for p in abstract_elems) if abstract_elems else None


# 2. Extract the data

In [39]:
# Download the articles CSV file
!wget https://raw.githubusercontent.com/jgalazka/SB_publications/main/SB_publication_PMC.csv -O SB_publication_PMC.csv

--2025-09-19 23:12:57--  https://raw.githubusercontent.com/jgalazka/SB_publications/main/SB_publication_PMC.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 2606:50c0:8003::154, 2606:50c0:8001::154, 2606:50c0:8002::154, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|2606:50c0:8003::154|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 97057 (95K) [text/plain]
Saving to: ‘SB_publication_PMC.csv’


2025-09-19 23:12:58 (663 KB/s) - ‘SB_publication_PMC.csv’ saved [97057/97057]



In [40]:
df = pd.read_csv("SB_publication_PMC.csv")
df.head()

Unnamed: 0,Title,Link
0,Mice in Bion-M 1 space mission: training and s...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4...
1,Microgravity induces pelvic bone loss through ...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...
2,Stem Cell Health and Tissue Regeneration in Mi...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...
3,Microgravity Reduces the Differentiation and R...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...
4,Microgravity validation of a novel system for ...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...


In [44]:
# Setting Email for NCBI Entrez
email = "atauchimamani@gmail.com"

# Example PMC ID
pmc_id = "PMC4136787"

# Step 1: Fetch XML
xml_content = fetch_pmc_xml(pmc_id, filename=f"{pmc_id}.xml", email=email)
print("XML downloaded!\n")

# Step 2: Parse it
root = load_xml(xml_content)

# Step 3: Extract metadata
title = get_title(root)
authors = get_authors(root)
abstract = get_abstract(root)

print("Title:", title)
print("Authors:", ", ".join(authors))
print("Abstract:", abstract)

XML saved to PMC4136787.xml
XML downloaded!

Title: Mice in Bion-M 1 Space Mission: Training and Selection
Authors: Alexander Andreev-Andrievskiy, Anfisa Popova, Richard Boyle, Jeffrey Alberts, Boris Shenkman, Olga Vinogradova, Oleg Dolgov, Konstantin Anokhin, Darya Tsvirkun, Pavel Soldatov, Tatyana Nemirovskaya, Eugeniy Ilyin, Vladimir Sychev
Abstract: After a 16-year hiatus, Russia has resumed its program of biomedical research in space, with the successful 30-day flight of the Bion-M 1 biosatellite (April 19–May 19, 2013). The principal species for biomedical research in this project was the mouse. This paper presents an overview of the scientific goals, the experimental design and the mouse training/selection program. The aim of mice experiments in the Bion-M 1 project was to elucidate cellular and molecular mechanisms, underlying the adaptation of key physiological systems to long-term exposure in microgravity. The studies with mice combined in vivo measurements, both in flight an