# 0. Libraries

In [None]:
import numpy as np
import pandas as pd
from Bio import Entrez
import xml.etree.ElementTree as ET
import os
import re
from datetime import datetime
from typing import List, Dict, Optional, Tuple
import requests
from urllib.parse import urljoin, urlparse
import time
import json

# 1. Modules

# 2. Extract the data

In [None]:
# Download the articles CSV file
!wget https://raw.githubusercontent.com/jgalazka/SB_publications/main/SB_publication_PMC.csv -O SB_publication_PMC.csv

--2025-09-24 14:31:03--  https://raw.githubusercontent.com/jgalazka/SB_publications/main/SB_publication_PMC.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 2606:50c0:8002::154, 2606:50c0:8001::154, 2606:50c0:8000::154, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|2606:50c0:8002::154|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 97057 (95K) [text/plain]
Saving to: ‘SB_publication_PMC.csv’


2025-09-24 14:31:03 (1.56 MB/s) - ‘SB_publication_PMC.csv’ saved [97057/97057]



In [None]:
df = pd.read_csv("SB_publication_PMC.csv")
# Create a new column with just the PMC ID
df["PMC_ID"] = df["Link"].str.extract(r'(PMC\d+)')
# Guardar el DataFrame en un nuevo CSV
df.to_csv("articles_with_pmcid.csv", index=False)
# Show the dataframe
df.head()

Unnamed: 0,Title,Link,PMC_ID
0,Mice in Bion-M 1 space mission: training and s...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4...,PMC4136787
1,Microgravity induces pelvic bone loss through ...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...,PMC3630201
2,Stem Cell Health and Tissue Regeneration in Mi...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...,PMC11988870
3,Microgravity Reduces the Differentiation and R...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...,PMC7998608
4,Microgravity validation of a novel system for ...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,PMC5587110


In [192]:
df["PMC_ID"].value_counts()

PMC_ID
PMC12008199    4
PMC5748516     3
PMC8044432     3
PMC11477029    2
PMC10831389    2
              ..
PMC12040010    1
PMC11833055    1
PMC9146534     1
PMC11492218    1
PMC9865768     1
Name: count, Length: 572, dtype: int64

In [194]:
# Create a directory to save XML files
os.makedirs("pmc_xmls", exist_ok=True)

In [195]:
# Setting Email for NCBI Entrez
email = "atauchimamani@gmail.com"

# Download XMLs for each article
for idx, row in df.iterrows():
    pmc_id = row["PMC_ID"]
    if pd.notna(pmc_id):
        filename = os.path.join("pmc_xmls", f"{pmc_id}.xml")
        if not os.path.exists(filename):  # Avoid re-downloading
            print(f"Fetching {pmc_id}...")
            fetch_pmc_xml(pmc_id, filename=filename, email=email)
        else:
            print(f"{pmc_id} already downloaded.")

PMC4136787 already downloaded.
PMC3630201 already downloaded.
PMC11988870 already downloaded.
PMC7998608 already downloaded.
PMC5587110 already downloaded.
PMC8396460 already downloaded.
PMC5666799 already downloaded.
PMC5460236 already downloaded.
PMC6222041 already downloaded.
PMC6813909 already downloaded.
PMC4095884 already downloaded.
PMC3040128 already downloaded.
PMC3177255 already downloaded.
PMC11500582 already downloaded.
PMC5387210 already downloaded.
PMC4642138 already downloaded.
PMC5387210 already downloaded.
PMC2915878 already downloaded.
PMC3901686 already downloaded.
PMC6985101 already downloaded.
PMC6387434 already downloaded.
PMC6371294 already downloaded.
PMC7072278 already downloaded.
PMC8441986 already downloaded.
PMC9400218 already downloaded.
PMC9267413 already downloaded.
PMC9576569 already downloaded.
PMC10789781 already downloaded.
PMC10772081 already downloaded.
PMC11166946 already downloaded.
PMC11166944 already downloaded.
PMC11166968 already downloaded.
P