In [23]:
# Python 3.12.0
import requests, sys
from Bio import Entrez
import pandas as pd
import time
import os
import json
import requests, xml.etree.ElementTree as ET

In [None]:
import re

# Ruta base
file_path = "input_lista_genus.txt"

with open(file_path, "r", encoding="utf-8") as f:
    content = f.read()

clean_text = re.sub(r"{\\.*?}|\\[a-z]+\d* ?|[{}\\]", "", content)

lines = clean_text.splitlines()
lines_clean = [line.strip() for line in lines if line.strip()]

print(lines_clean)


['Eubacterium', 'Rickettsiella', 'Lautropia']


In [None]:
def taxid_from_genus(genus):
    r = requests.get(f"{"https://www.ebi.ac.uk/ena/taxonomy/rest/scientific-name/"}{genus}", headers={"Accept": "application/json"})
    r.raise_for_status()

    for hit in r.json():
        if hit.get("rank") == "genus":
            return hit["taxId"]
    raise ValueError(f"No se encontró el género {genus}")

def species_from_genus(genus):
    taxid = taxid_from_genus(genus)
    xml = requests.get(f"{"https://www.ebi.ac.uk/ena/browser/api/xml/"}{taxid}").content
    root   = ET.fromstring(xml)

    species = []
    for tax in root.findall(".//taxon"):
        rank = tax.get("rank") or tax.findtext("rank")
        if rank == "species":
            species.append({
                "taxId": tax.get("taxId") or tax.findtext("taxId"),
                "name":  tax.get("scientificName") or tax.findtext("scientificName")
            })
    return species

def get_pmids(taxon, specie, num_salida):
    Entrez.email = 'fernando.lucas@um.es'

    query = f"{taxon} AND {specie}"

    date_range = '("1950/01/01"[Date - Create] : "2024/12/31"[Date - Create])'

    full_query = query + ' AND ' + date_range
    
    search_handle = Entrez.esearch(db="pubmed", 
                                   term=full_query, 
                                   retmax=num_salida
                                   )
    search_results = Entrez.read(search_handle)
    search_handle.close()
    
    pmids = search_results["IdList"]
    return pmids

def fetch_article_details(pmids):

    handle = Entrez.efetch(db="pubmed", 
                           id=",".join(pmids), 
                           rettype="medline", 
                           retmode="xml")
    records = Entrez.read(handle)
    
    articles = []
    for record in records['PubmedArticle']:
        article = {
            "title": record.get("MedlineCitation", {}).get("Article", {}).get("ArticleTitle", ""),
            "authors": [author.get('ForeName', '') + ' ' + author.get('LastName', '') for author in record.get("MedlineCitation", {}).get("Article", {}).get("AuthorList", [])],
            "abstract": "".join(record.get("MedlineCitation", {}).get("Article", {}).get("Abstract", {}).get("AbstractText", []))
        }
        articles.append(article)
    handle.close()
    return articles

def get_genes_pubtator(pmids):
    
    url = "https://www.ncbi.nlm.nih.gov/research/pubtator3-api/publications/export/biocjson"
    params = {
        "pmids": ",".join(pmids), 
        "full": "true"
    }


    response = requests.get(url, params=params)
    time.sleep(1)  

    if response.status_code != 200:
        print(f"Error al recuperar datos: {response.status_code}")
        return {}, pd.DataFrame()

    data = response.json()  


    secciones_interes = {"TITLE", "ABSTRACT", "INTRO", "METHODS", "RESULTS", "TABLE", "FIG"}
    genes_extraidos = []

    for articulo in data.get("PubTator3", []):
        pmid = articulo.get("_id", "Desconocido").split("|")[0] 

        for passage in articulo.get("passages", []):
            section_type = passage.get("infons", {}).get("section_type", "").upper()


            for annotation in passage.get("annotations", []):
                if annotation.get("infons", {}).get("type") == "Gene":
                    genes_extraidos.append({
                        "PMID": pmid,
                        "Gene_ID": annotation.get("infons", {}).get("identifier"),
                        "Gene_Name": annotation.get("infons", {}).get("name"),
                        "Section": section_type
                    })

    df_genes = pd.DataFrame(genes_extraidos)

    return df_genes



In [None]:
results_list = []

for tax in lines_clean:

     print("Genus: ", tax)

     species = species_from_genus(tax)
     names_species = [sp["name"] for sp in species]

     for sp in names_species:

          pmids = get_pmids(sp, '(Human OR Homo sapiens)', 100000)
          
          if(len(pmids) != 0):

               if len(pmids) > 100:
                    for i in range(0, len(pmids), 100):
                         batch_pmids = pmids[i:i + 100] 
                         df_genes = get_genes_pubtator(batch_pmids)

                         if not df_genes.empty:
                              df_genes["Taxon"] = sp
                              results_list.append(df_genes)
               else:
                    df_genes = get_genes_pubtator(pmids)

                    if not df_genes.empty:
                         df_genes["Taxon"] = sp
                         results_list.append(df_genes)
     

df_result = pd.concat(results_list, ignore_index=True) if results_list else pd.DataFrame()


Genus:  Eubacterium
Tiempo transcurrido para Eubacterium: 273.94 segundos
Genus:  Rickettsiella
Tiempo transcurrido para Rickettsiella: 11.38 segundos
Genus:  Lautropia
Tiempo transcurrido para Lautropia: 5.16 segundos
