In [90]:
import requests
from datetime import datetime, timedelta
import xml.etree.ElementTree as ET
import logging

logging.basicConfig(level=logging.INFO)

from tqdm.contrib.concurrent import (
    process_map,
)  # Import process_map for multiprocessing


def fetch_citation_count(pmid):
    """
    Fetch citation count for a given PMID.
    """
    elink_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi"
    params = {
        "dbfrom": "pubmed",
        "db": "pubmed",
        "id": pmid,
        "linkname": "pubmed_pubmed_citedin",  # Link to citing articles
        "retmode": "xml",
    }

    response = requests.get(elink_url, params=params)
    if response.status_code == 200:
        root = ET.fromstring(response.content)
        citation_count = len(root.findall(".//LinkSetDb/Link"))
        return citation_count
    return 0  # Return 0 if unable to fetch citation count


def fetch_citations_for_papers(papers):
    """
    Fetch citation counts for a list of papers in parallel.
    """
    pmids = [paper["pmid"] for paper in papers]
    citation_counts = process_map(fetch_citation_count, pmids, max_workers=10)
    for paper, citation_count in zip(papers, citation_counts):
        paper["citations"] = citation_count
    return papers


def fetch_recent_papers(query, days_ago=90, n_articles=10000):
    """
    Fetch recent papers from PubMed based on a query and include citation counts.
    """
    # Calculate date range
    end_date = datetime.today()
    start_date = end_date - timedelta(days=days_ago)
    start_date_str = start_date.strftime("%Y/%m/%d")
    end_date_str = end_date.strftime("%Y/%m/%d")

    # PubMed API endpoint
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
    params = {
        "db": "pubmed",
        "term": f'({query}) AND ("{start_date_str}"[Date - Publication] : "{end_date_str}"[Date - Publication])',
        "retmax": n_articles,
        "usehistory": "y",
    }

    response = requests.get(base_url, params=params)
    if response.status_code == 200:
        root = ET.fromstring(response.content)
        webenv = root.find(".//WebEnv").text
        query_key = root.find(".//QueryKey").text

        # Fetch actual paper details using efetch
        fetch_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
        fetch_params = {
            "db": "pubmed",
            "query_key": query_key,
            "WebEnv": webenv,
            "retmode": "xml",
            "rettype": "abstract",
            "retmax": n_articles,
        }
        fetch_response = requests.get(fetch_url, params=fetch_params)
        if fetch_response.status_code == 200:
            fetch_root = ET.fromstring(fetch_response.content)
            papers = []
            for article in fetch_root.findall(".//PubmedArticle"):
                title = article.find(".//ArticleTitle").text
                pmid = article.find(".//PMID").text
                link = f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/"

                # Extract keywords
                keyword_list = article.find(".//KeywordList")

                keywords = (
                    [keyword.text for keyword in keyword_list.findall(".//Keyword")]
                    if keyword_list is not None
                    else []
                )
                papers.append(
                    {"title": title, "pmid": pmid, "link": link, "keywords": keywords}
                )
            return papers
    return "Failed to fetch papers or no papers found."


# Example usage
journals = [
    "Bioinformatics",
    "BMC Bioinformatics",
    "Nucleic Acids Research",
    "Genome Research",
    "Genome Biology",
    "PLOS Computational Biology",
    "Journal of Computational Biology",
    "Journal of Biomedical Informatics",
    "Journal of Cheminformatics",
    "Molecular Informatics",
    "Journal of Chemical Information and Modeling",
    "Nature Machine Intelligence",
    "IEEE/ACM Transactions on Computational Biology and Bioinformatics",
    "Computational and Structural Biotechnology Journal",
    "Briefings in Bioinformatics",
    "Systems Biology and Applications",
    "BioSystems",
    "Algorithms for Molecular Biology",
    "BMC Genomics",
    "BMC Systems Biology",
    "BMC Evolutionary Biology",
    "BioData Mining",
    "PLOS Genetics",
    "Frontiers in Genetics",  # Bioinformatics and Computational Biology section
    "Journal of Proteome Research",
    "Proteins: Structure, Function, and Bioinformatics",
    "Molecular Systems Biology",
    "Cell Systems",
    "GigaScience",
    "Nature Biomedical Engineering",  # While broader, it includes significant bioinformatics research.
    "Patterns (Cell Press)",  # Focuses on data science that impacts science and society, including bioinformatics.
]


# Constructing the query part for journals
journal_queries = ['"' + journal + '"[Journal]' for journal in journals]
joined_journal_query = " OR ".join(journal_queries)

query = 'chemoinformatics OR "AI biomedicine"'
papers = fetch_recent_papers(joined_journal_query)


logging.info(f"Nº of papers: {len(papers)}")

INFO:root:Nº of papers: 2676


In [3]:
import requests
from datetime import datetime, timedelta
import xml.etree.ElementTree as ET
import logging

In [91]:
logging.info(f"Nº of papers: {len(papers)}")

INFO:root:Nº of papers: 2676


In [92]:
papers = fetch_citations_for_papers(papers)

  citation_counts = process_map(fetch_citation_count, pmids, max_workers=10)
100%|██████████| 2676/2676 [02:20<00:00, 19.09it/s]


In [93]:
logging.info(
    f"Nº of papers w/ citations in less than 3 months: {len([p for p in papers if p['citations'] > 0])}"
)

INFO:root:Nº of papers w/ citations in less than 3 months: 45


In [94]:
[p for p in papers if p["citations"] > 0]

[{'title': 'Counterfactual formulation of patient-specific root causes of disease.',
  'pmid': '38191012',
  'link': 'https://pubmed.ncbi.nlm.nih.gov/38191012/',
  'keywords': ['Causal discovery',
   'Causal inference',
   'Computational medicine',
   'Precision medicine',
   'Root cause analysis'],
  'citations': 1},
 {'title': 'Structural coverage of the human interactome.',
  'pmid': '38180828',
  'link': 'https://pubmed.ncbi.nlm.nih.gov/38180828/',
  'keywords': ['AlphaFold2',
   'PDB',
   'homology modeling databases',
   'human interactome',
   'human proteome',
   'protein complexes',
   'structural coverage'],
  'citations': 1},
 {'title': 'Relative molecule self-attention transformer.',
  'pmid': '38173009',
  'link': 'https://pubmed.ncbi.nlm.nih.gov/38173009/',
  'keywords': ['Molecular property prediction',
   'Molecular self-attention',
   'Neural networks pre-training'],
  'citations': 2},
 {'title': 'Cohesin and CTCF do not assemble TADs in ',
  'pmid': '38129077',
  'lin

In [2]:
import tweepy

# Replace these with your Twitter API credentials
consumer_key = "YOUR_CONSUMER_KEY"
consumer_secret = "YOUR_CONSUMER_SECRET"
access_token = "1701547786188054528-FFAWFV8sikP81RXmUJN8TbtMaL3x52"
access_token_secret = "BX4rQpWpn4lLIh0S47xVYQDa17wjR79laS2nOTqqxKKQY"

# Authenticate to Twitter
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)

# Create API object
api = tweepy.API(auth)