In [1]:
from metapub import PubMedFetcher
from dotenv import load_dotenv
import os
from utils import peep
import tqdm
import time

load_dotenv()

fetcher = PubMedFetcher(cachedir="./.cache/")


In [2]:
SEED_PMID = 31875792


In [3]:
class Article:
    def __init__(self, pmid: str | int, include_bib: bool = False):
        if isinstance(pmid, int):
            pmid = str(pmid)

        if include_bib:
            article = fetcher.article_by_pmid(pmid)
            self.bib = {
                key: getattr(article, key)
                for key in [
                    "authors_str",
                    "title",
                    "doi",
                    "year",
                    "journal",
                    "keywords",
                    "abstract",
                ]
            }

        related = fetcher.related_pmids(pmid)
        self.related = {key: related.get(key, []) for key in ["pubmed", "citedin", "refs"]}


seed_articles = [Article(SEED_PMID, include_bib=True)]


In [4]:
def combine_pmids(articles: list[Article], topk: int = 10) -> set[str]:
    pmid_set = set()
    for article in articles:
        pmid_set |= set(article.related["pubmed"][:topk])
    return pmid_set

In [8]:
def extend_pmid_set(pmid_set: set[str], topk: int = 10, max_retries: int = 3) -> list[Article]:
    extended_articles = []

    for pmid in tqdm.tqdm(pmid_set):
        for attempt in range(max_retries):
            try:
                article = Article(pmid)
                time.sleep(1)
                extended_articles.append(article)
                break
            except Exception as e:
                if attempt == max_retries - 1:
                    print(f"Failed to fetch article {pmid} after {max_retries} attempts: {str(e)}")
                else:
                    time.sleep(1)  # Wait 1 second before retrying

    pmid_set = set()
    for article in extended_articles:
        pmid_set |= set(article.related["pubmed"][:topk])

    if len(pmid_set) < 20 and pmid_set:  # Check if we have any articles
        try:
            print(f"Length of extended_articles: {len(pmid_set)}")
            return extend_pmid_set(pmid_set, topk)
        except Exception as e:
            print(f"Stopping recursion due to error: {str(e)}")
            return pmid_set

    return pmid_set


# Usage
final_articles = extend_pmid_set(seed_articles, topk=10)


100%|██████████| 1/1 [00:04<00:00,  4.10s/it]

Failed to fetch article <__main__.Article object at 0x12104d400> after 3 attempts: Error parsing response object from NCBI: OK (200): NCBI C++ Exception:
    Error: TXCLIENT(CException::eUnknown) "/pubmed_gen/rbuild/version/20240724/entrez/2.19/src/internal/txclient/TxClient.cpp", line 1045: ncbi::CTxRawClientImpl::readAll() --- Read failed: EOF (the other side has unexpectedly closed connection), peer: 130.14.18.59:8064






In [None]:
len(combine_pmids(final_articles, topk=10))

In [None]:
seed_articles