In [3]:
import altair as alt
import pandas as pd
from vega_datasets import data

# Load TopoJSON countries
countries = alt.topo_feature(data.world_110m.url, 'countries')

# Your country data (using ISO numeric codes)
source = pd.DataFrame({
    'id': [840, 124, 356],  # USA, Canada, India
    'country': ['United States', 'Canada', 'India'],
    'value': [300, 100, 1200]
})

# Choropleth with fallback value
choropleth = alt.Chart(countries).mark_geoshape().encode(
    color='value:Q'
).transform_lookup(
    lookup='id',
    from_=alt.LookupData(source, 'id', ['value'])
).transform_calculate(
    value="datum.value !== null ? datum.value : 0"
).project(
    type='naturalEarth1'
).properties(
    width=800,
    height=400
)

choropleth


In [None]:

from src.api_utils import ilove_access, cochrane_access, medline_class_access, ovid_new_access

medline_class_access(searchText=["""
(
  (review[pt] OR "review, tutorial"[pt] OR "review, academic"[pt])
  AND 
  (
    medline[tw] OR medlars[tw] OR embase[tw] OR pubmed[tw] OR cochrane[tw]
    OR scisearch[tw] OR psychinfo[tw] OR psycinfo[tw]
    OR psychlit[tw] OR psyclit[tw] 
    OR cinahl[tw] 
    OR ((hand[tw] AND search*[tw]) OR (manual*[tw] AND search*[tw]))
    OR ("electronic database*"[tw] OR "bibliographic database*"[tw] OR "computerized database*"[tw] OR "online database*"[tw])
    OR pooling[tw] OR pooled[tw] OR "mantel haenszel"[tw]
    OR peto[tw] OR dersimonian[tw] OR "der simonian"[tw] OR "fixed effect"[tw]
    OR "retraction of publication"[pt] OR "retracted publication"[pt]
  )
)
OR
(
  meta-analysis[pt] 
  OR meta-analysis[sh] 
  OR (meta-analys*[tw] OR meta analys*[tw] OR metaanalys*[tw])
  OR (systematic*[tw] AND review*[tw])
  OR (quantitative*[tw] AND review*[tw])
  OR (methodologic*[tw] AND review*[tw])
  OR ("integrative research review"[tw] OR "research integration"[tw])
)
AND
(
  immunization[mesh] 
  OR Immunization Programs[mesh] 
  OR vaccines[mesh]
  OR (immunisation[tiab] OR immunization[tiab] OR immunise[tiab] OR immunize[tiab] OR vaccine[tiab])
)
AND humans[filter]
AND 
("2011"[edat] : "3000"[edat])
"""])

Total IDs retrieved for query: 0
No results found for query.
No data to save.


In [6]:
from Bio import Entrez, Medline
import pandas as pd
from io import StringIO
import time
from tqdm import tqdm

class MedlineFetcher:
    def __init__(self, email, api_key=None):
        Entrez.email = email
        if api_key:
            Entrez.api_key = api_key

    def search_pubmed(self, query, retmax=10000):
        """Search PubMed with the given query and return all matching IDs."""
        all_ids = []
        retstart = 0

        print("🔍 Executing PubMed search...")
        while True:
            try:
                handle = Entrez.esearch(
                    db="pubmed", term=query, retmax=retmax,
                    retstart=retstart, usehistory="y"
                )
                record = Entrez.read(handle)
                handle.close()

                ids = record.get("IdList", [])
                all_ids.extend(ids)

                print(f"🔹 Retrieved {len(ids)} IDs (total so far: {len(all_ids)})")

                if len(ids) < retmax:
                    break

                retstart += retmax
                time.sleep(0.5)

            except Exception as e:
                print(f"❌ Error: {e}")
                break

        return all_ids

    def fetch_medline_records(self, id_list, batch_size=500):
        """Fetch MEDLINE records for a list of PubMed IDs."""
        all_records = []

        for start in tqdm(range(0, len(id_list), batch_size), desc="📦 Fetching details"):
            batch_ids = id_list[start:start + batch_size]
            id_string = ",".join(batch_ids)

            try:
                handle = Entrez.efetch(db="pubmed", id=id_string, rettype="medline", retmode="text")
                records = list(Medline.parse(StringIO(handle.read())))
                all_records.extend(records)
                handle.close()
                time.sleep(0.5)
            except Exception as e:
                print(f"❌ Error in batch {start}: {e}")

        print(f"✅ Total MEDLINE records fetched: {len(all_records)}")
        return all_records

    def clean_doi(self, doi_list):
        """Clean and join DOIs if present in AID field."""
        return "; ".join([d.split()[0] for d in doi_list if "doi" in d.lower()])

    def process_records(self, records):
        """Convert MEDLINE records to DataFrame and filter Systematic Reviews/Meta-analyses."""
        print("🧹 Cleaning and structuring records...")
        data = []
        for rec in records:
            doi = self.clean_doi(rec.get("AID", []))
            publication_type = "; ".join(rec.get("PT", []))

            data.append({
                "pmid": rec.get("PMID", ""),
                "title": rec.get("TI", ""),
                "abstract": rec.get("AB", ""),
                "authors": "; ".join(rec.get("AU", [])),
                "publication_date": rec.get("DP", ""),
                "journal": rec.get("JT", ""),
                "country": rec.get("PL", ""),
                "language": "; ".join(rec.get("LA", [])),
                "mesh_terms": "; ".join(rec.get("MH", [])),
                "publication_type": publication_type,
                "doi": doi
            })

        df = pd.DataFrame(data)
        df['publication_type'] = df['publication_type'].astype(str)

        # Filter by publication_type
        df = df[
            df['publication_type'].str.contains("Systematic Review|Meta-Analysis", case=False, na=False)
        ]

        df['year'] = pd.to_datetime(df['publication_date'], errors='coerce').dt.year
        df.drop_duplicates(subset='pmid', inplace=True)
        return df

    def run(self, query, output_csv="medline_filtered_output.csv"):
        id_list = self.search_pubmed(query)
        if not id_list:
            print("⚠️ No articles found for the query.")
            return

        records = self.fetch_medline_records(id_list)
        if not records:
            print("⚠️ No records could be fetched.")
            return

        df = self.process_records(records)
        if df.empty:
            print("⚠️ No records matched the filter for publication_type.")
        else:
            df.to_csv(output_csv, index=False)
            print(f"📁 Saved {len(df)} filtered records to: {output_csv}")


if __name__ == "__main__":
    QUERY = (
    "("
        "(review[pt] OR \"review, tutorial\"[pt] OR \"review, academic\"[pt]) AND "
        "("
            "medline[tw] OR medlars[tw] OR embase[tw] OR pubmed[tw] OR cochrane[tw] OR "
            "scisearch[tw] OR psychinfo[tw] OR psycinfo[tw] OR psychlit[tw] OR psyclit[tw] OR "
            "cinahl[tw] OR ((hand[tw] AND search*[tw]) OR (manual*[tw] AND search*[tw])) OR "
            "\"electronic database*\"[tw] OR \"bibliographic database*\"[tw] OR \"computerized database*\"[tw] OR \"online database*\"[tw] OR "
            "pooling[tw] OR pooled[tw] OR \"mantel haenszel\"[tw] OR peto[tw] OR dersimonian[tw] OR \"der simonian\"[tw] OR \"fixed effect\"[tw] OR "
            "\"retraction of publication\"[pt] OR \"retracted publication\"[pt]"
        ")"
    ") "
    "OR "
    "("
        "meta-analysis[pt] OR meta-analysis[sh] OR meta-analys*[tw] OR meta analys*[tw] OR metaanalys*[tw] OR "
        "(systematic*[tw] AND review*[tw]) OR (quantitative*[tw] AND review*[tw]) OR "
        "(methodologic*[tw] AND review*[tw]) OR \"integrative research review\"[tw] OR \"research integration\"[tw]"
    ") "
    "AND ("
        "immunization[mesh] OR Immunization Programs[mesh] OR vaccines[mesh] OR "
        "immunisation[tiab] OR immunization[tiab] OR immunise[tiab] OR immunize[tiab] OR vaccine[tiab]"
    ") "
    "AND humans[filter] "
    "AND (\"2011\"[edat] : \"3000\"[edat])"
)

    fetcher = MedlineFetcher(email="ebenco94@gmail.com", api_key="d4658719b8b55fb6817d221776bbddece608")
    fetcher.run(QUERY)


🔍 Executing PubMed search...


🔹 Retrieved 6997 IDs (total so far: 6997)


📦 Fetching details:  36%|███▌      | 5/14 [00:26<00:48,  5.38s/it]


KeyboardInterrupt: 