# IMPORT PACKAGES

In [None]:
import os
import time
import re
import csv
import requests
import pandas as pd
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# WEB SCRAPING PUBMED

In [None]:
# =======================================
# CONFIG
# =======================================

BASE_DIR = os.getcwd()
OUT_CSV  = os.path.join(BASE_DIR, "genai_topics_applications_counts_pubmed.csv")

TOPICS = ["Generative AI", "Digital twins", "Federated learning"]

APPLICATIONS = [
    "Diagnostic imaging (classification/detection)",
    "Imaging segmentation / reconstruction",
    "Clinical decision support & risk prediction",
    "Clinical text / EHR modelling",
    "Drug discovery & design",
    "Protein / biomolecule structure",
    "Patient monitoring & physiological signals",
    "Surgical planning / training & simulation",
]

YEARS = [2021, 2022, 2023, 2024, 2025]

# --- Application context, adapted for PubMed search ---
APP_CONTEXT = {
    "Diagnostic imaging (classification/detection)": (
        '("medical imaging"[All Fields] OR radiology[All Fields] '
        'OR "MRI"[All Fields] OR "CT scan"[All Fields] '
        'OR "X-ray"[All Fields] OR PET[All Fields]) '
        'AND (diagnosis[All Fields] OR diagnostic[All Fields] '
        'OR classification[All Fields] OR detection[All Fields])'
    ),

    "Imaging segmentation / reconstruction": (
        '("medical imaging"[All Fields] OR radiology[All Fields] '
        'OR "MRI"[All Fields] OR "CT scan"[All Fields] '
        'OR "X-ray"[All Fields] OR PET[All Fields]) '
        'AND (segmentation[All Fields] OR "image segmentation"[All Fields] '
        'OR reconstruction[All Fields] OR "image reconstruction"[All Fields])'
    ),

    "Clinical decision support & risk prediction": (
        '("clinical decision support"[All Fields] OR "decision support system"[All Fields] '
        'OR prognosis[All Fields] OR "risk prediction"[All Fields] '
        'OR "risk model"[All Fields]) '
        'AND (patient[All Fields] OR clinical[All Fields] '
        'OR hospital[All Fields] OR healthcare[All Fields])'
    ),

    "Clinical text / EHR modelling": (
        '("clinical notes"[All Fields] OR "clinical text"[All Fields] '
        'OR "electronic health records"[All Fields] OR "EHR data"[All Fields] '
        'OR EHR[All Fields] OR "medical records"[All Fields])'
    ),

    "Drug discovery & design": (
        '("drug discovery"[All Fields] OR "drug design"[All Fields] '
        'OR "molecule generation"[All Fields] OR "de novo design"[All Fields] '
        'OR "virtual screening"[All Fields])'
    ),

    "Protein / biomolecule structure": (
        '("protein design"[All Fields] OR "protein structure"[All Fields] '
        'OR "protein sequence"[All Fields] OR "antibody design"[All Fields] '
        'OR "biomolecular structure"[All Fields])'
    ),

    "Patient monitoring & physiological signals": (
        '("vital signs"[All Fields] OR "physiological signals"[All Fields] '
        'OR ECG[All Fields] OR EEG[All Fields] OR "time series"[All Fields] '
        'OR "patient monitoring"[All Fields] OR ICU[All Fields] '
        'OR "intensive care"[All Fields])'
    ),

    "Surgical planning / training & simulation": (
        '("surgical planning"[All Fields] OR "surgical simulation"[All Fields] '
        'OR "surgery training"[All Fields] OR "surgical training"[All Fields] '
        'OR "operative planning"[All Fields] OR "preoperative planning"[All Fields] '
        'OR "virtual patient"[All Fields] OR "patient-specific model"[All Fields])'
    ),
}

def build_pubmed_query(topic: str, app: str) -> str:
    """Combine topic and application into one PubMed 'term' string."""
    ctx = APP_CONTEXT[app]

    if topic == "Generative AI":
        tech = (
            '"generative ai"[All Fields] OR "generative model"[All Fields] '
            'OR "diffusion model"[All Fields] OR GAN[All Fields] '
            'OR "variational autoencoder"[All Fields]'
        )
    elif topic == "Digital twins":
        tech = '"digital twin"[All Fields] OR "digital twins"[All Fields]'
    elif topic == "Federated learning":
        tech = '"federated learning"[All Fields] OR "federated training"[All Fields]'
    else:
        tech = ""

    if tech:
        return f"({tech}) AND ({ctx})"
    else:
        return f"({ctx})"


# =======================================
# PubMed E-utilities helper
# =======================================

PUBMED_ESEARCH_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"

# OPTIONAL but recommended: add your email (NCBI etiquette)
NCBI_EMAIL = "your.email@example.com"  # <- change or leave as placeholder

def pubmed_yearly_count(term: str, year: int) -> int:
    """
    Use PubMed esearch to get the number of matching records in a given YEAR.
    Returns an integer count (0 if anything goes wrong).
    """
    params = {
        "db": "pubmed",
        "term": term,
        "mindate": str(year),
        "maxdate": str(year),
        "datetype": "pdat",   # publication date
        "retmode": "json",
        "rettype": "count",
        "email": NCBI_EMAIL,
    }

    try:
        r = requests.get(PUBMED_ESEARCH_URL, params=params, timeout=30)
        r.raise_for_status()
        data = r.json()
        count_str = data["esearchresult"]["count"]
        return int(count_str)
    except Exception as e:
        print(f"[WARN] PubMed error for year {year}: {e}")
        return 0


# =======================================
# MAIN LOOP â€“ build table of counts
# =======================================

rows = []

for topic in TOPICS:
    print(f"\n=== Topic: {topic} ===")
    for app in APPLICATIONS:
        term = build_pubmed_query(topic, app)
        print(f"  Application: {app}")
        print(f"  PubMed term: {term}")
        for year in YEARS:
            count = pubmed_yearly_count(term, year)
            print(f"    {year}: {count} results")
            rows.append({
                "topic": topic,
                "application": app,
                "year": year,
                "count": count,
            })
            # Be polite to NCBI: small delay
            time.sleep(0.4)

# =======================================
# SAVE TO CSV
# =======================================

df_counts = pd.DataFrame(rows)
df_counts.to_csv(OUT_CSV, index=False)

print("\n[INFO] Finished. Saved PubMed counts to:")
print(OUT_CSV)
print("\nPreview:")
print(df_counts.head())


# CIRCULAR BARPLOT

In [None]:


# =================== LOAD GOOGLE SCHOLAR COUNTS ===================

df_counts = pd.read_csv("genai_topics_applications_counts_pubmed.csv")

# In case you left any cells empty:
df_counts["count"] = df_counts["count"].fillna(0)

YEARS   = sorted(df_counts["year"].unique())
TOPICS  = list(df_counts["topic"].unique())
APPS    = list(df_counts["application"].unique())

print("Topics:", TOPICS)
print("Years:", YEARS)
print("Applications:", APPS)

# =================== BUILD GROUPS (one per topic) ===================

groups = []   # list of (topic_label, gdf_topic)

for topic in TOPICS:
    sub = df_counts[df_counts["topic"] == topic]

    pivot = (
        sub.pivot_table(index="application", columns="year",
                        values="count", aggfunc="sum")
           .reindex(index=APPS, columns=YEARS, fill_value=0)
    )

    rows_topic = []
    for app in APPS:
        counts = pivot.loc[app, YEARS]
        total  = counts.sum()
        if total == 0:
            # skip applications that truly have 0 across all years for this topic
            continue
        row = {"application": app}
        for i, y in enumerate(YEARS, start=1):
            row[f"lvl{i}"] = int(counts[y])
        row["total"] = int(total)
        rows_topic.append(row)

    gdf_topic = pd.DataFrame(rows_topic).reset_index(drop=True)
    groups.append((topic, gdf_topic))

# Short labels for applications inside bars
SHORT_APP = {
    "Diagnostic imaging (classification/detection)":   "Diag.\nImaging",
    "Imaging segmentation / reconstruction":           "Segm./\nRecons.",
    "Clinical decision support & risk prediction":     "CDS /\nRisk",
    "Clinical text / EHR modelling":                   "Text /\nEHR",
    "Drug discovery & design":                         "Drug\nDesign",
    "Protein / biomolecule structure":                 "Protein\nStruct.",
    "Patient monitoring & physiological signals":      "Monitor\nSignals",
    "Surgical planning / training & simulation":       "Surg.\nTrain/Sim",
}

SHORT_TOPIC = {
    "Generative AI": "GenAI",
    "Digital twins": "Digital twins",
    "Federated learning": "Federated",
}

# =================== CIRCULAR PLOT (WHITE STYLE, ONE REFERENCE LINE) ===================

INNER_RADIUS = 0.60
GAP_DEGREES  = 8.0
BAR_PAD_DEG  = 1.2
FIGSIZE      = (9, 9)
DPI          = 300

# Colors per year (oldest = lightest)
YEAR_COLORS = {
    YEARS[0]: "#d4b9da",
    YEARS[1]: "#bcbddc",
    YEARS[2]: "#9e9ac8",
    YEARS[3]: "#756bb1",
    YEARS[4]: "#54278f",
}

LABEL_IN_BAR  = True
LABEL_PAD_OUT = 0.018
LABEL_PAD_IN  = 0.010

# ---------- angles & widths ----------

n_groups = len(groups)
usable_deg     = 360.0 - n_groups * GAP_DEGREES
span_per_group = usable_deg / n_groups

group_spans = []
bar_centers_and_w = []

theta0 = 90.0
for g_idx, (_, gdfg) in enumerate(groups):
    start = theta0 - g_idx * (span_per_group + GAP_DEGREES)
    end   = start - span_per_group
    group_spans.append((start, end))

    width     = span_per_group
    pad       = BAR_PAD_DEG
    width_eff = width - 2 * pad
    n         = len(gdfg)
    tiny      = BAR_PAD_DEG if n > 1 else 0.0
    total_tiny = tiny * max(n - 1, 0)
    each      = (width_eff - total_tiny) / max(n, 1)

    centers = []
    cursor  = start - pad - each/2
    for _ in range(n):
        centers.append(cursor)
        cursor -= (each + tiny)
    bar_centers_and_w.append((np.array(centers), each))

# ---------- scaling ----------

all_totals = [int(v) for _, gdfg in groups for v in gdfg["total"].values]
global_max = max(all_totals) if all_totals else 1
RANGE      = 1.0 - INNER_RADIUS

# choose a nice rounded max for tick
def round_up_nice(x):
    if x <= 10:
        step = 1
    elif x <= 50:
        step = 5
    else:
        step = 10
    return int(np.ceil(x / step) * step)

max_label_val = round_up_nice(global_max)
single_tick_val = max_label_val
single_tick_radius = INNER_RADIUS + (single_tick_val / max(global_max, 1)) * RANGE

# ---------- plot ----------

plt.close("all")
fig = plt.figure(figsize=FIGSIZE, dpi=DPI, facecolor="white")
ax  = plt.subplot(111, polar=True)
ax.set_facecolor("white")
ax.set_theta_direction(-1)
ax.set_theta_offset(np.deg2rad(90))

ax.grid(False)
ax.set_yticklabels([])
ax.set_xticklabels([])
if "polar" in ax.spines:
    ax.spines["polar"].set_visible(False)
ax.set_rlim(0, 1.02)

# inner disk
ax.add_artist(plt.Circle((0, 0), INNER_RADIUS, transform=ax.transData._b,
                         color="white", zorder=5))

legend_handles = {}
outer_tops, bar_bottoms = [], []

for g_idx, (topic_label, gdfg) in enumerate(groups):
    centers_deg, bar_w_deg = bar_centers_and_w[g_idx]
    centers   = np.deg2rad(centers_deg)
    bar_width = np.deg2rad(bar_w_deg)

    bottom = np.full(len(gdfg), INNER_RADIUS)
    for i, year in enumerate(YEARS, start=1):
        vals    = gdfg[f"lvl{i}"].values.astype(float)
        heights = (vals / global_max) * RANGE
        bars = ax.bar(
            centers,
            height=heights,
            width=bar_width,
            bottom=bottom,
            color=YEAR_COLORS.get(year, "#CCCCCC"),
            edgecolor="white",
            linewidth=0.7,
            label=str(year),
            zorder=6,
        )
        bottom = bottom + heights
        if str(year) not in legend_handles and len(bars) > 0:
            legend_handles[str(year)] = bars[0]
    outer_tops.append(bottom.copy())
    bar_bottoms.append(np.full(len(gdfg), INNER_RADIUS))

# ----- SINGLE reference line (one circle) -----
th_arc = np.linspace(0, 2*np.pi, 400)
ax.plot(th_arc, np.full_like(th_arc, single_tick_radius),
        color="#CCCCCC", lw=1.0, zorder=4)

# label for that line at the top (90 degrees)
ax.text(np.deg2rad(90), single_tick_radius,
        f"{single_tick_val}",
        ha="center", va="bottom",
        fontsize=8, color="#666666", fontweight="bold", zorder=7)

# also mark 0 near the inner radius, if you like
ax.text(np.deg2rad(90), INNER_RADIUS - 0.01,
        "0",
        ha="center", va="top",
        fontsize=8, color="#666666", fontweight="bold", zorder=7)

# ----- sector baselines + labels -----
for g_idx, (topic_label, _) in enumerate(groups):
    start_deg, end_deg = group_spans[g_idx]
    mid_deg = (start_deg + end_deg) / 2.0
    base_r  = INNER_RADIUS - 0.010
    th_arc  = np.linspace(np.deg2rad(end_deg), np.deg2rad(start_deg), 64)
    ax.plot(th_arc, np.full_like(th_arc, base_r),
            color="black", lw=0.9, zorder=8)
    text_label = SHORT_TOPIC.get(topic_label, topic_label)
    ax.text(
        np.deg2rad(mid_deg), INNER_RADIUS - 0.035, text_label,
        ha="center", va="center",
        fontsize=12, color="black", fontweight="bold", zorder=9,
    )

# ----- tangent-angle helper -----
def screen_tangent_angle_deg(ax, theta_rad: float, r: float, eps: float = 1e-4) -> float:
    trans = ax.transData
    p1 = trans.transform((theta_rad - eps, r))
    p2 = trans.transform((theta_rad + eps, r))
    dy, dx = (p2[1] - p1[1]), (p2[0] - p1[0])
    angle = np.degrees(np.arctan2(dy, dx))
    if angle < -90 or angle > 90:
        angle += 180.0
    return angle

# ----- application labels inside bars -----
for (topic_label, gdfg), tops, bottoms, (centers_deg, _) in zip(
        groups, outer_tops, bar_bottoms, bar_centers_and_w):
    for ang_deg, r_top, r_bot, app in zip(
            centers_deg, tops, bottoms, gdfg["application"].values):
        theta = np.deg2rad(ang_deg)
        if LABEL_IN_BAR:
            r_text = (r_top + r_bot) / 2.0 - LABEL_PAD_IN
            va = "center"
        else:
            r_text = min(1.02, r_top + LABEL_PAD_OUT)
            va = "bottom"
        rot = screen_tangent_angle_deg(ax, theta, r_text)
        label_text = SHORT_APP.get(app, app)
        ax.text(
            theta, r_text, label_text,
            ha="center", va=va,
            rotation=rot, rotation_mode="anchor",
            fontsize=7, color="#444444", fontweight="bold", zorder=9,
        )

# ----- legend (years) -----
order = [str(y) for y in YEARS if str(y) in legend_handles]
handles = [legend_handles[k] for k in order]
plt.legend(
    handles, order,
    title="Publication year",
    loc="lower center", bbox_to_anchor=(0.5, -0.08),
    ncol=len(order), frameon=False,
    handlelength=1.4, columnspacing=1.6,
)

plt.tight_layout()
out_pdf = "genai_topics_applications_circularplot_pubmed.pdf"
fig.savefig(out_pdf, bbox_inches="tight", dpi=DPI)
print("Saved PDF:", out_pdf)

plt.show()


# BARPLOT

In [None]:

# ============ LOAD CSV ============
df = pd.read_csv("genai_topics_applications_counts_pubmed.csv")

# ensure numeric
df["count"] = pd.to_numeric(df["count"], errors="coerce").fillna(0).astype(int)

YEARS  = sorted(df["year"].unique())
TOPICS = ["Generative AI", "Digital twins", "Federated learning"]

# Use the same application order as in your circular plot
APPLICATIONS = [
    "Diagnostic imaging (classification/detection)",
    "Imaging segmentation / reconstruction",
    "Clinical decision support & risk prediction",
    "Clinical text / EHR modelling",
    "Drug discovery & design",
    "Protein / biomolecule structure",
    "Patient monitoring & physiological signals",
    "Surgical planning / training & simulation",
]

# Short labels (to keep x-axis readable)
SHORT_APP = {
    "Diagnostic imaging (classification/detection)":   "Diag.\nImaging",
    "Imaging segmentation / reconstruction":           "Segm.\nRecons.",
    "Clinical decision support & risk prediction":     "CDS /\nRisk",
    "Clinical text / EHR modelling":                   "Text /\nEHR",
    "Drug discovery & design":                         "Drug\nDesign",
    "Protein / biomolecule structure":                 "Protein\nStruct.",
    "Patient monitoring & physiological signals":      "Monitor\nSignals",
    "Surgical planning / training & simulation":       "Surg.\nTrain/Sim",
}

SHORT_TOPIC = {
    "Generative AI": "GenAI",
    "Digital twins": "Digital twins",
    "Federated learning": "Federated",
}

# Colors per year (same as circular plot)
YEAR_COLORS = {
    YEARS[0]: "#d4b9da",
    YEARS[1]: "#bcbddc",
    YEARS[2]: "#9e9ac8",
    YEARS[3]: "#756bb1",
    YEARS[4]: "#54278f",
}

# ============ FACETTED BAR PLOTS (a, b, c) ============

n_topics = len(TOPICS)
n_apps   = len(APPLICATIONS)
n_years  = len(YEARS)

x = np.arange(n_apps)
bar_width = 0.8 / n_years   # total width ~0.8

fig, axes = plt.subplots(
    1, n_topics,
    figsize=(5 * n_topics, 5),
    sharey=True,
    dpi=300
)

if n_topics == 1:
    axes = [axes]  # make iterable

all_handles = None
all_labels  = None

for i, topic in enumerate(TOPICS):
    ax = axes[i]

    # subset & pivot for this topic
    sub = df[df["topic"] == topic]
    pivot = (
        sub.pivot_table(index="application", columns="year",
                        values="count", aggfunc="sum")
           .reindex(index=APPLICATIONS, columns=YEARS, fill_value=0)
    )

    # grouped bars per application
    for j, year in enumerate(YEARS):
        y_vals = pivot[year].values
        # center bars around each integer x
        x_positions = x - 0.5*bar_width*(n_years - 1) + j*bar_width
        bars = ax.bar(
            x_positions, y_vals,
            width=bar_width,
            color=YEAR_COLORS.get(year, "#CCCCCC"),
            edgecolor="white",
            linewidth=0.7,
            label=str(year)
        )

        # capture legend handles only once (from first subplot)
        if i == 0 and j == 0:
            all_handles, all_labels = [], []
        if i == 0:
            all_handles.append(bars[0])
            all_labels.append(str(year))

    # x-axis formatting
    ax.set_xticks(x)
    ax.set_xticklabels(
        [SHORT_APP.get(a, a) for a in APPLICATIONS],
        fontsize=8
    )

    # y-axis label only on first panel
    if i == 0:
        ax.set_ylabel("Number of publications")

    # panel title with letter and topic
    panel_letter = chr(ord("a") + i)  # a, b, c
    ax.set_title(f"({panel_letter}) {SHORT_TOPIC.get(topic, topic)}",
                 fontsize=11, fontweight="bold")

    # light grid for readability
    ax.grid(axis="y", linestyle="--", alpha=0.3)

# ============ ONE SHARED LEGEND IN CENTER ============
if all_handles is not None and all_labels is not None:
    fig.legend(
        all_handles,
        all_labels,
        title="Publication year",
        loc="lower center",
        ncol=len(YEARS),
        frameon=False,
        bbox_to_anchor=(0.5, -0.02)
    )

# tighten layout, leaving space at bottom for legend
plt.tight_layout(rect=[0, 0.08, 1, 1])

# save if you like
out_pdf = "genai_topics_applications_faceted_pubmed.pdf"
fig.savefig(out_pdf, bbox_inches="tight")
print("Saved facet figure to:", out_pdf)

plt.show()
