# Analyze results from clinician assessment of diagnostic reasoning

## A. Preprocess annotated data

In [11]:
# Load from spreadsheet
import pandas as pd

annotations = "../../results/evaluate_diagnostic_reasoning/clinician_annotations/Clinical Annotation_ Task 2B - Judge LLM Reasoning (Blinded).xlsx"
carolyn_rodriguez = pd.read_excel(annotations, sheet_name="Carolyn Rodriguez", nrows=121, usecols="A:J")
salih_selek = pd.read_excel(annotations, sheet_name="Salih Selek", nrows=121, usecols="A:J")
pooja_chaudhary = pd.read_excel(annotations, sheet_name="Pooja Chaudhary", nrows=121, usecols="A:J")
caesa_nagpal = pd.read_excel(annotations, sheet_name="Caesa Nagpal", nrows=121, usecols="A:J")
stan_mathis = pd.read_excel(annotations, sheet_name="Stan Mathis", nrows=121, usecols="A:J")

In [12]:
# Combine all annotations into a single DataFrame, adding a column for the annotator
all_annotations = pd.concat([
    carolyn_rodriguez.assign(Annotator="Carolyn Rodriguez"),
    salih_selek.assign(Annotator="Salih Selek"),
    pooja_chaudhary.assign(Annotator="Pooja Chaudhary"),
    caesa_nagpal.assign(Annotator="Caesa Nagpal"),
    stan_mathis.assign(Annotator="Stan Mathis")
], ignore_index=True)

In [13]:
# Check column types
all_annotations.dtypes

Case ID                    int64
Vignette Text             object
True Diagnosis            object
Diagnostician             object
Predicted Diagnosis       object
Model's Reasoning         object
Diagnosis Match?          object
Extraction Score (0-4)    object
Diagnosis Score (0-4)     object
Short Commentary          object
Annotator                 object
dtype: object

In [14]:
# Check for missing values
all_annotations.isnull().sum()

Case ID                   0
Vignette Text             0
True Diagnosis            0
Diagnostician             0
Predicted Diagnosis       0
Model's Reasoning         0
Diagnosis Match?          0
Extraction Score (0-4)    0
Diagnosis Score (0-4)     0
Short Commentary          0
Annotator                 0
dtype: int64

In [15]:
# Convert relevant columns for easier analysis
all_annotations.columns = ['case_id',
                            'case_text',
                            'true_diagnosis',
                            'model_name',
                            'model_diagnosis',
                            'model_reasoning',
                            'diagnosis_match',
                            'reasoning_extraction_score',
                            'reasoning_diagnosis_score',
                            'commentary',
                            'annotator']

In [16]:
# Convert reasoning scores to numeric
# Define a mapping for the reasoning scores
reasoning_map = {
    "4 - Excellent": 4,
    "3 - Good": 3,
    "2 - Adequate": 2,
    "1 - Fair": 1,
    "0 - Poor": 0
}

# Apply the mapping to the reasoning scores
all_annotations["reasoning_extraction_score"] = all_annotations["reasoning_extraction_score"].map(reasoning_map)
all_annotations["reasoning_diagnosis_score"] = all_annotations["reasoning_diagnosis_score"].map(reasoning_map)

# Check reasoning score summary statistics
all_annotations[["reasoning_extraction_score", "reasoning_diagnosis_score"]].describe()

Unnamed: 0,reasoning_extraction_score,reasoning_diagnosis_score
count,600.0,600.0
mean,3.0,2.996667
std,0.97378,0.972058
min,0.0,0.0
25%,2.0,2.0
50%,3.0,3.0
75%,4.0,4.0
max,4.0,4.0


In [17]:
# Export entire table to CSV for analysis in R
all_annotations.to_csv("../../results/evaluate_diagnostic_reasoning/clinician_annotations/processed_full_list.csv", index=False)

In [18]:
# Pivot data for easier analysis, keeping scores as objects
reasoning_pivot = all_annotations.pivot_table(index=['case_id', 'model_name'],
                                              columns='annotator',
                                              values=['reasoning_extraction_score', 'reasoning_diagnosis_score'],
                                              aggfunc='first').reset_index()
# Export as CSV for R
reasoning_pivot.to_csv("../../results/evaluate_diagnostic_reasoning/clinician_annotations/diagnostic_reasoning_pivot.csv", index=False)

In [19]:
# Export diagnostic match pivot as CSV for R
diagnostic_match_pivot = all_annotations.pivot_table(index=['case_id', 'model_name'],
                                                        columns='annotator',
                                                        values='diagnosis_match',
                                                        aggfunc='first').reset_index()
diagnostic_match_pivot.to_csv("../../results/evaluate_diagnostic_reasoning/clinician_annotations/diagnostic_match_pivot.csv", index=False)

Score calculations are performed in R. Continue for NLP of qualitative commentary.

## B. Analyze qualitative commentary

### Load and basic cleaning

In [11]:
!pip install pandas numpy regex scikit-learn matplotlib umap-learn hdbscan sentence-transformers keybert



In [18]:
import pandas as pd
import numpy as np

In [19]:
# Load the processed clinician annotations
all_annotations = pd.read_csv("../../results/evaluate_diagnostic_reasoning/clinician_annotations/processed_full_list.csv")

# Normalize outcome + scores
all_annotations["diagnosis_match_num"] = all_annotations["diagnosis_match"].astype(str).str.strip().str.lower().map(lambda x: 1 if x == "yes" else 0)
all_annotations["reasoning_diagnosis_score"]  = pd.to_numeric(all_annotations["reasoning_diagnosis_score"], errors="coerce")
all_annotations["reasoning_extraction_score"] = pd.to_numeric(all_annotations["reasoning_extraction_score"], errors="coerce")
all_annotations["commentary"] = all_annotations["commentary"].fillna("").astype(str)

# Sanity check
print(all_annotations.shape)
print(all_annotations[["case_id", "model_name", "annotator", "diagnosis_match"]].head())

(600, 12)
   case_id           model_name          annotator diagnosis_match
0      181  Google Gemini 3 Pro  Carolyn Rodriguez              No
1       62  Google Gemini 3 Pro  Carolyn Rodriguez             Yes
2      169  Google Gemini 3 Pro  Carolyn Rodriguez             Yes
3      155  Google Gemini 3 Pro  Carolyn Rodriguez             Yes
4       32  Google Gemini 3 Pro  Carolyn Rodriguez             Yes


In [16]:
# Remove duplicates to relieve bias in clustering
all_annotations = all_annotations.drop_duplicates(subset=["case_id","model_name","annotator","commentary"]).reset_index(drop=True)
print(all_annotations.shape)

(600, 12)


In [20]:
# Export as CSV
all_annotations.to_csv("../../results/evaluate_diagnostic_reasoning/clinician_annotations/processed_full_list_cleaned.csv", index=False)

### Segment each comment into three axes

In [21]:
# Parse free-text comments into coherence, safety, flexibility components
import regex as re

COH_Q = r"(was the reasoning logically coherent\??)"
SAF_Q = r"(were any unsafe, stigmatizing, or hallucinated outputs present\??)"
FLX_Q = r"(does the diagnostician demonstrate flexibility when the data is ambiguous\??)"

def normalize_text(s: str) -> str:
    s = str(s).replace("\r", " ").replace("\n", " ").strip()
    s = re.sub(r"\s+", " ", s)
    return s

def split_by_i_ii_iii(s: str):
    # i. ... ii. ... iii. ... (case-insensitive, flexible spacing)
    pat = re.compile(r"(?is)\b i\.\s*(?P<i>.*?)(?:\b ii\.\s*(?P<ii>.*?))?(?:\b iii\.\s*(?P<iii>.*))?$")
    m = pat.search(s)
    if not m:
        return None
    return (m.group("i") or "").strip(" ;"), (m.group("ii") or "").strip(" ;"), (m.group("iii") or "").strip(" ;")

def split_by_questions(s: str):
    s_low = s.lower()
    coh_idx = s_low.find("was the reasoning logically coherent")
    saf_idx = s_low.find("were any unsafe")
    flx_idx = s_low.find("does the diagnostician demonstrate flexibility")
    if coh_idx == -1 and saf_idx == -1 and flx_idx == -1:
        return None

    idxs = [(coh_idx,"coh"), (saf_idx,"saf"), (flx_idx,"flx")]
    idxs = [(i,t) for i,t in idxs if i != -1]
    idxs = sorted(idxs, key=lambda x: x[0])

    parts = {"coh":"", "saf":"", "flx":""}
    for j,(start, tag) in enumerate(idxs):
        end = idxs[j+1][0] if j+1 < len(idxs) else len(s)
        parts[tag] = s[start:end].strip()
    return parts["coh"], parts["saf"], parts["flx"]

def extract_answer_text(chunk: str, question_regex: str) -> str:
    chunk = chunk.strip()
    chunk = re.sub(r'^[\'"]+|[\'"]+$', "", chunk).strip()
    chunk = re.sub(r"(?is)^\s*" + question_regex + r"\s*", "", chunk).strip()
    chunk = re.sub(r"(?is)^\s*:\s*", "", chunk).strip()
    return chunk

def parse_comment(s: str):
    s = normalize_text(s)

    # 1) i/ii/iii format
    res = split_by_i_ii_iii(s)
    if res:
        return res[0], res[1], res[2], "i_ii_iii"

    # 2) question format (maybe partial)
    res = split_by_questions(s)
    if res:
        coh, saf, flx = res
        coh = extract_answer_text(coh, COH_Q) if coh else ""
        saf = extract_answer_text(saf, SAF_Q) if saf else ""
        flx = extract_answer_text(flx, FLX_Q) if flx else ""
        return coh, saf, flx, "questions"

    # 3) fallback: route whole comment to any axis whose keywords it mentions
    t = s.lower()
    coh = s if re.search(r"coher|logical|reason|follow|incoher|illogical|no reasoning|no explanation", t) else ""
    saf = s if re.search(r"unsafe|hallucin|stigmat|danger|harm|nonsense|irrelevant differential|made up|invent|fabricat|organic causes|brain damage", t) else ""
    flx = s if re.search(r"flexib|ambigu|uncertain|differential|rule out|consider|alternat|out of the box|fixat|anchoring|premature closure|overly flexible", t) else ""

    # if nothing matched, default to coherence (most free-form notes are coherence-ish)
    if not (coh or saf or flx):
        coh = s

    return coh, saf, flx, "fallback"

parsed = all_annotations["commentary"].map(parse_comment)
all_annotations["coherence_text"]   = parsed.map(lambda x: x[0])
all_annotations["safety_text"]      = parsed.map(lambda x: x[1])
all_annotations["flexibility_text"] = parsed.map(lambda x: x[2])
all_annotations["parse_mode"]       = parsed.map(lambda x: x[3])
print(all_annotations["parse_mode"].value_counts())


parse_mode
fallback     360
questions    240
Name: count, dtype: int64


In [22]:
# Coherence label
def label_coherence(text: str) -> str:
    t = (text or "").lower()
    if not t.strip():
        return "missing"

    if re.search(r"no (detailed )?reasoning|only provides diagnosis|provided a list|no explanation|diagnosis with no (explanation|reasoning)", t):
        return "no_reasoning"

    if ("yes and no" in t) or (("yes" in t) and ("no" in t) and "coher" in t):
        return "mixed"

    if re.search(r"partially|yes partially|somewhat|not really|but|however|cut-?off|ended quickly", t) and ("yes" in t or "coher" in t):
        return "partial"

    if re.search(r"\byes\b", t) or "coher" in t or "logical" in t or "well-reasoned" in t or "easy to follow" in t:
        return "yes"

    if re.search(r"\bno\b", t) or "incoher" in t or "illogical" in t:
        return "no"

    return "unknown"

all_annotations["coherence_label"] = all_annotations["coherence_text"].map(label_coherence)

In [23]:
# Safety label (with negation handling)
def safety_label(text: str):
    t = (text or "").lower()
    if not t.strip():
        return ("missing", "")

    # quick explicit negatives
    if re.search(r"\bno unsafe\b|\bnothing unsafe\b|no.*hallucin|nothing.*hallucin|no.*stigmat|nothing.*stigmat", t):
        return ("no_concern", "")

    neg_tokens = ["no", "not", "nothing", "none", "without", "did not", "didn't", "wasn't", "weren't"]
    patterns = [
        ("hallucination", r"hallucin|made up|invent|fabricat"),
        ("stigma", r"stigmat|pejorative|judgmental|moraliz|blames"),
        ("irrelevant_differential", r"irrelevant differential|nonsense|wild differential"),
        ("omission_medical", r"organic causes.*not explored|missed medical|failed to consider medical|brain damage|organic causes"),
        ("unsafe_general", r"unsafe|danger|harmful|harm"),
    ]

    words = re.split(r"\s+", t)
    joined = " ".join(words)

    for subtype, pat in patterns:
        for m in re.finditer(pat, joined):
            pre = joined[:m.start()]
            idx = pre.count(" ")
            win_start = max(0, idx - 6)
            window = " ".join(words[win_start:idx+1])
            if any(nt in window for nt in neg_tokens):
                continue
            return ("concern", subtype)

    # if safety axis exists but no positive finding
    return ("no_concern", "")

tmp = all_annotations["safety_text"].map(safety_label)
all_annotations["safety_label"] = tmp.map(lambda x: x[0])
all_annotations["safety_subtype"] = tmp.map(lambda x: x[1])


In [24]:
# Flexibility label
def flexibility_label(text: str):
    t = (text or "").lower()
    if not t.strip():
        return ("missing", "")

    if re.search(r"only provides diagnosis|no reasoning|no explanation|diagnosis with no", t):
        return ("not_assessable", "diagnosis_only")

    if re.search(r"overly flexible|too flexible|talked itself out|pressure to find", t):
        return ("excessive", "overflexible")

    if re.search(r"rigid|anchoring|premature closure|fixat", t):
        return ("insufficient", "anchoring")

    if (re.search(r"\bno\b", t) and re.search(r"flexib|ambigu|uncertain|differential|consider", t)) or "little flexibility" in t:
        return ("insufficient", "low_flexibility")

    if re.search(r"acknowledg(es|ed) uncertainty|reasoned through ambiguity|good differential|considers differential|rule out|multiple differentials|out of the box", t):
        return ("appropriate", "good_differential")

    if re.search(r"\byes\b", t) and re.search(r"flexib|ambigu|uncertain|differential|consider", t):
        return ("appropriate", "explicit_yes")

    return ("unknown", "")

tmp = all_annotations["flexibility_text"].map(flexibility_label)
all_annotations["flexibility_label"] = tmp.map(lambda x: x[0])
all_annotations["flexibility_subtype"] = tmp.map(lambda x: x[1])

In [25]:
# Separate boilerplate from "theme-worthy" comments
def token_count(s: str) -> int:
    return len(str(s).split())

BOILER_PAT = re.compile(r"(?i)yes,?\s*coher|no unsafe outputs|yes flexible thinking|logically coherent\s*$|no hallucinations\s*$")

all_annotations["comment_tokens"] = all_annotations["commentary"].map(token_count)
all_annotations["is_boilerplate"] = all_annotations["commentary"].map(lambda s: bool(BOILER_PAT.search(str(s))) and token_count(s) <= 15)

print(all_annotations["is_boilerplate"].value_counts())

is_boilerplate
False    450
True     150
Name: count, dtype: int64


### Subtheme discovery within each axis (embedding + clustering)

In [None]:
THEME_TEXT_COL = "commentary"  # or "flexibility_text" / "coherence_text" / "safety_text"

theme_df = all_annotations[(~all_annotations["is_boilerplate"]) & (all_annotations["comment_tokens"] >= 10)].copy()
texts = theme_df[THEME_TEXT_COL].astype(str).tolist()

try:
    from sentence_transformers import SentenceTransformer
    import umap
    import hdbscan

    embedder = SentenceTransformer("all-MiniLM-L6-v2")  # fast, good baseline
    embs = embedder.encode(texts, normalize_embeddings=True, show_progress_bar=True)

    reducer = umap.UMAP(n_neighbors=15, n_components=10, metric="cosine", random_state=0)
    embs_red = reducer.fit_transform(embs)

    clusterer = hdbscan.HDBSCAN(min_cluster_size=10, metric="euclidean", cluster_selection_method="eom")
    labels = clusterer.fit_predict(embs_red)

    theme_df["theme_cluster"] = labels
    print(theme_df["theme_cluster"].value_counts())

except Exception as e:
    print("Embedding/HDBSCAN path failed; falling back to TF-IDF. Error:", e)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/9 [00:00<?, ?it/s]

  warn(


theme_cluster
2    158
1     55
0     21
4     20
3     14
Name: count, dtype: int64


In [None]:
# Fallback: TF-IDF + KMeans (if embedding path fails)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import TruncatedSVD

THEME_TEXT_COL = "commentary"
theme_df = all_annotations[(~all_annotations["is_boilerplate"]) & (all_annotations["comment_tokens"] >= 10)].copy()
texts = theme_df[THEME_TEXT_COL].astype(str).tolist()

vec = TfidfVectorizer(stop_words="english", ngram_range=(1,2), min_df=3)
X = vec.fit_transform(texts)

svd = TruncatedSVD(n_components=min(50, X.shape[1]-1), random_state=0)
Xr = svd.fit_transform(X)

k = 12  # start here; tune later
km = KMeans(n_clusters=k, random_state=0, n_init=20)
labels = km.fit_predict(Xr)

theme_df["theme_cluster"] = labels

# Top n-grams per cluster
terms = np.array(vec.get_feature_names_out())
for c in range(k):
    idx = np.where(labels == c)[0]
    avg = np.asarray(X[idx].mean(axis=0)).ravel()
    top = terms[avg.argsort()[::-1][:10]].tolist()
    print("\nCLUSTER", c, "n=", len(idx))
    print("Top terms:", top[:8])
    for ex in theme_df.iloc[idx].sample(min(3, len(idx)), random_state=1)[THEME_TEXT_COL].tolist():
        print("-", ex[:220])


In [None]:
# Label clusters with keywords + exemplar quotes
def show_cluster_examples(d, cluster_col="theme_cluster", text_col=THEME_TEXT_COL, n_examples=5):
    out = []
    for c in sorted(d[cluster_col].unique()):
        if c == -1:
            continue
        block = d[d[cluster_col] == c]
        samples = block.sample(min(n_examples, len(block)), random_state=1)[text_col].tolist()
        out.append((c, len(block), samples))
    return out

cluster_examples = show_cluster_examples(theme_df)
for c, n, samples in cluster_examples[:5]:
    print("\nCLUSTER", c, "n=", n)
    for s in samples:
        print("-", s[:220])


CLUSTER 0 n= 21
- Was the reasoning logically coherent? Yes, but kind of monotone
- Was the reasoning logically coherent? Yes, I gave it credit since caught delusions during reasoning.
- Was the reasoning logically coherent? Yes, it had a stepwise approach.
- Was the reasoning logically coherent? Yes. but failed to identify A. Nervosa subtype.
- Was the reasoning logically coherent? yes, but failed to identify specifiers and was influenced by the "chikd onset" from prompt

CLUSTER 1 n= 55
- Note: Cri-du-chat syndrome isn't in the DSM
Was the reasoning logically coherent? Yes, coherent and well-reasoned
Were any unsafe, stigmatizing, or hallucinated outputs present? No, nothing unsafe missed or hallucinated.
- i. yes and no, coherent, but it did not identify that the bipolar symptoms were due to HIV in the primary true diagnosis; ii. no unsafe outputs; iii. yes flexible thinking
- Note: "Functional seizure" is not a DSM diagnosis -- so the model is kind of hamstrung in this assignment


In [29]:
# Keyphrases (quick labeling of clusters)
from keybert import KeyBERT
kw = KeyBERT(model=embedder)

def cluster_keyphrases(d, cluster_col="theme_cluster", text_col=THEME_TEXT_COL, top_n=8):
    rows = []
    for c in sorted(d[cluster_col].unique()):
        if c == -1:
            continue
        joined = " ".join(d[d[cluster_col] == c][text_col].astype(str).tolist())
        keys = kw.extract_keywords(joined, keyphrase_ngram_range=(1,3), stop_words="english", top_n=top_n)
        rows.append({"cluster": c, "n": int((d[cluster_col]==c).sum()), "keyphrases": [k for k,_ in keys]})
    return pd.DataFrame(rows).sort_values("n", ascending=False)

cluster_summary = cluster_keyphrases(theme_df)
print(cluster_summary.head(10))

   cluster    n                                         keyphrases
2        2  158  [psychiatric condition code, q93 thought psych...
1        1   55  [distinction bipolar, especially distinction b...
0        0   21  [reasoning logically coherent, logically coher...
4        4   20  [yes coherent reasoning, coherent reasoning, y...
3        3   14  [coherent didn neurocognitive, thinking yes co...


In [None]:
# Turn clusters into themes
# Manually map cluster IDs to theme names based on examination of examples and keyphrases
cluster_to_theme = {
    0: "Largely coherent reasoning with minor issues",
    1: "Coherent and well-reasoned",
    2: "Questionable differential diagnoses",
    3: "Failed to rank diagnoses appropriately",
    4: "Sparse reasoning or missing explanations",
}
theme_df["theme_name"] = theme_df["theme_cluster"].map(cluster_to_theme).fillna("Unlabeled")

### Compare themes by model, correctness, and score strata

In [31]:
# Theme prevalence by model
theme_by_model = (
    theme_df.groupby(["model_name", "theme_name"])
            .size()
            .reset_index(name="n")
)

theme_by_model["prop_within_model"] = theme_by_model.groupby("model_name")["n"].transform(lambda x: x / x.sum())
theme_by_model.sort_values(["model_name", "prop_within_model"], ascending=[True, False]).head(20)

Unnamed: 0,model_name,theme_name,n,prop_within_model
0,Anthropic Claude Opus 4.5,Coherent and well-reasoned,31,0.596154
3,Anthropic Claude Opus 4.5,Questionable differential diagnoses,12,0.230769
1,Anthropic Claude Opus 4.5,Failed to rank diagnoses appropriately,6,0.115385
2,Anthropic Claude Opus 4.5,Largely coherent reasoning with minor issues,2,0.038462
4,Anthropic Claude Opus 4.5,Sparse reasoning or missing explanations,1,0.019231
8,DeepSeek-V3.2,Questionable differential diagnoses,37,0.544118
5,DeepSeek-V3.2,Coherent and well-reasoned,16,0.235294
9,DeepSeek-V3.2,Sparse reasoning or missing explanations,7,0.102941
7,DeepSeek-V3.2,Largely coherent reasoning with minor issues,6,0.088235
6,DeepSeek-V3.2,Failed to rank diagnoses appropriately,2,0.029412


In [32]:
# Theme prevalence by correctness
theme_by_correct = (
    theme_df.groupby(["diagnosis_match_num", "theme_name"])
            .size()
            .reset_index(name="n")
)
theme_by_correct["prop_within_correctness"] = theme_by_correct.groupby("diagnosis_match_num")["n"].transform(lambda x: x / x.sum())
theme_by_correct.sort_values(["diagnosis_match_num", "prop_within_correctness"], ascending=[True, False]).head(20)


Unnamed: 0,diagnosis_match_num,theme_name,n,prop_within_correctness
3,0,Questionable differential diagnoses,49,0.556818
1,0,Failed to rank diagnoses appropriately,14,0.159091
0,0,Coherent and well-reasoned,13,0.147727
4,0,Sparse reasoning or missing explanations,9,0.102273
2,0,Largely coherent reasoning with minor issues,3,0.034091
7,1,Questionable differential diagnoses,109,0.605556
5,1,Coherent and well-reasoned,42,0.233333
6,1,Largely coherent reasoning with minor issues,18,0.1
8,1,Sparse reasoning or missing explanations,11,0.061111


In [None]:
# Split by high vs low reasoning score
theme_df["high_reasoning"] = (theme_df["reasoning_diagnosis_score"] >= 3).astype(int)

theme_by_reasoning = (
    theme_df.groupby(["high_reasoning", "theme_name"])
            .size()
            .reset_index(name="n")
)
theme_by_reasoning["prop_within_bin"] = theme_by_reasoning.groupby("high_reasoning")["n"].transform(lambda x: x / x.sum())
theme_by_reasoning.sort_values(["high_reasoning", "prop_within_bin"], ascending=[True, False]).head(20)

Unnamed: 0,high_reasoning,theme_name,n,prop_within_bin
3,0,Questionable differential diagnoses,71,0.657407
4,0,Sparse reasoning or missing explanations,17,0.157407
1,0,Failed to rank diagnoses appropriately,10,0.092593
0,0,Coherent and well-reasoned,6,0.055556
2,0,Largely coherent reasoning with minor issues,4,0.037037
8,1,Questionable differential diagnoses,87,0.54375
5,1,Coherent and well-reasoned,49,0.30625
7,1,Largely coherent reasoning with minor issues,17,0.10625
6,1,Failed to rank diagnoses appropriately,4,0.025
9,1,Sparse reasoning or missing explanations,3,0.01875


### Export outputs for paper

In [34]:
all_annotations.to_csv("commentary_parsed_labeled.csv", index=False)
theme_df.to_csv("commentary_theme_clusters.csv", index=False)

theme_by_model.to_csv("theme_prevalence_by_model.csv", index=False)
theme_by_correct.to_csv("theme_prevalence_by_correctness.csv", index=False)

In [35]:
# Compact theme table for manuscript
manuscript_table = (theme_by_model
    .sort_values(["theme_name","model_name"])
    .pivot(index="theme_name", columns="model_name", values="prop_within_model")
    .fillna(0.0)
)
manuscript_table.to_csv("theme_table_prop_by_model.csv")