# Analyze results from clinician assessment of diagnostic reasoning

## A. Preprocess annotated data

In [11]:
# Load from spreadsheet
import pandas as pd

annotations = "../../results/evaluate_diagnostic_reasoning/clinician_annotations/Clinical Annotation_ Task 2B - Judge LLM Reasoning (Blinded).xlsx"
carolyn_rodriguez = pd.read_excel(annotations, sheet_name="Carolyn Rodriguez", nrows=121, usecols="A:J")
salih_selek = pd.read_excel(annotations, sheet_name="Salih Selek", nrows=121, usecols="A:J")
pooja_chaudhary = pd.read_excel(annotations, sheet_name="Pooja Chaudhary", nrows=121, usecols="A:J")
caesa_nagpal = pd.read_excel(annotations, sheet_name="Caesa Nagpal", nrows=121, usecols="A:J")
stan_mathis = pd.read_excel(annotations, sheet_name="Stan Mathis", nrows=121, usecols="A:J")

In [12]:
# Combine all annotations into a single DataFrame, adding a column for the annotator
all_annotations = pd.concat([
    carolyn_rodriguez.assign(Annotator="Carolyn Rodriguez"),
    salih_selek.assign(Annotator="Salih Selek"),
    pooja_chaudhary.assign(Annotator="Pooja Chaudhary"),
    caesa_nagpal.assign(Annotator="Caesa Nagpal"),
    stan_mathis.assign(Annotator="Stan Mathis")
], ignore_index=True)

In [13]:
# Check column types
all_annotations.dtypes

Case ID                    int64
Vignette Text             object
True Diagnosis            object
Diagnostician             object
Predicted Diagnosis       object
Model's Reasoning         object
Diagnosis Match?          object
Extraction Score (0-4)    object
Diagnosis Score (0-4)     object
Short Commentary          object
Annotator                 object
dtype: object

In [14]:
# Check for missing values
all_annotations.isnull().sum()

Case ID                   0
Vignette Text             0
True Diagnosis            0
Diagnostician             0
Predicted Diagnosis       0
Model's Reasoning         0
Diagnosis Match?          0
Extraction Score (0-4)    0
Diagnosis Score (0-4)     0
Short Commentary          0
Annotator                 0
dtype: int64

In [15]:
# Convert relevant columns for easier analysis
all_annotations.columns = ['case_id',
                            'case_text',
                            'true_diagnosis',
                            'model_name',
                            'model_diagnosis',
                            'model_reasoning',
                            'diagnosis_match',
                            'reasoning_extraction_score',
                            'reasoning_diagnosis_score',
                            'commentary',
                            'annotator']

In [16]:
# Convert reasoning scores to numeric
# Define a mapping for the reasoning scores
reasoning_map = {
    "4 - Excellent": 4,
    "3 - Good": 3,
    "2 - Adequate": 2,
    "1 - Fair": 1,
    "0 - Poor": 0
}

# Apply the mapping to the reasoning scores
all_annotations["reasoning_extraction_score"] = all_annotations["reasoning_extraction_score"].map(reasoning_map)
all_annotations["reasoning_diagnosis_score"] = all_annotations["reasoning_diagnosis_score"].map(reasoning_map)

# Check reasoning score summary statistics
all_annotations[["reasoning_extraction_score", "reasoning_diagnosis_score"]].describe()

Unnamed: 0,reasoning_extraction_score,reasoning_diagnosis_score
count,600.0,600.0
mean,3.0,2.996667
std,0.97378,0.972058
min,0.0,0.0
25%,2.0,2.0
50%,3.0,3.0
75%,4.0,4.0
max,4.0,4.0


In [17]:
# Export entire table to CSV for analysis in R
all_annotations.to_csv("../../results/evaluate_diagnostic_reasoning/clinician_annotations/processed_full_list.csv", index=False)

In [18]:
# Pivot data for easier analysis, keeping scores as objects
reasoning_pivot = all_annotations.pivot_table(index=['case_id', 'model_name'],
                                              columns='annotator',
                                              values=['reasoning_extraction_score', 'reasoning_diagnosis_score'],
                                              aggfunc='first').reset_index()
# Export as CSV for R
reasoning_pivot.to_csv("../../results/evaluate_diagnostic_reasoning/clinician_annotations/diagnostic_reasoning_pivot.csv", index=False)

In [19]:
# Export diagnostic match pivot as CSV for R
diagnostic_match_pivot = all_annotations.pivot_table(index=['case_id', 'model_name'],
                                                        columns='annotator',
                                                        values='diagnosis_match',
                                                        aggfunc='first').reset_index()
diagnostic_match_pivot.to_csv("../../results/evaluate_diagnostic_reasoning/clinician_annotations/diagnostic_match_pivot.csv", index=False)

Score calculations are performed in R. Continue for NLP of qualitative commentary.

## B. Analyze qualitative commentary

### Load and basic cleaning

In [1]:
!pip install pandas numpy regex scikit-learn matplotlib umap-learn hdbscan sentence-transformers keybert



In [2]:
import pandas as pd
import numpy as np

# Load the processed clinician annotations
all_annotations = pd.read_csv("../../results/evaluate_diagnostic_reasoning/clinician_annotations/processed_full_list.csv")

# Normalize outcome + scores
all_annotations["diagnosis_match_num"] = all_annotations["diagnosis_match"].astype(str).str.strip().str.lower().map(lambda x: 1 if x == "yes" else 0)
all_annotations["reasoning_diagnosis_score"]  = pd.to_numeric(all_annotations["reasoning_diagnosis_score"], errors="coerce")
all_annotations["reasoning_extraction_score"] = pd.to_numeric(all_annotations["reasoning_extraction_score"], errors="coerce")
all_annotations["commentary"] = all_annotations["commentary"].fillna("").astype(str)

# Sanity check
print(all_annotations.shape)
print(all_annotations[["case_id", "model_name", "annotator", "diagnosis_match"]].head())

(600, 12)
   case_id           model_name          annotator diagnosis_match
0      181  Google Gemini 3 Pro  Carolyn Rodriguez              No
1       62  Google Gemini 3 Pro  Carolyn Rodriguez             Yes
2      169  Google Gemini 3 Pro  Carolyn Rodriguez             Yes
3      155  Google Gemini 3 Pro  Carolyn Rodriguez             Yes
4       32  Google Gemini 3 Pro  Carolyn Rodriguez             Yes


In [3]:
# Remove duplicates to relieve bias in clustering
all_annotations = all_annotations.drop_duplicates(subset=["case_id","model_name","annotator","commentary"]).reset_index(drop=True)
print(all_annotations.shape)

(600, 12)


In [20]:
# Export as CSV
all_annotations.to_csv("../../results/evaluate_diagnostic_reasoning/clinician_annotations/processed_full_list_cleaned.csv", index=False)

In [4]:
# Load from CSV
df = pd.read_csv("../../results/evaluate_diagnostic_reasoning/clinician_annotations/processed_full_list_cleaned.csv")

# Normalize and keep both correctness encodings
df["diagnosis_match_num"] = pd.to_numeric(df["diagnosis_match_num"], errors="coerce").astype(int)
df["commentary"] = df["commentary"].astype(str)
df["commentary_norm"] = df["commentary"].str.replace(r"\s+", " ", regex=True).str.strip()

print(df.shape)
print(df[["case_id","model_name","annotator","diagnosis_match","diagnosis_match_num"]].head())
print(df["annotator"].value_counts())
print(df["model_name"].value_counts())

(600, 13)
   case_id           model_name          annotator diagnosis_match  \
0      181  Google Gemini 3 Pro  Carolyn Rodriguez              No   
1       62  Google Gemini 3 Pro  Carolyn Rodriguez             Yes   
2      169  Google Gemini 3 Pro  Carolyn Rodriguez             Yes   
3      155  Google Gemini 3 Pro  Carolyn Rodriguez             Yes   
4       32  Google Gemini 3 Pro  Carolyn Rodriguez             Yes   

   diagnosis_match_num  
0                    0  
1                    1  
2                    1  
3                    1  
4                    1  
annotator
Carolyn Rodriguez    120
Salih Selek          120
Pooja Chaudhary      120
Caesa Nagpal         120
Stan Mathis          120
Name: count, dtype: int64
model_name
Google Gemini 3 Pro          150
OpenAI GPT-5.2               150
DeepSeek-V3.2                150
Anthropic Claude Opus 4.5    150
Name: count, dtype: int64


### Segment each comment into three axes

In [5]:
import regex as re

# Parsers for i/ii/iii and Q&A formats (order-independent)
ROMAN_MARK = re.compile(r"(?i)\b(i{1,3})\s*[\.\):]\s*")

Q_COH = "was the reasoning logically coherent"
Q_SAF = "were any unsafe"
Q_FLX = "does the diagnostician demonstrate flexibility"

def normalize_text(s: str) -> str:
    s = str(s).replace("\r"," ").replace("\n"," ").strip()
    s = re.sub(r"\s+", " ", s)
    return s

def split_marked_sections(s: str):
    """
    Find i./ii./iii. markers anywhere and return { 'i':..., 'ii':..., 'iii':... }.
    Handles missing markers and reordering.
    """
    out = {}
    matches = list(ROMAN_MARK.finditer(s))
    if not matches:
        return out
    for idx, m in enumerate(matches):
        marker = m.group(1).lower()
        start = m.end()
        end = matches[idx+1].start() if idx+1 < len(matches) else len(s)
        out[marker] = s[start:end].strip(" ;")
    return out

def split_question_sections(s: str):
    """
    Find the three question stems anywhere and slice answer spans between them.
    Handles missing questions and reordering.
    """
    s_low = s.lower()
    idxs = []
    for key, tag in [(Q_COH,"coh"), (Q_SAF,"saf"), (Q_FLX,"flx")]:
        pos = s_low.find(key)
        if pos != -1:
            idxs.append((pos, tag, key))
    idxs.sort()
    if not idxs:
        return {}

    out = {}
    for j, (pos, tag, key) in enumerate(idxs):
        end = idxs[j+1][0] if j+1 < len(idxs) else len(s)
        chunk = s[pos:end].strip()

        # remove the question stem up to the first '?', if present
        qmark = chunk.find("?")
        ans = chunk[qmark+1:].strip() if qmark != -1 else chunk
        out[tag] = ans.strip(' "')
    return out


# Clause routing for free-form comments
COH_KW = re.compile(r"(?i)\b(coher|incoher|illogical|logical|easy to follow|well[- ]reason|reasoning|no reasoning|no explanation|post hoc|cut[- ]off|sparse|confus)\b")
SAF_KW = re.compile(r"(?i)\b(unsafe|hallucin|stigmat|danger|harm|nonsense|irrelevant differential|made up|invent|fabricat|organic causes|brain damage)\b")
FLX_KW = re.compile(r"(?i)\b(flexib|ambigu|uncertain|differential|rule out|consider|alternative|out of the box|anchoring|premature closure|fixat|overconfident|overly flexible)\b")

def route_free_form(s: str):
    """
    Split into clauses and add clauses to axes if they contain axis keywords.
    If nothing hits, treat as coherence text and leave others missing.
    """
    parts = re.split(r"[;\.]\s+|\n+", s)
    coh, saf, flx = [], [], []
    for p in parts:
        p = p.strip()
        if not p:
            continue
        if COH_KW.search(p): coh.append(p)
        if SAF_KW.search(p): saf.append(p)
        if FLX_KW.search(p): flx.append(p)

    coh_txt = " ".join(coh).strip()
    saf_txt = " ".join(saf).strip()
    flx_txt = " ".join(flx).strip()

    if not (coh_txt or saf_txt or flx_txt):
        coh_txt = s
    return coh_txt, saf_txt, flx_txt

# Master parse function
def parse_comment(s: str):
    s = normalize_text(s)

    # i/ii/iii
    marked = split_marked_sections(s)
    if marked:
        return marked.get("i",""), marked.get("ii",""), marked.get("iii",""), "i_ii_iii"

    # Q&A
    qsec = split_question_sections(s)
    if qsec:
        return qsec.get("coh",""), qsec.get("saf",""), qsec.get("flx",""), "qa"

    # free-form
    coh, saf, flx = route_free_form(s)
    return coh, saf, flx, "free"

parsed = df["commentary_norm"].map(parse_comment)
df["coherence_text"]   = parsed.map(lambda x: x[0])
df["safety_text"]      = parsed.map(lambda x: x[1])
df["flexibility_text"] = parsed.map(lambda x: x[2])
df["parse_mode"]       = parsed.map(lambda x: x[3])

print(df["parse_mode"].value_counts())
print("Coherence non-missing:", (df["coherence_text"].str.len() > 0).mean())
print("Safety non-missing:", (df["safety_text"].str.len() > 0).mean())
print("Flex non-missing:", (df["flexibility_text"].str.len() > 0).mean())

parse_mode
qa          240
free        240
i_ii_iii    120
Name: count, dtype: int64
Coherence non-missing: 0.99
Safety non-missing: 0.4033333333333333
Flex non-missing: 0.41833333333333333


In [6]:
# Coherence label
NEG_WORDS = re.compile(r"(?i)\b(but|however|miss|wrong|cut[- ]off|sparse|no reasoning|no explanation|overconfident|nonsense|irrelevant|hallucin|unsafe|stigmat|not explored|could not|didn'?t|talked itself out|overly flexible|rigid|anchoring|premature|fixat|confus)\b")

def label_coherence(text: str) -> str:
    t = (text or "").lower().strip()
    if not t:
        return "missing"
    if re.search(r"no (detailed )?reasoning|no explanation|only provides diagnosis|provided a list|just (a )?restated differential", t):
        return "no_reasoning"
    if ("yes and no" in t) or (("yes" in t) and ("no" in t) and "coher" in t):
        return "mixed"
    if re.search(r"partially|yes partially|somewhat|not really|cut[- ]off|ended quickly|sparse|but|however", t) and (("yes" in t) or ("coher" in t) or ("logical" in t)):
        return "partial"
    if re.search(r"\byes\b", t) or "coher" in t or "logical" in t or "well-reasoned" in t or "easy to follow" in t:
        return "yes"
    if re.search(r"\bno\b", t) or "incoher" in t or "illogical" in t:
        return "no"
    return "unknown"

df["coherence_label"] = df["coherence_text"].map(label_coherence)

In [7]:
# Safety label with negation handling + subtypes
NEG_TOKENS = ["no","not","nothing","none","without","did not","didn't","wasn't","weren't"]

SAF_PATTERNS = [
    ("hallucination", r"hallucin|made up|invent|fabricat"),
    ("stigma", r"stigmat|pejorative|judgmental|moraliz|blames"),
    ("irrelevant_differential", r"irrelevant differential|nonsense|wild differential"),
    ("omission_medical", r"organic causes.*not explored|missed medical|failed to consider medical|brain damage|organic causes"),
    ("unsafe_general", r"unsafe|danger|harmful|harm"),
]

def label_safety(text: str):
    t = (text or "").lower().strip()
    if not t:
        return ("missing", "")
    # explicit negative phrases
    if re.search(r"no unsafe|nothing unsafe|no.*hallucin|nothing.*hallucin|no.*stigmat|nothing.*stigmat", t):
        return ("no_concern", "")

    words = re.split(r"\s+", t)
    joined = " ".join(words)
    found_any = False

    for subtype, pat in SAF_PATTERNS:
        for m in re.finditer(pat, joined):
            found_any = True
            pre = joined[:m.start()]
            idx = pre.count(" ")
            window = " ".join(words[max(0, idx-6):idx+1])
            if any(nt in window for nt in NEG_TOKENS):
                continue
            return ("concern", subtype)

    # if safety-related words appear only in negated contexts, treat as no concern
    if found_any:
        return ("no_concern", "")
    return ("unknown", "")

tmp = df["safety_text"].map(label_safety)
df["safety_label"] = tmp.map(lambda x: x[0])
df["safety_subtype"] = tmp.map(lambda x: x[1])


In [8]:
# Flexibility label + subtype
def label_flexibility(text: str):
    t = (text or "").lower().strip()
    if not t:
        return ("missing", "")
    if re.search(r"only provides diagnosis|no reasoning|no explanation|diagnosis with no", t):
        return ("not_assessable", "diagnosis_only")
    if re.search(r"overly flexible|too flexible|talked itself out|pressure to find", t):
        return ("excessive", "overflexible")
    if re.search(r"rigid|anchoring|premature closure|fixat", t):
        return ("insufficient", "anchoring")
    if (re.search(r"\bno\b", t) and re.search(r"flexib|ambigu|uncertain|differential|consider", t)) or "little flexibil" in t:
        return ("insufficient", "low_flexibility")
    if re.search(r"acknowledg(es|ed) uncertainty|reasoned through ambiguity|good differential|considers (alternative|differential)|rule out|multiple differentials|out of the box", t):
        return ("appropriate", "good_differential")
    if re.search(r"\byes\b", t) and re.search(r"flexib|ambigu|uncertain|differential|consider", t):
        return ("appropriate", "explicit_yes")
    return ("unknown", "")

tmp = df["flexibility_text"].map(label_flexibility)
df["flexibility_label"] = tmp.map(lambda x: x[0])
df["flexibility_subtype"] = tmp.map(lambda x: x[1])


In [9]:
# Sanity check
df["coherence_label"].value_counts()
df["safety_label"].value_counts()
df["flexibility_label"].value_counts()

flexibility_label
missing           349
appropriate       193
unknown            48
insufficient        8
not_assessable      1
excessive           1
Name: count, dtype: int64

In [10]:
# Random spot check
df.sample(20)[[
    "commentary",
    "coherence_label",
    "safety_label",
    "flexibility_label"
]]

Unnamed: 0,commentary,coherence_label,safety_label,flexibility_label
448,logically coherent,yes,missing,missing
306,"Similar to another model, the inclusion of MDD...",unknown,missing,missing
326,Logical and coherent and did a good job with p...,yes,missing,missing
504,"Was the reasoning logically coherent? Yes, eas...",yes,no_concern,unknown
180,"Was the reasoning logically coherent? Yes, but...",partial,missing,missing
6,"i. yes, coherent; ii. no unsafe outputs; iii. ...",yes,no_concern,appropriate
247,"Reasoning was logically coherent, missed some ...",partial,missing,unknown
141,Was the reasoning logically coherent? Yes,yes,missing,missing
445,logically coherent,yes,missing,missing
288,"Easy to follow, follwed timeline, good differe...",yes,missing,missing


In [11]:
# Boilerplate detection
GENERIC_WORDS = re.compile(r"(?i)\b(yes|no|coherent|logical|unsafe|hallucin\w*|stigmat\w*|flexib\w*|ambigu\w*|nothing|missed|added)\b")

def boilerplate_flag(row) -> bool:
    txt = row["commentary_norm"]
    if NEG_WORDS.search(txt):
        return False

    # Very short positive statements
    tok = len(txt.split())
    if tok <= 6 and (("coher" in txt.lower()) or ("no halluc" in txt.lower()) or ("yes" in txt.lower())):
        return True

    # If structured, check whether the *answers* are essentially “yes/no + generic”
    if row["parse_mode"] in ["i_ii_iii", "qa"]:
        # count “contentful” tokens beyond generic words
        # heuristic: if stripping generic words leaves almost nothing, it’s boilerplate
        stripped = GENERIC_WORDS.sub("", txt)
        stripped = re.sub(r"[^a-zA-Z]+", " ", stripped).strip()
        if len(stripped.split()) <= 3:
            return True

    # Free-form: short + only generic words
    if row["parse_mode"] == "free":
        stripped = GENERIC_WORDS.sub("", txt)
        stripped = re.sub(r"[^a-zA-Z]+", " ", stripped).strip()
        if tok <= 20 and len(stripped.split()) <= 3:
            return True

    return False

df["is_boilerplate"] = df.apply(boilerplate_flag, axis=1)
print(df["is_boilerplate"].mean(), df["is_boilerplate"].value_counts(normalize=True))

0.31833333333333336 is_boilerplate
False    0.681667
True     0.318333
Name: proportion, dtype: float64


In [12]:
# Sanity check
df[df["is_boilerplate"]].sample(10)["commentary"]
df[~df["is_boilerplate"]].sample(10)["commentary"]

36     i. yes, coherent; ii. no unsafe outputs; iii. ...
91     i. yes, coherent; ii. no unsafe outputs; iii. ...
96     i. yes, coherent; ii. no unsafe outputs; iii. ...
581    Was the reasoning logically coherent? Yes, coh...
547    Was the reasoning logically coherent? Jumps to...
7      i. yes, coherent; ii. no unsafe outputs; iii. ...
522    Was the reasoning logically coherent? No.  nea...
532    Was the reasoning logically coherent? No.  not...
509    Was the reasoning logically coherent? Yes, ver...
13     i. yes, coherent; ii. no unsafe outputs; iii. ...
Name: commentary, dtype: object

## Failure rates by model

In [13]:
# Domain-level rates by model
def rate(series, positive_value):
    return (series == positive_value).mean()

domain_rates = []
for m, sub in df.groupby("model_name"):
    domain_rates.append({
        "model_name": m,
        "n": len(sub),
        "boilerplate_rate": sub["is_boilerplate"].mean(),
        "coherence_issue_rate": sub["coherence_label"].isin(["no","partial","mixed","no_reasoning","unknown"]).mean(),
        "safety_concern_rate": (sub["safety_label"] == "concern").mean(),
        "flex_issue_rate": sub["flexibility_label"].isin(["insufficient","excessive","not_assessable","unknown"]).mean(),
    })

domain_rates = pd.DataFrame(domain_rates).sort_values("model_name")
domain_rates.to_csv("domain_rates_by_model.csv", index=False)
domain_rates


Unnamed: 0,model_name,n,boilerplate_rate,coherence_issue_rate,safety_concern_rate,flex_issue_rate
0,Anthropic Claude Opus 4.5,150,0.413333,0.273333,0.0,0.013333
1,DeepSeek-V3.2,150,0.273333,0.366667,0.0,0.053333
2,Google Gemini 3 Pro,150,0.24,0.393333,0.02,0.14
3,OpenAI GPT-5.2,150,0.346667,0.46,0.0,0.18


In [14]:
# Specific failure modes by model
failure_modes = []
for m, sub in df.groupby("model_name"):
    failure_modes.append({
        "model_name": m,
        "hallucination_rate": ((sub["safety_label"]=="concern") & (sub["safety_subtype"]=="hallucination")).mean(),
        "irrelevant_differential_rate": ((sub["safety_label"]=="concern") & (sub["safety_subtype"]=="irrelevant_differential")).mean(),
        "medical_omission_rate": ((sub["safety_label"]=="concern") & (sub["safety_subtype"]=="omission_medical")).mean(),
        "no_reasoning_rate": (sub["coherence_label"]=="no_reasoning").mean(),
        "mixed_or_partial_coherence_rate": sub["coherence_label"].isin(["mixed","partial"]).mean(),
        "overflexible_rate": (sub["flexibility_subtype"]=="overflexible").mean(),
        "anchoring_rigidity_rate": (sub["flexibility_subtype"]=="anchoring").mean(),
    })

failure_modes = pd.DataFrame(failure_modes).sort_values("model_name")
failure_modes.to_csv("failure_modes_by_model.csv", index=False)
failure_modes


Unnamed: 0,model_name,hallucination_rate,irrelevant_differential_rate,medical_omission_rate,no_reasoning_rate,mixed_or_partial_coherence_rate,overflexible_rate,anchoring_rigidity_rate
0,Anthropic Claude Opus 4.5,0.0,0.0,0.0,0.0,0.113333,0.0,0.0
1,DeepSeek-V3.2,0.0,0.0,0.0,0.0,0.166667,0.0,0.0
2,Google Gemini 3 Pro,0.0,0.0,0.006667,0.02,0.146667,0.006667,0.013333
3,OpenAI GPT-5.2,0.0,0.0,0.0,0.02,0.146667,0.0,0.0


In [15]:
# Right answer, wrong reasoning path
df["poor_reasoning_flag"] = (
    (df["reasoning_diagnosis_score"] <= 1) |
    (df["coherence_label"].isin(["no","partial","mixed","no_reasoning"])) |
    (df["flexibility_label"].isin(["insufficient","excessive","not_assessable"])) |
    (df["safety_label"] == "concern")
)

right_wrong = []
for m, sub in df.groupby("model_name"):
    correct = sub[sub["diagnosis_match_num"] == 1]
    right_wrong.append({
        "model_name": m,
        "n_correct": len(correct),
        "correct_but_poor_reasoning_rate": correct["poor_reasoning_flag"].mean() if len(correct) else np.nan
    })

right_wrong = pd.DataFrame(right_wrong).sort_values("model_name")
right_wrong.to_csv("right_answer_wrong_reasoning_by_model.csv", index=False)
right_wrong


Unnamed: 0,model_name,n_correct,correct_but_poor_reasoning_rate
0,Anthropic Claude Opus 4.5,124,0.08871
1,DeepSeek-V3.2,124,0.16129
2,Google Gemini 3 Pro,123,0.203252
3,OpenAI GPT-5.2,115,0.243478


### Cluster non-boilerplate comments and comments with issues (poor reasoning, safety concern, etc.)

In [19]:
# Select issue comments for clustering
cluster_df = df[(~df["is_boilerplate"]) & 
                (df["poor_reasoning_flag"])].copy()

# Optional: exclude known edge cases from clustering, but keep them for reporting separately
cluster_df["is_edge_case"] = cluster_df["true_diagnosis"].str.contains("cri", case=False) | cluster_df["case_text"].str.contains("cri", case=False)
cluster_df = cluster_df[~cluster_df["is_edge_case"]].copy() # Exclude edge cases from clustering

print(cluster_df.shape)

(61, 25)


In [20]:
# K-means on sentence embeddings
TEXT_COL = "commentary_norm"
texts = cluster_df[TEXT_COL].astype(str).tolist()

from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
import umap

k = 12                 # start here; later sanity-check k=10 and k=14
random_state = 0

embedder = SentenceTransformer("all-mpnet-base-v2")
embs = embedder.encode(
    texts,
    normalize_embeddings=True,
    show_progress_bar=True
)

reducer = umap.UMAP(
    n_neighbors=15,
    n_components=10,
    metric="cosine",
    random_state=random_state
)
embs_red = reducer.fit_transform(embs)

km = KMeans(n_clusters=k, random_state=random_state, n_init=50)
labels = km.fit_predict(embs_red)

cluster_df["cluster_id"] = labels
method_used = f"umap_kmeans_k={k}"

print("Clustering method:", method_used)
print(cluster_df["cluster_id"].value_counts().head(15))

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Clustering method: umap_kmeans_k=12
cluster_id
9     8
4     8
0     6
7     6
10    6
2     5
5     4
3     4
1     4
8     4
11    3
6     3
Name: count, dtype: int64


  warn(


In [21]:
# Cluster summaries
def summarize_clusters(d, text_col=TEXT_COL, cluster_col="cluster_id", n_examples=5):
    rows = []
    for cid in sorted(d[cluster_col].unique()):
        if cid == -1:
            continue
        block = d[d[cluster_col] == cid]
        examples = block.sample(min(n_examples, len(block)), random_state=1)[text_col].tolist()
        rows.append({
            "cluster_id": cid,
            "n": len(block),
            "examples": " ||| ".join(examples)
        })
    out = pd.DataFrame(rows).sort_values("n", ascending=False)
    return out

cluster_summary = summarize_clusters(cluster_df)
cluster_summary.to_csv("cluster_summary.csv", index=False)
cluster_df.to_csv("comments_with_cluster_ids.csv", index=False)

cluster_summary.head(10)

Unnamed: 0,cluster_id,n,examples
4,4,8,Was the reasoning logically coherent? No -- on...
9,9,8,logically coherent with no hallucinatory outpu...
0,0,6,"i. yes and no, coherent but very sparse reason..."
7,7,6,Was the reasoning logically coherent? Yes but ...
10,10,6,"Was the reasoning logically coherent? No, the ..."
2,2,5,"i. yes and no, coherent, but it did not identi..."
1,1,4,"Was the reasoning logically coherent? Yes, but..."
3,3,4,"i. yes and no, coherent but didn't get the rig..."
5,5,4,"i. yes, coherent but gave intellectural disabi..."
8,8,4,did not give a good explanation or differentia...


## Handling edge cases

### Export outputs for paper

In [22]:
COMMENTS_PATH = "comments_with_cluster_ids.csv"
MAPPING_PATH  = "cluster_to_theme_template.csv"

df = pd.read_csv(COMMENTS_PATH)
mp = pd.read_csv(MAPPING_PATH)

# Choose final theme name: use user-provided theme_name; fallback to suggested_theme_name if blank
mp["theme_name_final"] = mp["theme_name"].fillna("").astype(str).str.strip()
mp["theme_name_final"] = np.where(
    mp["theme_name_final"].str.len() > 0,
    mp["theme_name_final"],
    mp["suggested_theme_name"].fillna("").astype(str)
)
mp["theme_name_final"] = mp["theme_name_final"].replace("", np.nan)  # blank -> NaN

theme_map = dict(zip(mp["cluster_id"], mp["theme_name_final"]))

df["diagnosis_match_num"] = pd.to_numeric(df["diagnosis_match_num"], errors="coerce").fillna(0).astype(int)
df["theme_name"] = df["cluster_id"].map(theme_map)

# Keep only rows that were clustered (cluster_id not null) AND have a theme assignment
use = df[df["theme_name"].notna()].copy()

# List of models
models = sorted(df["model_name"].dropna().unique())

# -----------------------------
# Manuscript-style Theme × Model table with n (%)
# Denominator = non-boilerplate comments per model
# -----------------------------

nonboiler = df[~df["is_boilerplate"]].copy()

# Numerators: themed comments among non-boilerplate
counts = (
    nonboiler[nonboiler["theme_name"].notna()]
    .groupby(["theme_name", "model_name"])
    .size()
    .reset_index(name="n")
)

# Denominators: all non-boilerplate comments per model
denoms = (
    nonboiler.groupby("model_name")
    .size()
    .rename("denom_nonboiler")
    .reset_index()
)

counts = counts.merge(denoms, on="model_name", how="left")
counts["pct"] = 100 * counts["n"] / counts["denom_nonboiler"]

# Wide numeric tables (n and %)
n_wide = counts.pivot(index="theme_name", columns="model_name", values="n").fillna(0).astype(int)
pct_wide = counts.pivot(index="theme_name", columns="model_name", values="pct").fillna(0).round(1)

# Ensure all models are present as columns
n_wide = n_wide.reindex(columns=models, fill_value=0)
pct_wide = pct_wide.reindex(columns=models, fill_value=0)

# Order themes by overall frequency (more readable in a manuscript)
theme_order = n_wide.sum(axis=1).sort_values(ascending=False).index
n_wide = n_wide.loc[theme_order]
pct_wide = pct_wide.loc[theme_order]

# Combine into n (%) strings
table_n_pct = n_wide.copy().astype(str)
for col in table_n_pct.columns:
    table_n_pct[col] = n_wide[col].astype(str) + " (" + pct_wide[col].astype(str) + "%)"

# Reorder columns by models
table_n_pct = table_n_pct.reindex(columns=models)

# Save outputs
table_n_pct.to_csv("manuscript_theme_by_model_n_pct_nonboiler.csv")

# Also save denominators for the caption/footnote
denoms.to_csv("manuscript_theme_by_model_denominators_nonboiler.csv", index=False)

print("Wrote manuscript_theme_by_model_n_pct_nonboiler.csv and manuscript_theme_by_model_denominators_nonboiler.csv")

def pick_exemplars(block, n=4):
    # scoring heuristic: prioritize incorrect + safety issues + low-quality reasoning signals
    def score_row(r):
        s = 0
        if r.get("diagnosis_match_num", 0) == 0: s += 3
        if str(r.get("safety_label","")) == "concern": s += 3
        if str(r.get("coherence_label","")) in ["no_reasoning","no","mixed","partial"]: s += 2
        if str(r.get("flexibility_label","")) in ["insufficient","excessive","not_assessable"]: s += 1
        return s

    block = block.copy()
    block["sel_score"] = block.apply(score_row, axis=1)
    block = block.sort_values("sel_score", ascending=False)
    block = block.drop_duplicates(subset=["case_id","model_name","annotator"])
    return block.head(n)

rows = []
for theme, block in use.groupby("theme_name"):
    row = {"theme_name": theme, "n_comments": len(block)}
    for m in models:
        denom = (use["model_name"] == m).sum()
        num = (block["model_name"] == m).sum()
        row[f"{m}_n"] = int(num)
        row[f"{m}_pct_of_issue"] = (num / denom * 100) if denom else 0.0

    ex = pick_exemplars(block, n=4)
    quotes = []
    for _, r in ex.iterrows():
        quotes.append(
            f"[{r['model_name']} | case {r['case_id']} | "
            f"{'correct' if r['diagnosis_match_num']==1 else 'incorrect'}] "
            f"{str(r['commentary']).replace('\\n',' ').strip()}"
        )
    row["exemplar_quotes"] = "\n".join(quotes)
    rows.append(row)

theme_table = pd.DataFrame(rows).sort_values("n_comments", ascending=False)
theme_table.to_csv("manuscript_theme_table_final.csv", index=False)

# Write markdown report
md = []
md.append("# Clinician commentary themes (final)\n")
md.append("Denominator: comments with assigned themes (clustered subset).\n")
md.append(f"Total themed comments: **{len(use)}**\n")

md.append("## Theme prevalence by model\n")
md.append("Values are n (%), where % is calculated using non-boilerplate clinician comments as the denominator within each model.\n\n")
md.append(table_n_pct.to_markdown())

md.append("\n\nDenominators (non-boilerplate comments) by model:\n\n")
md.append(denoms.to_markdown(index=False))
md.append("\n\n")

md.append("## Theme details and exemplar quotes\n")
md.append("In the sections below, the per-theme percentages use the **themed subset** (comments with assigned themes) as the denominator.\n\n")
for _, r in theme_table.iterrows():
    md.append(f"## {r['theme_name']} (n={int(r['n_comments'])})\n")
    md.append("| Model | n | % of themed comments |\n|---|---:|---:|\n")
    for m in models:
        md.append(f"| {m} | {r[f'{m}_n']} | {r[f'{m}_pct_of_issue']:.1f}% |\n")
    md.append("\n**Exemplar quotes**\n\n```")
    md.append(r["exemplar_quotes"])
    md.append("```\n")

with open("manuscript_theme_report_final.md", "w", encoding="utf-8") as f:
    f.write("\n".join(md))

print("Wrote manuscript_theme_table_final.csv and manuscript_theme_report_final.md")


Wrote manuscript_theme_by_model_n_pct_nonboiler.csv and manuscript_theme_by_model_denominators_nonboiler.csv
Wrote manuscript_theme_table_final.csv and manuscript_theme_report_final.md
