In [None]:
# LLM Evaluation of Sinusitis Surgery Recommendation

# This script evaluates LLM clinical decision-making for sinusitis surgery.
# The workflow is as follows:
# 1.  Setup: Load libraries, import preprocessing functions, and configure API keys.
# 2.  Preprocessing: 
    # - Group all records by patient ID.
    # - For each patient, create a sorted, longitudinal clinical note history.
    # - Identify the date of surgery (if any) and exclude pre-operative/post-operative notes.
    # - Censor any sentences mentioning surgical plans to create a "blinded" note for the LLM.
    # - Aggregate all other clinical data (labs, meds, demographics).
    # - Create a clean, flat DataFrame where each row is a unique patient.
# 3.  Load in BigQuery data
# 4.  LLM Analysis: Send the prompt to the GPT-4 API and parse the JSON response.
# 5.  Evaluation:
#     - Compare the LLM's decision against actual surgery CPT codes.
#     - Calculate accuracy, precision, recall, and F1-score.
#     - Analyze the LLM's confidence on correct vs. incorrect predictions.
# 6.  Output: Save the full results and a sample for human evaluation to CSV files.

In [None]:
# Libraries
import pandas as pd
import numpy as np
import os
import openai
import re
import json
import matplotlib.pyplot as plt

# For evaluation
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# For progress bars
from tqdm.auto import tqdm 

# For BigQuery access
from google.cloud import bigquery

In [None]:
# Import preprocessing functions
from utils import (
    censor_surgical_plans,
    extract_ent_notes,
    check_list_for_keywords,
    process_procedures,
    extract_radiology_report,
    query_openai,
    generate_prompt,
    preprocess_all_patients,
    evaluate_predictions
)

# Import API keys and configuration
from config import (
    PROJECT_ID,
    DATASET_IDS,
    DATA_TABLES,
    SURGERY_CPT_CODES,
    LAB_KEYWORDS,
    MED_KEYWORDS,
    DIAGNOSTIC_ENDOSCOPY_CPT_CODES
)

In [None]:
# Load OpenAI API Key
try:
    with open("/Users/joannalin/Github/ent-llm/data/openai_key.txt", "r") as f:
        os.environ["OPENAI_API_KEY"] = f.read().strip()
    openai.api_key = os.environ["OPENAI_API_KEY"]
except FileNotFoundError:
    print("OpenAI key file not found. Make sure 'openai_key.txt' is in the directory,")
    # Exit if key is not found, as the script cannot proceed.
    exit()

In [None]:
# Data Loading - Read in datasets from BigQuery

print("Connecting to BigQuery to load datasets...")
client = bigquery.Client(project=PROJECT_ID)
dataframes = {}

for table_name in DATA_TABLES:
    print(f"Combining table: '{table_name}'...")
    union_query = "\nUNION ALL\n".join(
        [f"SELECT * FROM `{PROJECT_ID}.{dataset_id}.{table_name}`" for dataset_id in DATASET_IDS]
    )
    full_query = f"""WITH unioned AS ({union_query}) SELECT * FROM unioned"""

    try:
        df = client.query(full_query).to_dataframe()
        dataframes[table_name] = df
        print(f"Loaded: {df.shape[0]} rows x {df.shape[1]} columns for '{table_name}'")
    except Exception as e:
        print(f"Failed to load preview for '{table_name}': {e}")

print("\n=== Loaded Tables Summary ===")
for table_name, df in dataframes.items():
    print(f"\n--- {table_name.upper()} ---")
    print("Columns:", df.columns.tolist())
    print("Top 5 rows:")
    print(df.head())


In [None]:
# Analysis - Process data
print ("Processing cases for LLM evaluation...")
cases_df = preprocess_all_patients(dataframes, SURGERY_CPT_CODES)

In [None]:
# Run LLM analysis
results = []
for _, case in tqdm(cases_df.iterrows(), total=len(cases_df), desc="Evaluating Cases"):
    prompt = generate_prompt(case)
    response_text = query_openai(prompt)
    try:
        parsed_response = json.loads(response_text)
        results.append({
            **case,
            'llm_decision': parsed_response.get("decision"),
            'llm_confidence': parsed_response.get("confidence"),
            'llm_reasoning': parsed_response.get("reasoning")
        })
    except Exception as e:
        print(f"Failed to parse LLM response for case {case['patient_id']}: {e}")

eval_df = pd.DataFrame(results)

In [None]:
# Evaluate predictions
print("\nEvaluating predictions...")
evaluate_predictions(eval_df)

In [None]:
# Save the final complete dataframe to a CSV file.
print("\nSaving results...")
full_results_path = "sinusitis_llm_full_results.csv"
eval_df.to_csv(full_results_path, index=False)
print(f"Full results with all columns saved to '{full_results_path}'")

# Save a sample of 200 cases to a separate CSV for human evaluation.
if len(eval_df) > 0:
    sample_path = "sinusitis_llm_human_review_sample.csv"
    sample_size = min(200, len(eval_df))
    eval_df.head(sample_size).to_csv(sample_path, index=False)
    print(f"A sample of {sample_size} cases for human review saved to '{sample_path}'")