In [None]:
# LLM Evaluation of Sinusitis Surgery Recommendation

# This script evaluates LLM clinical decision-making for sinusitis surgery.
# The workflow is as follows:
# 1.  Setup: Load libraries and configure API keys.
# 2.  Data Loading: Load and combine multiple years of patient data from BigQuery.
# 3.  Preprocessing: 
    # - Group all records by patient ID.
    # - For each patient, create a sorted, longitudinal clinical note history.
    # - Identify the date of surgery (if any) and exclude pre-operative/post-operative notes.
    # - Censor any sentences mentioning surgical plans to create a "blinded" note for the LLM.
    # - Aggregate all other clinical data (labs, meds, demographics).
    # - Create a clean, flat DataFrame where each row is a unique patient.
# 4.  LLM Analysis:
#     - Generate a structured prompt for each case.
#     - Send the prompt to the GPT-4 API and parse the JSON response.
# 5.  Evaluation:
#     - Compare the LLM's decision against actual surgery CPT codes.
#     - Calculate accuracy, precision, recall, and F1-score.
#     - Analyze the LLM's confidence on correct vs. incorrect predictions.
# 6.  Output: Save the full results and a sample for human evaluation to CSV files.

In [None]:
# Libraries
import pandas as pd
import numpy as np
import os
import re
import openai
import json
import matplotlib.pyplot as plt

# For evaluation
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# For progress bars
from tqdm.auto import tqdm 

# For BigQuery access
from google.cloud import bigquery

In [None]:
from utils import (
    censor_surgical_plans,
    extract_ent_notes,
    check_list_for_keywords,
    process_procedures,
    extract_radiology_report,
    query_openai,
    generate_prompt,
    preprocess_all_patients
)

In [None]:
# Load OpenAI API Key
try:
    with open("openai_key.txt", "r") as f:
        os.environ["OPENAI_API_KEY"] = f.read().strip()
    openai.api_key = os.environ["OPENAI_API_KEY"]
except FileNotFoundError:
    print("OpenAI key file not found. Make sure 'openai_key.txt' is in the directory,")
    # Exit if key is not found, as the script cannot proceed.
    exit()

In [None]:
# Data Loading - Read in datasets from BigQuery

print("Connecting to BigQuery to load datasets...")
client = bigquery.Client(project=PROJECT_ID)


# Create dictionary to hold dataframes
dataframes = {}

for table_name in DATA_TABLES:
     print(f"Loading and unioning table: '{table_name}'...")
     try:
        # Construct a query to UNION this specific table across all yearly datasets
        union_query = "\nUNION ALL\n".join(
            [f"SELECT * FROM `{PROJECT_ID}.{dataset_id}.{table_name}`" for dataset_id in DATASET_IDS]
        )
        # Load the data and store it in our dictionary
        dataframes[table_name] = client.query(union_query).to_dataframe()
        print(f" -> Loaded {len(dataframes[table_name])} total rows for '{table_name}'.")
    except Exception as e:
        print(f"\n--- QUERY FAILED for table '{table_name}' ---")
        print(f"An error occurred: {e}")
        print("Please check that this table exists in all of your yearly datasets.")
        exit()

In [None]:
# Analysis - Process data and analyze to LLM
for index, case in tqdm(cases_df.iterrows(), total=len(cases_df), desc="Evaluating Cases"):
    prompt = generate_prompt(case)
    response_text = query_openai(prompt)
    processed_data = preprocess_all_patients(case, surgery_codes)
    

In [None]:
if __name__ == "__main__":
    print("Loading data...")
    ...

    print("Running preprocessing...")
    cases_df = preprocess_all_patients(dataframes, SURGERY_CPT_CODES)

    print("Running LLM inference...")
    ...

    print("Evaluating predictions...")
    evaluate_predictions(eval_df)

    print("Saving results...")
    ...

In [None]:
# Save the final complete dataframe to a CSV file.
full_results_path = "sinusitis_llm_full_results.csv"
cases_df.to_csv(full_results_path, index=False)
print(f"\nFull results with all columns saved to '{full_results_path}'")

# Save a sample of 200 cases to a separate CSV for human evaluation.
if len(cases_df) > 0:
    sample_path = "sinusitis_llm_human_review_sample.csv"
    # Ensure we don't try to sample more rows than exist
    sample_size = min(200, len(cases_df))
    cases_df.head(sample_size).to_csv(sample_path, index=False)
    print(f"A sample of {sample_size} cases for human review saved to '{sample_path}'")