In [1]:
## o-3 disposition prediction 

In [2]:
import pandas as pd
pd.set_option('display.max_rows', None)  # Display all rows
pd.set_option('display.max_columns', None)  # Display all columns
pd.set_option('display.max_colwidth', None)  # Set max column width to None
pd.set_option('display.width', None)  # Set width to None
df = pd.read_csv('results.csv') ## this is the er-reason dataset 

In [3]:
import re
def extract_until_medical_decision(text):
    if pd.isna(text):
        return None
    # Split at first occurrence of "Medical Decision" (case-insensitive)
    parts = re.split(r'\bMedical Decision\b', text, flags=re.IGNORECASE)
    return parts[0].strip() if parts else None

# Apply it to the column
df['ED_Presentations'] = df['ED_Provider_Notes_Text'].apply(extract_until_medical_decision)


In [4]:
import pandas as pd
import os
import re
import json
import base64
import requests
import time
import urllib.parse

pd.set_option('display.max_rows', None)  # Display all rows
pd.set_option('display.max_columns', None)  # Display all columns
pd.set_option('display.max_colwidth', None)  # Set max column width to None
pd.set_option('display.width', None)  # Set width to None
# First, let's test by hard coding your Mulesoft Azure API key into the next line
API_KEY = 'x='  ##### Paste your API key between the quotes #####
API_VERSION = '2024-12-01-preview'  # For the most recent production release: https://learn.microsoft.com/en-us/azure/ai-services/openai/api-version-deprecation#latest-ga-api-release
RESOURCE_ENDPOINT = 'x'  # no trailing slash--this is used by libraries as a partial URL
DEPLOYMENT_NAME = "o3-mini-2025-01-31" 

In [5]:
from dotenv import load_dotenv, find_dotenv
from openai import AzureOpenAI

In [6]:
client = AzureOpenAI(
    api_key=API_KEY,
    api_version=API_VERSION,
    azure_endpoint=RESOURCE_ENDPOINT,
)

client

<openai.lib.azure.AzureOpenAI at 0x7ff07bc6a8d0>

In [None]:
import pandas as pd
import numpy as np
import requests
import time
import os
import json


def debug_api_response(response, context="API Call"):
    """Print detailed debugging info about API response"""
    print(f"--- DEBUG: {context} ---")
    print(f"Status code: {response.status_code}")
    try:
        json_data = response.json()
        print(f"JSON structure keys: {list(json_data.keys())}")
        if "choices" in json_data and len(json_data["choices"]) > 0:
            choice = json_data["choices"][0]
            print(f"First choice keys: {list(choice.keys())}")
            if "message" in choice:
                message = choice["message"]
                print(f"Message keys: {list(message.keys())}")
                if "content" in message:
                    print(f"Content (first 100 chars): {message['content'][:100]}...")
                else:
                    print("No 'content' in message")
            else:
                print("No 'message' in first choice")
        else:
            print("No 'choices' in response or empty choices")
    except Exception as e:
        print(f"Error parsing response: {e}")
        print(f"Raw response text: {response.text[:500]}...")
    print("-------------------")

def predict_disposition_with_dynamic_notes(row):
    # Extract all available fields
    chief_complaint = row['primarychiefcomplaintname']
    sex = row['sex']
    age = row['Age']
    presentation = row['ED_Presentations']
    
    # Check if necessary basic fields exist
    if pd.isna(chief_complaint):
        return None, None  # Skip if chief complaint is missing, return None for both prediction and requested notes
    
    # Initialize available note types with their existence status
    available_notes = {
        'Discharge Summary': not pd.isna(row.get('Discharge_Summary_Text')),
        'Progress Notes': not pd.isna(row.get('Progress_Note_Text')),
        'H&P': not pd.isna(row.get('HP_Note_Text')),
        'Echo': not pd.isna(row.get('Echo_Text')),
        'Imaging': not pd.isna(row.get('Imaging_Text')),
        'Consult': not pd.isna(row.get('Consult_Text')),
        'ECG': not pd.isna(row.get('ECG_Text')),
    }
    
    # Step 1: Ask the model which notes it wants to see (always include Discharge Summary if available)
    url = f"{RESOURCE_ENDPOINT}/openai/deployments/{DEPLOYMENT_NAME}/chat/completions?api-version={API_VERSION}"
    
    headers = {
        "Content-Type": "application/json",
        "api-key": API_KEY
    }
    
    # First message to decide which notes to read - using a straightforward format that works with o3
    selection_payload = {
        "messages": [
            {"role": "user", "content": f"You are an experienced Emergency Department (ED) physician. Your task is to decide which medical notes you need to read to predict the patient's ED disposition based on the chief complaint, PMH, physical exam findings, age, and sex. Patient basic info: {age}yo {sex} with chief complaint: {chief_complaint}\n\n"
                                      f"Available notes (respond ONLY with the names of notes you want to see, separated by commas):\n"
                                      f"- Discharge Summary: {'Available' if available_notes['Discharge Summary'] else 'Not available'}\n"
                                      f"- Progress Notes: {'Available' if available_notes['Progress Notes'] else 'Not available'}\n"
                                      f"- H&P: {'Available' if available_notes['H&P'] else 'Not available'}\n"
                                      f"- Echo: {'Available' if available_notes['Echo'] else 'Not available'}\n"
                                      f"- Imaging: {'Available' if available_notes['Imaging'] else 'Not available'}\n"
                                      f"- Consult: {'Available' if available_notes['Consult'] else 'Not available'}\n"
                                      f"- ECG: {'Available' if available_notes['ECG'] else 'Not available'}\n"
                                      f"Based on the chief complaint, list ONLY the note types you need to review (comma-separated, no explanation). Always include Discharge Summary if available:"}
        ]
    }
    
    # Request note selection
    retries = 0
    requested_notes = []
    notes_requested_str = ""
    
    while retries < 3:
        try:
            selection_response = requests.post(url, headers=headers, json=selection_payload)
            selection_response.raise_for_status()
            
            # Debug the selection response
            debug_api_response(selection_response, "Note Selection")
            
            # Get requested note types
            response_json = selection_response.json()
            if "choices" in response_json and len(response_json["choices"]) > 0:
                if "message" in response_json["choices"][0]:
                    notes_text = response_json["choices"][0]["message"].get("content", "").strip()
                    requested_notes = [note.strip() for note in notes_text.split(',')]
                    notes_requested_str = notes_text
            
            # Filter out unavailable notes
            requested_notes = [note for note in requested_notes 
                               if note in available_notes.keys() and available_notes[note]]
            
            # Always include Discharge Summary if available and not already requested
            if available_notes['Discharge Summary'] and 'Discharge Summary' not in requested_notes:
                requested_notes.append('Discharge Summary')
                if notes_requested_str:
                    notes_requested_str += ", Discharge Summary (auto-added)"
                else:
                    notes_requested_str = "Discharge Summary (auto-added)"
            
            break
        except requests.exceptions.RequestException as e:
            print(f"Note selection request failed: {e}. Retrying {retries+1}/3...")
            time.sleep(5)
            retries += 1
    
    # Default if selection failed
    if not requested_notes and available_notes['Discharge Summary']:
        requested_notes = ['Discharge Summary']
        notes_requested_str = "Failed to get selections. Defaulted to: Discharge Summary"
    elif not requested_notes:
        available_notes_list = [note for note, available in available_notes.items() if available][:2]
        requested_notes = available_notes_list
        notes_requested_str = f"Failed to get selections. Defaulted to: {', '.join(available_notes_list)}"
    
    # Step 2: Prepare the actual notes content
    notes_content = ""
    note_type_to_column = {
        'Discharge Summary': 'Discharge_Summary_Text',
        'Progress Notes': 'Progress_Note_Text',
        'H&P': 'HP_Note_Text',
        'Echo': 'Echo_Text',
        'Imaging': 'Imaging_Text',
        'Consult': 'Consult_Text',
        'ECG': 'ECG_Text',
    }
    
    def truncate_text(text, max_chars=4000):
        if text and len(text) > max_chars:
            return text[:max_chars] + "..."
        return text
    
    for note_type in requested_notes:
        column_name = note_type_to_column.get(note_type)
        if column_name and not pd.isna(row.get(column_name)):
            notes_content += f"\n\n{note_type}:\n{truncate_text(row[column_name])}"
    
    # Step 3: Generate the diagnosis prediction - using the exact same format as the working selection request
    prediction_payload = {
        "messages": [
            {"role": "user", "content": f"You are an experienced Emergency Department (ED) physician tasked with predicting the most likely disposition for a patient based on their presentation and physical, chief complaint, and available past medical information. Based on the patient's chief complaint, age, sex, and available clinical information, predict the most likely ED disposition from the following choices: 'Discharge', 'Admit', 'Eloped', 'Transfer to Another Facility', 'AMA', 'OR Admit', 'LWBS after Triage', 'Send to L&D', 'Expired','Dismissed - Never Arrived', 'Observation', 'None'-- ONLY RESPOND WITH THESE OPTIONS, no explanations.\n\n"
                                      f"CURRENT Chief Complaint: {chief_complaint}\n"
                                      f"Age: {age}\n"
                                      f"Sex: {sex}\n"
                                      f"CURRENT ED Presentation: {presentation}\n"
                                      f"PAST MEDICAL HISTORY: {notes_content}"}
        ]
    }
    
    # Request the diagnosis prediction
    retries = 0
    while retries < 3:
        try:
            prediction_response = requests.post(url, headers=headers, json=prediction_payload)
            prediction_response.raise_for_status()
            
            # Debug the prediction response
            debug_api_response(prediction_response, "Disposition Prediction")
            
            # Extract the prediction from the response
            response_json = prediction_response.json()
            if "choices" in response_json and len(response_json["choices"]) > 0:
                if "message" in response_json["choices"][0]:
                    prediction = response_json["choices"][0]["message"].get("content", "").strip()
                    return prediction, notes_requested_str
            
            # If we couldn't extract a prediction, return None
            print("Warning: Could not extract prediction from API response")
            return None, notes_requested_str
            
        except requests.exceptions.RequestException as e:
            print(f"Prediction request failed: {e}. Retrying {retries+1}/3...")
            time.sleep(5)
            retries += 1
    
    return None, notes_requested_str  # Return None for prediction and the requested notes string if all retries fail

def process_in_batches(df, batch_size=20, output_file="o3_disposition_predictions.csv"):
    """
    Process dataframe in batches, saving progress after each batch
    
    Args:
        df: DataFrame to process
        batch_size: Number of samples to process before saving
        output_file: Where to save the results
    
    Returns:
        Updated DataFrame with predictions
    """
    # Check if output file exists to resume processing
    if os.path.exists(output_file):
        print(f"Found existing output file {output_file}, resuming from there...")
        existing_df = pd.read_csv(output_file)
        
        # Identify which rows have already been processed
        processed_count = sum(~existing_df['Predicted_Disposition'].isna())
        print(f"Found {processed_count} already processed samples out of {len(existing_df)}")
        
        # Merge existing predictions back to original dataframe
        # Ensure index alignment for proper merging
        if len(existing_df) == len(df) and 'Predicted_Disposition' in existing_df.columns:
            # Create a mask for already processed rows
            already_processed = ~existing_df['Predicted_Disposition'].isna()
            
            # Only keep valid predictions and requested notes
            df.loc[already_processed, 'Predicted_Disposition'] = existing_df.loc[already_processed, 'Predicted_Disposition']
            df.loc[already_processed, 'Requested_Notes'] = existing_df.loc[already_processed, 'Requested_Notes']
            
            print(f"Restored {sum(already_processed)} existing predictions")
        else:
            # If file exists but structure doesn't match, initialize columns
            print("Output file structure doesn't match or is empty. Starting fresh.")
            df['Predicted_Disposition'] = None
            df['Requested_Notes'] = None
    else:
        # Initialize prediction columns if starting fresh
        print("Starting new prediction process...")
        df['Predicted_Disposition'] = None
        df['Requested_Notes'] = None
    
    # Get indices of rows that still need processing
    rows_to_process = df[pd.isna(df['Predicted_Disposition'])].index.tolist()
    total_rows = len(rows_to_process)
    
    print(f"Processing {total_rows} remaining samples in batches of {batch_size}")
    
    # Process in batches
    batch_count = 0
    for i in range(0, total_rows, batch_size):
        batch_indices = rows_to_process[i:i+batch_size]
        batch_count += 1
        
        print(f"Processing batch {batch_count}, samples {i+1}-{min(i+batch_size, total_rows)} of {total_rows}")
        
        # Process each sample in the batch
        for idx in batch_indices:
            row = df.loc[idx]
            
            # Predict disposition
            try:
                prediction, notes_requested = predict_disposition_with_dynamic_notes(row)
                
                # Debug the results
                print(f"Sample {idx}: Prediction = '{prediction}', Notes = '{notes_requested[:30]}...'")
                
                # Store results in dataframe
                df.at[idx, 'Predicted_Disposition'] = prediction
                df.at[idx, 'Requested_Notes'] = notes_requested
                
                # Print progress for every 5 samples
                if (batch_indices.index(idx) + 1) % 5 == 0 or batch_indices.index(idx) + 1 == len(batch_indices):
                    print(f"  Processed {batch_indices.index(idx) + 1}/{len(batch_indices)} samples in current batch")
            except Exception as e:
                print(f"Error processing sample {idx}: {e}")
                # Continue with next sample
        
        # Save after each batch
        print(f"Saving progress after batch {batch_count}...")
        df.to_csv(output_file, index=False)
        
        # Calculate completion percentage
        completed = total_rows - len(rows_to_process[i+len(batch_indices):])
        print(f"Overall progress: {completed}/{total_rows} ({completed/total_rows*100:.1f}%)")
    
    return df

# Main execution
if __name__ == "__main__":
    # Load your data
    print("Loading data...")
    # Replace with your actual data loading code
    # df = pd.read_csv("your_data.csv")
    
    # Set batch size and output file
    BATCH_SIZE = 20
    OUTPUT_FILE = "new_o3_disposition_predictions.csv"
    
    # Process data in batches with checkpoint saving
    df = process_in_batches(df, batch_size=BATCH_SIZE, output_file=OUTPUT_FILE)
    
    print(f"Processing complete! Results saved to {OUTPUT_FILE}")

In [9]:
# Add column to track accuracy of disposition prediction
results["Prediction_Correct"] = results["Predicted_Disposition"] == results["eddisposition"]

# Calculate overall accuracy
accuracy = (results["Prediction_Correct"].sum() / results["Prediction_Correct"].count()) * 100
print(f"Overall disposition prediction accuracy: {accuracy:.2f}%")

Overall disposition prediction accuracy: 63.08%
