In [1]:
### disposition experiment for gpt 3.5 

In [2]:
import pandas as pd
pd.set_option('display.max_rows', None)  # Display all rows
pd.set_option('display.max_columns', None)  # Display all columns
pd.set_option('display.max_colwidth', None)  # Set max column width to None
pd.set_option('display.width', None)  # Set width to None

In [3]:
df = pd.read_csv('results.csv')

In [4]:
import re
def extract_until_medical_decision(text):
    if pd.isna(text):
        return None
    # Split at first occurrence of "Medical Decision" (case-insensitive)
    parts = re.split(r'\bMedical Decision\b', text, flags=re.IGNORECASE)
    return parts[0].strip() if parts else None

# Apply it to the column
df['ED_Presentations'] = df['ED_Provider_Notes_Text'].apply(extract_until_medical_decision)


In [16]:
import pandas as pd
import os
import re
import json
import base64
import requests
import time
import urllib.parse

pd.set_option('display.max_rows', None)  # Display all rows
pd.set_option('display.max_columns', None)  # Display all columns
pd.set_option('display.max_colwidth', None)  # Set max column width to None
pd.set_option('display.width', None)  # Set width to None
# First, let's test by hard coding your Mulesoft Azure API key into the next line
API_KEY = 'MTEwMTQzZjI4MzBjNDY0MTgyMjUyMWE3MDAyZjQ4MTM6OGI3YUQzMDE1QTk0NDVmNDk3ZDNBYTVGOERGNjJFNDk='  ##### Paste your API key between the quotes #####
API_VERSION = '2024-06-01'  # For the most recent production release: https://learn.microsoft.com/en-us/azure/ai-services/openai/api-version-deprecation#latest-ga-api-release
RESOURCE_ENDPOINT = 'https://unified-api.ucsf.edu/general'  # no trailing slash--this is used by libraries as a partial URL
DEPLOYMENT_NAME = "gpt-35-turbo-16k" 

In [None]:
def predict_disposition_with_dynamic_notes(row):
    # Extract all available fields
    chief_complaint = row['primarychiefcomplaintname']
    sex = row['sex']
    age = row['Age']
    presentation = row['ED_Presentations']
    
    # Check if necessary basic fields exist
    if pd.isna(chief_complaint):
        return None, None  # Skip if chief complaint is missing, return None for both prediction and requested notes
    
    # Initialize available note types with their existence status
    available_notes = {
        'Discharge Summary': not pd.isna(row.get('Discharge_Summary_Text')),
        'Progress Notes': not pd.isna(row.get('Progress_Note_Text')),
        'H&P': not pd.isna(row.get('HP_Note_Text')),
        'Echo': not pd.isna(row.get('Echo_Text')),
        'Imaging': not pd.isna(row.get('Imaging_Text')),
        'Consult': not pd.isna(row.get('Consult_Text')),
        'ECG': not pd.isna(row.get('ECG_Text')),
    }
    
    # Step 1: Ask the model which notes it wants to see (always include Discharge Summary if available)
    url = f"{RESOURCE_ENDPOINT}/openai/deployments/{DEPLOYMENT_NAME}/chat/completions?api-version={API_VERSION}"
    
    headers = {
        "Content-Type": "application/json",
        "api-key": API_KEY
    }
    
    # First message to decide which notes to read
    selection_payload = {
        "messages": [
            {"role": "system", "content": "You are an experienced Emergency Department (ED) physician. Your task is to decide which medical notes you need to read to predict the patient's ED disposition based on the chief complaint, PMH, physical exam findings, age, and sex."},
            {"role": "user", "content": f"Patient basic info: {age}yo {sex} with chief complaint: {chief_complaint}\n\n"
                                      f"Available notes (respond ONLY with the names of notes you want to see, separated by commas):\n"
                                      f"- Discharge Summary: {'Available' if available_notes['Discharge Summary'] else 'Not available'}\n"
                                      f"- Progress Notes: {'Available' if available_notes['Progress Notes'] else 'Not available'}\n"
                                      f"- H&P: {'Available' if available_notes['H&P'] else 'Not available'}\n"
                                      f"- Echo: {'Available' if available_notes['Echo'] else 'Not available'}\n"
                                      f"- Imaging: {'Available' if available_notes['Imaging'] else 'Not available'}\n"
                                      f"- Consult: {'Available' if available_notes['Consult'] else 'Not available'}\n"
                                      f"- ECG: {'Available' if available_notes['ECG'] else 'Not available'}\n"
                                      f"Based on the chief complaint, list ONLY the note types you need to review (comma-separated, no explanation). Always include Discharge Summary if available:"}
        ],
        "temperature": 0.1,
        "max_tokens": 1000
    }
    
    # Request note selection with improved rate limit handling
    retries = 0
    requested_notes = []
    notes_requested_str = ""  # String to track requested notes
    max_retries = 5  # Increased retry attempts
    
    while retries < max_retries:
        try:
            # Exponential backoff with jitter for rate limiting
            if retries > 0:
                # Calculate delay with exponential backoff and random jitter
                delay = (2 ** retries) + (random.random() * 2)
                print(f"Selection request: Backing off for {delay:.2f} seconds before retry {retries+1}/{max_retries}...")
                time.sleep(delay)
            
            selection_response = requests.post(url, headers=headers, json=selection_payload)
            
            # Handle rate limiting specifically
            if selection_response.status_code == 429:  # Too Many Requests
                retry_after = int(selection_response.headers.get('Retry-After', 60))
                print(f"Rate limited. Waiting {retry_after} seconds as instructed by API...")
                time.sleep(retry_after)
                retries += 1
                continue
                
            selection_response.raise_for_status()
            
            # Get requested note types
            notes_text = selection_response.json()["choices"][0]["message"]["content"].strip()
            requested_notes = [note.strip() for note in notes_text.split(',')]
            
            # Save the original request for the tracking column
            notes_requested_str = notes_text
            
            # Filter out unavailable notes
            requested_notes = [note for note in requested_notes 
                              if note in available_notes.keys() and available_notes[note]]
            
            # Always include Discharge Summary if available and not already requested
            if available_notes['Discharge Summary'] and 'Discharge Summary' not in requested_notes:
                requested_notes.append('Discharge Summary')
                if notes_requested_str:
                    notes_requested_str += ", Discharge Summary (auto-added)"
                else:
                    notes_requested_str = "Discharge Summary (auto-added)"
            
            break
        except requests.exceptions.RequestException as e:
            print(f"Note selection request failed: {e}. Retrying {retries+1}/{max_retries}...")
            retries += 1
    
    if not requested_notes and available_notes['Discharge Summary']:
        # Default to discharge summary if selection failed but it's available
        requested_notes = ['Discharge Summary']
        notes_requested_str = "Failed to get selections. Defaulted to: Discharge Summary"
    elif not requested_notes:
        # If selection failed and no discharge summary, use whatever is available
        available_notes_list = [note for note, available in available_notes.items() if available][:2]
        requested_notes = available_notes_list
        notes_requested_str = f"Failed to get selections. Defaulted to: {', '.join(available_notes_list)}"
    
    # Step 2: Prepare the actual notes content
    notes_content = ""
    
    # Map note types to DataFrame column names
    note_type_to_column = {
        'Discharge Summary': 'Discharge_Summary_Text',
        'Progress Notes': 'Progress_Note_Text',
        'H&P': 'HP_Note_Text',
        'Echo': 'Echo_Text',
        'Imaging': 'Imaging_Text',
        'Consult': 'Consult_Text',
        'ECG': 'ECG_Text',
    }
    
    # Function to truncate text to manage token limits
    def truncate_text(text, max_chars=3000):
        if text and len(text) > max_chars:
            return text[:max_chars] + "..."
        return text
    
    # Add requested notes to content, with truncation
    for note_type in requested_notes:
        column_name = note_type_to_column.get(note_type)
        if column_name and not pd.isna(row.get(column_name)):
            notes_content += f"\n\n{note_type}:\n{truncate_text(row[column_name])}"
    
    # Step 3: Generate the disposition prediction
    prediction_payload = {
        "messages": [
            {"role": "system", "content": "You are an experienced Emergency Department (ED) physician tasked with predicting the most likely disposition for a patient based on their presentation and physicial, chief complaint, and available past medical information."},
            {"role": "user", "content": f"Based on the patient's chief complaint, age, sex, and available clinical information, predict the most likely ED disposition from the following choices: 'Discharge', 'Admit', 'Eloped', 'Transfer to Another Facility', 'AMA', 'OR Admit', 'LWBS after Triage', 'Send to L&D', 'Expired','Dismissed - Never Arrived', 'Observation', 'None'-- ONLY RESPOND WITH THESE OPTIONS, no explanations.\n\n"
                                      f"Chief Complaint: {chief_complaint}\n"
                                      f"Age: {age}\n"
                                      f"Sex: {sex}\n"
                                      f"Current ED Presentation: {presentation}\n"
                                      f"{notes_content}"}
        ],
        "temperature": 0.1,
        "max_tokens": 4096
    }
    
    # Request the disposition prediction with improved rate limit handling
    retries = 0
    max_retries = 5  # Increased retry attempts
    
    while retries < max_retries:
        try:
            # Exponential backoff with jitter for rate limiting
            if retries > 0:
                # Calculate delay with exponential backoff and random jitter
                delay = (2 ** retries) + (random.random() * 2)
                print(f"Prediction request: Backing off for {delay:.2f} seconds before retry {retries+1}/{max_retries}...")
                time.sleep(delay)
            
            prediction_response = requests.post(url, headers=headers, json=prediction_payload)
            
            # Handle rate limiting specifically
            if prediction_response.status_code == 429:  # Too Many Requests
                retry_after = int(prediction_response.headers.get('Retry-After', 60))
                print(f"Rate limited. Waiting {retry_after} seconds as instructed by API...")
                time.sleep(retry_after)
                retries += 1
                continue
                
            prediction_response.raise_for_status()
            return prediction_response.json()["choices"][0]["message"]["content"].strip(), notes_requested_str
            
        except requests.exceptions.RequestException as e:
            print(f"Prediction request failed: {e}. Retrying {retries+1}/{max_retries}...")
            retries += 1
    
    return None, notes_requested_str  # Return None for prediction and the requested notes string if all retries fail

# Import required additional libraries
import time
import random

# Process in batches with rate limit handling
def process_in_batches(df, batch_size=10, pause_between_batches=30):
    all_predictions = []
    all_requested_notes = []
    total_rows = len(df)
    
    # Create backup file path with timestamp
    import datetime
    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
    backup_path = f"ed_disposition_backup_{timestamp}.csv"
    
    for i in range(0, total_rows, batch_size):
        print(f"Processing batch {i//batch_size + 1} of {(total_rows-1)//batch_size + 1}...")
        
        # Process one batch
        end_idx = min(i + batch_size, total_rows)
        batch_df = df.iloc[i:end_idx]
        
        # Apply function to batch and collect results
        batch_results = batch_df.apply(lambda row: predict_disposition_with_dynamic_notes(row), axis=1)
        
        # Unpack results into separate lists
        batch_predictions = [result[0] for result in batch_results]
        batch_requested_notes = [result[1] for result in batch_results]
        
        # Add to overall results
        all_predictions.extend(batch_predictions)
        all_requested_notes.extend(batch_requested_notes)
        
        # Save progress after each batch
        temp_df = df.copy()
        
        # Create lists with proper length
        full_predictions = all_predictions + [None] * (total_rows - len(all_predictions))
        full_requested_notes = all_requested_notes + [None] * (total_rows - len(all_requested_notes))
        
        # Fill in results we have so far
        temp_df["Predicted_Disposition"] = full_predictions
        temp_df["Requested_Notes"] = full_requested_notes
        
        # Save backup
        temp_df.to_csv(backup_path, index=False)
        print(f"Progress saved to {backup_path}")
        
        # Pause between batches to avoid rate limits (unless it's the last batch)
        if end_idx < total_rows:
            print(f"Pausing for {pause_between_batches} seconds to avoid rate limits...")
            time.sleep(pause_between_batches)
    
    # Return results as a list of tuples to match the expected format
    return list(zip(all_predictions, all_requested_notes))

# Apply batch processing instead of processing all rows at once
results = process_in_batches(df, batch_size=5, pause_between_batches=60)

# Split the results into two columns
df["Predicted_Disposition"] = [result[0] for result in results]
df["Requested_Notes"] = [result[1] for result in results]

# Add column to track accuracy of prediction if we have the actual disposition column
if "eddisposition" in df.columns:
    df["Prediction_Correct"] = df["Predicted_Disposition"] == df["eddisposition"]
    
    # Calculate overall accuracy
    accuracy = (df["Prediction_Correct"].sum() / df["Prediction_Correct"].count()) * 100
    print(f"Overall disposition prediction accuracy: {accuracy:.2f}%")
    
    # Breakdown by disposition type
    disposition_accuracy = df.groupby("eddisposition").agg(
        total_count=("eddisposition", "count"),
        correct_predictions=("Prediction_Correct", "sum")
    )
    
    # Calculate percentage accuracy for each disposition type
    disposition_accuracy["accuracy_pct"] = (
        disposition_accuracy["correct_predictions"] / 
        disposition_accuracy["total_count"] * 100
    )
    
    # Show top dispositions by count with accuracy
    print("\nDispositions by frequency with accuracy:")
    print(disposition_accuracy.sort_values("total_count", ascending=False))

# Save to CSV and Display
df.to_csv("35_ed_dispo_predictions.csv", index=False)
print("Results saved to 35_ed_dispo_predictions.csv")

Processing batch 1 of 797...
Progress saved to ed_disposition_backup_20250430_170031.csv
Pausing for 60 seconds to avoid rate limits...
Processing batch 2 of 797...
Progress saved to ed_disposition_backup_20250430_170031.csv
Pausing for 60 seconds to avoid rate limits...
Processing batch 3 of 797...
Progress saved to ed_disposition_backup_20250430_170031.csv
Pausing for 60 seconds to avoid rate limits...
Processing batch 4 of 797...
Progress saved to ed_disposition_backup_20250430_170031.csv
Pausing for 60 seconds to avoid rate limits...
Processing batch 5 of 797...
Progress saved to ed_disposition_backup_20250430_170031.csv
Pausing for 60 seconds to avoid rate limits...
Processing batch 6 of 797...
Progress saved to ed_disposition_backup_20250430_170031.csv
Pausing for 60 seconds to avoid rate limits...
Processing batch 7 of 797...
Progress saved to ed_disposition_backup_20250430_170031.csv
Pausing for 60 seconds to avoid rate limits...
Processing batch 8 of 797...
Progress saved to e

In [20]:
import pandas as pd
results = pd.read_csv('35_ed_dispo_predictions.csv')

In [21]:
results.shape

(3984, 49)

In [22]:
# Add column to track accuracy of disposition prediction
results["Prediction_Correct"] = results["Predicted_Disposition"] == results["eddisposition"]

# Calculate overall accuracy
accuracy = (results["Prediction_Correct"].sum() / results["Prediction_Correct"].count()) * 100
print(f"Overall disposition prediction accuracy: {accuracy:.2f}%")

Overall disposition prediction accuracy: 57.08%
