## ACUITY PREDICTION FOR GPT SERIES: 3.5 AND 4o

In [2]:
import pandas as pd
pd.set_option('display.max_rows', None)  # Display all rows
pd.set_option('display.max_columns', None)  # Display all columns
pd.set_option('display.max_colwidth', None)  # Set max column width to None
pd.set_option('display.width', None)  # Set width to None
df = pd.read_csv('results.csv')

In [5]:
import pandas as pd
import os
import re
import json
import base64
import requests
import time
import urllib.parse

pd.set_option('display.max_rows', None)  # Display all rows
pd.set_option('display.max_columns', None)  # Display all columns
pd.set_option('display.max_colwidth', None)  # Set max column width to None
pd.set_option('display.width', None)  # Set width to None

API_KEY = 'x'  ##### Paste your API key between the quotes #####
API_VERSION = '2024-06-01'  # For the most recent production release: https://learn.microsoft.com/en-us/azure/ai-services/openai/api-version-deprecation#latest-ga-api-release
RESOURCE_ENDPOINT = 'x'  
DEPLOYMENT_NAME = 'gpt-35-turbo' ## change this to gpt-4o deployment name and rerun cells to see 4o acuity prediction results. 

In [None]:
import random
from tqdm import tqdm

# Function to predict acuity with robust error handling
def predict_acuity_with_all_info(row, max_retries=5):
    """
    Predicts acuity level using all available information with robust error handling.
    """
    # Skip if any critical field is missing
    required_fields = ['primarychiefcomplaintname']
    for field in required_fields:
        if field not in row or pd.isna(row[field]):
            return None
    
    # Build the prompt with all available information
    prompt = "Predict the emergency department acuity level for this patient.\n\n"
    
    # Add chief complaint (required)
    prompt += f"Chief Complaint: {row['primarychiefcomplaintname']}\n"
    
    # Add optional fields if available
    if 'age' in row and not pd.isna(row['age']):
        prompt += f"Age: {row['age']}\n"
    
    if 'sex' in row and not pd.isna(row['sex']):
        prompt += f"Sex: {row['sex']}\n"
    
    if 'firstrace' in row and not pd.isna(row['firstrace']):
        prompt += f"Race: {row['firstrace']}\n"
    
    if 'Vital_Signs' in row and not pd.isna(row['Vital_Signs']):
        # Limit vital signs to 300 characters to reduce payload size
        vital_signs = str(row['Vital_Signs'])
        if len(vital_signs) > 300:
            vital_signs = vital_signs[:300] + "..."
        prompt += f"Vital Signs: {vital_signs}\n"
    
    # Add instructions for output format
    prompt += "\nSelect the most appropriate acuity level from the following options ONLY:\n"
    prompt += "'Immediate', 'Emergent', 'Urgent', 'Less Urgent', 'Non-Urgent'\n\n"
    prompt += "Respond with ONLY ONE of these five options."
    
    # API request with retries and backoff
    current_retry = 0
    backoff_time = 2  # Initial backoff in seconds
    
    while current_retry <= max_retries:
        try:
            url = f"{RESOURCE_ENDPOINT}/openai/deployments/{DEPLOYMENT_NAME}/chat/completions?api-version={API_VERSION}"
            
            headers = {
                "Content-Type": "application/json",
                "api-key": API_KEY
            }
            
            payload = {
                "messages": [
                    {"role": "system", "content": "You are an experienced Emergency Department triage nurse."},
                    {"role": "user", "content": prompt}
                ],
                "temperature": 0.1,
                "max_tokens": 50
            }
            
            # Use increased timeout
            response = requests.post(url, headers=headers, json=payload, timeout=60)
            response.raise_for_status()
            
            # Extract and clean prediction
            prediction = response.json()["choices"][0]["message"]["content"].strip()
            
            # Handle potential variations in responses
            for acuity in ['Immediate', 'Emergent', 'Urgent', 'Less Urgent', 'Non-Urgent']:
                if acuity.lower() in prediction.lower():
                    return acuity
            
            # If no match found, return the raw response for debugging
            return prediction
            
        except requests.exceptions.RequestException as e:
            current_retry += 1
            
            if current_retry > max_retries:
                print(f"Failed after {max_retries} retries: {e}")
                return "Prediction failed"
            
            # Log the error and retry info
            print(f"Request failed: {e}. Retrying {current_retry}/{max_retries} after {backoff_time} seconds...")
            
            # Implement exponential backoff with jitter
            time.sleep(backoff_time + random.uniform(0, 1))
            backoff_time *= 2  # Double the backoff time for next retry
    
    return "Prediction failed"

# Process data in batches with checkpointing
def process_with_checkpoints(df, batch_size=20, checkpoint_file="35_acuity_predictions_all_info.csv"):
    # Check if checkpoint exists
    if os.path.exists(checkpoint_file):
        processed_df = pd.read_csv(checkpoint_file)
        # Create a set of already processed encounter IDs for faster lookups
        if 'encounterkey' in processed_df.columns:
            processed_ids = set(processed_df['encounterkey'].tolist())
        else:
            # If no encounterkey column, use row indices
            processed_ids = set(range(len(processed_df)))
        print(f"Resuming from checkpoint with {len(processed_ids)} already processed records")
    else:
        processed_df = pd.DataFrame(columns=df.columns.tolist() + ['predicted_acuity'])
        processed_ids = set()
    
    # Calculate total batches
    total_records = len(df)
    total_batches = (total_records + batch_size - 1) // batch_size
    
    # Track overall progress
    processed_count = len(processed_ids)
    start_time = time.time()
    
    # Process in batches
    for batch_num in range(total_batches):
        start_idx = batch_num * batch_size
        end_idx = min(start_idx + batch_size, total_records)
        
        # Get current batch
        batch = df.iloc[start_idx:end_idx].copy()
        
        # Filter out already processed records
        if 'encounterkey' in batch.columns:
            batch = batch[~batch['encounterkey'].isin(processed_ids)]
        else:
            batch = batch.iloc[[i for i in range(start_idx, end_idx) if i not in processed_ids]]
        
        if len(batch) == 0:
            continue
        
        print(f"Processing batch {batch_num+1}/{total_batches}, records {start_idx}-{end_idx}")
        
        # Process each record in batch
        for idx, row in tqdm(batch.iterrows(), total=len(batch)):
            # Make prediction
            prediction = predict_acuity_with_all_info(row)
            
            # Add prediction to row
            row_copy = row.copy()
            row_copy['predicted_acuity'] = prediction
            
            # Append to results dataframe
            processed_df = pd.concat([processed_df, pd.DataFrame([row_copy])], ignore_index=True)
            
            # Mark as processed
            if 'encounterkey' in row:
                processed_ids.add(row['encounterkey'])
            else:
                processed_ids.add(idx)
            
            # Update progress count
            processed_count += 1
        
        # Save checkpoint after each batch
        processed_df.to_csv(checkpoint_file, index=False)
        
        # Calculate and display progress statistics
        elapsed_time = time.time() - start_time
        records_per_second = processed_count / elapsed_time if elapsed_time > 0 else 0
        estimated_remaining = (total_records - processed_count) / records_per_second if records_per_second > 0 else float('inf')
        
        print(f"Progress: {processed_count}/{total_records} records ({processed_count/total_records:.1%})")
        print(f"Speed: {records_per_second:.2f} records/second")
        print(f"Est. time remaining: {estimated_remaining/60:.1f} minutes")
        print(f"Checkpoint saved at: {checkpoint_file}")
        
        # Add a short pause between batches to be nice to the API
        time.sleep(2)
    
    return processed_df

# Calculate accuracy
def calculate_accuracy(df):
    # Filter out rows with failed predictions
    valid_df = df[df['predicted_acuity'].isin(['Immediate', 'Emergent', 'Urgent', 'Less Urgent', 'Non-Urgent'])]
    
    # Count matches
    matches = (valid_df['predicted_acuity'] == valid_df['acuitylevel']).sum()
    total = len(valid_df)
    
    accuracy = matches / total if total > 0 else 0
    print(f"Accuracy: {matches}/{total} = {accuracy:.4f} ({accuracy:.2%})")
    
    # Count by acuity level
    print("\nAccuracy by acuity level:")
    for level in ['Immediate', 'Emergent', 'Urgent', 'Less Urgent', 'Non-Urgent']:
        level_df = valid_df[valid_df['acuitylevel'] == level]
        if len(level_df) > 0:
            level_matches = (level_df['predicted_acuity'] == level_df['acuitylevel']).sum()
            level_accuracy = level_matches / len(level_df)
            print(f"{level}: {level_matches}/{len(level_df)} = {level_accuracy:.4f} ({level_accuracy:.2%})")
    
    return accuracy

# Main execution
if __name__ == "__main__":
    # Load the data
    print("Loading data...")
    
    print(f"Loaded {len(df)} records")
    print("Starting prediction process with all information...")
    
    # Run the prediction with all information
    results_df = process_with_checkpoints(df)
    
    # Calculate and display accuracy
    accuracy = calculate_accuracy(results_df)
    
    # Save final results
    final_output = "35_acuity_prediction_results_all_info.csv"
    results_df.to_csv(final_output, index=False)
    print(f"Final results saved to {final_output}")