In [1]:
## Predict what patients are readmitted for using past medical history (using final diagnosis tab col)

In [2]:
import pandas as pd
pd.set_option('display.max_rows', None)  # Display all rows
pd.set_option('display.max_columns', None)  # Display all columns
pd.set_option('display.max_colwidth', None)  # Set max column width to None
pd.set_option('display.width', None)  # Set width to None

In [3]:
df = pd.read_csv('filtered_final_dataset.csv')

In [4]:
df.shape, df.columns

((3984, 43),
 Index(['Unnamed: 0', 'Unnamed: 0.1', 'patientdurablekey', 'encounterkey',
        'ArrivalDateKey', 'DepartureDateKeyValue', 'DepartureDateKey',
        'DispositionDateKeyValue', 'primarychiefcomplaintname',
        'primaryeddiagnosisname', 'sex', 'birthdate', 'firstrace',
        'preferredlanguage', 'highestlevelofeducation', 'maritalstatus', 'Age',
        'Discharge_Summary_Date', 'Discharge_Summary_Note_Key',
        'Progress_Note_Date', 'Progress_Note_Key', 'HP_Note_Date',
        'HP_Note_Key', 'Echo_Date', 'Echo_Key', 'Imaging_Date', 'Imaging_Key',
        'Consult_Date', 'Consult_Key', 'ED_Provider_Notes_Date',
        'ED_Provider_Notes_Key', 'ECG_Date', 'ECG_Key',
        'Discharge_Summary_Text', 'Progress_Note_Text', 'HP_Note_Text',
        'Echo_Text', 'Imaging_Text', 'Consult_Text', 'ECG_Text',
        'ED_Provider_Notes_Text', 'One_Sentence_Extracted', 'note_count'],
       dtype='object'))

In [5]:
df['primaryeddiagnosisname'].nunique()

1554

In [15]:
for val in df['primaryeddiagnosisname'].unique():
    print(val)


CAD in native artery
Vesicular rash
SBO (small bowel obstruction) (CMS code)
Gastroduodenitis
Nausea vomiting and diarrhea
Abdominal pain
Acute abdominal pain
Chronic cholecystitis
Generalized abdominal pain
Cellulitis, unspecified cellulitis site
Cholelithiasis without cholecystitis
Complicated UTI (urinary tract infection)
Abdominal pain, right lower quadrant
Strangulated hernia of abdominal wall
Generalized weakness
Chronic pancreatitis, unspecified pancreatitis type (CMS code)
Sepsis, due to unspecified organism, unspecified whether acute organ dysfunction present (CMS code)
Diverticulitis of colon with perforation
Altered mental status, unspecified altered mental status type
Urinary retention
Chronic abdominal pain
Lower abdominal pain
Abdominal pain, right upper quadrant
Obstructive hyperbilirubinemia (CMS code)
Thrombosis of ovarian vein
Chest pain, unspecified type
Pericarditis with effusion
Pyelonephritis
Abdominal pain, left lower quadrant
Diverticulitis
Colostomy in place (C

In [11]:
## Inputs: Chosen PMH, Age, Sex, Physical Exam (ED_presentation column), chief complaint
## Output: predicted diagnosis 

In [6]:
import re
def extract_until_medical_decision(text):
    if pd.isna(text):
        return None
    # Split at first occurrence of "Medical Decision" (case-insensitive)
    parts = re.split(r'\bMedical Decision\b', text, flags=re.IGNORECASE)
    return parts[0].strip() if parts else None

# Apply it to the column
df['ED_Presentations'] = df['ED_Provider_Notes_Text'].apply(extract_until_medical_decision)


In [8]:
import pandas as pd
import os
import re
import json
import base64
import requests
import time
import urllib.parse

pd.set_option('display.max_rows', None)  # Display all rows
pd.set_option('display.max_columns', None)  # Display all columns
pd.set_option('display.max_colwidth', None)  # Set max column width to None
pd.set_option('display.width', None)  # Set width to None
# First, let's test by hard coding your Mulesoft Azure API key into the next line
API_KEY = 'x'  ##### Paste your API key between the quotes #####
API_VERSION = '2024-06-01'  # For the most recent production release: https://learn.microsoft.com/en-us/azure/ai-services/openai/api-version-deprecation#latest-ga-api-release
RESOURCE_ENDPOINT = 'https://unified-api.ucsf.edu/general'  # no trailing slash--this is used by libraries as a partial URL
DEPLOYMENT_NAME = "gpt-4o-2024-08-06" 

In [9]:
def predict_diagnosis_with_dynamic_notes(row):
    # Extract all available fields
    chief_complaint = row['primarychiefcomplaintname']
    sex = row['sex']
    age = row['Age']
    presentation = row['ED_Presentations']
    
    # Check if necessary basic fields exist
    if pd.isna(chief_complaint):
        return None, None  # Skip if chief complaint is missing, return None for both prediction and requested notes
    
    # Initialize available note types with their existence status
    available_notes = {
        'Discharge Summary': not pd.isna(row.get('Discharge_Summary_Text')),
        'Progress Notes': not pd.isna(row.get('Progress_Note_Text')),
        'H&P': not pd.isna(row.get('HP_Note_Text')),
        'Echo': not pd.isna(row.get('Echo_Text')),
        'Imaging': not pd.isna(row.get('Imaging_Text')),
        'Consult': not pd.isna(row.get('Consult_Text')),
        'ECG': not pd.isna(row.get('ECG_Text')),
    }
    
    # Step 1: Ask the model which notes it wants to see (always include Discharge Summary if available)
    url = f"{RESOURCE_ENDPOINT}/openai/deployments/{DEPLOYMENT_NAME}/chat/completions?api-version={API_VERSION}"
    
    headers = {
        "Content-Type": "application/json",
        "api-key": API_KEY
    }
    
    # First message to decide which notes to read
    selection_payload = {
        "messages": [
            {"role": "system", "content": "You are an experienced Emergency Department (ED) physician. Your task is to decide which medical notes you need to read to predict the patient's ED diagnosis based on the chief complaint, PMH, physical exam findings, age, and sex."},
            {"role": "user", "content": f"Patient basic info: {age}yo {sex} with chief complaint: {chief_complaint}\n\n"
                                      f"Available notes (respond ONLY with the names of notes you want to see, separated by commas):\n"
                                      f"- Discharge Summary: {'Available' if available_notes['Discharge Summary'] else 'Not available'}\n"
                                      f"- Progress Notes: {'Available' if available_notes['Progress Notes'] else 'Not available'}\n"
                                      f"- H&P: {'Available' if available_notes['H&P'] else 'Not available'}\n"
                                      f"- Echo: {'Available' if available_notes['Echo'] else 'Not available'}\n"
                                      f"- Imaging: {'Available' if available_notes['Imaging'] else 'Not available'}\n"
                                      f"- Consult: {'Available' if available_notes['Consult'] else 'Not available'}\n"
                                      f"- ECG: {'Available' if available_notes['ECG'] else 'Not available'}\n"
                                      f"Based on the chief complaint, list ONLY the note types you need to review (comma-separated, no explanation). Always include Discharge Summary if available:"}
        ],
        "temperature": 0.1,
        "max_tokens": 1000
    }
    
    # Request note selection
    retries = 0
    requested_notes = []
    notes_requested_str = ""  # String to track requested notes
    
    while retries < 3:
        try:
            selection_response = requests.post(url, headers=headers, json=selection_payload)
            selection_response.raise_for_status()
            
            # Get requested note types
            notes_text = selection_response.json()["choices"][0]["message"]["content"].strip()
            requested_notes = [note.strip() for note in notes_text.split(',')]
            
            # Save the original request for the tracking column
            notes_requested_str = notes_text
            
            # Filter out unavailable notes
            requested_notes = [note for note in requested_notes 
                              if note in available_notes.keys() and available_notes[note]]
            
            # Always include Discharge Summary if available and not already requested
            if available_notes['Discharge Summary'] and 'Discharge Summary' not in requested_notes:
                requested_notes.append('Discharge Summary')
                if notes_requested_str:
                    notes_requested_str += ", Discharge Summary (auto-added)"
                else:
                    notes_requested_str = "Discharge Summary (auto-added)"
            
            break
        except requests.exceptions.RequestException as e:
            print(f"Note selection request failed: {e}. Retrying {retries+1}/3...")
            time.sleep(5)
            retries += 1
    
    if not requested_notes and available_notes['Discharge Summary']:
        # Default to discharge summary if selection failed but it's available
        requested_notes = ['Discharge Summary']
        notes_requested_str = "Failed to get selections. Defaulted to: Discharge Summary"
    elif not requested_notes:
        # If selection failed and no discharge summary, use whatever is available
        available_notes_list = [note for note, available in available_notes.items() if available][:2]
        requested_notes = available_notes_list
        notes_requested_str = f"Failed to get selections. Defaulted to: {', '.join(available_notes_list)}"
    
    # Step 2: Prepare the actual notes content
    notes_content = ""
    
    # Map note types to DataFrame column names
    note_type_to_column = {
        'Discharge Summary': 'Discharge_Summary_Text',
        'Progress Notes': 'Progress_Note_Text',
        'H&P': 'HP_Note_Text',
        'Echo': 'Echo_Text',
        'Imaging': 'Imaging_Text',
        'Consult': 'Consult_Text',
        'ECG': 'ECG_Text',
    }
    
    # Function to truncate text to manage token limits
    def truncate_text(text, max_chars=3000):
        if text and len(text) > max_chars:
            return text[:max_chars] + "..."
        return text
    
    # Add requested notes to content, with truncation
    for note_type in requested_notes:
        column_name = note_type_to_column.get(note_type)
        if column_name and not pd.isna(row.get(column_name)):
            notes_content += f"\n\n{note_type}:\n{truncate_text(row[column_name])}"
    
    # Step 3: Generate the diagnosis prediction
    prediction_payload = {
        "messages": [
            {"role": "system", "content": "You are an experienced Emergency Department (ED) physician tasked with predicting the most likely diagnosis for a patient based on their presentation, chief complaint, and available medical information."},
            {"role": "user", "content": f"Based on the patient's chief complaint, age, sex, and available clinical information, predict the most likely ED diagnosis. Provide a single diagnosis for the CURRENT ED Visit from the standardized CMS codes without explanation but with the word, not the code.\n\n"
                                      f"Chief Complaint: {chief_complaint}\n"
                                      f"Age: {age}\n"
                                      f"Sex: {sex}\n"
                                      f"Current ED Presentation: {presentation}\n"
                                      f"{notes_content}"}
        ],
        "temperature": 0.1,
        "max_tokens": 6000
    }
    
    # Request the diagnosis prediction
    retries = 0
    while retries < 3:
        try:
            prediction_response = requests.post(url, headers=headers, json=prediction_payload)
            prediction_response.raise_for_status()
            return prediction_response.json()["choices"][0]["message"]["content"].strip(), notes_requested_str
        except requests.exceptions.RequestException as e:
            print(f"Prediction request failed: {e}. Retrying {retries+1}/3...")
            time.sleep(5)
            retries += 1
    
    return None, notes_requested_str  # Return None for prediction and the requested notes string if all retries fail

# Apply function to DataFrame - now returns two values (prediction and requested notes)
results = df.apply(lambda row: predict_diagnosis_with_dynamic_notes(row), axis=1)

# Split the results into two columns
df["Predicted_Diagnosis"] = [result[0] for result in results]
df["Requested_Notes"] = [result[1] for result in results]

# Add column to track accuracy of prediction
df["Prediction_Correct"] = df["Predicted_Diagnosis"] == df["primaryeddiagnosisname"]

# Calculate overall accuracy
accuracy = (df["Prediction_Correct"].sum() / df["Prediction_Correct"].count()) * 100
print(f"Overall prediction accuracy: {accuracy:.2f}%")

# Save to CSV and Display
df.to_csv("ed_diagnosis_predictions.csv", index=False)


Overall prediction accuracy: 4.67%


In [33]:
df.loc[:,['Age','primaryeddiagnosisname','Predicted_Diagnosis']]

Unnamed: 0,Age,primaryeddiagnosisname,Predicted_Diagnosis
0,88,CAD in native artery,Abdominal Aortic Aneurysm
1,71,Vesicular rash,Ascites
2,49,SBO (small bowel obstruction) (CMS code),Small bowel obstruction
3,58,Gastroduodenitis,Gastric ulcer
4,55,Nausea vomiting and diarrhea,Afferent loop syndrome
5,82,Abdominal pain,Cirrhosis
6,69,Acute abdominal pain,Colitis
7,48,Chronic cholecystitis,Cholecystitis
8,57,Abdominal pain,Hepatic encephalopathy
9,56,Acute abdominal pain,Carcinomatosis
