In [1]:
# Install pandas for data manipulation
!pip install pandas

# Install OpenAI library for API interaction
!pip install openai


Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Collecting openai
  Using cached openai-1.97.1-py3-none-any.whl.metadata (29 kB)
Collecting httpx<1,>=0.23.0 (from openai)
  Using cached httpx-0.28.1-py3-none-any.whl.metadata (7.1 kB)
Collecting jiter<1,>=0.4.0 (from openai)
  Using cached jiter-0.10.0-cp312-cp312-win_amd64.whl.metadata (5.3 kB)
Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai)
  Using cached httpcore-1.0.9-py3-none-any.whl.metadata (21 kB)
Collecting h11>=0.16 (from httpcore==1.*->httpx<1,>=0.23.0->openai)
  Using cached h11-0.16.0-py3-none-any.whl.metadata (8.3 kB)
Using cached openai-1.97.1-py3-none-any.whl (764 kB)
Using cached httpx-0.28.1-py3-none-any.whl (73 kB)
Using cached httpcore-1.0.9-py3-none-any.whl (78 kB)
Using cached jiter-0.10.0-cp312-cp312-win_amd64.whl (206 kB)
Using cached h11-0.16.0-py3-none-any.whl (37 kB)
Installing collected package

In [15]:
import pandas as pd
from datetime import datetime
import json

PROCESSED_DATA_FILE = 'processed_data/patient_data.json'

def load_and_preprocess_data():
    """Loads and preprocesses the necessary CSV files."""
    patients_df = pd.read_csv('data/patients.csv')
    conditions_df = pd.read_csv('data/conditions.csv')
    encounters_df = pd.read_csv('data/encounters.csv')
    allergies_df = pd.read_csv('data/allergies.csv')
    immunizations_df = pd.read_csv('data/immunizations.csv')

    print("patients.csv, conditions.csv, encounters.csv, allergies.csv, immunizations.csv")

    # Calculate age
    patients_df['BIRTHDATE'] = pd.to_datetime(patients_df['BIRTHDATE'])
    current_date = datetime.now()
    patients_df['AGE'] = (current_date - patients_df['BIRTHDATE']).dt.days // 365

    # Convert date columns to datetime
    conditions_df['START'] = pd.to_datetime(conditions_df['START'])
    conditions_df['STOP'] = pd.to_datetime(conditions_df['STOP'])
    encounters_df['START'] = pd.to_datetime(encounters_df['START'])
    encounters_df['STOP'] = pd.to_datetime(encounters_df['STOP'])
    immunizations_df['DATE'] = pd.to_datetime(immunizations_df['DATE'])
    allergies_df['START'] = pd.to_datetime(allergies_df['START'])

    conditions_df.sort_values(by='START', ascending=False, inplace=True)
    encounters_df.sort_values(by='START', ascending=False, inplace=True)
    immunizations_df.sort_values(by='DATE', ascending=False, inplace=True)
    allergies_df.sort_values(by='START', ascending=False, inplace=True)

    return patients_df, conditions_df, encounters_df, allergies_df, immunizations_df


def get_patient_data_text(patient_id, patients_df, conditions_df, encounters_df, allergies_df, immunizations_df):
    """Gathers and formats a single patient's data into a structured text string."""
    patient_info = patients_df[patients_df['Id'] == patient_id].iloc[0]
    patient_conditions = conditions_df[conditions_df['PATIENT'] == patient_id]
    patient_encounters = encounters_df[encounters_df['PATIENT'] == patient_id]
    patient_allergies = allergies_df[allergies_df['PATIENT'] == patient_id]
    patient_immunizations = immunizations_df[immunizations_df['PATIENT'] == patient_id]


    gender = patient_info['GENDER']
    age = patient_info['AGE']
    first_name = patient_info['FIRST']
    last_name = patient_info['LAST']
    marital_status = patient_info['MARITAL'] if pd.notna(patient_info['MARITAL']) else 'Unknown'
    race = patient_info['RACE']
    ethnicity = patient_info['ETHNICITY']

    patient_text = f"Patient ID: {patient_id}\n"
    patient_text += f"Name: {first_name} {last_name}\n"
    patient_text += f"Age: {age}\n"
    patient_text += f"Gender: {gender}\n"
    patient_text += f"Marital Status: {marital_status}\n"
    patient_text += f"Race: {race}\n"
    patient_text += f"Ethnicity: {ethnicity}\n"


    if not patient_encounters.empty:
        patient_text += "\nRecent Encounters (up to 3 most recent):\n"
        for _, encounter in patient_encounters.head(3).iterrows():
            reason_desc = encounter['REASONDESCRIPTION'] if pd.notna(encounter['REASONDESCRIPTION']) else 'N/A'
            patient_text += f"- Date: {encounter['START'].strftime('%Y-%m-%d')}, Type: {encounter['DESCRIPTION']}, Reason: {reason_desc}\n"
    else:
        patient_text += "\nRecent Encounters: None\n"

    if not patient_allergies.empty:
        patient_text += "\nKnown Allergies:\n"
        allergies_list = ", ".join(patient_allergies['DESCRIPTION'].unique().tolist())
        patient_text += f"- {allergies_list}\n"
    else:
        patient_text += "\nKnown Allergies: None\n"

    if not patient_immunizations.empty:
        patient_text += "\nRecent Immunizations (up to 3 most recent):\n"
        for _, immunization in patient_immunizations.head(3).iterrows():
            patient_text += f"- Date: {immunization['DATE'].strftime('%Y-%m-%d')}, Type: {immunization['DESCRIPTION']}\n"
    else:
        patient_text += "\nRecent Immunizations: None\n"

    actual_conditions = patient_conditions['DESCRIPTION'].unique().tolist()
    
    return patient_text, actual_conditions

patients_df, conditions_df, encounters_df, allergies_df, immunizations_df = load_and_preprocess_data()

if patients_df is not None: 
    patient_condition_counts = conditions_df.groupby('PATIENT').size().sort_values(ascending=False)
    sample_patient_ids = patient_condition_counts.index.tolist()[:5]

    print(f"\nSelected Sample Patient IDs for processing: {sample_patient_ids}")

    processed_patient_data = {}
    for patient_id in sample_patient_ids:
        text_data, actual_conds = get_patient_data_text(
            patient_id, patients_df, conditions_df, encounters_df, allergies_df, immunizations_df
        )
        processed_patient_data[patient_id] = {
            'text_data': text_data,
            'actual_conditions': actual_conds
        }
        print(f"\nProcessed Data for Patient ID: {patient_id}")
        print(text_data)

    with open(PROCESSED_DATA_FILE, 'w') as f:
        json.dump(processed_patient_data, f, indent=4)
    print(f"\nProcessed patient data saved to {PROCESSED_DATA_FILE}")
else:
    print("Data loading failed")

patients.csv, conditions.csv, encounters.csv, allergies.csv, immunizations.csv

Selected Sample Patient IDs for processing: ['10342f7e-5793-1888-ee52-e1dac9ceae10', '88d1d936-6df2-66a2-04ef-b5ed5a2feceb', '2d11128a-82e5-3e38-a0b5-2e7bf7d5e378', 'bb090f3b-7bac-01f7-e262-6b026db13df8', 'cfa38fb8-2a40-b805-0e72-a9a10a7369b7']

Processed Data for Patient ID: 10342f7e-5793-1888-ee52-e1dac9ceae10
Patient ID: 10342f7e-5793-1888-ee52-e1dac9ceae10
Name: Lauren941 Nikolaus26
Age: 72
Gender: M
Marital Status: M
Race: asian
Ethnicity: nonhispanic

Recent Encounters (up to 3 most recent):
- Date: 2024-06-12, Type: Death Certification, Reason: End-stage renal disease (disorder)
- Date: 2024-05-22, Type: Admission to hospice (procedure), Reason: Chronic congestive heart failure (disorder)
- Date: 2024-05-22, Type: Encounter for problem (procedure), Reason: End-stage renal disease (disorder)

Known Allergies: None

Recent Immunizations (up to 3 most recent):
- Date: 2024-05-08, Type: Td (adult)  5 Lf 

In [17]:
import openai
import json
import os
import time 

client = openai.Client()


OPENAI_MODEL = "gpt-3.5-turbo" 

PROCESSED_DATA_FILE = 'processed_data/patient_data.json'


with open(PROCESSED_DATA_FILE, 'r') as f:
    processed_patient_data = json.load(f)

patient_id_for_single_run = list(processed_patient_data.keys())[0]
patient_data_single = processed_patient_data[patient_id_for_single_run]['text_data']
actual_conditions_single = processed_patient_data[patient_id_for_single_run]['actual_conditions']

print(f"\nDemonstrating LLM Prompting Methods for Patient ID: {patient_id_for_single_run}")
print(f"Actual Conditions: {', '.join(actual_conditions_single)}\n")




Demonstrating LLM Prompting Methods for Patient ID: 10342f7e-5793-1888-ee52-e1dac9ceae10
Actual Conditions: Medication review due (situation), Gingivitis (disorder), Stress (finding), Full-time employment (finding), Not in labor force (finding), Dental filling lost (finding), Reports of violence in the environment (finding), Primary dental caries (disorder), Limited social contact (finding), Acute infective cystitis (disorder), Part-time employment (finding), Victim of intimate partner abuse (finding), Social isolation (finding), Chronic congestive heart failure (disorder), Injury of knee (disorder), Injury of medial collateral ligament of knee (disorder), Unemployed (finding), Viral sinusitis (disorder), Gingival disease (disorder), End-stage renal disease (disorder), Chronic kidney disease stage 4 (disorder), Awaiting transplantation of kidney (situation), Unhealthy alcohol drinking behavior (finding), Loose dental filling (finding), Misuses drugs (finding), Metabolic syndrome X (di

In [18]:
# 1. Zero-Shot Prompting 
instruction_zs = "Based on the patient's health information, what are the most likely medical conditions this patient might have? List them concisely."
input_context_zs = patient_data_single

messages_zs = [
    {"role": "system", "content": "You are a helpful medical assistant."},
    {"role": "user", "content": f"{instruction_zs}\n\nPatient Data:\n{input_context_zs}"}
]

response = openai.chat.completions.create(
    model=OPENAI_MODEL,
    messages=messages_zs,
    max_tokens=500,
    temperature=0.7,
    top_p=1,
    frequency_penalty=0,
    presence_penalty=0
)
zero_shot_response = response.choices[0].message.content.strip()


print("\nLLM Response (Zero-Shot):\n", zero_shot_response)


LLM Response (Zero-Shot):
 Based on the patient's health information, the most likely medical conditions this patient might have are:
1. End-stage renal disease (disorder)
2. Chronic congestive heart failure (disorder)


In [23]:
# 2. Chain-of-Thought (CoT) Prompting
age_str = patient_data_single.split('Age: ')[1].split('\n')[0].strip()
gender_str = patient_data_single.split('Gender: ')[1].split('\n')[0].strip()

instruction_cot = f"""As a medical diagnostician, analyze the following patient's health information step-by-step to determine the most likely medical conditions. Provide your analysis at each step:

1.  **Demographics and General Health Context:** What can be inferred about the patient based on their age ({age_str}), gender ({gender_str}), and general examination encounters?
2.  **Allergies and Immunizations:** Are there any risk factors or protective factors evident from allergies or recent immunizations?
3.  **Encounter Reasons/Trends:** Are there any implications from the types of recent encounters (General examination, check-up, dental referral) or their stated reasons?
4.  **Synthesize Findings:** Combine these points. What are 1-3 broad areas of health concern or common conditions for someone with this profile?
5.  **Most Likely Conditions:** Based on your synthesis, what are the most likely specific medical conditions this patient could have, or conditions they should be screened for? Explain your reasoning briefly for each."""
input_context_cot = patient_data_single

messages_cot = [
    {"role": "system", "content": "You are a helpful medical diagnostician."},
    {"role": "user", "content": f"{instruction_cot}\n\nPatient Data:\n{input_context_cot}"}
]

response = openai.chat.completions.create(
    model=OPENAI_MODEL,
    messages=messages_cot,
    max_tokens=500,
    temperature=0.7,
    top_p=1,
    frequency_penalty=0,
    presence_penalty=0
)
cot_response = response.choices[0].message.content.strip()


print("\nLLM Response (CoT):\n", cot_response)

print(f"Compare predicted conditions with actual conditions: {', '.join(actual_conditions_single)}")



LLM Response (CoT):
 1. **Demographics and General Health Context:** The patient is a 72-year-old Asian male who has recent encounters related to end-stage renal disease and chronic congestive heart failure. These encounters suggest complex and potentially serious chronic health issues.

2. **Allergies and Immunizations:** The patient has no known allergies. Recent immunizations include Td (tetanus) and influenza vaccines, indicating some level of protection against these specific diseases.

3. **Encounter Reasons/Trends:** The recent encounters for death certification related to end-stage renal disease and admission to hospice for chronic congestive heart failure highlight significant health challenges, particularly related to renal and cardiac function.

4. **Synthesize Findings:** Based on the patient's age, recent encounters for end-stage renal disease and chronic congestive heart failure, and the absence of allergies, the broad areas of health concern include renal function, card

In [24]:
# 3. Tree-of-Thought (ToT) Prompting
instruction_tot = f"""As a medical diagnostician, consider this patient's health profile. Let's explore multiple diagnostic possibilities by following these steps:

1.  **Brainstorm broad categories of potential health issues** relevant to a {age_str}-year-old {gender_str} with general health encounters (e.g., Metabolic, Cardiovascular, Mental Health, Oral Health, etc.). List at least 3 categories.
2.  **For each category, propose 1-2 specific conditions** that are plausible, even if not directly symptomatic in the provided data, explaining the initial thought process for each.
3.  **Critically evaluate each proposed condition against the patient's data.** For each condition, explicitly state any supporting evidence (even if indirect inference) and any contradicting evidence from the provided records.
4.  **Based on your evaluation, identify the top 2-3 most likely conditions** and justify why they are stronger candidates compared to others."""
input_context_tot = patient_data_single

messages_tot = [
    {"role": "system", "content": "You are a helpful medical diagnostician."},
    {"role": "user", "content": f"{instruction_tot}\n\nPatient Data:\n{input_context_tot}"}
]

response = openai.chat.completions.create(
    model=OPENAI_MODEL,
    messages=messages_tot,
    max_tokens=500,
    temperature=0.7,
    top_p=1,
    frequency_penalty=0,
    presence_penalty=0
)
tot_response = response.choices[0].message.content.strip()


print("\nLLM Response (ToT):\n", tot_response)
print("\nEvaluation (ToT)")

print(f"Compare predicted conditions with actual conditions: {', '.join(actual_conditions_single)}")



LLM Response (ToT):
 1. **Cardiovascular**
   - **Hypertension**: Given the patient's age and the common occurrence of hypertension in older adults, it could be a contributing factor to the chronic congestive heart failure.
   - **Atherosclerosis**: Atherosclerosis is a common underlying cause of heart failure, especially in older individuals with multiple comorbidities like end-stage renal disease.

2. **Renal**
   - **Chronic Kidney Disease**: End-stage renal disease is the final stage of chronic kidney disease. The patient's history of renal disease indicates a potential long-term issue with the kidneys.
   - **Renal Hypertension**: Chronic kidney disease can lead to secondary hypertension, which could exacerbate the cardiovascular issues present in this case.

3. **Metabolic**
   - **Diabetes**: Diabetes is a common comorbidity with both chronic kidney disease and heart failure. The connection between these conditions could suggest an underlying metabolic issue.
   - **Hyperlipide

In [25]:
# 4. In-Context Learning with CoT 

example_patient_id = list(processed_patient_data.keys())[1]
example_patient_data = processed_patient_data[example_patient_id]['text_data']
example_actual_conditions = processed_patient_data[example_patient_id]['actual_conditions']

ideal_example_cot_response = f"""1.  **Demographics and General Health Context:** Patient is a 48-year-old female. Her recent encounters show follow-ups for Essential Hypertension, indicating a pre-existing chronic condition.
2.  **Allergies and Immunizations:** No known allergies. Regular influenza immunizations suggest proactive health management.
3.  **Encounter Reasons/Trends:** Recurring follow-up encounters specifically for "Essential hypertension" is a strong indicator of this active condition. General check-ups also occur.
4.  **Synthesize Findings:** The primary focus is clearly on the management of existing hypertension. Given chronic hypertension, related conditions like chronic kidney disease are common comorbidities. Also, general check-ups may reveal common age-related issues.
5.  **Most Likely Conditions:**
    * **Essential Hypertension:** Explicitly mentioned in encounter reasons, indicating ongoing management.
    * **Chronic Kidney Disease (related to hypertension or diabetes):** A common complication of long-standing hypertension.
    * **Gingivitis/Dental Issues:** Often identified during general check-ups or if a referral is made, although not explicitly stated for this patient, it's a common finding.
    * **Stress/Anxiety:** Common non-specific finding during general exams.
"""

instruction_few_shot_example = f"""As a medical diagnostician, analyze the following patient's health information step-by-step to determine the most likely medical conditions. Provide your analysis at each step:

1.  **Demographics and General Health Context:** What can be inferred about the patient based on their age ({example_patient_data.split('Age: ')[1].split('\\n')[0].strip()}), gender ({example_patient_data.split('Gender: ')[1].split('\\n')[0].strip()}), and general examination encounters?
2.  **Allergies and Immunizations:** Are there any risk factors or protective factors evident from allergies or recent immunizations?
3.  **Encounter Reasons/Trends:** Are there any implications from the types of recent encounters (General examination, check-up, dental referral) or their stated reasons?
4.  **Synthesize Findings:** Combine these points. What are 1-3 broad areas of health concern or common conditions for someone with this profile?
5.  **Most Likely Conditions:** Based on your synthesis, what are the most likely specific medical conditions this patient could have, or conditions they should be screened for? Explain your reasoning briefly for each."""

messages_few_shot_cot = [
    {"role": "system", "content": "You are a helpful medical diagnostician. Follow the example provided."},
    {"role": "user", "content": f"{instruction_few_shot_example}\n\nPatient Data:\n{example_patient_data}"},
    {"role": "assistant", "content": ideal_example_cot_response},
    {"role": "user", "content": f"{instruction_cot}\n\nPatient Data:\n{patient_data_single}"} 
]


response = openai.chat.completions.create(
    model=OPENAI_MODEL,
    messages=messages_few_shot_cot,
    max_tokens=500,
    temperature=0.7,
    top_p=1,
    frequency_penalty=0,
    presence_penalty=0
)
few_shot_response = response.choices[0].message.content.strip()


print("\nLLM Response (Few-Shot CoT):\n", few_shot_response)

print(f"Compare predicted conditions with actual conditions: {', '.join(actual_conditions_single)}")


LLM Response (Few-Shot CoT):
 1.  **Demographics and General Health Context:** Patient is a 72-year-old male. Recent encounters indicate advanced stages of chronic conditions like end-stage renal disease and chronic congestive heart failure.
2.  **Allergies and Immunizations:** No known allergies. Recent immunizations include tetanus and influenza vaccines, which are in line with recommended adult vaccinations for this age group.
3.  **Encounter Reasons/Trends:** The recent encounters for end-stage renal disease and chronic congestive heart failure are concerning for advanced and severe health issues. Hospice admission for heart failure and death certification for renal disease suggest significant disease progression.
4.  **Synthesize Findings:** The patient is dealing with severe end-stage renal disease and chronic congestive heart failure, both of which are life-threatening conditions. The need for hospice care indicates a terminal stage of heart failure.
5.  **Most Likely Condition