In [1]:
import os
from dotenv import load_dotenv
from pathlib import Path
import json 
from typing import List, Dict, Union, Optional, Tuple, Literal
import pandas as pd
from tqdm import tqdm
from openai import OpenAI
from pydantic import BaseModel, Field
from prompt import *
env_path = Path.home()
load_dotenv(dotenv_path=env_path / ".env")

True

In [6]:
class KnowledgeRules(BaseModel):
    rules: List[str]
 
client = OpenAI()

In [3]:
def get_response(prompt):
    messages = [
        {"role": "system", "content": "You are a medical professional reviewing a pathology report to determine the cancer stage."},
        {"role": "user", "content": prompt}
        ]
    
    try:
        response = client.beta.chat.completions.parse(
            model = "gpt-4o-2024-08-06",
            messages=messages,
            temperature = 0,
            response_format = Response_t14
        )

        # return json.loads(response.choices[0].message.parsed)
        return response.choices[0].message.parsed
    except Exception as e:
        print(f"An error occurred: {e}")
        return None
    
def get_tnm_stage(df, prompt_method, prompt, stage_type, context = ""):
    parsing_error = 0
    pbar = tqdm(total=df.shape[0])

    for idx, row in df.iterrows():
        report = row['text']
        if context:
            formatted_prompt = prompt.format(report=report, context=context)
        else:
            formatted_prompt = prompt.format(report=report)

        response = get_response(formatted_prompt)
        print("pred: ", response.stage)
        print("answer: ", row[stage_type])

    #     if response:
    #         df.at[idx, f'{prompt_method}_{stage_type}_reasoning'] = response['reasoning']
    #         df.at[idx, f'{prompt_method}_{stage_type}_stage'] = response['stage']
    #     else:
    #         parsing_error += 1
    #     pbar.update(1)
    # pbar.close()
    # print(f"Total parsing errors: {parsing_error}")
    # return df

In [4]:
brca_df = pd.read_csv("/secure/shared_data/rag_tnm_results/summary/5_folds_summary/brca_df.csv")
brca_df = brca_df[brca_df["n"]!=-1][:30]
brca_df = brca_df[['patient_filename', 't', 'text', 'n']]

get_tnm_stage(brca_df, "zscot",zscot_t14, "t", context = "")

  0%|          | 0/30 [00:00<?, ?it/s]

pred:  T2
answer:  1
pred:  T2
answer:  1
pred:  T2
answer:  1
pred:  T1
answer:  1
pred:  T1
answer:  1
pred:  T1
answer:  0
pred:  T2
answer:  1
pred:  T2
answer:  1
pred:  T2
answer:  1
pred:  T2
answer:  1
pred:  T3
answer:  2
pred:  T1
answer:  0
pred:  T2
answer:  1
pred:  T2
answer:  1
pred:  T2
answer:  1
pred:  T1
answer:  2
pred:  T3
answer:  2
pred:  T3
answer:  2
pred:  T3
answer:  2
pred:  T4
answer:  3
pred:  T2
answer:  1
pred:  T2
answer:  1
pred:  T2
answer:  1
pred:  T2
answer:  1
pred:  T1
answer:  1
pred:  T1
answer:  0
pred:  T2
answer:  1
pred:  T1
answer:  0
pred:  T4
answer:  1


  0%|          | 0/30 [02:04<?, ?it/s]

pred:  T2
answer:  1





In [7]:
a_train_df = pd.read_csv("/home/yl3427/cylab/rag_tnm/Task-3_Train.csv") # 603
a_eval_df = pd.read_csv("/home/yl3427/cylab/rag_tnm/Task-3_Eval.csv") # 75
a_test_df = pd.read_csv("/home/yl3427/cylab/rag_tnm/Task-3_Test.csv") # 87

as_train_df = pd.read_csv("/home/yl3427/cylab/rag_tnm/BioNLP2023-1A-Train.csv") # 765, this includes all of the three above
as_test_df = pd.read_csv("/home/yl3427/cylab/rag_tnm/BioNLP2023-1A-Test.csv") # for shots

filtered_df = as_train_df[as_train_df['File ID'].isin(set(a_test_df['File ID']).union({'190862.txt', '109943.txt', '195790.txt'}))]
len(filtered_df) # 90 - 1 = 89

89

In [14]:
def get_response(prompt):
    messages = [
        {"role": "system", "content": "You are a medical professional reviewing a progress note to understand a patient's condition."},
        {"role": "user", "content": prompt}
        ]
    
  
    response = client.chat.completions.create(
        model = "gpt-4o-2024-08-06",
        messages=messages,
        temperature = 0,
    )
    return response.choices[0].message


In [22]:
query = """
You are provided with a patient's medical information from a progress note formatted in the SOAP structure, containing only the Subjective, Objective, and Assessment sections. Your task is to generate a summary that lists the patient's medical problems and diagnoses, including both direct and indirect problems (a past medical problem or consequence from the primary diagnosis). Present your response as a concatenated list of diagnoses separated by semicolons without any additional text or formatting.

Patient Information:
<Subjective>
TITLE: Chief Complaint: 24 Hour Events: Allergies: No Known Drug Allergies
</Subjective>

<Objective>
Last dose of Antibiotics: Infusions: Other ICU medications: Other medications: Changes to medical and family history: Review of systems is unchanged from admission except as noted below Review of systems: Flowsheet Data as of   07:07 AM Vital signs Hemodynamic monitoring Fluid balance 24 hours Since 12 AM Tmax: 36.6 C (97.9 Tcurrent: 36.3 C (97.4 HR: 54 (42 - 76) bpm BP: 142/54(75) (114/32(56) - 147/76(90)) mmHg RR: 17 (12 - 19) insp/min SpO2: 99% Heart rhythm: SR (Sinus Rhythm) Height: 76 Inch Total In: 900 mL PO: 900 mL TF: IVF: Blood products: Total out: 680 mL 980 mL Urine: 680 mL 980 mL NG: Stool: Drains: Balance: -680 mL -80 mL Respiratory support SpO2: 99% ABG: ///27/ Physical Examination GENERAL: Alert, interactive, comfortable, NAD. HEENT: Enlarged 1cm (approx) uvula with erythema and swelling of left side of soft palate/arch. CARDIAC: RRR, normal S1, S2. No m/r/g. LUNGS: Resp unlabored, no accessory muscle use. CTAB, no crackles, wheezes or rhonchi. ABDOMEN: Soft, NTND. EXTREMITIES: No c/c/e. Labs / Radiology 240 K/uL 13.9 g/dL 150 mg/dL 1.0 mg/dL 27 mEq/L 4.5 mEq/L 13 mg/dL 104 mEq/L 139 mEq/L 42.3 % 6.1 K/uL [image002.jpg]   04:01 AM WBC 6.1 Hct 42.3 Plt 240 Cr 1.0 Glucose 150 Other labs: Differential-Neuts:83.6 %, Lymph:10.6 %, Mono:5.1 %, Eos:0.3 %, Ca++:9.8 mg/dL, Mg++:1.8 mg/dL, PO4:3.7 mg/dL
</Objective>

<Assessment>
Mr. [**Known lastname 8748**] is a 19 year old gentleman with history of AVNRT s/p nodal\n   ablation on [**2182-11-7**] with post procedural swelling of uvula.
</Assessment>

"""
answ = get_response(query)

NotFoundError: Error code: 404 - {'error': {'message': 'The model `o1-preview` does not exist or you do not have access to it.', 'type': 'invalid_request_error', 'param': None, 'code': 'model_not_found'}}

In [13]:
answ.content # gpt-4o-2024-08-06

'AVNRT; post procedural swelling of uvula'

In [24]:
soap_oneshot = """
You are provided with a patient's medical information from a progress note formatted in the SOAP structure, containing only the Subjective, Objective, and Assessment sections. Your task is to generate a summary that lists the patient's medical problems and diagnoses, including both direct and indirect problems (a past medical problem or consequence from the primary diagnosis). Use the example provided to guide your response. Present your response as a concatenated list of diagnoses separated by semicolons without any additional text or formatting.

-----
Example

Patient Information:
<Subjective>
TITLE: Chief Complaint: 24 Hour Events: Allergies: No Known Drug Allergies
</Subjective>

<Objective>
Last dose of Antibiotics: Infusions: Other ICU medications: Other medications: Changes to medical and family history: Review of systems is unchanged from admission except as noted below Review of systems: Flowsheet Data as of   07:07 AM Vital signs Hemodynamic monitoring Fluid balance 24 hours Since 12 AM Tmax: 36.6 C (97.9 Tcurrent: 36.3 C (97.4 HR: 54 (42 - 76) bpm BP: 142/54(75) (114/32(56) - 147/76(90)) mmHg RR: 17 (12 - 19) insp/min SpO2: 99% Heart rhythm: SR (Sinus Rhythm) Height: 76 Inch Total In: 900 mL PO: 900 mL TF: IVF: Blood products: Total out: 680 mL 980 mL Urine: 680 mL 980 mL NG: Stool: Drains: Balance: -680 mL -80 mL Respiratory support SpO2: 99% ABG: ///27/ Physical Examination GENERAL: Alert, interactive, comfortable, NAD. HEENT: Enlarged 1cm (approx) uvula with erythema and swelling of left side of soft palate/arch. CARDIAC: RRR, normal S1, S2. No m/r/g. LUNGS: Resp unlabored, no accessory muscle use. CTAB, no crackles, wheezes or rhonchi. ABDOMEN: Soft, NTND. EXTREMITIES: No c/c/e. Labs / Radiology 240 K/uL 13.9 g/dL 150 mg/dL 1.0 mg/dL 27 mEq/L 4.5 mEq/L 13 mg/dL 104 mEq/L 139 mEq/L 42.3 % 6.1 K/uL [image002.jpg]   04:01 AM WBC 6.1 Hct 42.3 Plt 240 Cr 1.0 Glucose 150 Other labs: Differential-Neuts:83.6 %, Lymph:10.6 %, Mono:5.1 %, Eos:0.3 %, Ca++:9.8 mg/dL, Mg++:1.8 mg/dL, PO4:3.7 mg/dL
</Objective>

<Assessment>
Mr. [**Known lastname 8748**] is a 19 year old gentleman with history of AVNRT s/p nodal\n   ablation on [**2182-11-7**] with post procedural swelling of uvula.
</Assessment>

Summary:
Uvula swelling; AVNRT
-----

Now, please perform the same task on the following patient information:

Patient Information:
<Subjective>
{subjective_section}
</Subjective> 

<Objective>
{objective_section}
</Objective>

<Assessment>
{assessment_section}
</Assessment>

"""

In [28]:
soap_oneshot.format(subjective_section = "test", objective_section = "test", assessment_section = "test{80%}")

"\nYou are provided with a patient's medical information from a progress note formatted in the SOAP structure, containing only the Subjective, Objective, and Assessment sections. Your task is to generate a summary that lists the patient's medical problems and diagnoses, including both direct and indirect problems (a past medical problem or consequence from the primary diagnosis). Use the example provided to guide your response. Present your response as a concatenated list of diagnoses separated by semicolons without any additional text or formatting.\n\n-----\nExample\n\nPatient Information:\n<Subjective>\nTITLE: Chief Complaint: 24 Hour Events: Allergies: No Known Drug Allergies\n</Subjective>\n\n<Objective>\nLast dose of Antibiotics: Infusions: Other ICU medications: Other medications: Changes to medical and family history: Review of systems is unchanged from admission except as noted below Review of systems: Flowsheet Data as of   07:07 AM Vital signs Hemodynamic monitoring Fluid ba