In [1]:
from langchain_openai import AzureChatOpenAI
from langchain.prompts import ChatPromptTemplate
import os

import truststore
truststore.inject_into_ssl()
import numpy as np
import pandas as pd
from tqdm import tqdm
import time
import seaborn as sns
import numpy as np
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
os.environ["APP_CLIENT_ID"] = "zizhang-chen-research-app"
os.environ["APP_CLIENT_SECRET"] = "cQPcueFP7tDrimbf8NW2GAHcHeQa"
from llm_idam_token_generator.idam_token_generator import get_idam_token
from langchain_community.callbacks import get_openai_callback
from langchain_openai import OpenAI
import tiktoken
import time

In [2]:
end_point_list = [
    'https://lmaas-beta.ai.gehealthcare.com',
    "https://openai-llm-frontdoor-hma7evbthrd4cugn.a01.azurefd.net"
]

model_list = ["gpt-35-turbo-16k", "gpt-4-32k-beta", "gpt-4o"]

# APIM_KEY = "8b96051ed6b84e4dad762fdc9f8c809e"

OPENAI_ENDPOINT = end_point_list[0]
OPENAI_DEPLOYMENT_MODEL = model_list[0]
OPENAI_AZURE_API_VERSION = "2023-12-01-preview"
# OPENAI_AZURE_API_VERSION = '2024-02-01'
OPENAI_TYPE = "azure"

In [3]:
# help(AzureChatOpenAI)

llm = AzureChatOpenAI(
    api_key="xxx",  # This is not playing any role, but required as per OpenAI sdk. So any random could be passed.
    azure_endpoint=OPENAI_ENDPOINT,
    deployment_name=OPENAI_DEPLOYMENT_MODEL,
    openai_api_version=OPENAI_AZURE_API_VERSION,
    n = 2,
    default_headers={
        'Authorization': f'Bearer {get_idam_token()}',
        'Content-Type': 'application/json'
    }
)

PID:19143 INFO llm_idam_token_generator.idam_token_generator - Client ID: zizhang-chen-research-app - Generating new token.
PID:19143 INFO llm_idam_token_generator.idam_token_generator - Client ID: zizhang-chen-research-app - All required environment variables are present.
PID:19143 INFO llm_idam_token_generator.idam_token_generator - Client ID: zizhang-chen-research-app - IDAM Access Token is generated
PID:19143 INFO llm_idam_token_generator.idam_token_generator - Client ID: zizhang-chen-research-app - IDAM Exchange Access Token is generated


In [4]:
def generate_prompt_task(path):
    with open(path) as f:
        prompt = f.readlines()
    return prompt

def split_string_into_two_parts(text):
    lines = text.split('\n')
    first_part = lines[:2]
    second_part = lines[2:]

    first_part_string = '\n'.join(first_part)
    second_part_string = '\n'.join(second_part)
    
    return first_part_string, second_part_string


def combin_prompt_no_cot(general_prompt, person_info, medical_event):
    template_by_task = '**Task:**\n' + general_prompt + '\n'

    person_info = person_info.replace('at the prediction time', 'at the discharge time')
    template_info = '**Patient age and demographic information:**\n' + person_info + '\n'

    template_events = '**Medical Events:**\n' + medical_event + '\n'
    
    template_events_answer_style = "**Answer requirement:**\nPlease only answer with 'Yes' or 'No'"

    return template_by_task + '\n' + '{events}' , template_info + '\n' + template_events + '\n' + template_events_answer_style

def calculate_row_entropy(df):
    def entropy(row):
        counts = row.value_counts(normalize=True)
        return -np.sum(counts * np.log2(counts + np.finfo(float).eps)) 

    return df.apply(entropy, axis=1)

def num_tokens_from_string(string, encoding_model = 'gpt-3.5-turbo') -> int:
    encoding = tiktoken.encoding_for_model(encoding_model)
    num_tokens = len(encoding.encode(string))
    return num_tokens

def generate_prompt_task(new_diagnose, new_diagnose_prediction):
    new_diagnose_single_word = new_diagnose.split('_')[1]
    task = f"You are an experienced doctor. Based on the provided patient age, demographic information and medical events, use your medical knowledge and reasoning to:\n{new_diagnose_prediction}\nYou are given the medical events up to the patient's discharges. Please respond 'Yes' if it is plausible and 'No' otherwise."
    return task

In [5]:
df = pd.read_csv('data/new_diagnose_patient_description.csv')
description_list = df['description'].values
# new_diagnose_list = [i.split('value_')[1] for i in df.columns[2:8]]
# new_diagnose_prediction = [
#     "Predict whether the patient will have first diagnosis of essential hypertension within the next year after discharge from hospital.",
#     "Predict whether the patient will have first diagnosis of hyperlipidemia within the next year after discharge from hospital.",
#     "Predict whether the patient will have first diagnosis of pancreatic cancer within the next year after discharge from hospital.",
#     "Predict whether the patient will have first diagnosis of celiac disease within the next year after discharge from hospital.",
#     "Predict whether the patient will have first diagnosis of lupus within the next year after discharge from hospital.",
#     "Predict whether the patient will have first diagnosis of an acute myocardial infarction within the next year after discharge from hospital."
# ]
new_diagnose_list = [
    "value_new_hypertension", "value_new_hyperlipidemia", "value_new_acutemi"
]

new_diagnose_prediction = [
        "Predict whether the patient will have first diagnosis of: 'Hypertension', within the next year after discharge from hospital.",
        "Predict whether the patient will have first diagnosis of: 'Hyperlipidemia', within the next year after discharge from hospital.",
        "Predict whether the patient will have first diagnosis of disease: 'Acute Myocardial Infarction', within the next year after discharge from hospital."
    ]

In [6]:
# for idx in range(len(new_diagnose_list)):
for idx in range(0, len(new_diagnose_list)):
    # for idx in [4]:
    out_csv_names = new_diagnose_list[idx] + '.csv'
    general_prompt = generate_prompt_task(
        new_diagnose=new_diagnose_list[idx],
        new_diagnose_prediction=new_diagnose_prediction[idx])
    # gt_list = df['value_' + new_diagnose_list[idx]].values
    gt_list = df[new_diagnose_list[idx]].values

    with get_openai_callback() as cb:

        answer_list_all = []

        for i in tqdm(range(len(df))):
            # for i in tqdm(range(1)):
            answer_list_5 = []
            for _ in range(5):
                # for _ in range(1):
                current_description = description_list[i]
                person_info, medical_events = split_string_into_two_parts(
                    current_description)
                c, e = combin_prompt_no_cot(general_prompt, person_info,
                                            medical_events)

                prompt = ChatPromptTemplate.from_template(c)
                chain = prompt | llm
                result = chain.invoke({'events': e})
                answer_list_5.append(result.content)
            answer_list_all.append(answer_list_5)

        df_pred = pd.DataFrame(answer_list_all)
        df_pred.columns = ['pred_1', 'pred_2', 'pred_3', 'pred_4', 'pred_5']
        df_pred['gt'] = list(gt_list)
        # df_pred.to_csv(os.path.join('results/gpt35/baseline_new_diagnose/', out_csv_names))
        # df_pred.to_csv(os.path.join('results/gpt4/baseline_new_diagnose/', out_csv_names))
    # df_pred.to_csv(os.path.join('results/gpt4/baseline_new_diagnose_v1/', out_csv_names))
    df_pred.to_csv(os.path.join('results/gpt35/baseline_new_diagnose_v2/', out_csv_names))

  0%|          | 0/100 [00:00<?, ?it/s]

PID:19143 INFO httpx - Client ID: zizhang-chen-research-app - HTTP Request: POST https://lmaas-beta.ai.gehealthcare.com/openai/deployments/gpt-35-turbo-16k/chat/completions?api-version=2023-12-01-preview "HTTP/1.1 200 OK"
PID:19143 INFO httpx - Client ID: zizhang-chen-research-app - HTTP Request: POST https://lmaas-beta.ai.gehealthcare.com/openai/deployments/gpt-35-turbo-16k/chat/completions?api-version=2023-12-01-preview "HTTP/1.1 200 OK"
PID:19143 INFO httpx - Client ID: zizhang-chen-research-app - HTTP Request: POST https://lmaas-beta.ai.gehealthcare.com/openai/deployments/gpt-35-turbo-16k/chat/completions?api-version=2023-12-01-preview "HTTP/1.1 200 OK"
PID:19143 INFO httpx - Client ID: zizhang-chen-research-app - HTTP Request: POST https://lmaas-beta.ai.gehealthcare.com/openai/deployments/gpt-35-turbo-16k/chat/completions?api-version=2023-12-01-preview "HTTP/1.1 200 OK"
PID:19143 INFO httpx - Client ID: zizhang-chen-research-app - HTTP Request: POST https://lmaas-beta.ai.gehealthc

In [7]:
cb

Tokens Used: 2195855
	Prompt Tokens: 2194850
	Completion Tokens: 1005
Successful Requests: 500
Total Cost (USD): $6.588569999999996

In [8]:
cb

Tokens Used: 2195855
	Prompt Tokens: 2194850
	Completion Tokens: 1005
Successful Requests: 500
Total Cost (USD): $6.588569999999996

In [9]:
print(e)

**Patient age and demographic information:**
The patient was 36 years old at the discharge time.
The patient has the following demographic information: Asian,FEMALE,Not Hispanic or Latino.

**Medical Events:**
At June 12, 2014:
1 events:'oxycodone hydrochloride 5 MG Oral Tablet' recorded 
1 events:'Acetaminophen 10 MG/ML Injectable Solution' recorded 
1 events:'1 ML ketorolac tromethamine 15 MG/ML Injection' recorded 
46 events:'Heart rate' recorded with values: 83.0, 84.0, 89.0, 91.0, 92.0, 93.0, 94.0, 95.0, 96.0, 97.0, 98.0, 99.0, 100.0, 101.0, 103.0, 104.0, 105.0, 106.0, 108.0, 109.0, 117.0. 
29 events:'Systolic blood pressure' recorded with values: 129.0, 131.0, 132.0, 133.0, 134.0, 136.0, 137.0, 138.0, 141.0, 147.0, 148.0, 115.0, 117.0, 118.0, 119.0, 120.0, 121.0, 122.0, 124.0, 125.0, 127.0. 
26 events:'Diastolic blood pressure' recorded with values: 64.0, 66.0, 67.0, 68.0, 69.0, 70.0, 71.0, 73.0, 75.0, 76.0, 79.0, 81.0, 50.0, 51.0, 83.0, 55.0, 56.0, 59.0. 
1 events:'Inhaled oxyge

In [10]:
cb

Tokens Used: 2195855
	Prompt Tokens: 2194850
	Completion Tokens: 1005
Successful Requests: 500
Total Cost (USD): $6.588569999999996