# Load OpenAI key

In [None]:
import os
import openai
import sys
sys.path.append('../..')

import panel as pn  # GUI
pn.extension()

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

openai.api_key  = os.environ['OPENAI_API_KEY']

# Load Data

In [None]:
!wget -O train.csv https://huggingface.co/datasets/aai530-group6/ddxplus/resolve/main/train.csv?download=true
!wget -O release_conditions.json https://huggingface.co/datasets/aai530-group6/ddxplus/resolve/main/release_conditions.json?download=true
!wget -O release_evidences.json https://huggingface.co/datasets/aai530-group6/ddxplus/resolve/main/release_evidences.json?download=true

# Format Cases according to our requirment

In [None]:
import json
import pandas as pd
import numpy as np

def read_release_evidences():
    with open('release_evidences.json') as f:
        data = json.load(f)
    return data

def read_release_conditions():
    with open('release_conditions.json') as f:
        data = json.load(f)
    return data

with open('cases.json', 'w') as f:
    f.write('')

conditions = read_release_conditions()
evidences = read_release_evidences()

df = pd.read_csv('train.csv')

cases = {}
c = 0
for index, row in df.iterrows():
    case = ''

    age = row['AGE']
    case = 'Age: ' + str(age) + ', '
    sex = row['SEX']
    if(sex == 'M'):
        case = case + 'Sex: ' + 'Male'
    elif(sex == 'F'):
        case = case + 'Sex:' + ' Female'
    
    pathology = row['PATHOLOGY']
    case = case + '\nGround truth pathology: ' + pathology + '\n'

    init_evidence = row['INITIAL_EVIDENCE']
    case = case + '\nInitial Evidence: ' +  (evidences[init_evidence]['question_en']) + ': Yes\n\n'

    symptoms = []
    antecedents = []


    evs = row['EVIDENCES']
    evs = evs[1:-1]
    evs = str(evs).split(',')
    for i in range(len(evs)):
        evs[i] = evs[i].strip()

    symp = {}
    ant = {}

    for k in evs:
        is_antecedent = False
        
        #check if k contains '_@_V'
        if '_@_V' in k:
            #extract the part of the key before _@_V
            key = k.split('_@_V')[0][1:]
            question = evidences[key]['question_en']
            val_key = 'V' + k.split('_@_V')[1]
            val_key = val_key[:-1]
            answer = evidences[key]["value_meaning"][val_key]['en']
            is_antecedent = evidences[key]['is_antecedent']

        elif '_@_' in k:
            key = k.split('_@_')[0][1:]
            question = 'On a scale of 0 to 10, ' + evidences[key]['question_en']
            answer = k.split('_@_')[1]
            answer = answer[:-1]
            is_antecedent = evidences[key]['is_antecedent']
        
        else:
            key = k[1:-1]
            if key in evidences:
                question = evidences[key]['question_en']
                answer = evidences[key]['default_value']
                if answer == 0:
                    answer = 'No'
                elif answer == 1:
                    answer = 'Yes'
                is_antecedent = evidences[key]['is_antecedent']

        if is_antecedent:
            if question not in ant:
                ant[question] = []
                ant[question].append(answer)
            else:
                ant[question].append(answer)
        else:
            if question not in symp:
                symp[question] = []
                symp[question].append(answer)
            else:
                symp[question].append(answer)

    case = case + 'Symptoms: \n'
    for key, value in symp.items():
        case = case + key + ': ' + ', '.join(value) + '\n'
    
    if('Yes' in ant.values()):
        case = case + '\n\nAntecedents: \n'
        for key, value in ant.items():
            if(value == 'Yes'):
                case = case + key + ': ' + ', '.join(value) + '\n'

    ddx = row['DIFFERENTIAL_DIAGNOSIS']
    case = case + '\nDifferential Diagnosis: `' + str(ddx) + '`.'

    print(case)
    print('\n🐻\n')
    c += 1

    cases[c] = case

with open('cases.json', 'w') as f:
    json.dump(cases, f, indent=4)

# Get Responses

In [None]:
import time
import json
from langchain.chat_models import ChatOpenAI
import pandas as pd
llm_name = "gpt-3.5-turbo"

llm = ChatOpenAI(model_name=llm_name, temperature=0)
llm.predict("Hello world!")
# open cases.json and read each case
# for each case, generate a conversation between a doctor and patient
    
with open('cases.json', 'r') as f:
    cases = json.load(f)

# initialize empty dataframe
df = pd.DataFrame(columns=['case', 'conversation'])

for i in range(450,900):

    PROMPT = "You will have a conversation between a doctor (must be named Assistant) and patient (must be named Human), focusing on differential diagnosis based on the following patient history." + cases[str(i+1)] + "Assistant: Ask 1 question about main symptom details, covering nature, location, severity, and onset (use details verbatim from symptoms section) Ask 1 question about relevant medical history (use antecedents section) State the top 2 potential diagnoses from the given differential diagnosis list, along with their associated probabilities Ask 1 follow-up question related to a symptom detail that could help differentiate between the two potential diagnoses  Recommend 1 next step for diagnosis (e.g. test, examination, etc.)  Patient: Respond with complete statements from the symptoms and antecedents sections verbatim If asked about any missing details, do not provide any new information The conversation should strictly utilize the information provided in the patient history sections. No new clinical details should be introduced. The dialog must end with the Assistant ' The dialogue must be in the following format: ###Assistant: What are your symptoms? ### Human: I have chest pain even at rest. The pain is a knife stroke in my upper chest, breast (R) and breast (L). On a scale of 0-10, the pain intensity is 7 and the pace it appeared is 9.### Assistant: Any relevant medical history?### Human: I have had a spontaneous pneumothorax. I smoke cigarettes. I have a chronic obstructive pulmonary disease. Some family members have had a pneumothorax.### Assistant: Based on your history, the two most likely diagnoses are unstable angina at 26.2% probability, and spontaneous pneumothorax at 6% probability. Does the pain increase or decrease with physical exertion?### Human: I have symptoms that increase with physical exertion but alleviate with rest.### Assistant: Let's get a chest X-ray to examine for a potential pneumothorax."

    # time.sleep(1)
    response = llm.predict(PROMPT)
    conv = str(response)
    # add case and conversation to dataframe, don't use append()
    df.loc[i] = [i+1, conv]
    print(i)

# write dataframe to generated_data.csv
df.to_csv('generated_data.csv', index=False)