In [None]:
from glob import glob
import random,os
import pandas as pd
import time

In [None]:
# ADE
dataset = 'ADE'
files = glob('./data/Vaccine-AE-recognition/test/*.bio')
random.seed(42)
#files = random.sample(files,k=100)
len(files)

In [None]:
training_files = glob('./data/Vaccine-AE-recognition/train/*.bio')
random.seed(42)

all_sentences = []
for sample in training_files:
    with open(sample) as f_sample:
        all_sentences.append(f_sample.read())

def get_few_shot_examples_merged(all_sentences):
    random.seed(42)
    samples = random.sample(all_sentences,k=5)

    one_shot_sentences = []
    for sentence in samples:
        one_shot_sentence = ''
        for token in sentence.split('\n'):
            one_shot_sentence+=token.split(' ')[0]+' '
        one_shot_sentences.append(one_shot_sentence)
        
    one_shot_entities = []
    for i1,sentence in enumerate(samples):
        one_shot_entity = {}
        for i2,token in enumerate(sentence.split('\n')):
            if token != '':
                if token.split(' ')[1].startswith('B'):
                    entity_type = token.split(' ')[1].split('-')[-1]
                    entity = token.split(' ')[0] + ' '
                    for i3 in range(i2+1,len(sentence.split('\n'))):
                        if sentence.split('\n')[i3].split(' ')[1].startswith('I'):
                            entity += sentence.split('\n')[i3].split(' ')[0] + ' '
                        else:
                            one_shot_entity.update({entity:entity_type})
                            break
        one_shot_entities.append(one_shot_entity)
        
    examples = []
    for sentence, entities in zip(one_shot_sentences,one_shot_entities):
        for entity,entity_type in entities.items():
            start_idx = sentence.index(entity)
            sentence = sentence[:start_idx]+f'<span class="{entity_type}">'+entity.strip()+f'</span> ' + sentence[start_idx + len(entity):]

        examples.append(sentence)
        
    return one_shot_sentences,examples

In [None]:
all_sentences = [s for s in all_sentences if 'B-' in s]

In [None]:
one_shot_sentences,examples = get_few_shot_examples_merged(all_sentences)

In [None]:
print (one_shot_sentences[0])
print (examples[0])

In [None]:
for sentence in one_shot_sentences:
    print (sentence)
print ()
for example in examples:
    print (example)

# official ChatGPT API

In [None]:
import openai
from glob import glob

In [None]:
def get_output(prompt,GPT):
    if GPT == 3.5:
        openai.api_key = 'your api key'
        model = 'gpt-3.5-turbo-0301'
        #model = 'gpt-3.5-turbo'
    elif GPT == 4:
        openai.api_key = 'your api key'
        model = 'gpt-4-0314'

    message = openai.ChatCompletion.create(
      model=model,
      temperature=0,
      messages=[
            {"role": "user", "content": prompt}
        ]
    )
    return message['choices'][0]['message']['content']

In [None]:
def run(GPT,prompt,prompt_type):
    for i,file in enumerate(files):
        with open(file,'r') as f_read:
            text = ' '.join([line.split('\t')[0] for line in f_read.read().splitlines()])
        file_name = file.split('/')[-1].split('.')[0]

        dir_path = f'./GPT{GPT}_output/{dataset}/temperature0/{prompt_type}/'
        if not os.path.exists(dir_path):
            os.makedirs(dir_path)

        success = False
        while not success:
            try:
                output = get_output(prompt.format(text),GPT)
                with open(f'/data/yhu5/chatgpt/GPT{GPT}_output/{dataset}/temperature0/{prompt_type}/{file_name}.html','w') as f_write:
                    f_write.write(output)
                    success = True
            except Exception as e:
                print (e)
                pass

# GPT 3.5

### zero-shot

In [None]:
prompt = '''### Task
Your task is to generate an HTML version of an input text, marking up specific entities related to healthcare. The entities to be identified are: 'investigations', 'nervous adverse events', 'other adverse events', and 'procedures'. Use HTML <span> tags to highlight these entities. Each <span> should have a class attribute indicating the type of the entity.

### Entity Markup Guide
Use <span class="investigation"> to denote an investigation.
Use <span class="nervous_AE"> to denote a nervous adverse event.
Use <span class="other_AE"> to denote an other adverse event.
Use <span class="procedure"> to denote a procedure.
If no entity found, leave the text as it is.

### Input Text: {}
### Output Text:
'''
GPT = 3.5
prompt_type = 'merged_prompt'
run(GPT,prompt,prompt_type)

### zero-shot with definition

In [None]:
prompt = '''### Task
Your task is to generate an HTML version of an input text, marking up specific entities related to healthcare. The entities to be identified are: 'investigations', 'nervous adverse events', 'other adverse events', and 'procedures'. Use HTML <span> tags to highlight these entities. Each <span> should have a class attribute indicating the type of the entity.

### Entity Markup Guide
Use <span class="investigation"> to denote an investigation.
Use <span class="nervous_AE"> to denote a nervous adverse event.
Use <span class="other_AE"> to denote an other adverse event.
Use <span class="procedure"> to denote a procedure.
If no entity found, leave the text as it is.

### Entity Definitions
Investigation includes typical lab tests or examinations in the report, such as physical examination, oxygen saturation, electromyogram, etc.
Nervous adverse event includes typically nervous system-related problems, such as guillain-barré syndrome, ataxia, areflexia, hypoaesthesia, paraesthesia, dizziness, headache and other nervous system disorders.
Other adverse event includes medical problems that are assigned to other MedDRA SOCs, including gastrointestinal disorders, cardiac disorders, psychiatric disorders, musculoskeletal and connective tissue disorders, etc.
Procedure includes non-medical problem events such as immunization, surgeries such as catheter placement, hospitalization, emergence care, intubation, etc.

### Input Text: {}
### Output Text:
'''

GPT = 3.5
prompt_type = 'merged_prompt_definition'
run(GPT,prompt,prompt_type)

### zero-shot with definition and guidelines

In [None]:
prompt = '''### Task
Your task is to generate an HTML version of an input text, marking up specific entities related to healthcare. The entities to be identified are: 'investigations', 'nervous adverse events', 'other adverse events', and 'procedures'. Use HTML <span> tags to highlight these entities. Each <span> should have a class attribute indicating the type of the entity.

### Entity Markup Guide
Use <span class="investigation"> to denote an investigation.
Use <span class="nervous_AE"> to denote a nervous adverse event.
Use <span class="other_AE"> to denote an other adverse event.
Use <span class="procedure"> to denote a procedure.
If no entity found, leave the text as it is.

### Entity Definitions
Investigation includes typical lab tests or examinations in the report, such as physical examination, oxygen saturation, electromyogram, etc.
Nervous adverse event includes typically nervous system-related problems, such as guillain-barré syndrome, ataxia, areflexia, hypoaesthesia, paraesthesia, dizziness, headache and other nervous system disorders.
Other adverse event includes medical problems that are assigned to other MedDRA SOCs, including gastrointestinal disorders, cardiac disorders, psychiatric disorders, musculoskeletal and connective tissue disorders, etc.
Procedure includes non-medical problem events such as immunization, surgeries such as catheter placement, hospitalization, emergence care, intubation, etc.

### Annotation Guidelines
Only annotate events that already occurred (i.e., occurred before the diagnosis of GBS).
When annotating events related to Flu-GBS, do not include prepositions including modifiers of the event.
Separate events in discontinuous segments.
When annotating events, more generalized events should not be annotated.
When annotating events related to symptom improvement / progress or negation events, the following guideline should be used. In the case where the patient reported a specific adverse event first, and then reported improvement / progress of the adverse event, we should annotate it as an improved symptom. However, we do NOT need to annotate the negation of a symptom which the patient never reported before.
Events reported as history (events that did not happen to the reporting patient) should be annotated. Family history is important for risk prediction and may be included as a baseline information (e.g., for statistical analysis).
Some VAERS reports have duplicate events reported. For example, the same events / text are repeated twice in the report. The case we are interested in, is the recurrence of some adverse event, i.e., it requires the adverse event appears, then disappear, and then come back. In this case it should definitely be annotated twice. Additionally, we need to annotate the relief/improvement of the event if it is mentioned in the report. When no such information to decide whether it is a recurrence, the principle is that if there are multiple time stamps of the same event, we annotate it twice, if not, we can just keep one record.

### Input Text: {}
### Output Text:
'''
GPT = 3.5
prompt_type = 'merged_prompt_definition_guidelines'
run(GPT,prompt,prompt_type)

### 1-shot

In [None]:
prompt = '''### Task
Your task is to generate an HTML version of an input text, marking up specific entities related to healthcare. The entities to be identified are: 'investigations', 'nervous adverse events', 'other adverse events', and 'procedures'. Use HTML <span> tags to highlight these entities. Each <span> should have a class attribute indicating the type of the entity.

### Entity Markup Guide
Use <span class="investigation"> to denote an investigation.
Use <span class="nervous_AE"> to denote a nervous adverse event.
Use <span class="other_AE"> to denote an other adverse event.
Use <span class="procedure"> to denote a procedure.
If no entity found, leave the text as it is.

### Examples 
Example Input1: she said the doctor diagnosed her husband with Gillian Barre related to flu vaccine . Seen in ER for C / O numbness in both hands ( fingertips ) and both feet with c / o abd pain ongoing times 5 days .  
Example Output1: she said the doctor diagnosed her husband with <span class="nervous_AE">Gillian Barre</span> related to <span class="procedure">flu vaccine</span> . Seen in <span class="procedure">ER</span> for C / O <span class="nervous_AE">numbness in both hands ( fingertips ) and both feet</span> with c / o <span class="other_AE">abd pain</span> ongoing times 5 days .  

### Input Text: {}
### Output Text:
'''
GPT = 3.5
prompt_type = 'merged_prompt_oneshot'
run(GPT,prompt,prompt_type)

### 1-shot with definition

In [None]:
prompt = '''### Task
Your task is to generate an HTML version of an input text, marking up specific entities related to healthcare. The entities to be identified are: 'investigations', 'nervous adverse events', 'other adverse events', and 'procedures'. Use HTML <span> tags to highlight these entities. Each <span> should have a class attribute indicating the type of the entity.

### Entity Markup Guide
Use <span class="investigation"> to denote an investigation.
Use <span class="nervous_AE"> to denote a nervous adverse event.
Use <span class="other_AE"> to denote an other adverse event.
Use <span class="procedure"> to denote a procedure.
If no entity found, leave the text as it is.

### Entity Definitions
Investigation includes typical lab tests or examinations in the report, such as physical examination, oxygen saturation, electromyogram, etc.
Nervous adverse event includes typically nervous system-related problems, such as guillain-barré syndrome, ataxia, areflexia, hypoaesthesia, paraesthesia, dizziness, headache and other nervous system disorders.
Other adverse event includes medical problems that are assigned to other MedDRA SOCs, including gastrointestinal disorders, cardiac disorders, psychiatric disorders, musculoskeletal and connective tissue disorders, etc.
Procedure includes non-medical problem events such as immunization, surgeries such as catheter placement, hospitalization, emergence care, intubation, etc.

### Examples 
Example Input1: she said the doctor diagnosed her husband with Gillian Barre related to flu vaccine . Seen in ER for C / O numbness in both hands ( fingertips ) and both feet with c / o abd pain ongoing times 5 days .  
Example Output1: she said the doctor diagnosed her husband with <span class="nervous_AE">Gillian Barre</span> related to <span class="procedure">flu vaccine</span> . Seen in <span class="procedure">ER</span> for C / O <span class="nervous_AE">numbness in both hands ( fingertips ) and both feet</span> with c / o <span class="other_AE">abd pain</span> ongoing times 5 days .  

### Input Text: {}
### Output Text:
'''
GPT = 3.5
prompt_type = 'merged_prompt_definition_oneshot'
run(GPT,prompt,prompt_type)

### 1-shot with definition and guidelines

In [None]:
prompt = '''### Task
Your task is to generate an HTML version of an input text, marking up specific entities related to healthcare. The entities to be identified are: 'investigations', 'nervous adverse events', 'other adverse events', and 'procedures'. Use HTML <span> tags to highlight these entities. Each <span> should have a class attribute indicating the type of the entity.

### Entity Markup Guide
Use <span class="investigation"> to denote an investigation.
Use <span class="nervous_AE"> to denote a nervous adverse event.
Use <span class="other_AE"> to denote an other adverse event.
Use <span class="procedure"> to denote a procedure.
If no entity found, leave the text as it is.

### Entity Definitions
Investigation includes typical lab tests or examinations in the report, such as physical examination, oxygen saturation, electromyogram, etc.
Nervous adverse event includes typically nervous system-related problems, such as guillain-barré syndrome, ataxia, areflexia, hypoaesthesia, paraesthesia, dizziness, headache and other nervous system disorders.
Other adverse event includes medical problems that are assigned to other MedDRA SOCs, including gastrointestinal disorders, cardiac disorders, psychiatric disorders, musculoskeletal and connective tissue disorders, etc.
Procedure includes non-medical problem events such as immunization, surgeries such as catheter placement, hospitalization, emergence care, intubation, etc.

### Annotation Guidelines
Only annotate events that already occurred (i.e., occurred before the diagnosis of GBS).
When annotating events related to Flu-GBS, do not include prepositions including modifiers of the event.
Separate events in discontinuous segments.
When annotating events, more generalized events should not be annotated.
When annotating events related to symptom improvement / progress or negation events, the following guideline should be used. In the case where the patient reported a specific adverse event first, and then reported improvement / progress of the adverse event, we should annotate it as an improved symptom. However, we do NOT need to annotate the negation of a symptom which the patient never reported before.
Events reported as history (events that did not happen to the reporting patient) should be annotated. Family history is important for risk prediction and may be included as a baseline information (e.g., for statistical analysis).
Some VAERS reports have duplicate events reported. For example, the same events / text are repeated twice in the report. The case we are interested in, is the recurrence of some adverse event, i.e., it requires the adverse event appears, then disappear, and then come back. In this case it should definitely be annotated twice. Additionally, we need to annotate the relief/improvement of the event if it is mentioned in the report. When no such information to decide whether it is a recurrence, the principle is that if there are multiple time stamps of the same event, we annotate it twice, if not, we can just keep one record.

### Examples 
Example Input1: she said the doctor diagnosed her husband with Gillian Barre related to flu vaccine . Seen in ER for C / O numbness in both hands ( fingertips ) and both feet with c / o abd pain ongoing times 5 days .  
Example Output1: she said the doctor diagnosed her husband with <span class="nervous_AE">Gillian Barre</span> related to <span class="procedure">flu vaccine</span> . Seen in <span class="procedure">ER</span> for C / O <span class="nervous_AE">numbness in both hands ( fingertips ) and both feet</span> with c / o <span class="other_AE">abd pain</span> ongoing times 5 days .  

### Input Text: {}
### Output Text:
'''
GPT = 3.5
prompt_type = 'merged_prompt_definition_guidelines_oneshot'
run(GPT,prompt,prompt_type)

### 5-shot

In [None]:
prompt = '''### Task
Your task is to generate an HTML version of an input text, marking up specific entities related to healthcare. The entities to be identified are: 'investigations', 'nervous adverse events', 'other adverse events', and 'procedures'. Use HTML <span> tags to highlight these entities. Each <span> should have a class attribute indicating the type of the entity.

### Entity Markup Guide
Use <span class="investigation"> to denote an investigation.
Use <span class="nervous_AE"> to denote a nervous adverse event.
Use <span class="other_AE"> to denote an other adverse event.
Use <span class="procedure"> to denote a procedure.
If no entity found, leave the text as it is.

### Examples 
Example Input1: she said the doctor diagnosed her husband with Gillian Barre related to flu vaccine . Seen in ER for C / O numbness in both hands ( fingertips ) and both feet with c / o abd pain ongoing times 5 days .  
Example Output1: she said the doctor diagnosed her husband with <span class="nervous_AE">Gillian Barre</span> related to <span class="procedure">flu vaccine</span> . Seen in <span class="procedure">ER</span> for C / O <span class="nervous_AE">numbness in both hands ( fingertips ) and both feet</span> with c / o <span class="other_AE">abd pain</span> ongoing times 5 days .  
Example Input2: He was seen in an emergency room the following day and was admitted to the hospital because he could not walk .  
Example Output2: He was seen in an <span class="procedure">emergency room</span> the following day and was admitted to the <span class="procedure">hospital</span> because he <span class="nervous_AE">could not walk</span> .  
Example Input3: On 20 September 2009 the patient was diagnosed with GUILLAIN BARRE Syndrome .  
Example Output3: On 20 September 2009 the patient was diagnosed with <span class="nervous_AE">GUILLAIN BARRE Syndrome</span> .  
Example Input4: No improvement as of yet 4 days of IVIG beginning 1 / 15 / 09 ( Hospitalized ) .  
Example Output4: No improvement as of yet 4 days of <span class="procedure">IVIG</span> beginning 1 / 15 / 09 ( <span class="procedure">Hospitalized</span> ) .  
Example Input5: Tx with plasmapheresis .  
Example Output5: Tx with <span class="procedure">plasmapheresis</span> .  

### Input Text: {}
### Output Text:
'''
GPT = 3.5
prompt_type = 'merged_prompt_fiveshot'
run(GPT,prompt,prompt_type)

### 5-shot with definition

In [None]:
prompt = '''### Task
Your task is to generate an HTML version of an input text, marking up specific entities related to healthcare. The entities to be identified are: 'investigations', 'nervous adverse events', 'other adverse events', and 'procedures'. Use HTML <span> tags to highlight these entities. Each <span> should have a class attribute indicating the type of the entity.

### Entity Markup Guide
Use <span class="investigation"> to denote an investigation.
Use <span class="nervous_AE"> to denote a nervous adverse event.
Use <span class="other_AE"> to denote an other adverse event.
Use <span class="procedure"> to denote a procedure.
If no entity found, leave the text as it is.

### Entity Definitions
Investigation includes typical lab tests or examinations in the report, such as physical examination, oxygen saturation, electromyogram, etc.
Nervous adverse event includes typically nervous system-related problems, such as guillain-barré syndrome, ataxia, areflexia, hypoaesthesia, paraesthesia, dizziness, headache and other nervous system disorders.
Other adverse event includes medical problems that are assigned to other MedDRA SOCs, including gastrointestinal disorders, cardiac disorders, psychiatric disorders, musculoskeletal and connective tissue disorders, etc.
Procedure includes non-medical problem events such as immunization, surgeries such as catheter placement, hospitalization, emergence care, intubation, etc.

### Examples 
Example Input1: she said the doctor diagnosed her husband with Gillian Barre related to flu vaccine . Seen in ER for C / O numbness in both hands ( fingertips ) and both feet with c / o abd pain ongoing times 5 days .  
Example Output1: she said the doctor diagnosed her husband with <span class="nervous_AE">Gillian Barre</span> related to <span class="procedure">flu vaccine</span> . Seen in <span class="procedure">ER</span> for C / O <span class="nervous_AE">numbness in both hands ( fingertips ) and both feet</span> with c / o <span class="other_AE">abd pain</span> ongoing times 5 days .  
Example Input2: He was seen in an emergency room the following day and was admitted to the hospital because he could not walk .  
Example Output2: He was seen in an <span class="procedure">emergency room</span> the following day and was admitted to the <span class="procedure">hospital</span> because he <span class="nervous_AE">could not walk</span> .  
Example Input3: On 20 September 2009 the patient was diagnosed with GUILLAIN BARRE Syndrome .  
Example Output3: On 20 September 2009 the patient was diagnosed with <span class="nervous_AE">GUILLAIN BARRE Syndrome</span> .  
Example Input4: No improvement as of yet 4 days of IVIG beginning 1 / 15 / 09 ( Hospitalized ) .  
Example Output4: No improvement as of yet 4 days of <span class="procedure">IVIG</span> beginning 1 / 15 / 09 ( <span class="procedure">Hospitalized</span> ) .  
Example Input5: Tx with plasmapheresis .  
Example Output5: Tx with <span class="procedure">plasmapheresis</span> .  

### Input Text: {}
### Output Text:
'''
GPT = 3.5
prompt_type = 'merged_prompt_definition_fiveshot'
run(GPT,prompt,prompt_type)

### 5-shot with definition and guidelines

In [None]:
prompt = '''### Task
Your task is to generate an HTML version of an input text, marking up specific entities related to healthcare. The entities to be identified are: 'investigations', 'nervous adverse events', 'other adverse events', and 'procedures'. Use HTML <span> tags to highlight these entities. Each <span> should have a class attribute indicating the type of the entity.

### Entity Markup Guide
Use <span class="investigation"> to denote an investigation.
Use <span class="nervous_AE"> to denote a nervous adverse event.
Use <span class="other_AE"> to denote an other adverse event.
Use <span class="procedure"> to denote a procedure.
If no entity found, leave the text as it is.

### Entity Definitions
Investigation includes typical lab tests or examinations in the report, such as physical examination, oxygen saturation, electromyogram, etc.
Nervous adverse event includes typically nervous system-related problems, such as guillain-barré syndrome, ataxia, areflexia, hypoaesthesia, paraesthesia, dizziness, headache and other nervous system disorders.
Other adverse event includes medical problems that are assigned to other MedDRA SOCs, including gastrointestinal disorders, cardiac disorders, psychiatric disorders, musculoskeletal and connective tissue disorders, etc.
Procedure includes non-medical problem events such as immunization, surgeries such as catheter placement, hospitalization, emergence care, intubation, etc.

### Annotation Guidelines
Only annotate events that already occurred (i.e., occurred before the diagnosis of GBS).
When annotating events related to Flu-GBS, do not include prepositions including modifiers of the event.
Separate events in discontinuous segments.
When annotating events, more generalized events should not be annotated.
When annotating events related to symptom improvement / progress or negation events, the following guideline should be used. In the case where the patient reported a specific adverse event first, and then reported improvement / progress of the adverse event, we should annotate it as an improved symptom. However, we do NOT need to annotate the negation of a symptom which the patient never reported before.
Events reported as history (events that did not happen to the reporting patient) should be annotated. Family history is important for risk prediction and may be included as a baseline information (e.g., for statistical analysis).
Some VAERS reports have duplicate events reported. For example, the same events / text are repeated twice in the report. The case we are interested in, is the recurrence of some adverse event, i.e., it requires the adverse event appears, then disappear, and then come back. In this case it should definitely be annotated twice. Additionally, we need to annotate the relief/improvement of the event if it is mentioned in the report. When no such information to decide whether it is a recurrence, the principle is that if there are multiple time stamps of the same event, we annotate it twice, if not, we can just keep one record.

### Examples 
Example Input1: she said the doctor diagnosed her husband with Gillian Barre related to flu vaccine . Seen in ER for C / O numbness in both hands ( fingertips ) and both feet with c / o abd pain ongoing times 5 days .  
Example Output1: she said the doctor diagnosed her husband with <span class="nervous_AE">Gillian Barre</span> related to <span class="procedure">flu vaccine</span> . Seen in <span class="procedure">ER</span> for C / O <span class="nervous_AE">numbness in both hands ( fingertips ) and both feet</span> with c / o <span class="other_AE">abd pain</span> ongoing times 5 days .  
Example Input2: He was seen in an emergency room the following day and was admitted to the hospital because he could not walk .  
Example Output2: He was seen in an <span class="procedure">emergency room</span> the following day and was admitted to the <span class="procedure">hospital</span> because he <span class="nervous_AE">could not walk</span> .  
Example Input3: On 20 September 2009 the patient was diagnosed with GUILLAIN BARRE Syndrome .  
Example Output3: On 20 September 2009 the patient was diagnosed with <span class="nervous_AE">GUILLAIN BARRE Syndrome</span> .  
Example Input4: No improvement as of yet 4 days of IVIG beginning 1 / 15 / 09 ( Hospitalized ) .  
Example Output4: No improvement as of yet 4 days of <span class="procedure">IVIG</span> beginning 1 / 15 / 09 ( <span class="procedure">Hospitalized</span> ) .  
Example Input5: Tx with plasmapheresis .  
Example Output5: Tx with <span class="procedure">plasmapheresis</span> .  

### Input Text: {}
### Output Text:
'''
GPT = 3.5
prompt_type = 'merged_prompt_definition_guidelines_fiveshot'
run(GPT,prompt,prompt_type)

# GPT-4

### zero-shot

In [None]:
prompt = '''### Task
Your task is to generate an HTML version of an input text, marking up specific entities related to healthcare. The entities to be identified are: 'investigations', 'nervous adverse events', 'other adverse events', and 'procedures'. Use HTML <span> tags to highlight these entities. Each <span> should have a class attribute indicating the type of the entity.

### Entity Markup Guide
Use <span class="investigation"> to denote an investigation.
Use <span class="nervous_AE"> to denote a nervous adverse event.
Use <span class="other_AE"> to denote an other adverse event.
Use <span class="procedure"> to denote a procedure.
If no entity found, leave the text as it is.

### Input Text: {}
### Output Text:
'''
GPT = 4
prompt_type = 'merged_prompt'
run(GPT,prompt,prompt_type)

### zero-shot with definition

In [None]:
prompt = '''### Task
Your task is to generate an HTML version of an input text, marking up specific entities related to healthcare. The entities to be identified are: 'investigations', 'nervous adverse events', 'other adverse events', and 'procedures'. Use HTML <span> tags to highlight these entities. Each <span> should have a class attribute indicating the type of the entity.

### Entity Markup Guide
Use <span class="investigation"> to denote an investigation.
Use <span class="nervous_AE"> to denote a nervous adverse event.
Use <span class="other_AE"> to denote an other adverse event.
Use <span class="procedure"> to denote a procedure.
If no entity found, leave the text as it is.

### Entity Definitions
Investigation includes typical lab tests or examinations in the report, such as physical examination, oxygen saturation, electromyogram, etc.
Nervous adverse event includes typically nervous system-related problems, such as guillain-barré syndrome, ataxia, areflexia, hypoaesthesia, paraesthesia, dizziness, headache and other nervous system disorders.
Other adverse event includes medical problems that are assigned to other MedDRA SOCs, including gastrointestinal disorders, cardiac disorders, psychiatric disorders, musculoskeletal and connective tissue disorders, etc.
Procedure includes non-medical problem events such as immunization, surgeries such as catheter placement, hospitalization, emergence care, intubation, etc.

### Input Text: {}
### Output Text:
'''

GPT = 4
prompt_type = 'merged_prompt_definition'
run(GPT,prompt,prompt_type)

### zero-shot with definition and guidelines

In [None]:
prompt = '''### Task
Your task is to generate an HTML version of an input text, marking up specific entities related to healthcare. The entities to be identified are: 'investigations', 'nervous adverse events', 'other adverse events', and 'procedures'. Use HTML <span> tags to highlight these entities. Each <span> should have a class attribute indicating the type of the entity.

### Entity Markup Guide
Use <span class="investigation"> to denote an investigation.
Use <span class="nervous_AE"> to denote a nervous adverse event.
Use <span class="other_AE"> to denote an other adverse event.
Use <span class="procedure"> to denote a procedure.
If no entity found, leave the text as it is.

### Entity Definitions
Investigation includes typical lab tests or examinations in the report, such as physical examination, oxygen saturation, electromyogram, etc.
Nervous adverse event includes typically nervous system-related problems, such as guillain-barré syndrome, ataxia, areflexia, hypoaesthesia, paraesthesia, dizziness, headache and other nervous system disorders.
Other adverse event includes medical problems that are assigned to other MedDRA SOCs, including gastrointestinal disorders, cardiac disorders, psychiatric disorders, musculoskeletal and connective tissue disorders, etc.
Procedure includes non-medical problem events such as immunization, surgeries such as catheter placement, hospitalization, emergence care, intubation, etc.

### Annotation Guidelines
Only annotate events that already occurred (i.e., occurred before the diagnosis of GBS).
When annotating events related to Flu-GBS, do not include prepositions including modifiers of the event.
Separate events in discontinuous segments.
When annotating events, more generalized events should not be annotated.
When annotating events related to symptom improvement / progress or negation events, the following guideline should be used. In the case where the patient reported a specific adverse event first, and then reported improvement / progress of the adverse event, we should annotate it as an improved symptom. However, we do NOT need to annotate the negation of a symptom which the patient never reported before.
Events reported as history (events that did not happen to the reporting patient) should be annotated. Family history is important for risk prediction and may be included as a baseline information (e.g., for statistical analysis).
Some VAERS reports have duplicate events reported. For example, the same events / text are repeated twice in the report. The case we are interested in, is the recurrence of some adverse event, i.e., it requires the adverse event appears, then disappear, and then come back. In this case it should definitely be annotated twice. Additionally, we need to annotate the relief/improvement of the event if it is mentioned in the report. When no such information to decide whether it is a recurrence, the principle is that if there are multiple time stamps of the same event, we annotate it twice, if not, we can just keep one record.

### Input Text: {}
### Output Text:
'''
GPT = 4
prompt_type = 'merged_prompt_definition_guidelines'
run(GPT,prompt,prompt_type)

### 1-shot

In [None]:
prompt = '''### Task
Your task is to generate an HTML version of an input text, marking up specific entities related to healthcare. The entities to be identified are: 'investigations', 'nervous adverse events', 'other adverse events', and 'procedures'. Use HTML <span> tags to highlight these entities. Each <span> should have a class attribute indicating the type of the entity.

### Entity Markup Guide
Use <span class="investigation"> to denote an investigation.
Use <span class="nervous_AE"> to denote a nervous adverse event.
Use <span class="other_AE"> to denote an other adverse event.
Use <span class="procedure"> to denote a procedure.
If no entity found, leave the text as it is.

### Examples 
Example Input1: she said the doctor diagnosed her husband with Gillian Barre related to flu vaccine . Seen in ER for C / O numbness in both hands ( fingertips ) and both feet with c / o abd pain ongoing times 5 days .  
Example Output1: she said the doctor diagnosed her husband with <span class="nervous_AE">Gillian Barre</span> related to <span class="procedure">flu vaccine</span> . Seen in <span class="procedure">ER</span> for C / O <span class="nervous_AE">numbness in both hands ( fingertips ) and both feet</span> with c / o <span class="other_AE">abd pain</span> ongoing times 5 days .  

### Input Text: {}
### Output Text:
'''
GPT = 4
prompt_type = 'merged_prompt_oneshot'
run(GPT,prompt,prompt_type)

### 1-shot with definition

In [None]:
prompt = '''### Task
Your task is to generate an HTML version of an input text, marking up specific entities related to healthcare. The entities to be identified are: 'investigations', 'nervous adverse events', 'other adverse events', and 'procedures'. Use HTML <span> tags to highlight these entities. Each <span> should have a class attribute indicating the type of the entity.

### Entity Markup Guide
Use <span class="investigation"> to denote an investigation.
Use <span class="nervous_AE"> to denote a nervous adverse event.
Use <span class="other_AE"> to denote an other adverse event.
Use <span class="procedure"> to denote a procedure.
If no entity found, leave the text as it is.

### Entity Definitions
Investigation includes typical lab tests or examinations in the report, such as physical examination, oxygen saturation, electromyogram, etc.
Nervous adverse event includes typically nervous system-related problems, such as guillain-barré syndrome, ataxia, areflexia, hypoaesthesia, paraesthesia, dizziness, headache and other nervous system disorders.
Other adverse event includes medical problems that are assigned to other MedDRA SOCs, including gastrointestinal disorders, cardiac disorders, psychiatric disorders, musculoskeletal and connective tissue disorders, etc.
Procedure includes non-medical problem events such as immunization, surgeries such as catheter placement, hospitalization, emergence care, intubation, etc.

### Examples 
Example Input1: she said the doctor diagnosed her husband with Gillian Barre related to flu vaccine . Seen in ER for C / O numbness in both hands ( fingertips ) and both feet with c / o abd pain ongoing times 5 days .  
Example Output1: she said the doctor diagnosed her husband with <span class="nervous_AE">Gillian Barre</span> related to <span class="procedure">flu vaccine</span> . Seen in <span class="procedure">ER</span> for C / O <span class="nervous_AE">numbness in both hands ( fingertips ) and both feet</span> with c / o <span class="other_AE">abd pain</span> ongoing times 5 days .  

### Input Text: {}
### Output Text:
'''
GPT = 4
prompt_type = 'merged_prompt_definition_oneshot'
run(GPT,prompt,prompt_type)

### 1-shot with definition and guidelines

In [None]:
prompt = '''### Task
Your task is to generate an HTML version of an input text, marking up specific entities related to healthcare. The entities to be identified are: 'investigations', 'nervous adverse events', 'other adverse events', and 'procedures'. Use HTML <span> tags to highlight these entities. Each <span> should have a class attribute indicating the type of the entity.

### Entity Markup Guide
Use <span class="investigation"> to denote an investigation.
Use <span class="nervous_AE"> to denote a nervous adverse event.
Use <span class="other_AE"> to denote an other adverse event.
Use <span class="procedure"> to denote a procedure.
If no entity found, leave the text as it is.

### Entity Definitions
Investigation includes typical lab tests or examinations in the report, such as physical examination, oxygen saturation, electromyogram, etc.
Nervous adverse event includes typically nervous system-related problems, such as guillain-barré syndrome, ataxia, areflexia, hypoaesthesia, paraesthesia, dizziness, headache and other nervous system disorders.
Other adverse event includes medical problems that are assigned to other MedDRA SOCs, including gastrointestinal disorders, cardiac disorders, psychiatric disorders, musculoskeletal and connective tissue disorders, etc.
Procedure includes non-medical problem events such as immunization, surgeries such as catheter placement, hospitalization, emergence care, intubation, etc.

### Annotation Guidelines
Only annotate events that already occurred (i.e., occurred before the diagnosis of GBS).
When annotating events related to Flu-GBS, do not include prepositions including modifiers of the event.
Separate events in discontinuous segments.
When annotating events, more generalized events should not be annotated.
When annotating events related to symptom improvement / progress or negation events, the following guideline should be used. In the case where the patient reported a specific adverse event first, and then reported improvement / progress of the adverse event, we should annotate it as an improved symptom. However, we do NOT need to annotate the negation of a symptom which the patient never reported before.
Events reported as history (events that did not happen to the reporting patient) should be annotated. Family history is important for risk prediction and may be included as a baseline information (e.g., for statistical analysis).
Some VAERS reports have duplicate events reported. For example, the same events / text are repeated twice in the report. The case we are interested in, is the recurrence of some adverse event, i.e., it requires the adverse event appears, then disappear, and then come back. In this case it should definitely be annotated twice. Additionally, we need to annotate the relief/improvement of the event if it is mentioned in the report. When no such information to decide whether it is a recurrence, the principle is that if there are multiple time stamps of the same event, we annotate it twice, if not, we can just keep one record.

### Examples 
Example Input1: she said the doctor diagnosed her husband with Gillian Barre related to flu vaccine . Seen in ER for C / O numbness in both hands ( fingertips ) and both feet with c / o abd pain ongoing times 5 days .  
Example Output1: she said the doctor diagnosed her husband with <span class="nervous_AE">Gillian Barre</span> related to <span class="procedure">flu vaccine</span> . Seen in <span class="procedure">ER</span> for C / O <span class="nervous_AE">numbness in both hands ( fingertips ) and both feet</span> with c / o <span class="other_AE">abd pain</span> ongoing times 5 days .  

### Input Text: {}
### Output Text:
'''
GPT = 4
prompt_type = 'merged_prompt_definition_guidelines_oneshot'
run(GPT,prompt,prompt_type)

### 5-shot

In [None]:
prompt = '''### Task
Your task is to generate an HTML version of an input text, marking up specific entities related to healthcare. The entities to be identified are: 'investigations', 'nervous adverse events', 'other adverse events', and 'procedures'. Use HTML <span> tags to highlight these entities. Each <span> should have a class attribute indicating the type of the entity.

### Entity Markup Guide
Use <span class="investigation"> to denote an investigation.
Use <span class="nervous_AE"> to denote a nervous adverse event.
Use <span class="other_AE"> to denote an other adverse event.
Use <span class="procedure"> to denote a procedure.
If no entity found, leave the text as it is.

### Examples 
Example Input1: she said the doctor diagnosed her husband with Gillian Barre related to flu vaccine . Seen in ER for C / O numbness in both hands ( fingertips ) and both feet with c / o abd pain ongoing times 5 days .  
Example Output1: she said the doctor diagnosed her husband with <span class="nervous_AE">Gillian Barre</span> related to <span class="procedure">flu vaccine</span> . Seen in <span class="procedure">ER</span> for C / O <span class="nervous_AE">numbness in both hands ( fingertips ) and both feet</span> with c / o <span class="other_AE">abd pain</span> ongoing times 5 days .  
Example Input2: He was seen in an emergency room the following day and was admitted to the hospital because he could not walk .  
Example Output2: He was seen in an <span class="procedure">emergency room</span> the following day and was admitted to the <span class="procedure">hospital</span> because he <span class="nervous_AE">could not walk</span> .  
Example Input3: On 20 September 2009 the patient was diagnosed with GUILLAIN BARRE Syndrome .  
Example Output3: On 20 September 2009 the patient was diagnosed with <span class="nervous_AE">GUILLAIN BARRE Syndrome</span> .  
Example Input4: No improvement as of yet 4 days of IVIG beginning 1 / 15 / 09 ( Hospitalized ) .  
Example Output4: No improvement as of yet 4 days of <span class="procedure">IVIG</span> beginning 1 / 15 / 09 ( <span class="procedure">Hospitalized</span> ) .  
Example Input5: Tx with plasmapheresis .  
Example Output5: Tx with <span class="procedure">plasmapheresis</span> .  

### Input Text: {}
### Output Text:
'''
GPT = 4
prompt_type = 'merged_prompt_fiveshot'
run(GPT,prompt,prompt_type)

### 5-shot with definition

In [None]:
prompt = '''### Task
Your task is to generate an HTML version of an input text, marking up specific entities related to healthcare. The entities to be identified are: 'investigations', 'nervous adverse events', 'other adverse events', and 'procedures'. Use HTML <span> tags to highlight these entities. Each <span> should have a class attribute indicating the type of the entity.

### Entity Markup Guide
Use <span class="investigation"> to denote an investigation.
Use <span class="nervous_AE"> to denote a nervous adverse event.
Use <span class="other_AE"> to denote an other adverse event.
Use <span class="procedure"> to denote a procedure.
If no entity found, leave the text as it is.

### Entity Definitions
Investigation includes typical lab tests or examinations in the report, such as physical examination, oxygen saturation, electromyogram, etc.
Nervous adverse event includes typically nervous system-related problems, such as guillain-barré syndrome, ataxia, areflexia, hypoaesthesia, paraesthesia, dizziness, headache and other nervous system disorders.
Other adverse event includes medical problems that are assigned to other MedDRA SOCs, including gastrointestinal disorders, cardiac disorders, psychiatric disorders, musculoskeletal and connective tissue disorders, etc.
Procedure includes non-medical problem events such as immunization, surgeries such as catheter placement, hospitalization, emergence care, intubation, etc.

### Examples 
Example Input1: she said the doctor diagnosed her husband with Gillian Barre related to flu vaccine . Seen in ER for C / O numbness in both hands ( fingertips ) and both feet with c / o abd pain ongoing times 5 days .  
Example Output1: she said the doctor diagnosed her husband with <span class="nervous_AE">Gillian Barre</span> related to <span class="procedure">flu vaccine</span> . Seen in <span class="procedure">ER</span> for C / O <span class="nervous_AE">numbness in both hands ( fingertips ) and both feet</span> with c / o <span class="other_AE">abd pain</span> ongoing times 5 days .  
Example Input2: He was seen in an emergency room the following day and was admitted to the hospital because he could not walk .  
Example Output2: He was seen in an <span class="procedure">emergency room</span> the following day and was admitted to the <span class="procedure">hospital</span> because he <span class="nervous_AE">could not walk</span> .  
Example Input3: On 20 September 2009 the patient was diagnosed with GUILLAIN BARRE Syndrome .  
Example Output3: On 20 September 2009 the patient was diagnosed with <span class="nervous_AE">GUILLAIN BARRE Syndrome</span> .  
Example Input4: No improvement as of yet 4 days of IVIG beginning 1 / 15 / 09 ( Hospitalized ) .  
Example Output4: No improvement as of yet 4 days of <span class="procedure">IVIG</span> beginning 1 / 15 / 09 ( <span class="procedure">Hospitalized</span> ) .  
Example Input5: Tx with plasmapheresis .  
Example Output5: Tx with <span class="procedure">plasmapheresis</span> .  

### Input Text: {}
### Output Text:
'''
GPT = 4
prompt_type = 'merged_prompt_definition_fiveshot'
run(GPT,prompt,prompt_type)

### zero-shot with definition and guidelines

In [None]:
prompt = '''### Task
Your task is to generate an HTML version of an input text, marking up specific entities related to healthcare. The entities to be identified are: 'investigations', 'nervous adverse events', 'other adverse events', and 'procedures'. Use HTML <span> tags to highlight these entities. Each <span> should have a class attribute indicating the type of the entity.

### Entity Markup Guide
Use <span class="investigation"> to denote an investigation.
Use <span class="nervous_AE"> to denote a nervous adverse event.
Use <span class="other_AE"> to denote an other adverse event.
Use <span class="procedure"> to denote a procedure.
If no entity found, leave the text as it is.

### Entity Definitions
Investigation includes typical lab tests or examinations in the report, such as physical examination, oxygen saturation, electromyogram, etc.
Nervous adverse event includes typically nervous system-related problems, such as guillain-barré syndrome, ataxia, areflexia, hypoaesthesia, paraesthesia, dizziness, headache and other nervous system disorders.
Other adverse event includes medical problems that are assigned to other MedDRA SOCs, including gastrointestinal disorders, cardiac disorders, psychiatric disorders, musculoskeletal and connective tissue disorders, etc.
Procedure includes non-medical problem events such as immunization, surgeries such as catheter placement, hospitalization, emergence care, intubation, etc.

### Annotation Guidelines
Only annotate events that already occurred (i.e., occurred before the diagnosis of GBS).
When annotating events related to Flu-GBS, do not include prepositions including modifiers of the event.
Separate events in discontinuous segments.
When annotating events, more generalized events should not be annotated.
When annotating events related to symptom improvement / progress or negation events, the following guideline should be used. In the case where the patient reported a specific adverse event first, and then reported improvement / progress of the adverse event, we should annotate it as an improved symptom. However, we do NOT need to annotate the negation of a symptom which the patient never reported before.
Events reported as history (events that did not happen to the reporting patient) should be annotated. Family history is important for risk prediction and may be included as a baseline information (e.g., for statistical analysis).
Some VAERS reports have duplicate events reported. For example, the same events / text are repeated twice in the report. The case we are interested in, is the recurrence of some adverse event, i.e., it requires the adverse event appears, then disappear, and then come back. In this case it should definitely be annotated twice. Additionally, we need to annotate the relief/improvement of the event if it is mentioned in the report. When no such information to decide whether it is a recurrence, the principle is that if there are multiple time stamps of the same event, we annotate it twice, if not, we can just keep one record.

### Examples 
Example Input1: she said the doctor diagnosed her husband with Gillian Barre related to flu vaccine . Seen in ER for C / O numbness in both hands ( fingertips ) and both feet with c / o abd pain ongoing times 5 days .  
Example Output1: she said the doctor diagnosed her husband with <span class="nervous_AE">Gillian Barre</span> related to <span class="procedure">flu vaccine</span> . Seen in <span class="procedure">ER</span> for C / O <span class="nervous_AE">numbness in both hands ( fingertips ) and both feet</span> with c / o <span class="other_AE">abd pain</span> ongoing times 5 days .  
Example Input2: He was seen in an emergency room the following day and was admitted to the hospital because he could not walk .  
Example Output2: He was seen in an <span class="procedure">emergency room</span> the following day and was admitted to the <span class="procedure">hospital</span> because he <span class="nervous_AE">could not walk</span> .  
Example Input3: On 20 September 2009 the patient was diagnosed with GUILLAIN BARRE Syndrome .  
Example Output3: On 20 September 2009 the patient was diagnosed with <span class="nervous_AE">GUILLAIN BARRE Syndrome</span> .  
Example Input4: No improvement as of yet 4 days of IVIG beginning 1 / 15 / 09 ( Hospitalized ) .  
Example Output4: No improvement as of yet 4 days of <span class="procedure">IVIG</span> beginning 1 / 15 / 09 ( <span class="procedure">Hospitalized</span> ) .  
Example Input5: Tx with plasmapheresis .  
Example Output5: Tx with <span class="procedure">plasmapheresis</span> .  

### Input Text: {}
### Output Text:
'''
GPT = 4
prompt_type = 'merged_prompt_definition_guidelines_fiveshot'
run(GPT,prompt,prompt_type)

### after error analysis

In [None]:
prompt = '''### Task
Your task is to generate an HTML version of an input text, marking up specific entities related to healthcare. The entities to be identified are: 'investigations', 'nervous adverse events', 'other adverse events', and 'procedures'. Use HTML <span> tags to highlight these entities. Each <span> should have a class attribute indicating the type of the entity.

### Entity Markup Guide
Use <span class="investigation"> to denote an investigation.
Use <span class="nervous_AE"> to denote a nervous adverse event.
Use <span class="other_AE"> to denote an other adverse event.
Use <span class="procedure"> to denote a procedure.
If no entity found, leave the text as it is.

### Entity Definitions
Investigation includes typical lab tests or examinations in the report, such as physical examination, oxygen saturation, electromyogram, etc.
Nervous adverse event includes typically nervous system-related problems, such as guillain-barré syndrome, ataxia, areflexia, hypoaesthesia, paraesthesia, dizziness, headache and other nervous system disorders.
Other adverse event includes medical problems that are assigned to other MedDRA SOCs, including gastrointestinal disorders, cardiac disorders, psychiatric disorders, musculoskeletal and connective tissue disorders, etc.
Procedure includes non-medical problem events such as individual immunization complications or related medical events (each immunization should be marked separately), surgeries such as catheter placement, hospitalization, emergence care, intubation, etc. A procedure refers to a specific medical or surgical activity carried out to diagnose, treat, or monitor a condition. Routine care activities or general healthcare administration such as 'sick call', 'doctor's visit', 'general checkup', etc. without a specific associated procedure or event should not be considered as a procedure. Note that 'vaccines administered' in absence of any complications or related medical events should not be considered a procedure.
Please note that in the case of negation where a certain adverse event, investigation, or procedure is clearly indicated NOT to have occurred (e.g., 'No bowel or bladder symptoms'), do not mark the entity.

### Annotation Guidelines
Only annotate events that already occurred (i.e., occurred before the diagnosis of GBS).
When annotating events related to Flu-GBS, do not include prepositions including modifiers of the event.
Separate events in discontinuous segments.
When annotating events, more generalized events should not be annotated.
When annotating events related to symptom improvement / progress or negation events, the following guideline should be used. In the case where the patient reported a specific adverse event first, and then reported improvement / progress of the adverse event, we should annotate it as an improved symptom. However, we do NOT need to annotate the negation of a symptom which the patient never reported before.
Events reported as history (events that did not happen to the reporting patient) should be annotated. Family history is important for risk prediction and may be included as a baseline information (e.g., for statistical analysis).
Some VAERS reports have duplicate events reported. For example, the same events / text are repeated twice in the report. The case we are interested in, is the recurrence of some adverse event, i.e., it requires the adverse event appears, then disappear, and then come back. In this case it should definitely be annotated twice. Additionally, we need to annotate the relief/improvement of the event if it is mentioned in the report. When no such information to decide whether it is a recurrence, the principle is that if there are multiple time stamps of the same event, we annotate it twice, if not, we can just keep one record.
When annotating 'vaccine administration' as a procedure, ensure it is specifically related to an adverse event or mentioned in the context of a medical procedure. Simply mentioning 'vaccine administration' without any adverse reactions or complications should not be considered a procedure.
Negations: Do not mark any entity that is mentioned in a context that clearly indicates it did not occur or was not present or was denied.

### Examples 
Example Input1: she said the doctor diagnosed her husband with Gillian Barre related to flu vaccine . Seen in ER for C / O numbness in both hands ( fingertips ) and both feet with c / o abd pain ongoing times 5 days .  
Example Output1: she said the doctor diagnosed her husband with <span class="nervous_AE">Gillian Barre</span> related to <span class="procedure">flu vaccine</span> . Seen in <span class="procedure">ER</span> for C / O <span class="nervous_AE">numbness in both hands ( fingertips ) and both feet</span> with c / o <span class="other_AE">abd pain</span> ongoing times 5 days .  
Example Input2: He was seen in an emergency room the following day and was admitted to the hospital because he could not walk .  
Example Output2: He was seen in an <span class="procedure">emergency room</span> the following day and was admitted to the <span class="procedure">hospital</span> because he <span class="nervous_AE">could not walk</span> .  
Example Input3: On 20 September 2009 the patient was diagnosed with GUILLAIN BARRE Syndrome .  
Example Output3: On 20 September 2009 the patient was diagnosed with <span class="nervous_AE">GUILLAIN BARRE Syndrome</span> .  
Example Input4: No improvement as of yet 4 days of IVIG beginning 1 / 15 / 09 ( Hospitalized ) .  
Example Output4: No improvement as of yet 4 days of <span class="procedure">IVIG</span> beginning 1 / 15 / 09 ( <span class="procedure">Hospitalized</span> ) .  
Example Input5: Tx with plasmapheresis .  
Example Output5: Tx with <span class="procedure">plasmapheresis</span> .  

### Input Text: {}
### Output Text:
'''
GPT = 3.5
prompt_type = 'merged_prompt_definition_guidelines_fiveshot_after_error_analysis'
run(GPT,prompt,prompt_type)

In [None]:
GPT = 4
prompt_type = 'merged_prompt_definition_guidelines_fiveshot_after_error_analysis'
run(GPT,prompt,prompt_type)

# evaluation

In [None]:
from bs4 import BeautifulSoup as bs
from bs4 import NavigableString, Tag
from glob import glob
import spacy
py_nlp = spacy.load ("en_core_web_lg")

In [None]:
def html2bio(html_path):
    with open(html_path) as f:
        
        html = f.read()
        
        if '***output***' in html.lower():
            html = html[html.lower().index('***output***')+len('***output***')+1:]
        if 'output:' in html.lower():
            html = html[html.lower().index('output:')+len('output:')+1:]
        if 'output text' in html.lower():
            html = html[html.lower().index('output text')+len('output text')+1:]
        if '***Highlighted Text***'  in html.lower():
            html = html[html.lower().index('***Highlighted Text***')+len('***Highlighted Text***')+1:]
        if '<body>' in html:
            html = html[html.index('<body>')+6:html.index('</body>')]
        if '<p>' in html:
            html = html[html.index('<p>')+3:html.index('</p>')]
            
        #print (html_path)
        #print (html,'\n')
            
        # Parse HTML using BeautifulSoup
        soup = bs(html, "html.parser")

        # Extract text under 'p' tags and convert to BIO format
        bio_format = []
        

        for child in soup.children:
            if isinstance(child, NavigableString):
                for word in child.split():
                    bio_format.append(f"{word}\tO\n")
            elif isinstance(child, Tag):
                words = py_nlp (child.get_text())
                try:
                    entity = child.attrs['class'][0]
                except:
                    entity = 'O'
                if len(words) != 0:
                    if entity != 'O' and entity in ['investigation','nervous_AE','other_AE','procedure']:
                        bio_format.append(f"{words[0]}\tB-{entity}\n")
                        for word in words[1:]:
                            bio_format.append(f"{word}\tI-{entity}\n")
                    else:
                        bio_format.append(f"{words[0]}\tO\n")
                        for word in words[1:]:
                            bio_format.append(f"{word}\tO\n")
    return bio_format

In [None]:
def get_performance(GPT,prompt):
    all_tags = []
    all_tokens = []
    gold_tags = []

    for file in files:
        file_name = file.split('/')[-1].split('.')[0]
        with open(file) as f_gold:
            lines = f_gold.readlines()
            tokens = [line.strip().split(' ')[0] for line in lines]
            tags = [line.strip().split(' ')[-1] for line in lines]

            prediction = f'/data/yhu5/chatgpt/GPT{GPT}_output/{dataset}/temperature0/{prompt}/{file_name}.html'

            bio_2 = html2bio(prediction)
            all_tokens += tokens

            for i,token in enumerate(tokens):
                if token != '':
                    match = False
                    for i2 in range(i,-1,-1):
                        try:
                            token_2,tag_2 = bio_2[i2].strip().split('\t')
                        except:
                            token_2,tag_2 = None, None
                        if token_2!=None:
                            if token in token_2 or token_2 in token:
                                match = True
                                break

                    if not match:
                        tag_2 = 'O'
                else:
                    tag_2 = ''
                #print (token,tags[i],tag_2)

                gold_tags.append(tags[i])
                all_tags.append(tag_2)
    with open('./merged_gold_pre.bio','w') as fg:
        for i,(token,gold_tag,all_tag) in enumerate(zip(all_tokens,gold_tags,all_tags)):
            if token!='':

                fg.write(f'{token}\t{all_tag}\t{gold_tag}\n')
            else:
                fg.write(f'\n')
    !python /data/yhu5/CLAMP/melaxdev-deepmeddocker_tf-28b7a60e460b/dockersimple/evaluate_jianfu_new.py -lf /data/yhu5/chatgpt/merged_gold_pre.bio

In [None]:
GPT = 3.5
prompt = 'merged_prompt'
get_performance(GPT,prompt)
prompt = 'merged_prompt_definition'
get_performance(GPT,prompt)
prompt = 'merged_prompt_definition_guidelines'
get_performance(GPT,prompt)

In [None]:
GPT = 3.5
prompt = 'merged_prompt_oneshot'
get_performance(GPT,prompt)
prompt = 'merged_prompt_definition_oneshot'
get_performance(GPT,prompt)
prompt = 'merged_prompt_definition_guidelines_oneshot'
get_performance(GPT,prompt)

In [None]:
GPT = 3.5
prompt = 'merged_prompt_fiveshot'
get_performance(GPT,prompt)
prompt = 'merged_prompt_definition_fiveshot'
get_performance(GPT,prompt)
prompt = 'merged_prompt_definition_guidelines_fiveshot'
get_performance(GPT,prompt)

In [None]:
GPT = 4
prompt = 'merged_prompt'
get_performance(GPT,prompt)
prompt = 'merged_prompt_definition'
get_performance(GPT,prompt)
prompt = 'merged_prompt_definition_guidelines'
get_performance(GPT,prompt)

In [None]:
GPT = 4
prompt = 'merged_prompt_oneshot'
get_performance(GPT,prompt)
prompt = 'merged_prompt_definition_oneshot'
get_performance(GPT,prompt)
prompt = 'merged_prompt_definition_guidelines_oneshot'
get_performance(GPT,prompt)

In [None]:
GPT = 4
prompt = 'merged_prompt_fiveshot'
get_performance(GPT,prompt)
prompt = 'merged_prompt_definition_fiveshot'
get_performance(GPT,prompt)
prompt = 'merged_prompt_definition_guidelines_fiveshot'
get_performance(GPT,prompt)

In [None]:
GPT = 3.5
prompt = 'merged_prompt_definition_guidelines_fiveshot_after_error_analysis'
print ('\n',prompt)
get_performance(GPT,prompt)

GPT = 4
prompt = 'merged_prompt_definition_guidelines_fiveshot_after_error_analysis'
print ('\n',prompt)
get_performance(GPT,prompt)