In [11]:
from glob import glob
import random
from shutil import copyfile
import pandas as pd

### separate sentences (split bio file for each note to bio file for each sentence)

In [22]:
files = glob(f'./notes/*.bio') ## your dir of bio files
i = 0
for file in files:
    with open(file) as f:
        text = f.read()
        lines = text.strip().split('\n')

        sentences = []
        current_sentence = []

        for line in lines:
            parts = line.split('\t')
            if len(parts) == 2:
                word, label = parts
                current_sentence.append(f"{word}\t{label}")
            elif len(parts) == 1 and not parts[0].strip():
                # Empty line, indicating the end of a sentence
                sentences.append('\n'.join(current_sentence))
                current_sentence = []

        # If there are any remaining sentences, add them
        if current_sentence:
            sentences.append('\n'.join(current_sentence))

        # Join the sentences with '\n' and print
        for sentence in sentences:
            with open(f'./sentences/{i}.bio','w') as f:
                f.write(sentence)
            i+=1
    

### BIO TO INSTRUCT

In [25]:
def load_unprocessed_text(file):
    with open(file,'r') as f_read:
        text = ' '.join([line.split('\t')[0] for line in f_read.read().splitlines()])
    return text

def load_processed_text(file):
    with open(file,'r') as f_read:
        lines = f_read.readlines()
    
    processed_text = ''
    for i, line in enumerate(lines):
        token, e_type = line.strip().split('\t')
        if e_type == 'O':
            processed_text += token+' '
            
        if e_type.startswith('B-'):
            if i <= len(lines)-2:
                if lines[i+1]=='\n' or lines[i+1].strip().split('\t')[1]=='O' or lines[i+1].strip().split('\t')[1].startswith('B-'):
                    processed_text += f'<span class="{e_type[2:]}">'+token+'</span> '
                else:
                    processed_text += f'<span class="{e_type[2:]}">'+token+' '
            else:
                processed_text += f'<span class="{e_type[2:]}">'+token+'</span> '
            
        if e_type.startswith('I-'):
            if i <= len(lines)-2:
                if lines[i+1]=='\n' or lines[i+1].strip().split('\t')[1]=='O' or lines[i+1].strip().split('\t')[1].startswith('B-'):
                    processed_text += token+'</span> '
                else:
                    processed_text += token+' '
            else:
                processed_text += token+'</span> '
    processed_text+='<EOS>'
    return processed_text 

In [None]:
prompt = '''### Task:
Your task is to generate an HTML version of an input text, using HTML <span> tags to mark up specific entities.

### Entity Markup Guides:
Use <span class="problem"> to denote a medical problem.
Use <span class="treatment"> to denote a treatment.
Use <span class="test"> to denote a test.
Use <span class="drug"> to denote a drug.

### Entity Definitions:
Medical Problem: The abnormal condition that happens physically or mentally to a patient.
Treatment: The procedures, interventions, and substances given to a patient for treating a problem.
Drug: Generic or brand name of a single medication or a collective name of a group of medication.
Test: A medical procedure performed (i) to detect or diagnose a problem, (ii) to monitor diseases, disease processes, and susceptibility, or (iii) to determine a course of treatment.

### Input Text: {} <EOS>
### Output Text:'''

files = glob(f'./sentences/*.bio')
#files = glob(f'./after_split/{split}/*.bio')
random.seed(42)
random.shuffle(files)
len(files)

df = pd.DataFrame(columns=['unprocessed', 'processed'])

i = 0
length = len(files)

unprocessed = []
processed = []
for file in files:               
    unprocessed_tmp = load_unprocessed_text(file)
    processed_tmp = load_processed_text(file)

    processed.append(processed_tmp)
    unprocessed.append(prompt.format(unprocessed_tmp))

    length_list.append(len(unprocessed_tmp.split(' '))+len(processed_tmp.split(' ')))

df = pd.concat([df, pd.DataFrame({'unprocessed': unprocessed, 'processed': processed})], ignore_index=True)

#df.to_csv(f'document_level_main_{split}.csv', index=False)
df.to_csv(f'NER.csv', index=False)