In [1]:
import os
import pandas as pd
import spacy
from glob import glob
import re

In [2]:
def follows_pattern(s):
    # Define the regex pattern
    pattern = r'^[TR]\d+'
    
    # Use re.match to check if the string matches the pattern
    if re.match(pattern, s):
        return True
    else:
        return False

In [3]:
def reorg_annfile(file):
    new_file=[]
    with open(file) as f:
        lines = f.readlines()
    skip = False
    for i,line in enumerate(lines):
        if i <= len(lines)-2:
            if not follows_pattern(lines[i+1]):
                new_file.append(line.strip()+' '+lines[i+1].strip()+'\n')
                skip = True
            else:
                if not skip:
                    new_file.append(line)
                else:
                    skip = False
                    pass
                
        else:
            if not skip:
                new_file.append(line)
            else:
                skip = False
                pass
    return new_file

In [4]:
def read_brat_files(txt_path, ann_path):

    with open(txt_path, "r") as txt_file:
        text = txt_file.read()
    
    entities = {}
    relations = []
    
    new_lines = reorg_annfile(ann_path)
    for line in new_lines:
        try:
            parts = line.strip().split('\t')
            if parts[0].startswith('T'):
                try:
                    ent_id, ent_info = parts[0], parts[1]
                    ent_type, start, end = ent_info.split(' ')
                    ent_type = ent_type.lower()
                    entities[ent_id] = (ent_type, int(start), int(end))
                except:

                    print (new_lines)

                    raise
            elif parts[0].startswith('R'):
                rel_id, rel_type, arg1, arg2 = [parts[0]]+parts[1].split(' ')
                relations.append((rel_type, entities[arg1.split(':')[1]], entities[arg2.split(':')[1]]))
        except:
            continue
    return text, entities, relations


In [5]:
rel_dict = {'hasAttr':0}

def replace_entities_with_types(sent, ent1, ent2):
    sent_text = str(sent)
    replacements = []
    
    if ent1[1]>ent2[1]:
        ent1,ent2 = ent2,ent1
        
    if ent2[1]>ent1[2]:
        original_len = len(sent_text)
        ent_type, start, end =ent1
        #sent_text = sent_text[:start - sent.start_char]+f"@{ent_type}$"+sent_text[end - sent.start_char:]
        sent_text = sent_text[:start - sent.start_char]+f"@{sent_text[start - sent.start_char:end - sent.start_char]}$"+sent_text[end - sent.start_char:]
        modified_len = len(sent_text)

        len_diff = original_len - modified_len
        ent_type, start, end =ent2
        #sent_text = sent_text[:start - sent.start_char - len_diff]+f"@{ent_type}$"+sent_text[end - sent.start_char- len_diff: ]
        sent_text = sent_text[:start - sent.start_char - len_diff]+f"@{sent_text[start - sent.start_char - len_diff:end - sent.start_char- len_diff]}$"+sent_text[end - sent.start_char- len_diff: ]
    else:

        original_len = len(sent_text)
        ent_type, start, end =ent1
        #sent_text = sent_text[:start - sent.start_char]+f"@{ent_type}$"+sent_text[end - sent.start_char:]
        sent_text = sent_text[:start - sent.start_char]+f"@{sent_text[start - sent.start_char:end - sent.start_char]}$"+sent_text[end - sent.start_char:]
        modified_len = len(sent_text)

        len_diff = original_len - modified_len +1
        ent_type, start, end =ent2
        #sent_text = sent_text[:start - sent.start_char - len_diff]+f"@{ent_type}$"+sent_text[end - sent.start_char- len_diff: ]
        sent_text = sent_text[:start - sent.start_char - len_diff]+f"@{sent_text[start - sent.start_char - len_diff:end - sent.start_char- len_diff]}$"+sent_text[end - sent.start_char- len_diff: ]

    return sent_text

def sentence_relations(text,entities, relations, nlp):
    all_entity_pairs = {'test': ['temporal', 'labvalue', 'reference_range', 'negation'],
     'treatment': ['temporal', 'negation'],
     'problem': ['bodyloc',
      'negation',
      'temporal',
      'subject',
      'severity',
      'course',
      'condition',
      'uncertain'],
     'drug': ['temporal',
      'route',
      'form',
      'strength',
      'frequency',
      'dosage',
      'duration',
      'negation']}
    doc = nlp(text)
    sentences = list(doc.sents)
    sentence_relations = []

    for sent in sentences:
        sent_relations = set()
        sent_entities = []

        # Check if entities are in the current sentence
        for ent_id, ent in entities.items():
            if ent[1] >= sent.start_char and ent[2] <= sent.end_char:
                sent_entities.append(ent)
                
        # Check for existing relations in the current sentence
        for rel in relations:
            rel_type, ent1, ent2 = rel
            if ent1 in sent_entities and ent2 in sent_entities:
                sent_relations.add((ent1, ent2))
                # Replace entity mentions with their types
                sent_text = replace_entities_with_types(sent, ent1, ent2)
                sentence_relations.append((ent1[0], ent2[0],text[ent1[1]:ent1[2]].replace('\n',' '),text[ent2[1]:ent2[2]].replace('\n',' '), sent_text.strip().replace('\n',' '), 0))

        #print (sent_relations)
        # Check for negative relations in the current sentence
        for ent1 in sent_entities:
            for ent2 in sent_entities:
                if ent1 != ent2 and (ent1, ent2) not in sent_relations and  (ent2, ent1) not in sent_relations:
                    #print ((ent1, ent2))
                    for e in all_entity_pairs.keys():
                        if ent1[0] == e:
                            if ent2[0] in all_entity_pairs[e]:
                                #print (file)
                                sent_text = replace_entities_with_types(sent, ent1, ent2)
                                sentence_relations.append((ent1[0], ent2[0],text[ent1[1]:ent1[2]].replace('\n',' '),text[ent2[1]:ent2[2]].replace('\n',' '),  sent_text.strip().replace('\n',' '), f"1"))

    return sentence_relations


In [37]:
rel_dict = {'hasAttr':0}

def group_relations(tuples_set):
    grouped_tuples = {}
    for pair in tuples_set:
        key = pair[0]  # The first tuple of the pair
        if key in grouped_tuples:
            grouped_tuples[key].append(pair[1])
        else:
            grouped_tuples[key] = [pair[1]]
            
    # Sorting the values in each group in descending order based on the second value of the tuple
    sorted_grouped_tuples = {}
    for key, values in grouped_tuples.items():
        sorted_grouped_tuples[key] = sorted(values, key=lambda x: x[1], reverse=True)

    return sorted_grouped_tuples

def replace_entities_with_types(sent, entities):
    sent_text = str(sent)
    if isinstance(entities, list):
        for e in entities:
            ent_type, start, end =e
            sent_text = sent_text[:start - sent.start_char]+f'<span class="{ent_type}">{sent_text[start - sent.start_char:end - sent.start_char]}</span>'+sent_text[end - sent.start_char:] 
    else:
        ent_type, start, end =entities
        sent_text = sent_text[:start - sent.start_char]+f'<span class="{ent_type}">{sent_text[start - sent.start_char:end - sent.start_char]}</span>'+sent_text[end - sent.start_char:] 
    return sent_text

def sentence_relations(text,entities, relations, nlp):
    test_prompt = '''### Task:
    Your task is to mark up modifier entities related to the entity marked with <span> tag in the input text.
    
    ### Entity Markup Guide:
    Use <span class="labvalue"> to denote a numeric value or a normal description of the result of a lab test.
    Use <span class="reference_range"> to denote the range or interval of values that are deemed as normal for a test in a healthy person.
    Use <span class="negation"> to denote the phrase that indicates the absence of an entity.
    Use <span class="temporal"> to denote a calendar date, time, or duration related to a test.

    ### Input Text: {} <EOS>
    ### Output Text:'''
    
    drug_prompt = '''### Task:
    Your task is to mark up modifier entities related to the entity marked with <span> tag in the input text.

    ### Entity Markup Guide:
    Use <span class="form"> to denote the form of drug.
    Use <span class="frequency"> to denote the frequency of taking a drug.
    Use <span class="dosage"> to denote the amount of active ingredient from the number of drugs prescribed.
    Use <span class="duration"> to denote the time period a patient should take a drug.
    Use <span class="strength"> to denote the amount of active ingredient in a given dosage form.
    Use <span class="route"> to denote the way by which a drug, fluid, poison, or other substance is taken into the body.
    Use <span class="negation"> to denote the phrase that indicates the absence of an entity.
    Use <span class="temporal"> to denote a calendar date, time, or duration related to a drug.

    ### Input Text: {} <EOS>
    ### Output Text:'''

    problem_prompt = '''### Task:
    Your task is to mark up modifier entities related to the entity marked with <span> tag in the input text.

    ### Entity Markup Guide:
    Use <span class="uncertain"> to denote a measure of doubt.
    Use <span class="condition"> to denote a phrase that indicates the problems existing in a certain situation.
    Use <span class="subject"> to denote the person entity who is experiencing the disorder.
    Use <span class="negation"> to denote the phrase that indicates the absence of an entity.
    Use <span class="bodyloc"> to denote the location on the body where the observation is present.
    Use <span class="severity"> to denote the degree of intensity of a clinical condition.
    Use <span class="temporal"> to denote a calendar date, time, or duration related to a problem.
    Use <span class="course"> to denote the development or alteration of a problem.

    ### Input Text: {} <EOS>
    ### Output Text:'''
    
    treatment_prompt = '''### Task:
    Your task is to mark up modifier entities related to the entity marked with <span> tag in the input text.

    ### Entity Markup Guide:
    Use <span class="temporal"> to denote a calendar date, time, or duration related to a treatment.
    Use <span class="negation"> to denote the phrase that indicates the absence of an entity.

    ### Input Text: {} <EOS>
    ### Output Text:'''
    df = pd.DataFrame(columns=['unprocessed', 'processed'])
    
    doc = nlp(text)
    sentences = list(doc.sents)
    sentence_relations = []

    unprocessed = []
    processed = []
    for sent in sentences:
        sent_relations = set()
        sent_entities = []

        # Check if entities are in the current sentence
        for ent_id, ent in entities.items():
            if ent[1] >= sent.start_char and ent[2] <= sent.end_char:
                sent_entities.append(ent)
                                
        # Check for existing relations in the current sentence
        for rel in relations:
            rel_type, ent1, ent2 = rel
            if ent1 in sent_entities and ent2 in sent_entities:
                sent_relations.add((ent1, ent2))
        sent_relations = group_relations(sent_relations)
        
        for main_entity, modifier_entities in sent_relations.items():
            # Replace entity mentions with their types
            modifier_sent_text = replace_entities_with_types(sent, modifier_entities).replace('\n',' ')
            main_entity_sent_text = replace_entities_with_types(sent, main_entity).replace('\n',' ')
            
            if main_entity[0] == 'problem':
                unprocessed.append(problem_prompt.format(main_entity_sent_text))
                processed.append(modifier_sent_text+' <EOS>')
            if main_entity[0] == 'drug':
                unprocessed.append(drug_prompt.format(main_entity_sent_text))
                processed.append(modifier_sent_text+' <EOS>')
            if main_entity[0] == 'treatment':
                unprocessed.append(treatment_prompt.format(main_entity_sent_text))
                processed.append(modifier_sent_text+' <EOS>')
            if main_entity[0] == 'test':
                unprocessed.append(test_prompt.format(main_entity_sent_text))
                processed.append(modifier_sent_text+' <EOS>')
        # check for non-existing relations in current sentence
        for entity in sent_entities:
            if entity not in sent_relations and entity[0] in ['problem','treatment','test','drug']:
                main_entity_sent_text = replace_entities_with_types(sent, entity).replace('\n',' ')
                sent_text = str(sent).replace('\n',' ')
                if entity[0] == 'problem':
                    unprocessed.append(problem_prompt.format(main_entity_sent_text))
                    processed.append(sent_text+' <EOS>')
                if entity[0] == 'drug':
                    unprocessed.append(drug_prompt.format(main_entity_sent_text))
                    processed.append(sent_text+' <EOS>')
                if entity[0] == 'treatment':
                    unprocessed.append(treatment_prompt.format(main_entity_sent_text))
                    processed.append(sent_text+' <EOS>')
                if entity[0] == 'test':
                    unprocessed.append(test_prompt.format(main_entity_sent_text))
                    processed.append(sent_text+' <EOS>')
    df = pd.concat([df, pd.DataFrame({'unprocessed': unprocessed, 'processed': processed})], ignore_index=True)
    return df

In [21]:
def main():
    # Load the SpaCy model for sentence tokenization
    nlp = spacy.load('en_core_web_lg')
    
    txt = glob(f'./splitted_dataset_RE/dev/*.txt')

    relation_types = []
    entity_pairs = []
    !mkdir /data/yhu5/LLAMA2/data/CLAMP_data_reorg/splitted_dataset_RE/LLAMA2/dev/
    df = pd.DataFrame(columns=['unprocessed', 'processed'])
    for txt_path in txt:
        filename = txt_path.split('/')[-1].split('.')[0]
        ann_path = f'./splitted_dataset_RE/dev/{filename}.ann'
        output_file = f'./splitted_dataset_RE/LLAMA2/dev/{filename}.csv'

        # Read the Brat files and extract the entities and relations
        text, entities, relations = read_brat_files(txt_path, ann_path)
        
        #for relation in relations:
        #    print (relation)

        # Extract sentence relations
        df = pd.concat([df, sentence_relations(text, entities, relations, nlp)])
        
    # Save the table to a CSV file
    df.to_csv('./RE_dev.csv', index=False)

    
if __name__ == "__main__":
    main()

mkdir: cannot create directory ‘/data/yhu5/LLAMA2/data/CLAMP_data_reorg/splitted_dataset_RE/LLAMA2/dev/’: File exists


In [112]:
def sentence_relations(filename, text,entities, relations, nlp):
    test_prompt = '''### Task:
    Your task is to mark up modifier entities related to the entity marked with <span> tag in the input text.
    
    ### Entity Markup Guide:
    Use <span class="labvalue"> to denote a numeric value or a normal description of the result of a lab test.
    Use <span class="reference_range"> to denote the range or interval of values that are deemed as normal for a test in a healthy person.
    Use <span class="negation"> to denote the phrase that indicates the absence of an entity.
    Use <span class="temporal"> to denote a calendar date, time, or duration related to a test.

    ### Input Text: {} <EOS>
    ### Output Text:'''
    
    drug_prompt = '''### Task:
    Your task is to mark up modifier entities related to the entity marked with <span> tag in the input text.

    ### Entity Markup Guide:
    Use <span class="form"> to denote the form of drug.
    Use <span class="frequency"> to denote the frequency of taking a drug.
    Use <span class="dosage"> to denote the amount of active ingredient from the number of drugs prescribed.
    Use <span class="duration"> to denote the time period a patient should take a drug.
    Use <span class="strength"> to denote the amount of active ingredient in a given dosage form.
    Use <span class="route"> to denote the way by which a drug, fluid, poison, or other substance is taken into the body.
    Use <span class="negation"> to denote the phrase that indicates the absence of an entity.
    Use <span class="temporal"> to denote a calendar date, time, or duration related to a drug.

    ### Input Text: {} <EOS>
    ### Output Text:'''

    problem_prompt = '''### Task:
    Your task is to mark up modifier entities related to the entity marked with <span> tag in the input text.

    ### Entity Markup Guide:
    Use <span class="uncertain"> to denote a measure of doubt.
    Use <span class="condition"> to denote a phrase that indicates the problems existing in a certain situation.
    Use <span class="subject"> to denote the person entity who is experiencing the disorder.
    Use <span class="negation"> to denote the phrase that indicates the absence of an entity.
    Use <span class="bodyloc"> to denote the location on the body where the observation is present.
    Use <span class="severity"> to denote the degree of intensity of a clinical condition.
    Use <span class="temporal"> to denote a calendar date, time, or duration related to a problem.
    Use <span class="course"> to denote the development or alteration of a problem.

    ### Input Text: {} <EOS>
    ### Output Text:'''
    
    treatment_prompt = '''### Task:
    Your task is to mark up modifier entities related to the entity marked with <span> tag in the input text.

    ### Entity Markup Guide:
    Use <span class="temporal"> to denote a calendar date, time, or duration related to a treatment.
    Use <span class="negation"> to denote the phrase that indicates the absence of an entity.

    ### Input Text: {} <EOS>
    ### Output Text:'''
    df = pd.DataFrame(columns=['file_name','sentence_idx','main_entity','unprocessed', 'processed'])
    
    doc = nlp(text)
    sentences = list(doc.sents)
    sentence_relations = []

    unprocessed = []
    processed = []
    main_entities = []
    index = []
    for i,sent in enumerate(sentences):
        #print (i,sent)
        sent_relations = set()
        sent_entities = []

        # Check if entities are in the current sentence
        for ent_id, ent in entities.items():
            if ent[1] >= sent.start_char and ent[2] <= sent.end_char:
                sent_entities.append(ent)
                                
        # Check for existing relations in the current sentence
        for rel in relations:
            rel_type, ent1, ent2 = rel
            if ent1 in sent_entities and ent2 in sent_entities:
                sent_relations.add((ent1, ent2))
        sent_relations = group_relations(sent_relations)
        
        for main_entity, modifier_entities in sent_relations.items():
            #print (main_entity[0])
            main_entities.append(main_entity[0])
            # Replace entity mentions with their types
            modifier_sent_text = replace_entities_with_types(sent, modifier_entities).replace('\n',' ')
            main_entity_sent_text = replace_entities_with_types(sent, main_entity).replace('\n',' ')
            
            if main_entity[0] == 'problem':
                index.append(i)
                unprocessed.append(problem_prompt.format(main_entity_sent_text))
                processed.append(modifier_sent_text+' <EOS>')
            if main_entity[0] == 'drug':
                index.append(i)
                unprocessed.append(drug_prompt.format(main_entity_sent_text))
                processed.append(modifier_sent_text+' <EOS>')
            if main_entity[0] == 'treatment':
                index.append(i)
                unprocessed.append(treatment_prompt.format(main_entity_sent_text))
                processed.append(modifier_sent_text+' <EOS>')
            if main_entity[0] == 'test':
                index.append(i)
                unprocessed.append(test_prompt.format(main_entity_sent_text))
                processed.append(modifier_sent_text+' <EOS>')
        # check for non-existing relations in current sentence
        for entity in sent_entities:
            if entity not in sent_relations and entity[0] in ['problem','treatment','test','drug']:
                #print (entity[0])
                main_entities.append(entity[0])
                main_entity_sent_text = replace_entities_with_types(sent, entity).replace('\n',' ')
                sent_text = str(sent).replace('\n',' ')
                if entity[0] == 'problem':
                    index.append(i)
                    unprocessed.append(problem_prompt.format(main_entity_sent_text))
                    processed.append(sent_text+' <EOS>')
                if entity[0] == 'drug':
                    index.append(i)
                    unprocessed.append(drug_prompt.format(main_entity_sent_text))
                    processed.append(sent_text+' <EOS>')
                if entity[0] == 'treatment':
                    index.append(i)
                    unprocessed.append(treatment_prompt.format(main_entity_sent_text))
                    processed.append(sent_text+' <EOS>')
                if entity[0] == 'test':
                    index.append(i)
                    unprocessed.append(test_prompt.format(main_entity_sent_text))
                    processed.append(sent_text+' <EOS>')
    df = pd.concat([df, pd.DataFrame({'file_name':filename,'sentence_idx':index,'main_entity':main_entities,'unprocessed': unprocessed, 'processed': processed})], ignore_index=True)
    return df

In [139]:
def main():
    # Load the SpaCy model for sentence tokenization
    nlp = spacy.load('en_core_web_lg')
    
    for dataset in ['i2b2_test','MTSample_test','MIMIC3_test','UTP_test']:
        if dataset=='MIMIC3_test':
            txt = glob(f'./reorg_datasets/test/{dataset}/brat/*.txt')
            print (txt)
            relation_types = []
            entity_pairs = []
            df = pd.DataFrame(columns=['file_name','sentence_idx','main_entity','unprocessed', 'processed'])
            for txt_path in txt:
                filename = txt_path.split('/')[-1].split('.')[0]
                txt_path = f'./individual_tests/mimic3_test/updated_50_brat/{filename}.txt'
                ann_path = f'./individual_tests/mimic3_test/updated_50_brat/{filename}.ann'

                # Read the Brat files and extract the entities and relations
                text, entities, relations = read_brat_files(txt_path, ann_path)

                # Extract sentence relations
                df = pd.concat([df, sentence_relations(filename, text, entities, relations, nlp)])

            os.system(f'mkdir ./splitted_dataset_RE/test')
            os.system(f'mkdir ./splitted_dataset_RE/test/{dataset}')
            os.system(f'mkdir ./splitted_dataset_RE/test/{dataset}/html/')
            os.system(f'mkdir ./splitted_dataset_RE/test/{dataset}/html/test')
            os.system(f'mkdir ./splitted_dataset_RE/test/{dataset}/html/problem')
            os.system(f'mkdir ./splitted_dataset_RE/test/{dataset}/html/treatment')
            os.system(f'mkdir ./splitted_dataset_RE/test/{dataset}/html/drug')

            print (len(df))
            for i, (index, row) in enumerate(df.iterrows()):
                main_entity = row['main_entity']
                #print (row)
                with open(f'./splitted_dataset_RE/test/{dataset}/html/{main_entity}/{i}.html','w') as f:
                    processed = row['processed'].split('<EOS>')[0]
                    f.write(processed)

            df.to_csv(f'./RE_{dataset}.csv',index=False)
        else:
            txt = glob(f'./reorg_datasets/test/{dataset}/brat/*.txt')
            print (txt)
            relation_types = []
            entity_pairs = []
            df = pd.DataFrame(columns=['file_name','sentence_idx','main_entity','unprocessed', 'processed'])
            for txt_path in txt:
                filename = txt_path.split('/')[-1].split('.')[0]
                ann_path = f'./reorg_datasets/test/{dataset}/brat/{filename}.ann'

                # Read the Brat files and extract the entities and relations
                text, entities, relations = read_brat_files(txt_path, ann_path)

                # Extract sentence relations
                df = pd.concat([df, sentence_relations(filename, text, entities, relations, nlp)])

            os.system(f'mkdir ./splitted_dataset_RE/test')
            os.system(f'mkdir ./splitted_dataset_RE/test/{dataset}')
            os.system(f'mkdir ./splitted_dataset_RE/test/{dataset}/html/')
            os.system(f'mkdir ./splitted_dataset_RE/test/{dataset}/html/test')
            os.system(f'mkdir ./splitted_dataset_RE/test/{dataset}/html/problem')
            os.system(f'mkdir ./splitted_dataset_RE/test/{dataset}/html/treatment')
            os.system(f'mkdir ./splitted_dataset_RE/test/{dataset}/html/drug')

            print (len(df))
            for i, (index, row) in enumerate(df.iterrows()):
                main_entity = row['main_entity']
                #print (row)
                with open(f'./splitted_dataset_RE/test/{dataset}/html/{main_entity}/{i}.html','w') as f:
                    processed = row['processed'].split('<EOS>')[0]
                    f.write(processed)

            df.to_csv(f'./RE_{dataset}.csv',index=False)
if __name__ == "__main__":
    main()

['./reorg_datasets/test/i2b2_test/brat/discharge81.txt', './reorg_datasets/test/i2b2_test/brat/0457.txt', './reorg_datasets/test/i2b2_test/brat/0408.txt', './reorg_datasets/test/i2b2_test/brat/0360.txt', './reorg_datasets/test/i2b2_test/brat/progress111.txt', './reorg_datasets/test/i2b2_test/brat/progress78.txt', './reorg_datasets/test/i2b2_test/brat/progress117.txt', './reorg_datasets/test/i2b2_test/brat/0007.txt', './reorg_datasets/test/i2b2_test/brat/discharge68.txt', './reorg_datasets/test/i2b2_test/brat/progress28.txt', './reorg_datasets/test/i2b2_test/brat/progress53.txt', './reorg_datasets/test/i2b2_test/brat/discharge77.txt', './reorg_datasets/test/i2b2_test/brat/record-177.txt', './reorg_datasets/test/i2b2_test/brat/discharge95.txt', './reorg_datasets/test/i2b2_test/brat/progress105.txt', './reorg_datasets/test/i2b2_test/brat/progress87.txt', './reorg_datasets/test/i2b2_test/brat/0469.txt', './reorg_datasets/test/i2b2_test/brat/discharge50.txt', './reorg_datasets/test/i2b2_tes

mkdir: cannot create directory ‘./splitted_dataset_RE/test’: File exists
mkdir: cannot create directory ‘./splitted_dataset_RE/test/i2b2_test’: File exists
mkdir: cannot create directory ‘./splitted_dataset_RE/test/i2b2_test/html/’: File exists
mkdir: cannot create directory ‘./splitted_dataset_RE/test/i2b2_test/html/test’: File exists
mkdir: cannot create directory ‘./splitted_dataset_RE/test/i2b2_test/html/problem’: File exists
mkdir: cannot create directory ‘./splitted_dataset_RE/test/i2b2_test/html/treatment’: File exists
mkdir: cannot create directory ‘./splitted_dataset_RE/test/i2b2_test/html/drug’: File exists


3847
['./reorg_datasets/test/MTSample_test/brat/166_20210112211715.txt', './reorg_datasets/test/MTSample_test/brat/sample_2757.txt', './reorg_datasets/test/MTSample_test/brat/sample_1295.txt', './reorg_datasets/test/MTSample_test/brat/sample_963.txt', './reorg_datasets/test/MTSample_test/brat/160_20210112211715.txt', './reorg_datasets/test/MTSample_test/brat/sample_364.txt', './reorg_datasets/test/MTSample_test/brat/sample_2755.txt', './reorg_datasets/test/MTSample_test/brat/sample_2790.txt', './reorg_datasets/test/MTSample_test/brat/sample_648.txt', './reorg_datasets/test/MTSample_test/brat/sample_207.txt', './reorg_datasets/test/MTSample_test/brat/sample_1246.txt', './reorg_datasets/test/MTSample_test/brat/sample_52.txt', './reorg_datasets/test/MTSample_test/brat/80_20210112211705.txt', './reorg_datasets/test/MTSample_test/brat/sample_1714.txt', './reorg_datasets/test/MTSample_test/brat/sample_402.txt', './reorg_datasets/test/MTSample_test/brat/85_20210112211705.txt', './reorg_datase

mkdir: cannot create directory ‘./splitted_dataset_RE/test’: File exists
mkdir: cannot create directory ‘./splitted_dataset_RE/test/MTSample_test’: File exists
mkdir: cannot create directory ‘./splitted_dataset_RE/test/MTSample_test/html/’: File exists
mkdir: cannot create directory ‘./splitted_dataset_RE/test/MTSample_test/html/test’: File exists
mkdir: cannot create directory ‘./splitted_dataset_RE/test/MTSample_test/html/problem’: File exists
mkdir: cannot create directory ‘./splitted_dataset_RE/test/MTSample_test/html/treatment’: File exists
mkdir: cannot create directory ‘./splitted_dataset_RE/test/MTSample_test/html/drug’: File exists


3882
['./reorg_datasets/test/MIMIC3_test/brat/397.txt', './reorg_datasets/test/MIMIC3_test/brat/351.txt', './reorg_datasets/test/MIMIC3_test/brat/283.txt', './reorg_datasets/test/MIMIC3_test/brat/356.txt', './reorg_datasets/test/MIMIC3_test/brat/304.txt', './reorg_datasets/test/MIMIC3_test/brat/668.txt', './reorg_datasets/test/MIMIC3_test/brat/340.txt', './reorg_datasets/test/MIMIC3_test/brat/298.txt', './reorg_datasets/test/MIMIC3_test/brat/294.txt', './reorg_datasets/test/MIMIC3_test/brat/256.txt', './reorg_datasets/test/MIMIC3_test/brat/636.txt', './reorg_datasets/test/MIMIC3_test/brat/654.txt', './reorg_datasets/test/MIMIC3_test/brat/632.txt', './reorg_datasets/test/MIMIC3_test/brat/365.txt', './reorg_datasets/test/MIMIC3_test/brat/267.txt', './reorg_datasets/test/MIMIC3_test/brat/406.txt', './reorg_datasets/test/MIMIC3_test/brat/375.txt', './reorg_datasets/test/MIMIC3_test/brat/622.txt', './reorg_datasets/test/MIMIC3_test/brat/615.txt', './reorg_datasets/test/MIMIC3_test/brat/389.

mkdir: cannot create directory ‘./splitted_dataset_RE/test’: File exists
mkdir: cannot create directory ‘./splitted_dataset_RE/test/MIMIC3_test’: File exists
mkdir: cannot create directory ‘./splitted_dataset_RE/test/MIMIC3_test/html/’: File exists
mkdir: cannot create directory ‘./splitted_dataset_RE/test/MIMIC3_test/html/test’: File exists
mkdir: cannot create directory ‘./splitted_dataset_RE/test/MIMIC3_test/html/problem’: File exists
mkdir: cannot create directory ‘./splitted_dataset_RE/test/MIMIC3_test/html/treatment’: File exists
mkdir: cannot create directory ‘./splitted_dataset_RE/test/MIMIC3_test/html/drug’: File exists


6446
['./reorg_datasets/test/UTP_test/brat/83599508.txt', './reorg_datasets/test/UTP_test/brat/88977525.txt', './reorg_datasets/test/UTP_test/brat/82924358.txt', './reorg_datasets/test/UTP_test/brat/90721596.txt', './reorg_datasets/test/UTP_test/brat/56200121.txt', './reorg_datasets/test/UTP_test/brat/65560435.txt', './reorg_datasets/test/UTP_test/brat/70175786.txt', './reorg_datasets/test/UTP_test/brat/68024227.txt', './reorg_datasets/test/UTP_test/brat/56486908.txt', './reorg_datasets/test/UTP_test/brat/68761371.txt', './reorg_datasets/test/UTP_test/brat/66361476.txt', './reorg_datasets/test/UTP_test/brat/76279635.txt', './reorg_datasets/test/UTP_test/brat/59781389.txt', './reorg_datasets/test/UTP_test/brat/80721428.txt', './reorg_datasets/test/UTP_test/brat/84564650.txt', './reorg_datasets/test/UTP_test/brat/56630897.txt', './reorg_datasets/test/UTP_test/brat/91096314.txt', './reorg_datasets/test/UTP_test/brat/65852842.txt', './reorg_datasets/test/UTP_test/brat/81006564.txt', './reo

mkdir: cannot create directory ‘./splitted_dataset_RE/test’: File exists
mkdir: cannot create directory ‘./splitted_dataset_RE/test/UTP_test’: File exists
mkdir: cannot create directory ‘./splitted_dataset_RE/test/UTP_test/html/’: File exists
mkdir: cannot create directory ‘./splitted_dataset_RE/test/UTP_test/html/test’: File exists
mkdir: cannot create directory ‘./splitted_dataset_RE/test/UTP_test/html/problem’: File exists
mkdir: cannot create directory ‘./splitted_dataset_RE/test/UTP_test/html/treatment’: File exists
mkdir: cannot create directory ‘./splitted_dataset_RE/test/UTP_test/html/drug’: File exists


6465
