In [45]:
# libraries to read BRAT and conll files
import glob
import pandas as pd

In [21]:
def read_brat(folder_location):
    ann_files = glob.glob(folder_location + "*.ann")

    #read ann files and ignore lines starting with #
    ann_lines = []
    for ann_file in ann_files:
        with open(ann_file, 'r') as f:
            for line in f:
                if not line.startswith("#"):
                    ann_lines.append(line)

    #split lines into things by tab, we want the second column as the type, third column as the entity
    ann_lines_split = []
    for line in ann_lines:
        splits = line.split("\t")
        ann_lines_split.append([splits[1].split(" ")[0], splits[2].split("\n")[0]])
    return ann_lines_split

def read_conll(folder_location):
    lines = []
    

In [41]:
def process_annotations_brat(ann_lines_split):
    #process the annotations into a dictionary of lists
    #we want to get a dictionary count of tokens (entites split by space), a dictionary count of entities, and a dictionary count of entity types
    count_entities = {}
    count_tokens = {}
    count_entity_types = {}

    for line in ann_lines_split:
        entity = line[1]
        entity_type = line[0]
        tokens = entity.split(" ")
        if entity in count_entities:
            count_entities[entity] += 1
        else:
            count_entities[entity] = 1
        for token in tokens:
            if token in count_tokens:
                count_tokens[token] += 1
            else:
                count_tokens[token] = 1
        if entity_type in count_entity_types:
            count_entity_types[entity_type] += 1
        else:
            count_entity_types[entity_type] = 1

    # return total of tokens, total of entitities, total of unique entities, how many types we have and a dictionary of tokens per type count
    return sum(count_tokens.values()), sum(count_entities.values()), len(count_entities), len(count_entity_types), count_entity_types

In [42]:
# read BRAT corpus EMEA trainset
train_EMEA_folder_location = "./QUAERO_FrenchMed_TP2021/QUAERO_FrenchMed_BRAT/corpus/train/EMEA/"
train_MEDLINE_folder_location = "./QUAERO_FrenchMed_TP2021/QUAERO_FrenchMed_BRAT/corpus/train/MEDLINE/"
dev_EMEA_folder_location = "./QUAERO_FrenchMed_TP2021/QUAERO_FrenchMed_BRAT/corpus/dev/EMEA/"
dev_MEDLINE_folder_location = "./QUAERO_FrenchMed_TP2021/QUAERO_FrenchMed_BRAT/corpus/dev/MEDLINE/"
test_EMEA_folder_location = "./QUAERO_FrenchMed_TP2021/QUAERO_FrenchMed_BRAT/corpus/test/EMEA/"
test_MEDLINE_folder_location = "./QUAERO_FrenchMed_TP2021/QUAERO_FrenchMed_BRAT/corpus/test/MEDLINE/"

folder_base = "./QUAERO_FrenchMed_TP2021/QUAERO_FrenchMed"

formats = ["conll", "brat"]
splits = ["train", "dev", "test"]
subsets = ["EMEA", "MEDLINE"]

# create pandas dataframe to store results of number of tokens, number of entities, number of unique entities, number of entity types, and dictionary of entity types
df = pd.DataFrame(columns=["format", "type", "subset", "tokens", "entities", "unique_entities", "entity_types", "entity_types_count"])
# add results to df
for format in formats:
    for splits in splits:
        for subset in subsets:
            if format == "conll":
                folder = folder_base + "_conll/corpus/" + splits + "/" + subset + "/"
                ann_lines_split = read_conll(folder)
            elif format == "brat":
                folder = folder_base + "_BRAT/corpus/" + splits + "/" + subset + "/"
                ann_lines_split = read_brat(folder)
            tokens, entities, unique_entities, entity_types, entity_types_count = process_annotations_brat(ann_lines_split)
            df = df.append({"format": format, "type": "tokens", "subset": subset, "tokens": tokens, "entities": entities, "unique_entities": unique_entities, "entity_types": entity_types, "entity_types_count": entity_types_count}, ignore_index=True)

In [43]:
train_EMEA_ann_lines_split = read_brat(train_EMEA_folder_location)

number_of_tokens, number_of_entities, number_of_unique_entities, number_of_entity_types, count_entity_types = process_annotations_brat(train_EMEA_ann_lines_split)


In [44]:
print(number_of_tokens)
print(number_of_entities)
print(number_of_unique_entities)
print(number_of_entity_types)

3579
2695
923
10
