In [1]:
# libraries to read BRAT and conll files
import glob
import pandas as pd

In [2]:
def read_brat(folder_location):
    ann_files = glob.glob(folder_location + "*.ann")

    #read ann files and ignore lines starting with #
    ann_lines = []
    for ann_file in ann_files:
        with open(ann_file, 'r') as f:
            for line in f:
                if not line.startswith("#"):
                    ann_lines.append(line)

    #split lines into things by tab, we want the second column as the type, third column as the entity
    ann_lines_split = []
    for line in ann_lines:
        splits = line.split("\t")
        ann_lines_split.append([splits[1].split(" ")[0], splits[2].split("\n")[0]])
    return ann_lines_split

def read_conll(file_location):
    conll_files = glob.glob(file_location + "*.conll")
    if len(conll_files) > 1:
        raise Exception("More than one conll file found in folder")
    elif len(conll_files) == 0:
        raise Exception("No conll file found in folder")
    else:
        file_location = conll_files[0]
    lines = []
    with open(file_location, 'r') as f:
        for line in f:
            lines.append(line)

    # each line represents a token, where the first tab is the type and the fourth is the token
    lines_split = []
    last_type = None
    for line in lines:
        if line == "\n":
            last_type = None
            continue
        splits = line.split("\t")
        entity_type, token = splits[0], splits[3].split("\n")[0]
        if entity_type != "O":
            if (last_type != entity_type):
                lines_split.append([entity_type, token.split("\n")[0]])
            else:
                lines_split[-1][1] += " " + token

        last_type = entity_type
    return lines_split

In [12]:
def process_annotations_brat(ann_lines_split):
    #process the annotations into a dictionary of lists
    #we want to get a dictionary count of tokens (entites split by space), a dictionary count of entities, and a dictionary count of entity types
    count_entities = {}
    count_tokens = {}
    count_entity_types = {}

    for line in ann_lines_split:
        entity = line[1]
        entity_type = line[0]
        tokens = entity.split(" ")
        if entity in count_entities:
            count_entities[entity] += 1
        else:
            count_entities[entity] = 1
        for token in tokens:
            if token in count_tokens:
                count_tokens[token] += 1
            else:
                count_tokens[token] = 1
        if entity_type in count_entity_types:
            count_entity_types[entity_type] += 1
        else:
            count_entity_types[entity_type] = 1

    # return total of tokens, total of entitities, total of unique entities, how many types we have and a dictionary of tokens per type count
    return sum(count_tokens.values()), sum(count_entities.values()), len(count_entities), len(count_entity_types), count_entity_types, count_entities, count_tokens, count_entity_types

In [10]:
# read BRAT corpus EMEA trainset
train_EMEA_folder_location = "./QUAERO_FrenchMed_TP2021/QUAERO_FrenchMed_BRAT/corpus/train/EMEA/"
train_MEDLINE_folder_location = "./QUAERO_FrenchMed_TP2021/QUAERO_FrenchMed_BRAT/corpus/train/MEDLINE/"
dev_EMEA_folder_location = "./QUAERO_FrenchMed_TP2021/QUAERO_FrenchMed_BRAT/corpus/dev/EMEA/"
dev_MEDLINE_folder_location = "./QUAERO_FrenchMed_TP2021/QUAERO_FrenchMed_BRAT/corpus/dev/MEDLINE/"
test_EMEA_folder_location = "./QUAERO_FrenchMed_TP2021/QUAERO_FrenchMed_BRAT/corpus/test/EMEA/"
test_MEDLINE_folder_location = "./QUAERO_FrenchMed_TP2021/QUAERO_FrenchMed_BRAT/corpus/test/MEDLINE/"

folder_base = "./QUAERO_FrenchMed_TP2021/QUAERO_FrenchMed"

formats = ["conll", "brat"]
splits = ["train", "dev", "test"]
subsets = ["EMEA", "MEDLINE"]

# create pandas dataframe to store results of number of tokens, number of entities, number of unique entities, number of entity types, and dictionary of entity types
df = pd.DataFrame(columns=["format", "split", "subset", "tokens", "entities", "unique_entities", "entity_types"])
df_entity_types = pd.DataFrame(columns=["format", "split", "subset", "entity_type", "entity_types_count"])
# add results to df
for format in formats:
    for split in splits:
        for subset in subsets:
            if format == "conll":
                folder = folder_base + "_conll/corpus/" + split + "/" + subset + "/"
                ann_lines_split = read_conll(folder)
            elif format == "brat":
                folder = folder_base + "_BRAT/corpus/" + split + "/" + subset + "/"
                ann_lines_split = read_brat(folder)
            tokens, entities, unique_entities, entity_types, entity_types_count, _, _, _ = process_annotations_brat(ann_lines_split)
            output = pd.DataFrame({"format": format, "split": split, "subset": subset, "tokens": tokens, "entities": entities, "unique_entities": unique_entities, "entity_types": entity_types}, index=[0])
            df = pd.concat([df, output], ignore_index=True)
            output_per_type = pd.DataFrame([{"format": format, "split": split, "subset": subset, "entity_type": entity_type, "entity_types_count": entity_type_count} for entity_type, entity_type_count in entity_types_count.items()])
            df_entity_types = pd.concat([df_entity_types, output_per_type], ignore_index=True)

In [13]:
folder = "./QUAERO_FrenchMed_TP2021/QUAERO_FrenchMed_conll/corpus/train/EMEA/"
ann_lines_split = read_conll(folder)
tokens, entities, unique_entities, entity_types, entity_types_count, count_entities, count_tokens, count_entity_types = process_annotations_brat(ann_lines_split)

In [16]:
tokens, entities, unique_entities, entity_types, entity_types_count#, count_entities, count_tokens, count_entity_types

(3198,
 2841,
 898,
 20,
 {'CHEM-B': 676,
  'PROC-B': 390,
  'CHEM-I': 114,
  'DISO-B': 618,
  'DISO-I': 246,
  'LIVB-B': 253,
  'PROC-I': 61,
  'ANAT-B': 124,
  'ANAT-I': 28,
  'DEVI-B': 48,
  'OBJC-B': 70,
  'LIVB-I': 26,
  'PHEN-B': 19,
  'PHYS-B': 107,
  'PHYS-I': 29,
  'PHEN-I': 2,
  'GEOG-B': 21,
  'GEOG-I': 7,
  'OBJC-I': 1,
  'DEVI-I': 1})

In [14]:
count_entity_types

{'CHEM-B': 676,
 'PROC-B': 390,
 'CHEM-I': 114,
 'DISO-B': 618,
 'DISO-I': 246,
 'LIVB-B': 253,
 'PROC-I': 61,
 'ANAT-B': 124,
 'ANAT-I': 28,
 'DEVI-B': 48,
 'OBJC-B': 70,
 'LIVB-I': 26,
 'PHEN-B': 19,
 'PHYS-B': 107,
 'PHYS-I': 29,
 'PHEN-I': 2,
 'GEOG-B': 21,
 'GEOG-I': 7,
 'OBJC-I': 1,
 'DEVI-I': 1}

In [5]:
# # write results to csv
# df.to_csv("results.csv")
# df_entity_types.to_csv("results_entity_types.csv")