# Build test datasets

Builds a three-column dataframes:
 - term: term from either the DEFT dataset or MEsh terminology
 - label: Mesh code C**
 - term source: "DEFT-train", "DEFT-val", "Mesh-FR", "Mesh-EN"

In [2]:
from os.path import isfile, isdir, join
from os import listdir
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import collections
import math

IN_BRAT_DIR = '/export/home/cse200093/brat_data/CRH_VAL_PHENO_nlstruct'

# extracted ressources of all terms before classification, with sex information
OUT_DF = './resources/15_02_2022_CRH_VAL_PHENO_pred.csv'
OUT_Gender = './resources/15_02_2022_CRH_VAL_PHENO_Gender.csv'

assert isdir(IN_BRAT_DIR)


In [3]:
def find_offsets_relaxed_match(label_infos, left_offset, right_offset):
    matches = []
    for o in label_infos.keys():
        o_list = o.split(' ')
        leftmost_offset = int(o_list[0])
        rightmost_offset = int(o_list[-1])
        if leftmost_offset <= left_offset <= rightmost_offset or leftmost_offset <= right_offset <= rightmost_offset or left_offset <= leftmost_offset <= right_offset or left_offset <= rightmost_offset <= right_offset:
            matches.append((o, label_infos[o]))
    return matches

In [4]:
ENTITY_REGEX = re.compile('^(T\d+)\t([^ ]+) ([^\t]+)\t(.*)$')

tags = []
data = []

labels = ["sosydiso"]
gender = ['F', 'M']
ann_files = [f for f in listdir(IN_BRAT_DIR) if isfile(join(IN_BRAT_DIR, f)) if f.endswith('.ann')]


for ann_file in ann_files:
    source = ann_file
    ann_path = join(IN_BRAT_DIR, ann_file)
    assert isfile(ann_path)
    # Read ann file
    with open(ann_path, 'r', encoding='utf-8') as f_in:
        lines = f_in.readlines()

    columns = ['term', 'source']

    # First pass -> extract labels sosy, pathologie, substance, examen
    label_infos = {}
    other_infos = {}
    for line in lines:
        entity_match = ENTITY_REGEX.match(line.strip())
        if entity_match is not None:
            ann_id = entity_match.group(1)
            label = entity_match.group(2)
            offsets = entity_match.group(3)
            span = entity_match.group(4)
            if label in labels:
                label_list = label_infos.get(offsets, [])
                label_list.append((ann_id, label, offsets, span))
                label_infos[offsets] = label_list
                data.append([span, source])
            if label in gender:
                data.append([label, source])

"""
        # Find other mentions with the same offsets
        # restrict to pathology and sosy
        for line in lines:
            entity_match = ENTITY_REGEX.match(line.strip())
            if entity_match is not None:
                ann_id = entity_match.group(1)
                label = entity_match.group(2)
                offsets = entity_match.group(3)
                offset_list = offsets.split(' ')
                leftmost_offset = int(offset_list[0])
                rightmost_offset = int(offset_list[-1])
                span = entity_match.group(4)
                if label not in labels:
                    if label in ('pathologie', 'sosy','substance','examen'):
                        res = find_offsets_relaxed_match(label_infos, leftmost_offset, rightmost_offset)
                        if len(res):
                            for r in res:
                                new_offsets, mesh_tags = r
                                for i in mesh_tags:
                                    assert i[1] in labels
                                    data.append([span, i[1], source])
                                #tags.append([ann_file, ann_id, new_offsets, span, label, mesh_tags])
                        #else:                        
                        #    tags.append([ann_file, ann_id, offsets, span, label, None])

    #    for offsets in label_infos.keys():
    #        offset_df = [t for t in tags if t[2] == offsets]
    #        if not len(offset_df):
    #            offset_list = offsets.split(' ')
    #            leftmost_offset = int(offset_list[0])
    #            rightmost_offset = int(offset_list[-1])
    #            span = text[leftmost_offset:rightmost_offset]
    #            tags.append([ann_file, ann_id, offsets, span, 'null', ', '.join([i[1] for i in label_infos[offsets]])])
    #     
#tag_df = pd.DataFrame(data=tags, columns=['ann_file', 'ann_id', 'offsets', 'span', 'label', 'tag'])
"""
len(data)

18818

In [5]:
#data

In [17]:
dataset_df = pd.DataFrame(data=list(data), columns=columns)
# dataset_df = dataset_df.drop_duplicates().reset_index(drop=True)
dataset_df.sample(30)

Unnamed: 0,term,source
15991,anévrysme de l’artère fémorale commune,CRH_val_sample_239.ann
9021,Dyskinésies,CRH_val_sample_380.ann
11998,limitation de l'ouverture buccale,CRH_val_sample_179.ann
13854,CAPS,CRH_val_sample_382.ann
7104,RCF pathologique,CRH_val_sample_364.ann
7530,F,NEW_CRH_val_sample_9.ann
16449,glomérulonéphrite de classe I,NEW_CRH_val_sample_61.ann
13432,Syndrome de Raynaud,CRH_val_sample_193.ann
8199,Vessie faiblement remplie,NEW_CRH_val_sample_35.ann
3207,densité liquidienne et non hématique,CRH_val_sample_357.ann


In [48]:
dataset_df[dataset_df['term'] == 'taka'] 

Unnamed: 0,term,source


In [43]:
# look for number of patients with lupus, Scl, SAPL and Taka in validation dataset
df_lupus = dataset_df[(dataset_df['term'] == 'lupus')|(dataset_df['term'] == 'Lupus')|
                      (dataset_df['term'] == 'lupique')
                     |(dataset_df['term'] == 'LES')]
df_lupus = df_lupus.drop_duplicates().reset_index(drop=True)
df_lupus.sample(5)
len(df_lupus)

100

In [44]:
# look for number of patients with lupus, Scl, SAPL and Taka in validation dataset
df_sapl = dataset_df[(dataset_df['term'] == 'SAPL')]
df_sapl = df_sapl.drop_duplicates().reset_index(drop=True)
df_sapl.sample(5)
len(df_sapl)

51

In [45]:
df_scl = dataset_df[(dataset_df['term'] == 'sclérodermie')|(dataset_df['term'] == 'Sclérodermie')
                   |(dataset_df['term'] == 'sclérodermie systémique') 
                    |(dataset_df['term'] == 'Sclérodermie systémique')
                    |(dataset_df['term'] == 'sclérodermie diffuse')
                   ]
df_scl = df_scl.drop_duplicates().reset_index(drop=True)
df_scl.sample(5)
len(df_scl)

88

In [46]:
df_taka = dataset_df[(dataset_df['term'] == 'Takayasu')|(dataset_df['term'] == 'maladie de Takayasu')]
df_taka = df_taka.drop_duplicates().reset_index(drop=True)
df_taka.sample(5)
len(df_taka)

18

In [47]:
len(df_lupus)+len(df_sapl)+len(df_scl)+len(df_taka)

257

In [15]:
# Save separetely genders in antoher table
df_gender = dataset_df[(dataset_df['term'] == 'F') | (dataset_df['term'] == 'M')]
df_gender = df_gender.drop_duplicates().reset_index(drop=True)
df_gender.sample(5)

Unnamed: 0,term,source
218,F,CRH_val_sample_9.ann
228,F,CRH_val_sample_70.ann
82,F,CRH_val_sample_199.ann
15,F,CRH_val_sample_3.ann
47,M,NEW_CRH_val_sample_34.ann


In [16]:
# save predicted terms 
dataset_df.to_csv(OUT_DF)
# save gender separately :
df_gender.to_csv(OUT_Gender)