### Data Preparation

* This notebook helps to read "sample.ann" files

-------

In [2]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder 
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from time import time

In [3]:
% matplotlib inline
#import scikitplot
#import scikitplot.plotters as skplt

### Read node data from Files

In [4]:
def sum_items(items):
    st =""
    for item in items:
        st = st+" "+item
    return st

In [5]:
def df_generator(input_file):
    
    C = []
    
    with open(input_file) as f:
        for line in f:
            C.append(line.split())

    L = []
    for item in C:
        if item[0][0] =="T":
            t_key = item[0]
            t_type = item[1]
            all_items = item[4:]
            L.append([t_key,t_type, sum_items(all_items)])
            
            
    LL = pd.DataFrame(L, columns = ["t_key", "t_type", "t_values"], index = None)
    return LL

---------

### Get  node data

In [6]:
import glob
ann_files =  glob.glob("data/*.ann")
len(ann_files)

200

In [7]:
all_df = pd.DataFrame(columns = ["t_key","t_type","t_values"], index = None)
for file in ann_files:
    df = df_generator(file) 
    all_df = pd.concat([all_df,df])   

In [8]:
alldf = all_df
lbn = LabelEncoder()
lbn.fit(alldf["t_type"])
lbn.transform(alldf["t_type"])
alldf['t_label'] = lbn.transform(alldf["t_type"])

In [9]:
alldf.head(10)

Unnamed: 0,t_key,t_type,t_values,t_label
0,T1,Age,28-year-old,2
1,T2,History,previously healthy,20
2,T3,Gender,man,18
3,T4,Activity,presented,0
4,T5,Sign_symptom,healthy,31
5,T6,Duration,6-week,14
6,T7,Sign_symptom,palpitations,31
7,T8,Coreference,symptoms,7
8,T9,Activity,rest,0
9,T10,Frequency,2–3 times per week,17


* Unique Terms

In [10]:
uterms = set(alldf["t_type"])

In [11]:
data_dict = {}
for tm in uterms:
    key = tm
    value = list(alldf[alldf["t_type"]== tm]["t_values"])
    data_dict.update({key:value})

In [12]:
data_dict.keys()

dict_keys(['Dosage', 'Shape', 'Activity', 'Qualitative_concept', 'Other_event', 'Date', 'Quantitative_concept', 'Biological_attribute', 'Severity', 'Weight', 'Family_history', 'Frequency', 'Other_entity', 'Age', 'Subject', 'Volume', 'Administration', 'Nonbiological_location', 'Medication', 'Lab_value', 'Height', 'Ethnicity', 'Area', 'Therapeutic_procedure', 'Sign_symptom', 'History', 'Coreference', 'Diagnostic_procedure', 'Duration', 'Distance', 'Texture', 'Biological_structure', 'Color', 'Disease_disorder', 'Time', 'Gender', 'Detailed_description', 'Occupation'])

---------

In [13]:
alldf.to_csv("alldf.csv")

In [14]:
import json
with open('data_dict.json', 'w') as fp:
    json.dump(data_dict, fp)


---------

### Node and Edge data dictionary

In [15]:
input_file = ann_files[0]
alldict = {}


C = []
with open(input_file) as f:
        for line in f:
            C.append(line.split())
            

t_dict = {}           
for item in C:
        if item[0][0] =="T":
            t_key = item[0]
            t_type = item[1]
            t_value = sum_items(item[4:])
            t_dict.update({t_key:{t_type:t_value}})
            alldict.update({t_key:{t_type:t_value}})         
            
            
e_dict = {}
for item in C:
    if item[0][0] =="E":
        #print(item)
        t_key = item[1].split(':')[1]
        e_key = item[0]
        e_type = item[1].split(':')[0]
        e_val = t_dict[t_key][e_type]
        e_dict.update({e_key:{e_type: t_key+ ":" + e_val}})
        alldict.update({e_key:{e_type:e_val}})   
        
        
r_dict = {}
for item in C:
    if item[0][0] =="R":
        #print(item)
        r_key = item[0]
        r_relation = item[1]
        r_from = item[2]
        r_to = item[3]
        r_dict.update({r_key:{r_relation,r_from,r_to}})

In [18]:
list(r_dict.values()) 

[{'Arg1:E34', 'Arg2:E35', 'BEFORE'},
 {'Arg1:T49', 'Arg2:E25', 'MODIFY'},
 {'Arg1:T61', 'Arg2:E32', 'MODIFY'},
 {'Arg1:E8', 'Arg2:E7', 'BEFORE'},
 {'Arg1:T40', 'Arg2:E20', 'MODIFY'},
 {'Arg1:E18', 'Arg2:E17', 'MODIFY'},
 {'Arg1:T14', 'Arg2:E7', 'MODIFY'},
 {'Arg1:T17', 'Arg2:E7', 'MODIFY'},
 {'Arg1:T35', 'Arg2:E19', 'MODIFY'},
 {'Arg1:T16', 'Arg2:E7', 'MODIFY'},
 {'Arg1:E29', 'Arg2:E28', 'SUB_PROCEDURE'},
 {'Arg1:T42', 'Arg2:E21', 'MODIFY'},
 {'Arg1:E25', 'Arg2:E26', 'BEFORE'},
 {'Arg1:T45', 'Arg2:E22', 'MODIFY'},
 {'Arg1:T65', 'Arg2:E34', 'MODIFY'},
 {'Arg1:E27', 'Arg2:E28', 'BEFORE'},
 {'Arg1:E18', 'Arg2:T31', 'IDENTICAL'},
 {'AFTER', 'Arg1:E1', 'Arg2:E3'},
 {'Arg1:T11', 'Arg2:E5', 'MODIFY'},
 {'Arg1:T44', 'Arg2:E22', 'MODIFY'},
 {'Arg1:E28', 'Arg2:E30', 'BEFORE'},
 {'Arg1:T27', 'Arg2:E14', 'MODIFY'},
 {'Arg1:E5', 'Arg2:E4', 'IDENTICAL'},
 {'Arg1:T10', 'Arg2:E5', 'MODIFY'},
 {'Arg1:E6', 'Arg2:E5', 'MODIFY'},
 {'Arg1:T24', 'Arg2:E12', 'MODIFY'},
 {'Arg1:T54', 'Arg2:E28', 'MODIFY'},
 {

In [40]:
e_dict

{'E1': {'Activity': 'T4: presented'},
 'E10': {'Diagnostic_procedure': 'T21: electrocardiogram'},
 'E11': {'Diagnostic_procedure': 'T22: ECG'},
 'E12': {'Diagnostic_procedure': 'T23: sinus rhythm'},
 'E13': {'Sign_symptom': 'T25: Wolff– Parkinson– White pre-excitation pattern'},
 'E14': {'Disease_disorder': 'T26: accessory pathway'},
 'E15': {'Diagnostic_procedure': 'T28: echocardiography'},
 'E16': {'Disease_disorder': "T30: Ebstein's anomaly"},
 'E17': {'Sign_symptom': 'T32: apical displacement'},
 'E18': {'Coreference': 'T33: valve'},
 'E19': {'Disease_disorder': 'T34: atrialized'},
 'E2': {'Sign_symptom': 'T5: healthy'},
 'E20': {'Sign_symptom': 'T39: elongated'},
 'E21': {'Sign_symptom': 'T41: rudimentary'},
 'E22': {'Diagnostic_procedure': 'T43: echocardiography'},
 'E23': {'Disease_disorder': 'T46: patent foramen ovale'},
 'E24': {'Sign_symptom': 'T47: right-to-left shunting'},
 'E25': {'Sign_symptom': 'T48: bubbles'},
 'E26': {'Diagnostic_procedure': 'T50: electrophysiologic st

In [41]:
import json
with open('e_dict.json', 'w') as fp:
    json.dump(e_dict, fp)


In [42]:
t_dict

{'T1': {'Age': ' 28-year-old'},
 'T10': {'Frequency': ' 2–3 times per week'},
 'T11': {'Detailed_description': ' up to 30 minutes at a time'},
 'T12': {'Sign_symptom': ' dyspnea'},
 'T13': {'Sign_symptom': ' regurgitation murmur'},
 'T14': {'Biological_structure': ' tricuspid'},
 'T15': {'Detailed_description': ' holosystolic'},
 'T16': {'Qualitative_concept': ' grade 2/6'},
 'T17': {'Biological_structure': ' left sternal border'},
 'T18': {'Detailed_description': ' inspiratory accentuation'},
 'T19': {'Diagnostic_procedure': ' physical examination'},
 'T2': {'History': ' previously healthy'},
 'T20': {'Qualitative_concept': ' unremarkable'},
 'T21': {'Diagnostic_procedure': ' electrocardiogram'},
 'T22': {'Diagnostic_procedure': ' ECG'},
 'T23': {'Diagnostic_procedure': ' sinus rhythm'},
 'T24': {'Qualitative_concept': ' normal'},
 'T25': {'Sign_symptom': ' Wolff– Parkinson– White pre-excitation pattern'},
 'T26': {'Disease_disorder': ' accessory pathway'},
 'T27': {'Detailed_descript

------------

In [45]:
alldict

{'E1': {'Activity': ' presented'},
 'E10': {'Diagnostic_procedure': ' electrocardiogram'},
 'E11': {'Diagnostic_procedure': ' ECG'},
 'E12': {'Diagnostic_procedure': ' sinus rhythm'},
 'E13': {'Sign_symptom': ' Wolff– Parkinson– White pre-excitation pattern'},
 'E14': {'Disease_disorder': ' accessory pathway'},
 'E15': {'Diagnostic_procedure': ' echocardiography'},
 'E16': {'Disease_disorder': " Ebstein's anomaly"},
 'E17': {'Sign_symptom': ' apical displacement'},
 'E18': {'Coreference': ' valve'},
 'E19': {'Disease_disorder': ' atrialized'},
 'E2': {'Sign_symptom': ' healthy'},
 'E20': {'Sign_symptom': ' elongated'},
 'E21': {'Sign_symptom': ' rudimentary'},
 'E22': {'Diagnostic_procedure': ' echocardiography'},
 'E23': {'Disease_disorder': ' patent foramen ovale'},
 'E24': {'Sign_symptom': ' right-to-left shunting'},
 'E25': {'Sign_symptom': ' bubbles'},
 'E26': {'Diagnostic_procedure': ' electrophysiologic study'},
 'E27': {'Diagnostic_procedure': ' mapping'},
 'E28': {'Therapeutic

In [46]:
import json
with open('alldict.json', 'w') as fp:
    json.dump(alldict, fp)