## functions process umass18 data to svm training data and testing data

In [290]:
#go to config file
import re, os, sys, itertools, copy
import pickle as pkl

bioc_dir = "annotations/"
map_file_dir = "corpus_sent/"
sent_file_dir = "corpus_sent/"
rel_file_dir = "corpus_rel/"
train_file_dir = "corpus/"
bioc_suffix = ".bioc.xml"
map_suffix = ".wmap.txt"
sent_suffix = ".sent.txt"
rel_suffix = ".rel.txt"


def extract_sent_map(map_file):
    sent_boundry_map = dict()
    sents = []
    with open(map_file, "r") as f:
        for idx, sec in enumerate(f.read().strip().split("\n\n")):
            sent = []        
            ss = sec.split("\n")
            for i, line in enumerate(ss):
                info = line.split("\t")
                info[3] = int(info[3])
                info[4] = int(info[4])
                #record boundary for each sentence
                if len(ss) == 1:
                    sent_boundry_map[idx] = [info[3], info[4]]
                elif i == 0:
                    sent_boundry_map[idx] = [info[3]]
                elif i == (len(sec.split("\n"))-1):
                    sent_boundry_map[idx].append(info[4])
                #idx of sentence
                info.append(idx)
                #append sentences together in one list
                sent.append(info)
            sents.append(sent)
    return sents, sent_boundry_map


def process_bioc(bioc_file):
    with open(bioc_file, "r") as f:
        txt = f.read().strip()
        txt = re.sub('\n',' ',txt)
        txt = re.sub('[ ]+',' ',txt)
    return txt


def extract_entities_from_bioc(txt):
    #<annotation id="1"> <infon key="type">Drug</infon> <location length="11" offset="859"/> <text>doxorubicin</text> </annotation>
    annos=re.findall('<annotation id="(\d+)"> <infon key="type">(\w+)</infon> <location length="(\d+)" offset="(\d+)"/> <text>(.*?)</text> </annotation>',txt)
    # process annos
    if len(annos) == 0:
        return [],{}
    annos=sorted(annos, key=lambda x: int(x[3]))
    # print(annos)
    id2idx = dict()
    nannos = []
    for i, each in enumerate(annos):
        fs = int(each[3])
        fe = fs + int(each[2])
        id2idx[each[0]] = i
        nannos.append([each[0], each[1], fs, fe, each[-1].replace("\\n", "")])
    return nannos, id2idx


def extract_relation_from_bioc(txt):
    trels = []
    rels = re.findall('<relation id="(\d+)"> <infon key="type">(\w+|\w+/\w+)</infon> <node refid="(\d+)" role="(annotation \d+)"/> <node refid="(\d+)" role="(annotation \d+)"/> </relation>', txt)  
    for each in rels:
        trels.append((each[0], each[1], each[2], each[4]))
    return trels


def label_tokens_in_sent(anno, sents):
    nsents = []
    for sent in sents:
        se = len(sent) - 1
        for i, e in enumerate(sent):
            if i == 0:
                e.insert(-1, 's')
            elif i == se:
                e.insert(-1, 'e')
            else:
                e.insert(-1, 'm')
            nsents.append(e)
    
    tid2sidx = dict() #token id to token start and end idx in the sentence (easy to get unigram feature)

    idx = 0
    tlen = len(nsents)
    for en in anno:
        ensb = en[2] #start boundary
        eneb = en[3] #end boundary
        enid = en[0] #entity id
        while idx < tlen and nsents[idx][4] <= ensb:
            idx += 1
        if nsents[idx][3] != ensb:
            print(en, 'start boundary not match - check corpus')
        tid2sidx[enid] = [idx]
        if nsents[idx][4] < eneb:
            idx += 1
            while idx < tlen and nsents[idx][4] < eneb and nsents[idx][4] >= 0:
                idx += 1
            tid2sidx[enid].append(idx)
            idx += 1
        elif nsents[idx][4] == eneb:
            #signle word
            tid2sidx[enid].append(idx)
            idx += 1
        else:
            #overlap case (only handle two overlap if three overlap, error will be created, toekn will be lost)
            if eneb < nsents[idx][3]:
                idx = 0
                while idx < tlen and nsents[idx][4] <= ensb:
                    idx += 1
                if nsents[idx][3] != ensb:
                    print(en, 'start boundary not match - check corpus') #change to log later for debug perposes
                tid2sidx[enid] = [idx]
#                 print(1, idx)
                if nsents[idx][4] < eneb:
                    idx += 1
                    while idx < tlen and nsents[idx][4] < eneb and nsents[idx][4] >= 0:
                        idx += 1
                    tid2sidx[enid].append(idx)
#                     print(2, idx)
                    idx += 1
#                 elif nsents[idx][4] == eneb:
                else:
                    #signle word
#                     print(3, idx)
                    tid2sidx[enid].append(idx)
                    idx += 1
            else:
            #not overlap case (single word)
                tid2sidx[enid].append(idx)
                idx += 1
    if len(tid2sidx) != len(anno):
        print('lose entities due to label process')
    return nsents, tid2sidx


def extract_features_of_relations(rels, annos, nsents, aid2idx, tid2idx):
    label_cat = set()
    rel_fea_in_sent = []
    rel_pair = set()
    discard_rel = []
    
    for each in rels:
        rtype = each[1]
        ann1_id = each[2]
        ann2_id = each[3]
        if ann1_id == ann2_id:
            discard_rel.append(each)
            print(each[0], " has two entities with same id ", ann1_id)
            continue
        
        #the label category and relation pairs list will have extra terms due to some cross_sentence relations have been discard
        label_cat.add(rtype)
        rel_pair.add((ann1_id, ann2_id))
        
        #text of entities in the relation
        tk1 = annos[aid2idx[ann1_id]]
        tk2 = annos[aid2idx[ann2_id]]
         
        #using the sentence idx of the first token in an entity to represent the entity sentence idx
        pos1 = tid2idx[ann1_id]
        pos2 = tid2idx[ann2_id]
        tk1_sent_idx = nsents[pos1[0]][-1]
        tk2_sent_idx = nsents[pos2[0]][-1]
        
        #check if entity 1 and entity 2 are in same sentence
        #if two entities are in the same sentence, the sent idx difference should 0
        if tk1_sent_idx - tk2_sent_idx == 0:
            label = rtype
            #entity text
            fea1 = "" + tk1[-1].upper()           
            fea2 = "" + tk2[-1].upper()         
            #entity type
            fea3 = "" + tk1[1]            
            fea4 = "" + tk2[1]           
            #entities distance measured by diference of the start position before mapping
            fea5 = tk1[2] - tk2[2]   
            #entities distance measered by position in sentence as seperated token numbers 
            fea6 = pos1[0] - pos2[0]
            #ngram features: window size = 3
            fea_ngram_en1 = extract_ngram_feature(nsents, pos1)
            fea_ngram_en2 = extract_ngram_feature(nsents, pos2)
            feas = [label, fea1, fea2, fea3, fea4, fea5, fea6] + fea_ngram_en1 + fea_ngram_en2
#             rel_fea_in_sent.append([fea for fea in feas if fea != ""])   
            rel_fea_in_sent.append([fea for fea in feas])     
    
    return rel_fea_in_sent, rel_pair, label_cat, discard_rel
   
    
def extract_ngram_feature(nsents, pos):
#     case = nsents[pos[0]][-2] + nsents[pos[1]][-2]
#     if case == "sm":
#         return get_post_feature(nsents, pos)
#     elif case == "me":
#         return get_pre_feature(nsents, pos)
#     elif case == "mm":
#         return get_pre_feature(nsents, pos) + get_post_feature(nsents, pos)
#     else:
#         return []
    
    case = nsents[pos[0]][-2] + nsents[pos[1]][-2]
    if case == "sm":
        return ["Nan"]*3 + get_post_feature(nsents, pos)
    elif case == "me":
        return get_pre_feature(nsents, pos) + ["Nan"]*3
    elif case == "mm":
        return get_pre_feature(nsents, pos) + get_post_feature(nsents, pos)
    else:
        return ["Nan"]*6

    
def get_pre_feature(nsents, pos):
    features = []                                                             
    for i in range(1, 4):
        try:
            nens = [e[0] for e in nsents[pos[0]-i: pos[0]]]
            features.append(" ".join(nens))
        except Exception as  e:
            print("get_pre_feature exception ", e)
            break
    
    for _ in range(len(features), 3):
        features.append("Nan")
        
    return features


def get_post_feature(nsents, pos):
    features = []
    for i in range(1, 4):
        try:
            nens = [e[0] for e in nsents[pos[1]+1: pos[1]+i+1]]
            features.append(" ".join(nens))
        except Exception as  e:
            print("get_post_feature exception ", e)
            break
    
    for _ in range(len(features), 3):
        features.append("Nan")
    
    return features


def _group_annotaion_by_sent(anns, sent_boundry_map):
    idx = 0
    ann_in_sent = dict()
    for k, v in sent_boundry_map.items():
        ann_in_sent[k] = []
        while idx < len(anns) and anns[idx][2] >= v[0] and anns[idx][3] <= v[1]:
            ann_in_sent[k].append(idx)
            idx += 1
        if idx < len(anns) and anns[idx][3] > v[1] and anns[idx][2] < v[1]:
            print(anns[idx], " is in two sentences. Assign to the first sentence")
            ann_in_sent[k].append(idx) # assgin shifted entities to current sentence
            print("entity index is ", idx)
            idx += 1
#     print(ann_in_sent)
    return ann_in_sent


def generate_relations_based_on_entities_in_sentence(anns, sent_boundry_map, true_rels=[], fake=False):
    '''
        fake flag is set to True which means all the relations generated are for negative sample training
        to generate samples for prediction, set the fake flag to False (default)
    '''
    ann_in_sent = _group_annotaion_by_sent(anns, sent_boundry_map)
    
    #the generated relation must have its entities with types as defined below
    #all these are know pre-hand by analysis the all the relations
    entity_type_rule_set = {
        ('Drug', 'ADE'), ('SSLIF', 'ADE'), ('Drug', 'Dose'), 
        ('Drug', 'Duration'), ('Drug', 'Frequency'), ('Drug', 'Indication'), 
        ('SSLIF', 'Severity'), ('ADE', 'Severity'), ('Indication', 'Severity'), 
        ('Drug', 'Route')
    }
    
    #permutation entities in each sentence to create relation pairs
    rels = []
    for k, v in ann_in_sent.items():
        for e1, e2 in itertools.permutations(v, 2):
            #make some rules to eliminate some of the relation combinations
            rel_pair = (anns[e1][0], anns[e2][0])
            rel_type_pair = (anns[e1][1], anns[e2][1])
            
            #fake flag is used to control whether the generated relations are for negative training sample or prediction
            #TODO ugly code here but logic is clear
            if fake:
                if rel_pair not in true_rels and rel_type_pair in entity_type_rule_set:  
                    rels.append((-1, "NEGATIVE", rel_pair[0], rel_pair[1]))
            else:
                #TODO need to perserve original entities's id
                if rel_type_pair in entity_type_rule_set:
                    rels.append((-2, "PREDICT", rel_pair[0], rel_pair[1]))    

    return rels 
        

#save generated relations from the current file
#features are sperated by '\t'
#input data must be list
def output_relations(file_name, data):
    with open(file_name, "w") as  fw:
        for each in data:
            print("\t".join([str(e) for e in each]), file=fw, end="\n")

            
def generate_SVM_training_data(file_dir):
    total_rel_fea_in_sent = []
    label_cat_set = set()
    total_true_rel_num = 0
    total_neg_rel_num = 0
    
    for file_id in os.listdir(file_dir):
        bioc_file = bioc_dir + file_id + bioc_suffix
        sent_file = sent_file_dir + file_id + sent_suffix
        map_file = map_file_dir + file_id + map_suffix
        rel_file = rel_file_dir + file_id + rel_suffix
        
        print("********" + file_id + "********")
        #read sentence mapping file
        sents, sent_boundry_map = extract_sent_map(map_file_dir + file_id + map_suffix)
        #read bioc file
        txt = process_bioc(bioc_dir + file_id + bioc_suffix)
        #get all entities from bioc
        anns, aid2idx = extract_entities_from_bioc(txt)
        #get all relations from bioc
        nsents, tid2idx = label_tokens_in_sent(anns, sents)
        t_rels = extract_relation_from_bioc(txt)
        t_rel_fea_in_sent, t_rel_pair, t_label_cat, discard_rel_id = extract_features_of_relations(t_rels, anns, nsents, aid2idx, tid2idx)
        total_true_rel_num += len(t_rel_fea_in_sent)
        #f_rels is generated for negative training sample (all relations are not true label and relations overlapped with t_rels are removed)
        f_rels = generate_relations_based_on_entities_in_sentence(anns, sent_boundry_map, t_rel_pair, True)
        total_neg_rel_num += len(f_rels)
        #extract features for f_rels and p_rels
        f_rel_fea_in_sent, _, f_label, _ = extract_features_of_relations(f_rels, anns, nsents, aid2idx, tid2idx)
        #update label category set
        label_cat_set.update(t_label_cat)
        label_cat_set.update(f_label)
        #add true and negative relations to the total relations list
        t_rel_fea_in_sent.extend(f_rel_fea_in_sent)
        total_rel_fea_in_sent.extend(t_rel_fea_in_sent)
        #output current file relations 
        output_relations(rel_file ,t_rel_fea_in_sent)
        print()
    
    print("The number of true relations in corpus: ", total_true_rel_num)
    print("The number of negative relations in corpus ", total_neg_rel_num)
    
    return total_rel_fea_in_sent, label_cat_set


def feature2code(fea,fea_dic,fea_lt):
    if fea in fea_dic:
        return fea_dic[fea]
    else:
        fea_lt.append(fea)
        fea_dic[fea]=len(fea_lt)
        return fea_dic[fea]

    
def label2idx(labels):
    ls = copy.deepcopy(labels)
    l2idx = dict()
    #make sure the negtive samples are generated
    assert "NEGATIVE" in ls, "no negative sample is generated for training."
    #force label NAG
    l2idx['NEGATIVE'] = 0
    ls.remove('NEGATIVE')
    for i, l in enumerate(ls):
        l2idx[l] = i+1
    return l2idx


def dump_feature_dict(data, file_name):
    with open(file_name, "wb") as fw:
        pkl.dump(data, fw)

        
def load_feature_dict(file_name):
    with open(file_name, "rb") as fr:
        feature_dict = pkl.load(fr)
    return feature_dict


def output_SVM_training_data(file_name, data, labels):
    '''
        the file_name param should be just file name with directory but not suffix, e.g. data/umass18
        two files will be created one as svm input umass18.svm; another is the features table umass18_features.fea
    '''
    feature_dict = dict()
    feature_list = []
    
    svm_file = file_name + ".svm"
    fea_file = file_name + "_features_dict.pkl"
    idx2l_file = file_name + "_index2label.pkl"
    
    l2idx = label2idx(labels)
    idx2l = {v:k for k, v in l2idx.items()}
    
    with open(idx2l_file, "wb") as f:
        pkl.dump(idx2l, f)
    print("label dictionary is store in {}".format(idx2l_file))
    
    with open(svm_file, "w") as f1:
        for idx, info in enumerate(data):
            #debug purposes
#             if idx < 20:
#                 print(info)
            #['NAGTIVE', 'NEAR SYNCOPE', 'SEVERE', 'ADE', 'Severity', -76, -12, 'a', 'was a', 'Event was a', 'thought', 'thought due', 'thought due to', 'of', 'setting of', 'in setting of', ',', ', progressive', ', progressive pulmonary'], ['NAGTIVE', 'INADEQUATE RV FILLING PRESSURES', 'NEAR SYNCOPE', 'SSLIF', 'ADE', 30, 5, 'to', 'due to', 'thought due to', 'in', 'in setting', 'in setting of', 'a', 'was a', 'Event was a', 'thought', 'thought due', 'thought due to']
            #covert features to code
            #add label
            stt = l2idx[info[0]]
            #numeric features can be directly covert to svm features use default value
            code_dis = feature2code("e1e2dis", feature_dict, feature_list)
            dis = info.pop(5)
            stt = "{} {}:{}".format(stt, code_dis, dis)
            code_pos = feature2code("e1e2pos", feature_dict, feature_list)
            #the distance value has been removed from list, so the index for pos is changed from 6 to 5
            pos = info.pop(5)
            stt = "{} {}:{}".format(stt, code_pos, pos)
            #string features must be encoded with a distinct numeric id
            tset = set()
            for k, each in enumerate(info[1:]):
#                 print(each) #debug 
                code = feature2code(each, feature_dict, feature_list)
                tset.add(code)
            #libsvm requires input feature code in an ascending order
            tset = sorted(list(tset))
#             temp = -1
            for each in tset:
#                 assert temp != each, "duplicate feature " + each
#                 temp = each
                #categorical label associated value just need to set to 1 means true
                stt = "{} {}:{}".format(stt, each, 1) 
            print(stt, file=f1, end="\n")
    
    #perserve feature files for prediction feature look up
    dump_feature_dict(feature_dict, fea_file)
    print("The training data is in file " + svm_file)
    print("The features and associated code is in file " + fea_file)

## test functions on 7_760

In [158]:
#test
file_id = "7_670"
txt = process_bioc(bioc_dir + file_id + bioc_suffix)
sents, sent_boundry_map = extract_sent_map(map_file_dir + file_id + map_suffix)

anns, aid2idx = extract_entities_from_bioc(txt)
nsents, tid2idx = label_tokens_in_sent(anns, sents)

#t_rels is generated for true training sample (all the relations are true label)
t_rels = extract_relation_from_bioc(txt)
t_rel_fea_in_sent, t_rel_pair, t_label_cat, discard_rel_id = extract_features_of_relations(t_rels, anns, nsents, aid2idx, tid2idx)

#f_rels is generated for negative training sample (all relations are not true label and relations overlapped with t_rels are removed)
f_rels = generate_relations_based_on_entities_in_sentence(anns, sent_boundry_map, t_rel_pair, True)
#p_rels is generated for prediction
p_rels = generate_relations_based_on_entities_in_sentence(anns, sent_boundry_map)
#extract features for f_rels and p_rels
f_rel_fea_in_sent, _, f_label, _ = extract_features_of_relations(f_rels, anns, nsents, aid2idx, tid2idx)
p_rel_fea_in_sent, _, _, _ = extract_features_of_relations(p_rels, anns, nsents, aid2idx, tid2idx)

ann_in_sent = _group_annotaion_by_sent(anns, sent_boundry_map)

['69917', 'Severity', 2893, 2901, 'severely'] start boundary not match - check corpus
['70075', 'Drug', 7065, 7085, 'POLYETHYLENE GLYCOL'] start boundary not match - check corpus
['70207', 'Severity', 10747, 10753, 'mildly'] start boundary not match - check corpus
['70287', 'SSLIF', 5924, 5939, 'discharge (L)']  is in two sentences. Assign to the first sentence
entity index is  159
['70067', 'SSLIF', 6228, 6254, 'L Dorsalis pedis reduced']  is in two sentences. Assign to the first sentence
entity index is  167
['69972', 'SSLIF', 8549, 8568, 'warm IgG antibody']  is in two sentences. Assign to the first sentence
entity index is  267
['70287', 'SSLIF', 5924, 5939, 'discharge (L)']  is in two sentences. Assign to the first sentence
entity index is  159
['70067', 'SSLIF', 6228, 6254, 'L Dorsalis pedis reduced']  is in two sentences. Assign to the first sentence
entity index is  167
['69972', 'SSLIF', 8549, 8568, 'warm IgG antibody']  is in two sentences. Assign to the first sentence
entity

In [38]:
sent_boundry_map

{0: [0, 426],
 1: [428, 581],
 2: [582, 774],
 3: [780, 880],
 4: [886, 921],
 5: [927, 1152],
 6: [1153, 1264],
 7: [1265, 1407],
 8: [1408, 1486],
 9: [1487, 1527],
 10: [1529, 1561],
 11: [1562, 1695],
 12: [1696, 1942],
 13: [1943, 2023],
 14: [2025, 2064],
 15: [2065, 2132],
 16: [2135, 2248],
 17: [2249, 2294],
 18: [2297, 2411],
 19: [2414, 2444],
 20: [2445, 2522],
 21: [2523, 2583],
 22: [2589, 2671],
 23: [2672, 2748],
 24: [2749, 2820],
 25: [2821, 2961],
 26: [2962, 3080],
 27: [3081, 3183],
 28: [3185, 3241],
 29: [3250, 3511],
 30: [3514, 3590],
 31: [3593, 3748],
 32: [3751, 3968],
 33: [3969, 4032],
 34: [4033, 4044],
 35: [4045, 4063],
 36: [4064, 4104],
 37: [4105, 4161],
 38: [4162, 4308],
 39: [4309, 4346],
 40: [4346, 4378],
 41: [4381, 4557],
 42: [4558, 4621],
 43: [4628, 4638],
 44: [4639, 4679],
 45: [4681, 4851],
 46: [4857, 5416],
 47: [5422, 5476],
 48: [5477, 5556],
 49: [5562, 5648],
 50: [5651, 5713],
 51: [5716, 5933],
 52: [5936, 6012],
 53: [6015, 6021

In [11]:
anns[:10]

[['69881', 'SSLIF', 277, 285, 'Lymphoma'],
 ['69882', 'SSLIF', 303, 327, 'warm IgG auto antibodies'],
 ['69883', 'SSLIF', 333, 350, 'refractory anemia'],
 ['69884', 'Indication', 352, 354, 'PE'],
 ['69885', 'Drug', 375, 383, 'coumadin'],
 ['69936', 'SSLIF', 403, 425, 'compressions fractures'],
 ['69887', 'SSLIF', 477, 481, 'fall'],
 ['69888', 'SSLIF', 483, 519, 'left nondisplaced olecranon fracture'],
 ['69889', 'SSLIF', 550, 557, 'hypoxic'],
 ['69893', 'ADE', 594, 608, 'near syncope']]

In [10]:
id2idx

{'69881': 0,
 '69882': 1,
 '69883': 2,
 '69884': 3,
 '69885': 4,
 '69936': 5,
 '69887': 6,
 '69888': 7,
 '69889': 8,
 '69893': 9,
 '69890': 10,
 '69892': 11,
 '69891': 12,
 '69937': 13,
 '69938': 14,
 '69939': 15,
 '69894': 16,
 '70279': 17,
 '70280': 18,
 '69896': 19,
 '69895': 20,
 '69940': 21,
 '69897': 22,
 '69899': 23,
 '69902': 24,
 '69941': 25,
 '69900': 26,
 '69942': 27,
 '70219': 28,
 '69901': 29,
 '69943': 30,
 '69903': 31,
 '69904': 32,
 '69905': 33,
 '69944': 34,
 '69906': 35,
 '69945': 36,
 '69946': 37,
 '69907': 38,
 '69908': 39,
 '69947': 40,
 '69909': 41,
 '69910': 42,
 '70281': 43,
 '70282': 44,
 '70283': 45,
 '69948': 46,
 '70220': 47,
 '69911': 48,
 '69912': 49,
 '69949': 50,
 '70294': 51,
 '70284': 52,
 '69913': 53,
 '69915': 54,
 '69950': 55,
 '69916': 56,
 '69917': 57,
 '69918': 58,
 '69920': 59,
 '69919': 60,
 '69921': 61,
 '69995': 62,
 '70013': 63,
 '69922': 64,
 '69996': 65,
 '70014': 66,
 '69923': 67,
 '69997': 68,
 '70015': 69,
 '69924': 70,
 '69998': 71,
 '

In [21]:
nssents

[['Resident', '0', '8', 0, 8, 's', 0],
 ['-', '9', '10', 9, 10, 'm', 0],
 ['Admission', '11', '20', 11, 20, 'm', 0],
 ['Note', '21', '25', 21, 25, 'm', 0],
 ['PERMANENT', '26', '35', 28, 37, 'm', 0],
 ['Date', '36', '40', 44, 48, 'm', 0],
 ['16', '41', '43', 53, 55, 'm', 0],
 [':', '44', '45', 55, 56, 'm', 0],
 ['11', '46', '48', 56, 58, 'm', 0],
 ['Hospital', '49', '57', 68, 76, 'm', 0],
 ['Hospital', '58', '66', 87, 95, 'm', 0],
 ['-', '67', '68', 100, 101, 'm', 0],
 ['CCU', '69', '72', 102, 105, 'm', 0],
 ['Hospital', '73', '81', 110, 118, 'm', 0],
 ['Name', '82', '86', 132, 136, 'm', 0],
 [',', '87', '88', 140, 141, 'm', 0],
 ['Name', '89', '93', 146, 150, 'm', 0],
 ['Date', '94', '98', 160, 164, 'm', 0],
 ['of', '99', '101', 165, 167, 'm', 0],
 ['Service', '102', '109', 168, 175, 'm', 0],
 ['Date', '110', '114', 180, 184, 'm', 0],
 ['16', '115', '117', 189, 191, 'm', 0],
 [':', '118', '119', 191, 192, 'm', 0],
 ['11', '120', '122', 192, 194, 'm', 0],
 ['Patient', '123', '130', 200

In [22]:
tid2idx

{'69881': [35, 35],
 '69882': [41, 44],
 '69883': [46, 47],
 '69884': [49, 49],
 '69885': [52, 52],
 '69936': [56, 57],
 '69887': [70, 70],
 '69888': [72, 75],
 '69889': [83, 83],
 '69893': [92, 93],
 '69890': [97, 100],
 '69892': [104, 104],
 '69891': [107, 108],
 '69937': [120, 120],
 '69938': [130, 130],
 '69939': [132, 134],
 '69894': [162, 162],
 '70279': [168, 171],
 '70280': [173, 174],
 '69896': [176, 176],
 '69895': [179, 179],
 '69940': [183, 184],
 '69897': [193, 193],
 '69899': [203, 203],
 '69902': [214, 215],
 '69941': [219, 222],
 '69900': [226, 226],
 '69942': [229, 230],
 '70219': [256, 256],
 '69901': [267, 267],
 '69943': [279, 279],
 '69903': [289, 289],
 '69904': [293, 294],
 '69905': [296, 302],
 '69944': [305, 312],
 '69906': [317, 325],
 '69945': [328, 328],
 '69946': [335, 335],
 '69907': [336, 336],
 '69908': [338, 338],
 '69947': [339, 339],
 '69909': [341, 344],
 '69910': [368, 371],
 '70281': [376, 377],
 '70282': [378, 378],
 '70283': [379, 379],
 '69948':

In [154]:
rels[:10], len(rels)

NameError: name 'rels' is not defined

In [160]:
t_rel_fea_in_sent[:100]

[['reason',
  'COUMADIN',
  'PE',
  'Drug',
  'Indication',
  23,
  3,
  'with',
  'Date with',
  'PE Date with',
  'therapy',
  'therapy ,',
  'therapy , multiple',
  ',',
  'anemia ,',
  'refractory anemia ,',
  'Date',
  'Date with',
  'Date with coumadin'],
 ['severity_type',
  'PULMONARY HYPERTENSION',
  'SEVERE',
  'SSLIF',
  'Severity',
  22,
  3,
  'progressive',
  ', progressive',
  'severe , progressive',
  ',',
  ', which',
  ', which may',
  'of',
  'setting of',
  'in setting of',
  ',',
  ', progressive',
  ', progressive pulmonary'],
 ['reason',
  'COUMADIN',
  'PE',
  'Drug',
  'Indication',
  23,
  3,
  'with',
  'Date with',
  'PE Date with',
  'therapy',
  'therapy ,',
  'therapy , multiple',
  ',',
  'anemia ,',
  'refractory anemia ,',
  'Date',
  'Date with',
  'Date with coumadin'],
 ['severity_type',
  'H/H WAS SEVERELY DECREASED',
  'SEVERELY',
  'SSLIF',
  'Severity',
  -8,
  -4,
  'the',
  'that the',
  'indicated that the',
  'and',
  'and she',
  'and she w

In [85]:
print(f_rel_fea_in_sent[:5])
print(p_rel_fea_in_sent[:5])

[['NAGTIVE', 'NEAR SYNCOPE', 'SEVERE', 'ADE', 'Severity', -76, -12, 'a', 'was a', 'Event was a', 'thought', 'thought due', 'thought due to', 'of', 'setting of', 'in setting of', ',', ', progressive', ', progressive pulmonary'], ['NAGTIVE', 'INADEQUATE RV FILLING PRESSURES', 'NEAR SYNCOPE', 'SSLIF', 'ADE', 30, 5, 'to', 'due to', 'thought due to', 'in', 'in setting', 'in setting of', 'a', 'was a', 'Event was a', 'thought', 'thought due', 'thought due to'], ['NAGTIVE', 'INADEQUATE RV FILLING PRESSURES', 'SEVERE', 'SSLIF', 'Severity', -46, -7, 'to', 'due to', 'thought due to', 'in', 'in setting', 'in setting of', 'of', 'setting of', 'in setting of', ',', ', progressive', ', progressive pulmonary'], ['NAGTIVE', 'PULMONARY HYPERTENSION', 'NEAR SYNCOPE', 'SSLIF', 'ADE', 98, 15, 'progressive', ', progressive', 'severe , progressive', ',', ', which', ', which may', 'a', 'was a', 'Event was a', 'thought', 'thought due', 'thought due to'], ['NAGTIVE', 'NEAR SYNCOPE', 'SEVERE', 'ADE', 'Severity', 

In [78]:
f_rels[:10], len(f_rels)

([(-1, 'NAGTIVE', '69893', '69892'),
  (-1, 'NAGTIVE', '69890', '69893'),
  (-1, 'NAGTIVE', '69890', '69892'),
  (-1, 'NAGTIVE', '69891', '69893'),
  (-1, 'NAGTIVE', '69902', '69900'),
  (-1, 'NAGTIVE', '69941', '69902'),
  (-1, 'NAGTIVE', '69941', '69900'),
  (-1, 'NAGTIVE', '69942', '69902'),
  (-1, 'NAGTIVE', '69946', '69908'),
  (-1, 'NAGTIVE', '69947', '69907')],
 319)

In [79]:
p_rels[:10], len(set(p_rels))

([(-2, 'PREDICT', '69885', '69884'),
  (-2, 'PREDICT', '69893', '69892'),
  (-2, 'PREDICT', '69890', '69893'),
  (-2, 'PREDICT', '69890', '69892'),
  (-2, 'PREDICT', '69891', '69893'),
  (-2, 'PREDICT', '69891', '69892'),
  (-2, 'PREDICT', '69937', '69893'),
  (-2, 'PREDICT', '69895', '69896'),
  (-2, 'PREDICT', '69902', '69900'),
  (-2, 'PREDICT', '69941', '69902')],
 455)

In [56]:
ref = [(each[2], each[3]) for each in p_rels]
ref[:5]

[('69885', '69884'),
 ('69893', '69892'),
 ('69890', '69893'),
 ('69890', '69892'),
 ('69891', '69893')]

In [58]:
# check if the results here are relations in cross sentence
for each in set(t_rels + f_rels):
    t = (each[2], each[3])
    if t not in ref:
        print(each in t_rels)
        print(each)
#manully check: yes they are all cross sentence relations after sentence normalization

True
('16481', 'reason', '70157', '70156')
True
('16489', 'reason', '70196', '70199')
True
('16392', 'adverse', '69943', '69902')
True
('16480', 'reason', '70153', '69970')
True
('16415', 'do', '69993', '70011')
True
('16490', 'reason', '70198', '70200')
True
('16382', 'fr', '69931', '70022')


In [271]:
def output_cross_sent_relations(t_rel, f_rel, p_rel, res_file):
    res = []
    ref = set([(each[2], each[3]) for each in p_rels])
    for each in (t_rel + f_rel):
        t = (each[2], each[3])
        if t not in ref:
            res.append(each)
            if each in t_rel:
                print(each)
    with open(res_file, "w") as f:
        for each in res:
            print("\t".join(each), file=f, end="\n")

In [106]:
t_label_cat.update(f_label)
t_label_cat

{'NEGATIVE', 'adverse', 'do', 'fr', 'manner/route', 'reason', 'severity_type'}

In [109]:
label2idx(t_label_cat)

{'NEGATIVE': 0,
 'adverse': 1,
 'fr': 2,
 'reason': 3,
 'manner/route': 4,
 'do': 5,
 'severity_type': 6}

In [5]:
print(len(anns))
print(len(ann_in_sent))
print(ann_in_sent)

360
149
{0: [0, 1, 2, 3, 4, 5], 1: [6, 7, 8], 2: [9, 10, 11, 12, 13], 3: [14, 15], 4: [], 5: [16, 17, 18, 19, 20, 21], 6: [22, 23], 7: [24, 25, 26, 27], 8: [], 9: [], 10: [28], 11: [29, 30], 12: [31, 32, 33, 34, 35], 13: [36, 37, 38, 39, 40], 14: [41], 15: [], 16: [42, 43, 44, 45], 17: [], 18: [46, 47], 19: [48], 20: [49, 50, 51, 52], 21: [], 22: [], 23: [53], 24: [54], 25: [55, 56, 57], 26: [58, 59, 60], 27: [], 28: [], 29: [61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86], 30: [87, 88, 89, 90, 91, 92, 93, 94], 31: [95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109], 32: [110, 111, 112, 113, 114, 115], 33: [], 34: [], 35: [116], 36: [117, 118], 37: [], 38: [], 39: [], 40: [119, 120], 41: [121, 122, 123, 124, 125], 42: [126, 127], 43: [], 44: [128, 129], 45: [130, 131, 132], 46: [133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149], 47: [150, 151], 48: [152, 153, 154, 155], 49: [],

In [7]:
for i in range(len(ann_in_sent)-1):
    j = i + 1
    print(ann_in_sent[i])
    print(ann_in_sent[j])

[0, 1, 2, 3, 4, 5]
[6, 7, 8]
[6, 7, 8]
[9, 10, 11, 12, 13]
[9, 10, 11, 12, 13]
[14, 15]
[14, 15]
[]
[]
[16, 17, 18, 19, 20, 21]
[16, 17, 18, 19, 20, 21]
[22, 23]
[22, 23]
[24, 25, 26, 27]
[24, 25, 26, 27]
[]
[]
[]
[]
[28]
[28]
[29, 30]
[29, 30]
[31, 32, 33, 34, 35]
[31, 32, 33, 34, 35]
[36, 37, 38, 39, 40]
[36, 37, 38, 39, 40]
[41]
[41]
[]
[]
[42, 43, 44, 45]
[42, 43, 44, 45]
[]
[]
[46, 47]
[46, 47]
[48]
[48]
[49, 50, 51, 52]
[49, 50, 51, 52]
[]
[]
[]
[]
[53]
[53]
[54]
[54]
[55, 56, 57]
[55, 56, 57]
[58, 59, 60]
[58, 59, 60]
[]
[]
[]
[]
[61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86]
[61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86]
[87, 88, 89, 90, 91, 92, 93, 94]
[87, 88, 89, 90, 91, 92, 93, 94]
[95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109]
[95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109]
[110, 111, 112, 113, 114, 115]
[110, 11

## The whole process of SVM model for relation detection model data process

In [272]:
#generate true and negative relationsfor training
total_rel_fea_in_sent, label_cat_set = generate_SVM_training_data(train_file_dir)

********12_123********
['13243', 'Drug', 1078, 1104, 'MULTIVITAMINS (TAB-A-VITE)'] start boundary not match - check corpus

********4_857********
['90589', 'Duration', 4836, 4844, '2 cycles'] start boundary not match - check corpus

********10_988********
['104675', 'Severity', 4169, 4180, 'significant'] start boundary not match - check corpus

********17_839********

********13_513********
['54443', 'Duration', 938, 946, '6 cycles'] start boundary not match - check corpus
['54444', 'Duration', 984, 992, '4 cycles'] start boundary not match - check corpus

********1_985********
['104367', 'Frequency', 2099, 2104, '1 day'] start boundary not match - check corpus
['104377', 'Duration', 2558, 2565, '1 cycle'] start boundary not match - check corpus
['104289', 'Severity', 5775, 5780, 'quite'] start boundary not match - check corpus
['104291', 'ADE', 6456, 6479, 'abnormal renal function'] start boundary not match - check corpus
['104301', 'Duration', 6762, 6763, '4'] start boundary not matc


********14_637********

********10_91********

********10_65********

********17_852********
['89956', 'SSLIF', 4937, 4943, 'tender'] start boundary not match - check corpus

********10_1********

********5_6********

********13_781********

********13_45********

********1_589********
['61842', 'Dose', 4592, 4600, '2 liters'] start boundary not match - check corpus
['61838', 'Route', 5690, 5692, 'IV'] start boundary not match - check corpus

********14_237********
['25300', 'Severity', 1300, 1304, 'some'] start boundary not match - check corpus

********1_1039********
['110380', 'Duration', 7385, 7391, '7 days'] start boundary not match - check corpus

********19_617********
['64492', 'Severity', 2368, 2376, 'slightly'] start boundary not match - check corpus
['64545', 'Dose', 3180, 3195, '5 mg 1-2 tabs']  is in two sentences. Assign to the first sentence
entity index is  37

********1_32********
['3212', 'Severity', 6362, 6375, 'significantly'] start boundary not match - check corpu


********6_322********

********7_712********
['74267', 'Severity', 1157, 1164, "100cc's"] start boundary not match - check corpus
['74265', 'Indication', 6389, 6401, 'Constipation'] start boundary not match - check corpus

********4_771********

********13_209********

********7_47********
['4775', 'Severity', 285, 293, 'slightly'] start boundary not match - check corpus
['4806', 'Severity', 1699, 1704, 'a bit'] start boundary not match - check corpus

********10_691********

********7_573********
['60715', 'ADE', 2587, 2605, 'decreased appetite'] start boundary not match - check corpus

********19_566********
14216  has two entities with same id  59793

********4_776********
19165  has two entities with same id  81454

********5_722********
['75373', 'Severity', 5381, 5389, 'slightly'] start boundary not match - check corpus
['75376', 'SSLIF', 6043, 6055, 'Pancytopenia'] start boundary not match - check corpus

********1_234********
['24851', 'Severity', 5963, 5971, 'slightly'] start


********3_858********

********3_256********

********14_381********

********7_129********

********6_189********
['20636', 'ADE', 1064, 1075, 'mood swings'] start boundary not match - check corpus
['20638', 'ADE', 4101, 4119, 'bone health issues'] start boundary not match - check corpus

********14_147********

********19_336********
['36091', 'Duration', 5530, 5536, '10days'] start boundary not match - check corpus
['36090', 'Duration', 5705, 5711, '10days'] start boundary not match - check corpus

********6_377********

********14_372********

********14_716********
['74703', 'ADE', 3367, 3374, 'erosion'] start boundary not match - check corpus
['74729', 'Dose', 4683, 4684, '4'] start boundary not match - check corpus

********4_925********
['98004', 'SSLIF', 616, 659, 'double heterozygosity for factor V Leiden']  is in two sentences. Assign to the first sentence
entity index is  3

********14_350********
['37192', 'SSLIF', 1491, 1506, 'lightheadedness'] start boundary not match -

['80605', 'Severity', 2005, 2010, 'a bit'] start boundary not match - check corpus

********10_1014********

********1_886********
['93955', 'Indication', 5049, 5070, 'Peripheral neuropathy'] start boundary not match - check corpus

********7_731********

********7_355********
['37622', 'SSLIF', 5689, 5729, 'LDH and reticulocyte count are elevated'] start boundary not match - check corpus
['37746', 'SSLIF', 5749, 5761, 'bony disease'] start boundary not match - check corpus
['37698', 'Severity', 6406, 6411, 'quite'] start boundary not match - check corpus

********9_319********

********1_620********
['64741', 'Drug', 3309, 3319, 'MS Contin'] start boundary not match - check corpus
['64889', 'Drug', 3310, 3319, 'MS Contin'] start boundary not match - check corpus
['64748', 'SSLIF', 3509, 3517, 'lymphoma'] start boundary not match - check corpus
['64769', 'SSLIF', 5014, 5031, 'multiple myeloma'] start boundary not match - check corpus
['64913', 'Severity', 6662, 6670, 'slightly'] start 

['31386', 'Severity', 4531, 4540, 'low-grade'] start boundary not match - check corpus

********13_95********

********12_153********

********1_81********
['8638', 'ADE', 3570, 3593, 'neuropathy in his hands'] start boundary not match - check corpus
['8614', 'Severity', 5475, 5486, 'significant'] start boundary not match - check corpus
['8618', 'Severity', 5779, 5787, 'slightly'] start boundary not match - check corpus
1868  has two entities with same id  8640

********7_810********

********1_330********
['35521', 'Severity', 5319, 5323, 'very'] start boundary not match - check corpus
8405  has two entities with same id  35370
8423  has two entities with same id  35467
8428  has two entities with same id  35481
8443  has two entities with same id  35527

********1_1041********
['110750', 'Severity', 7469, 7481, '40-50 pounds'] start boundary not match - check corpus

********1_798********

********13_66********

********14_879********

********14_71********

********14_613********
['

['16605', 'Severity', 5042, 5046, 'MILD'] start boundary not match - check corpus
['16564', 'SSLIF', 1669, 1678, 'NEC. FASC']  is in two sentences. Assign to the first sentence
entity index is  8
['16565', 'SSLIF', 1703, 1715, 'VIT. D DEFIC']  is in two sentences. Assign to the first sentence
entity index is  9

********1_365********

********1_20********
['1915', 'SSLIF', 3457, 3465, 'fatigued'] start boundary not match - check corpus
['1874', 'Severity', 5400, 5404, 'very'] start boundary not match - check corpus
['1870', 'Drug', 3092, 3103, 'vitamin D']  is in two sentences. Assign to the first sentence
entity index is  71

********14_487********
['51668', 'Drug', 818, 825, 'calcium'] start boundary not match - check corpus

********10_539********

********3_301********
7787  has two entities with same id  32523

********10_14********

********10_162********

********1_739********
['77342', 'ADE', 5249, 5264, 'urinary urgency'] start boundary not match - check corpus
['77388', 'ADE'

['19249', 'Dose', 5510, 5513, 'TAB'] start boundary not match - check corpus

********13_49********

********19_812********
['85509', 'Severity', 1187, 1197, 'moderately'] start boundary not match - check corpus
['85516', 'Severity', 1687, 1692, 'quite'] start boundary not match - check corpus

********1_728********
['76182', 'ADE', 7764, 7783, 'disseminated zoster'] start boundary not match - check corpus

********10_341********

********17_835********

********14_465********
['49405', 'Drug', 829, 836, 'calcium'] start boundary not match - check corpus

********6_460********

********12_913********
['96897', 'Severity', 5459, 5465, 'mildly'] start boundary not match - check corpus

********10_528********

********1_387********
['40900', 'ADE', 2536, 2545, 'skin rash'] start boundary not match - check corpus
['40938', 'SSLIF', 5075, 5112, 'weight 231 pounds, which is increased'] start boundary not match - check corpus
9582  has two entities with same id  40895

********14_805********



********1_636********
['66856', 'SSLIF', 3948, 3960, 'side effects'] start boundary not match - check corpus
['66822', 'Severity', 6462, 6470, 'slightly'] start boundary not match - check corpus
['66855', 'Duration', 7644, 7650, '2 days'] start boundary not match - check corpus

********10_409********
['43381', 'Indication', 4135, 4162, 'Erythematous injection site'] start boundary not match - check corpus
['43511', 'ADE', 4198, 4210, 'erythematous'] start boundary not match - check corpus

********7_515********

********5_744********

********17_777********
['81537', 'SSLIF', 2758, 2766, 'palpable'] start boundary not match - check corpus

********19_951********
23786  has two entities with same id  100893

********7_10********
['839', 'Drug', 2134, 2144, 'prednisone'] start boundary not match - check corpus
['873', 'Severity', 5076, 5085, 'minimally'] start boundary not match - check corpus

********8_93********
['9979', 'Dose', 2953, 2963, 'one tablet'] start boundary not match - c


********14_119********
['12753', 'SSLIF', 4855, 4879, 'C. diff toxin positive']  is in two sentences. Assign to the first sentence
entity index is  83

********1_276********

********1_612********

********7_393********
['41456', 'Severity', 3788, 3792, 'very'] start boundary not match - check corpus

********1_282********
['30810', 'SSLIF', 535, 557, 'IgG kappa paraprotein'] start boundary not match - check corpus
['30725', 'Severity', 7325, 7329, 'very'] start boundary not match - check corpus
['30808', 'SSLIF', 7744, 7753, 'epistaxis'] start boundary not match - check corpus
['30806', 'SSLIF', 8108, 8127, 'renal insufficiency'] start boundary not match - check corpus
['30906', 'SSLIF', 6578, 6614, 'hematocrit of 23.3. This has fallen']  is in two sentences. Assign to the first sentence
entity index is  162

********12_829********

********10_244********
['25718', 'Dose', 685, 698, '20 mg Capsule'] start boundary not match - check corpus

********1_847********
['89267', 'Severity', 


********7_995********

********10_845********

********12_826********
['86924', 'Severity', 516, 525, 'low-grade'] start boundary not match - check corpus

********7_959********
['101607', 'SSLIF', 4581, 4598, 'fluid retention'] start boundary not match - check corpus

********8_519********
['55200', 'ADE', 1902, 1920, 'did not feel right'] start boundary not match - check corpus
['55139', 'Severity', 3647, 3651, '4 mm'] start boundary not match - check corpus

********13_618********
['64663', 'ADE', 1328, 1336, 'numbness'] start boundary not match - check corpus
['64664', 'ADE', 1341, 1361, 'tingling in his legs'] start boundary not match - check corpus
['64665', 'ADE', 1468, 1510, 'gained that weight back and has added some'] start boundary not match - check corpus

********17_755********

********14_796********

********6_563********

********14_306********
['33192', 'Severity', 3074, 3083, '11 pounds'] start boundary not match - check corpus
['33194', 'Severity', 3166, 3175, '20 p


********12_353********
['37438', 'PHI', 6266, 6279, "St. Vincent's"]  is in two sentences. Assign to the first sentence
entity index is  70
['37439', 'PHI', 6633, 6646, "St. Vincent's"]  is in two sentences. Assign to the first sentence
entity index is  71

********17_611********
['63968', 'Dose', 1580, 1587, '1 liter'] start boundary not match - check corpus

********12_505********
['53580', 'Severity', 460, 469, 'low-grade'] start boundary not match - check corpus

********12_737********
['77088', 'PHI', 1677, 1697, 'St. Vincent Hospital']  is in two sentences. Assign to the first sentence
entity index is  15

********5_48********
['4960', 'SSLIF', 4469, 4508, 'supraclavicular or axillary adenopathy'] start boundary not match - check corpus
['4992', 'SSLIF', 6493, 6505, 'pancytopenia'] start boundary not match - check corpus

********6_247********
6187  has two entities with same id  26162
6188  has two entities with same id  26163

********1_130********
['13719', 'Duration', 1894, 


********1_751********
['78605', 'ADE', 5461, 5484, 'bleeding from the wound'] start boundary not match - check corpus
['78634', 'SSLIF', 7561, 7596, 'bleeding from his right leg wound'] start boundary not match - check corpus
['78661', 'ADE', 7695, 7702, 'rebleed'] start boundary not match - check corpus

********1_1044********
['111081', 'Severity', 5711, 5716, 'quite'] start boundary not match - check corpus

********19_467********
['49580', 'Route', 4090, 4094, 'oral'] start boundary not match - check corpus
['49510', 'Frequency', 3938, 3953, 'q.4 h. p.r.n.']  is in two sentences. Assign to the first sentence
entity index is  44

********1_107********
['11769', 'SSLIF', 1725, 1736, 'respiratory'] start boundary not match - check corpus
['11768', 'SSLIF', 4413, 4458, 'peripheral neuropathy in his hands and feet'] start boundary not match - check corpus
['11771', 'Severity', 6602, 6610, 'slightly'] start boundary not match - check corpus

********7_472********
['50166', 'Severity', 1

In [174]:
total_rel_fea_in_sent[:10]

[['1 TABLET',
  'Drug',
  'Dose',
  'Nan',
  'Nan',
  'Nan',
  ')',
  ') 1',
  ') 1 TABLET',
  ')',
  'VITE )',
  '- VITE )',
  '=',
  '= 1',
  '= 1 TABLET'],
 ['do',
  'MULTIVITAMINS (TAB-A-VITE',
  '1 TABLET',
  'Drug',
  'Dose',
  -38,
  -11,
  'Nan',
  'Nan',
  'Nan',
  ')',
  ') 1',
  ') 1 TABLET',
  '=',
  'TABLET =',
  '1 TABLET =',
  'Oral',
  'Oral TABLET',
  'Oral TABLET DAILY'],
 ['fr',
  'MULTIVITAMINS (TAB-A-VITE',
  'DAILY',
  'Drug',
  'Frequency',
  -59,
  -15,
  'Nan',
  'Nan',
  'Nan',
  ')',
  ') 1',
  ') 1 TABLET',
  'TABLET',
  'Oral TABLET',
  'TABLET Oral TABLET',
  'STAT',
  'STAT and',
  'STAT and then'],
 ['manner/route',
  'MULTIVITAMINS (TAB-A-VITE',
  'ORAL',
  'Drug',
  'Route',
  -47,
  -13,
  'Nan',
  'Nan',
  'Nan',
  ')',
  ') 1',
  ') 1 TABLET',
  'TABLET',
  '1 TABLET',
  '= 1 TABLET',
  'TABLET',
  'TABLET DAILY',
  'TABLET DAILY STAT'],
 ['do',
  'ATENOLOL',
  '50 MG TABLET',
  'Drug',
  'Dose',
  -9,
  -1,
  'DAILY',
  'oral DAILY',
  'tablet oral

In [163]:
label_cat_set

{'NEGATIVE',
 'adverse',
 'do',
 'du',
 'fr',
 'manner/route',
 'reason',
 'severity_type'}

In [273]:
def output_training_data_for_random_forest(file_name, fea, lab):
    #create label dict output
    fea_file = file_name + "_features_dict.pkl"
    idx2l_file = file_name + "_index2label.pkl"
    
    l2idx = label2idx(lab)
    idx2l = {v:k for k, v in l2idx.items()}
    
    with open(idx2l_file, "wb") as f:
        pkl.dump(idx2l, f)
    print("label dictionary is store in {}".format(idx2l_file))
    
    rf_file = file_name + ".rf"
    with open(rf_file, "w") as f:
        for each in fea:
            label = str(l2idx[each.pop(0)])
            dis = str(each.pop(4))
            tok_dis = str(each.pop(4))
            line = [label, dis, tok_dis]
            for e in each:
                line.append(e)
            print("\t".join(line), file=f, end="\n")

In [274]:
output_training_data_for_random_forest("umassRF_training_within", total_rel_fea_in_sent, label_cat_set)

label dictionary is store in umassRF_training_within_index2label.pkl


In [165]:
###### SVM #######

In [37]:
#convert relation features to code for LIBSVM data
output_SVM_training_data("umass18SVM_within_sent", total_rel_fea_in_sent, label_cat_set)

The training data is in file umass18SVM_within_sent.svm
The features and associated code is in file umass18SVM_within_sent_features_dict.pkl


In [8]:
print(len(total_rel_fea_in_sent))
print(total_rel_fea_in_sent[:10])

64783
[['do', 'MULTIVITAMINS (TAB-A-VITE', '1 TABLET', 'Drug', 'Dose', -27, -8, ')', ') 1', ') 1 TABLET', ')', 'VITE )', '- VITE )', '=', '= 1', '= 1 TABLET'], ['do', 'MULTIVITAMINS (TAB-A-VITE', '1 TABLET', 'Drug', 'Dose', -38, -11, ')', ') 1', ') 1 TABLET', '=', 'TABLET =', '1 TABLET =', 'Oral', 'Oral TABLET', 'Oral TABLET DAILY'], ['fr', 'MULTIVITAMINS (TAB-A-VITE', 'DAILY', 'Drug', 'Frequency', -59, -15, ')', ') 1', ') 1 TABLET', 'TABLET', 'Oral TABLET', 'TABLET Oral TABLET', 'STAT', 'STAT and', 'STAT and then'], ['manner/route', 'MULTIVITAMINS (TAB-A-VITE', 'ORAL', 'Drug', 'Route', -47, -13, ')', ') 1', ') 1 TABLET', 'TABLET', '1 TABLET', '= 1 TABLET', 'TABLET', 'TABLET DAILY', 'TABLET DAILY STAT'], ['do', 'ATENOLOL', '50 MG TABLET', 'Drug', 'Dose', -9, -1, 'DAILY', 'oral DAILY', 'tablet oral DAILY', '50', '50 mg', '50 mg Tablet', 'atenolol', 'DAILY atenolol', 'oral DAILY atenolol', ',', ', Ordered', ', Ordered By'], ['do', 'ATENOLOL', '1 TABLET', 'Drug', 'Dose', -76, -12, 'DAILY'

In [69]:
label_cat_set

{'NEGATIVE',
 'adverse',
 'do',
 'du',
 'fr',
 'manner/route',
 'reason',
 'severity_type'}

## cross sentence relation extraction

In [291]:
def extract_features_of_relations_cross_sent(rels, anns, nsents, aid2idx, tid2idx, cross=1):
    label_cat = set()
    rel_fea_in_sent = []
    rel_pair = set()

    for each in rels:
        rtype = each[1]
        ann1_id = each[2]
        ann2_id = each[3]
        if ann1_id == ann2_id:
            continue
        
        #the label category and relation pairs list will have extra terms due to some cross_sentence relations have been discard
        label_cat.add(rtype)
        rel_pair.add((ann1_id, ann2_id))
        
        #text of entities in the relation
        tk1 = anns[aid2idx[ann1_id]]
        tk2 = anns[aid2idx[ann2_id]]
         
        #using the sentence idx of the first token in an entity to represent the entity sentence idx
        pos1 = tid2idx[ann1_id]
        pos2 = tid2idx[ann2_id]
        tk1_sent_idx = nsents[pos1[0]][-1]
        tk2_sent_idx = nsents[pos2[0]][-1]
        
        #check if entity 1 and entity 2 are in same sentence
        #if two entities are in the same sentence, the sent idx difference should 0
        if abs(tk1_sent_idx - tk2_sent_idx) == cross:
            label = rtype
            #entity text
            fea1 = "" + tk1[-1].upper()           
            fea2 = "" + tk2[-1].upper()         
            #entity type
            fea3 = "" + tk1[1]            
            fea4 = "" + tk2[1]           
            #entities distance measured by diference of the start position before mapping
            fea5 = tk1[2] - tk2[2] 
            fea6 = pos1[0] - pos2[0]
            #ngram features: window size = 3
            fea_ngram_en1 = extract_ngram_feature(nsents, pos1)
            fea_ngram_en2 = extract_ngram_feature(nsents, pos2)
            feas = [label, fea1, fea2, fea3, fea4, fea5, fea6] + fea_ngram_en1 + fea_ngram_en2
            rel_fea_in_sent.append([fea for fea in feas if fea != ""])     
    
    return rel_fea_in_sent, rel_pair, label_cat


def generate_relations_based_on_entities_cross_sentence(anns, sent_boundry_map, in_sent_rels, true_rels=[], fake=False):
    ann_in_sent = _group_annotaion_by_sent(anns, sent_boundry_map)
    
    #the generated relation must have its entities with types as defined below
    #all these are know pre-hand by analysis the all the relations
    #entity type need to be modified since we only consider reason and adverse relation
    #for fr and do relation, we will set a distance boundary to them
#     entity_type_rule_set = {
#         ('Drug', 'ADE'), ('SSLIF', 'ADE'), ('Drug', 'Dose'), 
#         ('Drug', 'Duration'), ('Drug', 'Frequency'), ('Drug', 'Indication'), 
#         ('SSLIF', 'Severity'), ('ADE', 'Severity'), ('Indication', 'Severity'), 
#         ('Drug', 'Route')
#     }
    entity_type_rule_set = {
        ('Drug', 'ADE'), ('Drug', 'Dose'), ('Drug', 'Frequency'), ('Drug', 'Indication'), ('Drug', 'Route')
    }
    
    rels = []
    #rotate sentences to create all the entity pairs in two adjacent sentence
    for i in range(len(ann_in_sent)-1):
        j = i + 1
        v = ann_in_sent[i] + ann_in_sent[j]
        for e1, e2 in itertools.permutations(v, 2):
            #make some rules to eliminate some of the relation combinations
            rel_pair = (anns[e1][0], anns[e2][0])
            rel_type_pair = (anns[e1][1], anns[e2][1])
            
            #fake flag is used to control whether the generated relations are for negative training sample or prediction
            #TODO ugly code here but logic is clear
            if fake:
                if rel_pair not in true_rels and rel_pair not in in_sent_rels and rel_type_pair in entity_type_rule_set:  
                    rels.append((-1, "NEGATIVE", rel_pair[0], rel_pair[1]))
            else:
                if rel_type_pair in entity_type_rule_set and rel_pair not in in_sent_rels:
                    rels.append((-2, "PREDICT", rel_pair[0], rel_pair[1]))    

    return rels 

In [292]:
t_cross = []
f_cross = []
tt_cross = []
ft_cross = []
labels = set()
train_data = []
for file_id in os.listdir(train_file_dir):
        bioc_file = bioc_dir + file_id + bioc_suffix
        sent_file = sent_file_dir + file_id + sent_suffix
        map_file = map_file_dir + file_id + map_suffix
        rel_file = rel_file_dir + file_id + rel_suffix
        
        print("********" + file_id + "********")
        #read sentence mapping file
        sents, sent_boundry_map = extract_sent_map(map_file_dir + file_id + map_suffix)
        #read bioc file
        txt = process_bioc(bioc_dir + file_id + bioc_suffix)
        #get all entities from bioc
        anns, aid2idx = extract_entities_from_bioc(txt)
        #get all relations from bioc
        nsents, tid2idx = label_tokens_in_sent(anns, sents)
        t_rels = extract_relation_from_bioc(txt)
        #since we wont consider 
        t_rel_fea_in_sent, t_rel_pair, t_label_cat = extract_features_of_relations_cross_sent(t_rels, anns, nsents, aid2idx, tid2idx, 1)
        tt_cross.extend(t_rel_fea_in_sent)
        train_data.extend(t_rel_fea_in_sent)
        
        for each in t_label_cat:
            labels.add(each)
        
        #p_rels is all the relations generated within sentence; in_sent_rels contains all the in-setences relation pairs
        p_rels = generate_relations_based_on_entities_in_sentence(anns, sent_boundry_map)
        in_sent_rels = set([(each[2], each[3]) for each in p_rels])
        
        #generate all relation pairs within two sentences then remove within sentence pairs
        f_rels_cross = generate_relations_based_on_entities_cross_sentence(anns, sent_boundry_map, in_sent_rels, t_rel_pair, True)
        f_cross.extend(f_rels_cross)
        f_rels_fea_cross_sent, _, _ = extract_features_of_relations_cross_sent(f_rels_cross, anns, nsents, aid2idx, tid2idx, 1)
        ft_cross.extend(f_rels_fea_cross_sent)
        train_data.extend(f_rels_fea_cross_sent)
        #all the cross_sentence relations are stored in res_file in directory "cross_sent_rel"
#         p_rels = generate_relations_based_on_entities_in_sentence(anns, sent_boundry_map)
#         f_rels = generate_relations_based_on_entities_in_sentence(anns, sent_boundry_map, t_rel_pair, True)
#         output_cross_sent_relations(t_rels, f_rels, p_rels, res_file)

********12_123********
['13243', 'Drug', 1078, 1104, 'MULTIVITAMINS (TAB-A-VITE)'] start boundary not match - check corpus
********4_857********
['90589', 'Duration', 4836, 4844, '2 cycles'] start boundary not match - check corpus
********10_988********
['104675', 'Severity', 4169, 4180, 'significant'] start boundary not match - check corpus
********17_839********
********13_513********
['54443', 'Duration', 938, 946, '6 cycles'] start boundary not match - check corpus
['54444', 'Duration', 984, 992, '4 cycles'] start boundary not match - check corpus
********1_985********
['104367', 'Frequency', 2099, 2104, '1 day'] start boundary not match - check corpus
['104377', 'Duration', 2558, 2565, '1 cycle'] start boundary not match - check corpus
['104289', 'Severity', 5775, 5780, 'quite'] start boundary not match - check corpus
['104291', 'ADE', 6456, 6479, 'abnormal renal function'] start boundary not match - check corpus
['104301', 'Duration', 6762, 6763, '4'] start boundary not match - c

['54981', 'Severity', 3334, 3339, 'quite'] start boundary not match - check corpus
********1_127********
********16_699********
********6_895********
********17_898********
********10_97********
********1_588********
['61726', 'Severity', 1178, 1183, 'quite'] start boundary not match - check corpus
********13_310********
['33493', 'Severity', 2876, 2882, '6-7/10'] start boundary not match - check corpus
********4_668********
********19_686********
['72051', 'Drug', 769, 773, 'NaCl'] start boundary not match - check corpus
['72050', 'Drug', 876, 881, 'ACD-A'] start boundary not match - check corpus
['72049', 'Dose', 732, 773, '10% solution. 10 ml in 50 ml of 0.9% NaCl']  is in two sentences. Assign to the first sentence
entity index is  4
['72049', 'Dose', 732, 773, '10% solution. 10 ml in 50 ml of 0.9% NaCl']  is in two sentences. Assign to the first sentence
entity index is  4
********7_667********
********5_436********
********6_866********
['91560', 'ADE', 2015, 2023, 'DELIRIUM'] st

********7_545********
********6_347********
********6_949********
['100716', 'Severity', 1659, 1667, 'stage II'] start boundary not match - check corpus
********10_208********
********7_542********
********7_770********
********13_200********
['21597', 'SSLIF', 5188, 5199, 'weight gain'] start boundary not match - check corpus
********14_183********
********14_177********
['19407', 'Drug', 2057, 2064, 'calcium'] start boundary not match - check corpus
['19413', 'Severity', 5504, 5512, 'slightly'] start boundary not match - check corpus
['19399', 'SSLIF', 5671, 5677, 'Anemia'] start boundary not match - check corpus
['19334', 'Frequency', 2944, 2959, 'q.8 h. p.r.n.']  is in two sentences. Assign to the first sentence
entity index is  59
['19334', 'Frequency', 2944, 2959, 'q.8 h. p.r.n.']  is in two sentences. Assign to the first sentence
entity index is  59
********13_435********
********10_254********
********7_348********
['36970', 'Severity', 7533, 7539, 'mildly'] start boundary not 

['9395', 'Route', 6566, 6570, 'oral'] start boundary not match - check corpus
********10_840********
['88279', 'SSLIF', 1784, 1813, 'Thyroid gland is not palpable'] start boundary not match - check corpus
['88261', 'Severity', 2606, 2614, 'slightly'] start boundary not match - check corpus
['88302', 'SSLIF', 3245, 3266, 'Hypogammaglobulinemia'] start boundary not match - check corpus
['88267', 'ADE', 3299, 3324, 'low levels of IgG and IgM'] start boundary not match - check corpus
['88311', 'ADE', 3761, 3782, 'Peripheral neuropathy'] start boundary not match - check corpus
********12_649********
********7_160********
['17457', 'ADE', 594, 608, 'near syncope'] start boundary not match - check corpus
['17458', 'ADE', 1288, 1300, 'near syncope'] start boundary not match - check corpus
['17139', 'Indication', 2509, 2521, 'goal INR 2-3'] start boundary not match - check corpus
['17143', 'Severity', 2893, 2901, 'severely'] start boundary not match - check corpus
['17472', 'Severity', 10747, 1

********5_289********
['31386', 'Severity', 4531, 4540, 'low-grade'] start boundary not match - check corpus
********13_95********
********12_153********
********1_81********
['8638', 'ADE', 3570, 3593, 'neuropathy in his hands'] start boundary not match - check corpus
['8614', 'Severity', 5475, 5486, 'significant'] start boundary not match - check corpus
['8618', 'Severity', 5779, 5787, 'slightly'] start boundary not match - check corpus
********7_810********
********1_330********
['35521', 'Severity', 5319, 5323, 'very'] start boundary not match - check corpus
********1_1041********
['110750', 'Severity', 7469, 7481, '40-50 pounds'] start boundary not match - check corpus
********1_798********
********13_66********
********14_879********
********14_71********
********14_613********
['64065', 'Indication', 1723, 1735, 'constipation'] start boundary not match - check corpus
********1_906********
['96084', 'Severity', 5754, 5762, 'somewhat'] start boundary not match - check corpus
*****

['48619', 'SSLIF', 2376, 2403, 'increased her volume status'] start boundary not match - check corpus
['48546', 'Severity', 4424, 4428, 'mild'] start boundary not match - check corpus
['48564', 'SSLIF', 5366, 5372, 'snored'] start boundary not match - check corpus
['48614', 'Severity', 6152, 6162, 'marginally'] start boundary not match - check corpus
********19_684********
['71984', 'Severity', 2410, 2416, 'mildly'] start boundary not match - check corpus
********14_69********
['7422', 'Indication', 2058, 2086, 'FOLLICULAR B CELL LYMPHOMA']  is in two sentences. Assign to the first sentence
entity index is  6
['7373', 'Drug', 8141, 8170, 'CALCIUM CONTAINING ANTACIDS']  is in two sentences. Assign to the first sentence
entity index is  55
['7422', 'Indication', 2058, 2086, 'FOLLICULAR B CELL LYMPHOMA']  is in two sentences. Assign to the first sentence
entity index is  6
['7373', 'Drug', 8141, 8170, 'CALCIUM CONTAINING ANTACIDS']  is in two sentences. Assign to the first sentence
entity

['55520', 'Severity', 3957, 3969, 'grade III/VI'] start boundary not match - check corpus
['55608', 'Severity', 4075, 4081, 'softly'] start boundary not match - check corpus
['55638', 'Dose', 7436, 7441, '40 mg'] start boundary not match - check corpus
['55612', 'Frequency', 8809, 8815, 'weekly'] start boundary not match - check corpus
********21_212********
['22636', 'Severity', 1488, 1492, 'very'] start boundary not match - check corpus
********13_266********
********14_747********
********14_581********
['61153', 'Severity', 1862, 1870, 'somewhat'] start boundary not match - check corpus
********6_917********
['97327', 'SSLIF', 880, 896, 'Burkitt lymphoma'] start boundary not match - check corpus
['97320', 'ADE', 3238, 3244, 'nausea'] start boundary not match - check corpus
['97323', 'ADE', 3788, 3792, 'rash'] start boundary not match - check corpus
********7_386********
********4_114********
********4_345********
['36789', 'SSLIF', 2514, 2534, 'spleen not palpable'] start boundary 

********17_167********
['18230', 'Indication', 1465, 1472, 'itching'] start boundary not match - check corpus
********12_425********
********7_56********
********13_227********
['24022', 'Duration', 650, 651, '6'] start boundary not match - check corpus
********9_529********
['56157', 'Severity', 1884, 1889, 'a bit'] start boundary not match - check corpus
********3_417********
********10_479********
********1_646********
********7_565********
['59666', 'Severity', 2176, 2184, 'somewhat'] start boundary not match - check corpus
['59693', 'Severity', 3785, 3791, 'slowly'] start boundary not match - check corpus
['59694', 'SSLIF', 3833, 3878, 'hematocrit of 34.6, which has also decreased'] start boundary not match - check corpus
['59732', 'Severity', 4261, 4267, 'slowly'] start boundary not match - check corpus
['59692', 'SSLIF', 3739, 3800, 'platelet count is 53,000 which continues to slowly decrease']  is in two sentences. Assign to the first sentence
entity index is  75
['59692', 'SSL

********10_504********
['53544', 'Indication', 2215, 2221, 'nausea'] start boundary not match - check corpus
********1_25********
********19_252********
********14_827********
********1_732********
********1_164********
********10_390********
['41056', 'Severity', 944, 948, 'some'] start boundary not match - check corpus
********10_700********
['73187', 'Severity', 2076, 2082, 'mildly'] start boundary not match - check corpus
********13_991********
['104843', 'SSLIF', 1929, 1937, 'weakness'] start boundary not match - check corpus
['104855', 'Frequency', 2024, 2029, 'clock'] start boundary not match - check corpus
['104901', 'ADE', 2466, 2477, 'hypotensive'] start boundary not match - check corpus
********6_214********
********21_916********
['97230', 'SSLIF', 652, 677, 'volar wrist ganglion.']  is in two sentences. Assign to the first sentence
entity index is  1
['97230', 'SSLIF', 652, 677, 'volar wrist ganglion.']  is in two sentences. Assign to the first sentence
entity index is  1


********1_392********
['41332', 'Severity', 6532, 6540, 'slightly'] start boundary not match - check corpus
['41350', 'ADE', 7143, 7152, 'skin rash'] start boundary not match - check corpus
********10_159********
********12_308********
********14_828********
********14_642********
['67393', 'Severity', 2804, 2808, '2 cm'] start boundary not match - check corpus
['67395', 'Dose', 4622, 4623, '4'] start boundary not match - check corpus
********21_344********
['36703', 'Severity', 1639, 1649, '1 x 1 cm']  is in two sentences. Assign to the first sentence
entity index is  14
['36703', 'Severity', 1639, 1649, '1 x 1 cm']  is in two sentences. Assign to the first sentence
entity index is  14
********1_590********
********14_629********
['66058', 'SSLIF', 1157, 1182, 'rise in his serum calcium'] start boundary not match - check corpus
['66056', 'ADE', 3755, 3768, 'hypercalcemia'] start boundary not match - check corpus
********17_98********
['10486', 'SSLIF', 2647, 2653, 'tender'] start boun

In [293]:
train_data

[['NEGATIVE',
  'ASPIRIN',
  '5 MG-325 MG TABLET',
  'Drug',
  'Dose',
  -534,
  -97,
  'medications',
  'Discharge medications',
  '. Discharge medications',
  '81',
  '81 mg',
  '81 mg Tablet',
  'acetaminophen',
  '- acetaminophen',
  'HYDROcodone - acetaminophen',
  ',',
  ', Ordered',
  ', Ordered By'],
 ['NEGATIVE',
  'ASPIRIN',
  '1 TABLET',
  'Drug',
  'Dose',
  -607,
  -111,
  'medications',
  'Discharge medications',
  '. Discharge medications',
  '81',
  '81 mg',
  '81 mg Tablet',
  ':',
  'Directions :',
  'Name Directions :',
  'oral',
  'oral every',
  'oral every four'],
 ['NEGATIVE',
  'ASPIRIN',
  'ORAL',
  'Drug',
  'Route',
  -616,
  -113,
  'medications',
  'Discharge medications',
  '. Discharge medications',
  '81',
  '81 mg',
  '81 mg Tablet',
  'tablet',
  '1 tablet',
  ': 1 tablet',
  'every',
  'every four',
  'every four hours'],
 ['NEGATIVE',
  'ASPIRIN',
  'EVERY FOUR HOURS PRN',
  'Drug',
  'Frequency',
  -621,
  -114,
  'medications',
  'Discharge medicat

In [199]:
len(tt_cross)

2814

In [200]:
len(f_cross)

28592

In [201]:
len(ft_cross)

28592

In [295]:
print(tt_cross[:100])
print()
print(ft_cross[:100])

[['reason', 'RITUXIMAB', 'FOLLICULAR SMALL CLEAVED LYMPHOCYTIC LYMPHOMA', 'Drug', 'Indication', 171, 26, 'completed', 'having completed', 'after having completed', ',', ', status', ', status post', 'of', 'history of', 'a history of', ',', ', stage', ', stage II'], ['reason', 'CYCLOPHOSPHAMIDE', 'FOLLICULAR SMALL CLEAVED LYMPHOCYTIC LYMPHOMA', 'Drug', 'Indication', 194, 30, 'post', 'status post', ', status post', ',', ', vincristine', ', vincristine and', 'of', 'history of', 'a history of', ',', ', stage', ', stage II'], ['reason', 'VINCRISTINE', 'FOLLICULAR SMALL CLEAVED LYMPHOCYTIC LYMPHOMA', 'Drug', 'Indication', 212, 32, ',', 'cyclophosphamide ,', 'post cyclophosphamide ,', 'and', 'and prednisone', 'and prednisone ,', 'of', 'history of', 'a history of', ',', ', stage', ', stage II'], ['reason', 'PREDNISONE', 'FOLLICULAR SMALL CLEAVED LYMPHOCYTIC LYMPHOMA', 'Drug', 'Indication', 228, 34, 'and', 'vincristine and', ', vincristine and', ',', ', of', ', of which', 'of', 'history of', 'a 

In [294]:
labels

{'adverse', 'do', 'du', 'fr', 'manner/route', 'reason', 'severity_type'}

In [18]:
d = dict()
for each in tt_cross:
    if each[0] in d:
        d[each[0]] += 1
    else:
        d[each[0]] = 1

In [19]:
d

{'reason': 1257,
 'adverse': 411,
 'fr': 356,
 'du': 67,
 'do': 389,
 'severity_type': 47,
 'manner/route': 287}

In [296]:
labels.add('NEGATIVE')

In [96]:
print(train_data[100:105])
len(train_data)

[['NEGATIVE', 'CLONAZEPAM', 'PANCYTOPENIA', 'Drug', 'ADE', 53, 7, ',', 'Calcium ,', ': Calcium ,', ',', ', diltiazem', ', diltiazem ,', 'profound', 'caused profound', 'which caused profound', '.', '. ADDITIONAL', '. ADDITIONAL MEDICATIONS'], ['NEGATIVE', 'DILTIAZEM', 'PANCYTOPENIA', 'Drug', 'ADE', 65, 9, ',', 'clonazepam ,', ', clonazepam ,', ',', ', doxazosin', ', doxazosin ,', 'profound', 'caused profound', 'which caused profound', '.', '. ADDITIONAL', '. ADDITIONAL MEDICATIONS'], ['NEGATIVE', 'DOXAZOSIN', 'PANCYTOPENIA', 'Drug', 'ADE', 76, 11, ',', 'diltiazem ,', ', diltiazem ,', ',', ', fish', ', fish oil', 'profound', 'caused profound', 'which caused profound', '.', '. ADDITIONAL', '. ADDITIONAL MEDICATIONS'], ['NEGATIVE', 'FISH OIL', 'PANCYTOPENIA', 'Drug', 'ADE', 87, 13, ',', 'doxazosin ,', ', doxazosin ,', ',', ', furosemide', ', furosemide ,', 'profound', 'caused profound', 'which caused profound', '.', '. ADDITIONAL', '. ADDITIONAL MEDICATIONS'], ['NEGATIVE', 'FUROSEMIDE', 'P

26765

In [279]:
output_SVM_training_data("umass18SVM_cross_sent.svm", train_data, labels)

label dictionary is store in umass18SVM_cross_sent.svm_index2label.pkl
The training data is in file umass18SVM_cross_sent.svm.svm
The features and associated code is in file umass18SVM_cross_sent.svm_features_dict.pkl


In [297]:
td = tt_cross + ft_cross
td[:1000]

[['reason',
  'RITUXIMAB',
  'FOLLICULAR SMALL CLEAVED LYMPHOCYTIC LYMPHOMA',
  'Drug',
  'Indication',
  171,
  26,
  'completed',
  'having completed',
  'after having completed',
  ',',
  ', status',
  ', status post',
  'of',
  'history of',
  'a history of',
  ',',
  ', stage',
  ', stage II'],
 ['reason',
  'CYCLOPHOSPHAMIDE',
  'FOLLICULAR SMALL CLEAVED LYMPHOCYTIC LYMPHOMA',
  'Drug',
  'Indication',
  194,
  30,
  'post',
  'status post',
  ', status post',
  ',',
  ', vincristine',
  ', vincristine and',
  'of',
  'history of',
  'a history of',
  ',',
  ', stage',
  ', stage II'],
 ['reason',
  'VINCRISTINE',
  'FOLLICULAR SMALL CLEAVED LYMPHOCYTIC LYMPHOMA',
  'Drug',
  'Indication',
  212,
  32,
  ',',
  'cyclophosphamide ,',
  'post cyclophosphamide ,',
  'and',
  'and prednisone',
  'and prednisone ,',
  'of',
  'history of',
  'a history of',
  ',',
  ', stage',
  ', stage II'],
 ['reason',
  'PREDNISONE',
  'FOLLICULAR SMALL CLEAVED LYMPHOCYTIC LYMPHOMA',
  'Drug',
  '

In [281]:
labels

{'NEGATIVE',
 'adverse',
 'do',
 'du',
 'fr',
 'manner/route',
 'reason',
 'severity_type'}

In [252]:
label_cat_set

{'NEGATIVE',
 'adverse',
 'do',
 'du',
 'fr',
 'manner/route',
 'reason',
 'severity_type'}

In [298]:
output_training_data_for_random_forest("umassRF_training_cross1", td, labels)

label dictionary is store in umassRF_training_cross1_index2label.pkl


## recount number of cross sentence cases

In [21]:
def rel_count(rels, anns, nsents, aid2idx, tid2idx):
    d = {k:0 for k in range(4)}
    for each in rels:
        ann1_id = each[2]
        ann2_id = each[3]
        if ann1_id == ann2_id:
            continue
            
        #text of entities in the relation
        tk1 = anns[aid2idx[ann1_id]]
        tk2 = anns[aid2idx[ann2_id]]
         
        #using the sentence idx of the first token in an entity to represent the entity sentence idx
        pos1 = tid2idx[ann1_id]
        pos2 = tid2idx[ann2_id]
        tk1_sent_idx = nsents[pos1[0]][-1]
        tk2_sent_idx = nsents[pos2[0]][-1]
        
        #check if entity 1 and entity 2 are in same sentence
        #if two entities are in the same sentence, the sent idx difference should 0
        diff = abs(tk1_sent_idx - tk2_sent_idx)
        if diff < 3:
            d[diff] += 1
        else:
            d[3] += 1
            
    return d 

In [22]:
total = dict()
for file_id in os.listdir(train_file_dir):
        bioc_file = bioc_dir + file_id + bioc_suffix
        sent_file = sent_file_dir + file_id + sent_suffix
        map_file = map_file_dir + file_id + map_suffix
        rel_file = rel_file_dir + file_id + rel_suffix
        res_file = "cross_sent_rel/"+ file_id + ".txt"
        
        print("********" + file_id + "********")
        #read sentence mapping file
        sents, sent_boundry_map = extract_sent_map(map_file_dir + file_id + map_suffix)
        #read bioc file
        txt = process_bioc(bioc_dir + file_id + bioc_suffix)
        #get all entities from bioc
        anns, aid2idx = extract_entities_from_bioc(txt)
        #get all relations from bioc
        nsents, tid2idx = label_tokens_in_sent(anns, sents)
        t_rels = extract_relation_from_bioc(txt)
        d = rel_count(t_rels, anns, nsents, aid2idx, tid2idx)
        for k, v in d.items():
            if k in total:
                total[k] += v
            else:
                total[k] = v

********12_123********
['13243', 'Drug', 1078, 1104, 'MULTIVITAMINS (TAB-A-VITE)'] start boundary not match - check corpus
********4_857********
['90589', 'Duration', 4836, 4844, '2 cycles'] start boundary not match - check corpus
********10_988********
['104675', 'Severity', 4169, 4180, 'significant'] start boundary not match - check corpus
********17_839********
********13_513********
['54443', 'Duration', 938, 946, '6 cycles'] start boundary not match - check corpus
['54444', 'Duration', 984, 992, '4 cycles'] start boundary not match - check corpus
********1_985********
['104367', 'Frequency', 2099, 2104, '1 day'] start boundary not match - check corpus
['104377', 'Duration', 2558, 2565, '1 cycle'] start boundary not match - check corpus
['104289', 'Severity', 5775, 5780, 'quite'] start boundary not match - check corpus
['104291', 'ADE', 6456, 6479, 'abnormal renal function'] start boundary not match - check corpus
['104301', 'Duration', 6762, 6763, '4'] start boundary not match - c

********13_86********
['9228', 'Drug', 3495, 3502, 'PainMed'] start boundary not match - check corpus
********1_860********
['91031', 'SSLIF', 3731, 3738, 'Fatigue'] start boundary not match - check corpus
********1_894********
['94514', 'Severity', 7859, 7866, 'inching'] start boundary not match - check corpus
********12_690********
********21_816********
********13_891********
********14_920********
['97644', 'Drug', 1410, 1437, 'calcium containing antacids'] start boundary not match - check corpus
********19_963********
['101856', 'Severity', 1302, 1311, 'partially'] start boundary not match - check corpus
['101866', 'Severity', 3042, 3050, 'severely'] start boundary not match - check corpus
********7_378********
********7_982********
['103860', 'Severity', 5146, 5155, 'minimally'] start boundary not match - check corpus
['103866', 'ADE', 6111, 6132, 'compression fractures'] start boundary not match - check corpus
['103877', 'SSLIF', 6510, 6522, 'bone problem'] start boundary not ma

['105787', 'Dose', 1313, 1314, '1'] start boundary not match - check corpus
['105789', 'Dose', 1646, 1647, '1'] start boundary not match - check corpus
['105801', 'Dose', 2414, 2419, '40 mg'] start boundary not match - check corpus
********14_767********
['80605', 'Severity', 2005, 2010, 'a bit'] start boundary not match - check corpus
********10_1014********
********1_886********
['93955', 'Indication', 5049, 5070, 'Peripheral neuropathy'] start boundary not match - check corpus
********7_731********
********7_355********
['37622', 'SSLIF', 5689, 5729, 'LDH and reticulocyte count are elevated'] start boundary not match - check corpus
['37746', 'SSLIF', 5749, 5761, 'bony disease'] start boundary not match - check corpus
['37698', 'Severity', 6406, 6411, 'quite'] start boundary not match - check corpus
********9_319********
********1_620********
['64741', 'Drug', 3309, 3319, 'MS Contin'] start boundary not match - check corpus
['64889', 'Drug', 3310, 3319, 'MS Contin'] start boundary no

['29548', 'Severity', 2034, 2038, 'mild'] start boundary not match - check corpus
['29595', 'Severity', 6923, 6931, 'slightly'] start boundary not match - check corpus
['29470', 'Severity', 7031, 7039, 'slightly'] start boundary not match - check corpus
********1_157********
['16605', 'Severity', 5042, 5046, 'MILD'] start boundary not match - check corpus
********1_365********
********1_20********
['1915', 'SSLIF', 3457, 3465, 'fatigued'] start boundary not match - check corpus
['1874', 'Severity', 5400, 5404, 'very'] start boundary not match - check corpus
********14_487********
['51668', 'Drug', 818, 825, 'calcium'] start boundary not match - check corpus
********10_539********
********3_301********
********10_14********
********10_162********
********1_739********
['77342', 'ADE', 5249, 5264, 'urinary urgency'] start boundary not match - check corpus
['77388', 'ADE', 9693, 9708, 'urinary urgency'] start boundary not match - check corpus
********7_628********
['65973', 'Severity', 52

['82961', 'Indication', 946, 951, 'fever'] start boundary not match - check corpus
['82841', 'SSLIF', 2488, 2494, 'Sweats'] start boundary not match - check corpus
['82842', 'SSLIF', 2501, 2507, 'Chills'] start boundary not match - check corpus
['82843', 'SSLIF', 2512, 2523, 'Weight Loss'] start boundary not match - check corpus
['82844', 'SSLIF', 2528, 2539, 'Weight Gain'] start boundary not match - check corpus
['82845', 'SSLIF', 2559, 2569, 'Congestion'] start boundary not match - check corpus
['82936', 'SSLIF', 2574, 2585, 'Sore Throat'] start boundary not match - check corpus
['82846', 'SSLIF', 2632, 2639, 'Dysuria'] start boundary not match - check corpus
['82847', 'SSLIF', 2644, 2653, 'Hematuria'] start boundary not match - check corpus
['82848', 'SSLIF', 2665, 2671, 'Rashes'] start boundary not match - check corpus
['82937', 'SSLIF', 2708, 2718, 'Joint Pain'] start boundary not match - check corpus
['82887', 'SSLIF', 2723, 2732, 'Back Pain'] start boundary not match - check cor

********3_466********
********10_602********
********10_292********
['31658', 'ADE', 588, 609, 'Peripheral neuropathy'] start boundary not match - check corpus
['31607', 'SSLIF', 1031, 1039, 'tingling'] start boundary not match - check corpus
['31608', 'SSLIF', 1044, 1064, 'numbness of his feet'] start boundary not match - check corpus
['31609', 'SSLIF', 1069, 1112, 'loss of sensation in the tip of his fingers'] start boundary not match - check corpus
********12_834********
********19_196********
********10_259********
********14_119********
********1_276********
********1_612********
********7_393********
['41456', 'Severity', 3788, 3792, 'very'] start boundary not match - check corpus
********1_282********
['30810', 'SSLIF', 535, 557, 'IgG kappa paraprotein'] start boundary not match - check corpus
['30725', 'Severity', 7325, 7329, 'very'] start boundary not match - check corpus
['30808', 'SSLIF', 7744, 7753, 'epistaxis'] start boundary not match - check corpus
['30806', 'SSLIF', 810

********17_611********
['63968', 'Dose', 1580, 1587, '1 liter'] start boundary not match - check corpus
********12_505********
['53580', 'Severity', 460, 469, 'low-grade'] start boundary not match - check corpus
********12_737********
********5_48********
['4960', 'SSLIF', 4469, 4508, 'supraclavicular or axillary adenopathy'] start boundary not match - check corpus
['4992', 'SSLIF', 6493, 6505, 'pancytopenia'] start boundary not match - check corpus
********6_247********
********1_130********
['13719', 'Duration', 1894, 1899, '1 day'] start boundary not match - check corpus
['13730', 'Duration', 2357, 2364, '1 cycle'] start boundary not match - check corpus
['13613', 'Drug', 2516, 2525, 'vitamin D'] start boundary not match - check corpus
['13774', 'Severity', 5911, 5916, 'quite'] start boundary not match - check corpus
['13802', 'SSLIF', 6938, 6949, 'Neutropenia'] start boundary not match - check corpus
********1_302********
['32803', 'ADE', 7831, 7841, 'neuropathy'] start boundary no

In [23]:
total

{0: 18948, 1: 2814, 2: 615, 3: 691}

## prediction2BioC

#### generate training samples

In [283]:
def generate_data_for_prediction(gid, file_id, anns, sent_boundry_map, map_dict, in_sent_rels=[], in_sent=True):
    ann_in_sent = _group_annotaion_by_sent(anns, sent_boundry_map)
    rels = []
    
    if in_sent:
        entity_type_rule_set = {
            ('Drug', 'ADE'), ('SSLIF', 'ADE'), ('Drug', 'Dose'), 
            ('Drug', 'Duration'), ('Drug', 'Frequency'), ('Drug', 'Indication'), 
            ('SSLIF', 'Severity'), ('ADE', 'Severity'), ('Indication', 'Severity'), 
            ('Drug', 'Route')
        }
        
        for k, v in ann_in_sent.items():
            for e1, e2 in itertools.permutations(v, 2):
                rel_pair = (anns[e1][0], anns[e2][0])
                rel_type_pair = (anns[e1][1], anns[e2][1])
                if rel_type_pair in entity_type_rule_set:  
                    rels.append(rel_pair)
                    map_dict[gid] = [file_id, rel_pair]
                    gid += 1
    else:
        entity_type_rule_set = {
            ('Drug', 'ADE'), ('Drug', 'Dose'), ('Drug', 'Frequency'), 
            ('Drug', 'Indication'), ('Drug', 'Route')
        }
            
        for i in range(len(ann_in_sent)-1):
            j = i + 1
            v = ann_in_sent[i] + ann_in_sent[j]
            for e1, e2 in itertools.permutations(v, 2):
                rel_pair = (anns[e1][0], anns[e2][0])
                rel_type_pair = (anns[e1][1], anns[e2][1])
                if rel_type_pair in entity_type_rule_set and rel_pair not in in_sent_rels:
                    rels.append(rel_pair)    
                    map_dict[gid] = [file_id, rel_pair]
                    gid += 1
    
    return rels, gid


#use this function to load features and idx2label
def load_dict(file_name):
    with open(file_name, "rb") as f:
        d = pkl.load(f)
    return d

def dump_dict(data, file_name):
    with open(file_name, "wb") as f:
        pkl.dump(data, f)


def generate_features_for_prediction(rels, anns, nsents, aid2idx, tid2idx, in_sent=True):
    rel_fea = []
    
    if in_sent:
        cross = 0
    else:
        cross = 1
    
    for each in rels:
        ann1_id = each[0]
        ann2_id = each[1]
        
        #text of entities in the relation
        tk1 = anns[aid2idx[ann1_id]]
        tk2 = anns[aid2idx[ann2_id]]
         
        #using the sentence idx of the first token in an entity to represent the entity sentence idx
        pos1 = tid2idx[ann1_id]
        pos2 = tid2idx[ann2_id]
        tk1_sent_idx = nsents[pos1[0]][-1]
        tk2_sent_idx = nsents[pos2[0]][-1]
        
        #check if entity 1 and entity 2 are in same sentence
        #if two entities are in the same sentence, the sent idx difference should 0
        if abs(tk1_sent_idx - tk2_sent_idx) == cross:
            fea1 = "" + tk1[-1].upper()           
            fea2 = "" + tk2[-1].upper()         
            #entity type
            fea3 = "" + tk1[1]            
            fea4 = "" + tk2[1]           
            #entities distance measured by diference of the start position before mapping
            fea5 = tk1[2] - tk2[2] 
            fea6 = pos1[0] - pos2[0]
            #ngram features: window size = 3
            fea_ngram_en1 = extract_ngram_feature(nsents, pos1)
            fea_ngram_en2 = extract_ngram_feature(nsents, pos2)
            feas = [fea1, fea2, fea3, fea4, fea5, fea6] + fea_ngram_en1 + fea_ngram_en2
            rel_fea.append([fea for fea in feas if fea != ""])     
    
    return rel_fea


def output_SVM_prediction_data(file_name, data, feature_dict_file):
    #read feature dictionary (within and cross sentence data should have different feature dict)
    feature_dict = load_dict(feature_dict_file)

    svm_file = file_name + ".svm"
    with open(svm_file, "w") as  fw:
        for each in data:
            stt = "-1"
            tset = set()
            #dis
            stt = "{} 1:{}".format(stt, each.pop(4))
            if stt == "-1 1:0":
                print(each)
            #pos
            stt = "{} 2:{}".format(stt, each.pop(4))
            #rest features
            for fea in each:
                if fea in feature_dict:
                    code = feature_dict[fea]
                    tset.add(code)    
            #sort the tset
            tset = sorted(list(tset))
            for each in tset:
                stt = "{} {}:{}".format(stt, each, 1)
            print(stt, file=fw, end="\n")
    print("The prediction data is in file " + svm_file)

In [284]:
def output_testing_data_for_random_forest(file_name, fea):
    rf_file = file_name + ".rf"
    with open(rf_file, "w") as f:
        for each in fea:
            dis = str(each.pop(4))
            tok_dis = str(each.pop(4))
            line = [dis, tok_dis]
            for e in each:
                line.append(e)
            print("\t".join(line), file=f, end="\n")

In [72]:
#this is a test
file_id = '7_670'

'''
map_dict:
    {
        idx:
        [file_id, (en1_id, en2_id)]
    }
'''
#define in global
in_map_dict = dict() 
cross_map_dict = dict()
data4pred_in = []
data4pred_cross = []
within_feature_data = []
cross_feature_data = []

in_id = 0
cross_id = 0
in_sent_file = "umass_in_sent_prediction"
cross_sent_file = "umass_cross_sent_prediciton"
in_sent_feature_dict_file = "features_labels/umass18SVM_within_sent_features_dict.pkl"
cross_sent_feature_dict_file = "features_labels/umass18SVM_cross_sent.svm_features_dict.pkl"
in_sent_label_idx_dict_file = "features_labels/umass18SVM_within_sent_index2label.pkl"
cross_sent_label_idx_dict_file = "features_labels/umass18SVM_cross_sent.svm_index2label.pkl"

#for each files
bioc_file = bioc_dir + file_id + bioc_suffix
sent_file = sent_file_dir + file_id + sent_suffix
map_file = map_file_dir + file_id + map_suffix
rel_file = rel_file_dir + file_id + rel_suffix

print("********" + file_id + "********")
#read sentence mapping file
sents, sent_boundry_map = extract_sent_map(map_file_dir + file_id + map_suffix)
#read bioc file
txt = process_bioc(bioc_dir + file_id + bioc_suffix)
#get all entities from bioc
anns, aid2idx = extract_entities_from_bioc(txt)
#get all relations from bioc
nsents, tid2idx = label_tokens_in_sent(anns, sents)

#generate relations within one sentence
within_rels, in_id = generate_data_for_prediction(in_id, file_id, anns, sent_boundry_map, in_map_dict)
data4pred_in.extend(within_rels)
within_fea = generate_features_for_prediction(within_rels, anns, nsents, aid2idx, tid2idx)
within_feature_data.extend(within_fea)
print(len(within_rels))
print(len(within_feature_data))

#generate relations cross sentence
cross_rels, cross_id = generate_data_for_prediction(cross_id, file_id, anns, sent_boundry_map, cross_map_dict, within_rels, False)
data4pred_cross.extend(cross_rels)
cross_fea = generate_features_for_prediction(cross_rels, anns, nsents, aid2idx, tid2idx, False)
cross_feature_data.extend(cross_fea)
print(len(cross_rels))
print(len(cross_feature_data))



#global
#output in sent prediction data
output_SVM_prediction_data(in_sent_file, within_feature_data, in_sent_feature_dict_file, in_sent_label_idx_dict_file)
#output cross sent prediction data
output_SVM_prediction_data(cross_sent_file, cross_feature_data, cross_sent_feature_dict_file, cross_sent_label_idx_dict_file)

********7_670********
['69917', 'Severity', 2893, 2901, 'severely'] start boundary not match - check corpus
['70075', 'Drug', 7065, 7085, 'POLYETHYLENE GLYCOL'] start boundary not match - check corpus
['70207', 'Severity', 10747, 10753, 'mildly'] start boundary not match - check corpus
['70287', 'SSLIF', 5924, 5939, 'discharge (L)']  is in two sentences. Assign to the first sentence
entity index is  159
['70067', 'SSLIF', 6228, 6254, 'L Dorsalis pedis reduced']  is in two sentences. Assign to the first sentence
entity index is  167
['69972', 'SSLIF', 8549, 8568, 'warm IgG antibody']  is in two sentences. Assign to the first sentence
entity index is  267
0
455
455
['70287', 'SSLIF', 5924, 5939, 'discharge (L)']  is in two sentences. Assign to the first sentence
entity index is  159
['70067', 'SSLIF', 6228, 6254, 'L Dorsalis pedis reduced']  is in two sentences. Assign to the first sentence
entity index is  167
['69972', 'SSLIF', 8549, 8568, 'warm IgG antibody']  is in two sentences. Ass

In [305]:
cross_map_dict

{0: ['18_821', ('6642', '6643')],
 1: ['18_821', ('6642', '6646')],
 2: ['18_821', ('6647', '6643')],
 3: ['18_821', ('6647', '6646')],
 4: ['18_821', ('6648', '6643')],
 5: ['18_821', ('6648', '6646')],
 6: ['18_821', ('6649', '6643')],
 7: ['18_821', ('6649', '6646')],
 8: ['18_821', ('6650', '6643')],
 9: ['18_821', ('6650', '6646')],
 10: ['18_821', ('6657', '6662')],
 11: ['18_821', ('6657', '6663')],
 12: ['18_821', ('6657', '6665')],
 13: ['18_821', ('6664', '6658')],
 14: ['18_821', ('6664', '6659')],
 15: ['18_821', ('6664', '6660')],
 16: ['18_821', ('6664', '6667')],
 17: ['18_821', ('6664', '6668')],
 18: ['18_821', ('6664', '6669')],
 19: ['18_821', ('6666', '6662')],
 20: ['18_821', ('6666', '6663')],
 21: ['18_821', ('6666', '6665')],
 22: ['18_821', ('6666', '6671')],
 23: ['18_821', ('6666', '6672')],
 24: ['18_821', ('6666', '6673')],
 25: ['18_821', ('6670', '6667')],
 26: ['18_821', ('6670', '6668')],
 27: ['18_821', ('6670', '6669')],
 28: ['18_821', ('6670', '6675

In [306]:
in_map_dict

{0: ['18_821', ('6644', '6643')],
 1: ['18_821', ('6644', '6646')],
 2: ['18_821', ('6645', '6643')],
 3: ['18_821', ('6645', '6646')],
 4: ['18_821', ('6651', '6653')],
 5: ['18_821', ('6652', '6653')],
 6: ['18_821', ('6654', '6653')],
 7: ['18_821', ('6655', '6656')],
 8: ['18_821', ('6657', '6658')],
 9: ['18_821', ('6657', '6659')],
 10: ['18_821', ('6657', '6660')],
 11: ['18_821', ('6657', '6661')],
 12: ['18_821', ('6664', '6662')],
 13: ['18_821', ('6664', '6663')],
 14: ['18_821', ('6664', '6665')],
 15: ['18_821', ('6666', '6667')],
 16: ['18_821', ('6666', '6668')],
 17: ['18_821', ('6666', '6669')],
 18: ['18_821', ('6670', '6671')],
 19: ['18_821', ('6670', '6672')],
 20: ['18_821', ('6670', '6673')],
 21: ['18_821', ('6674', '6675')],
 22: ['18_821', ('6674', '6676')],
 23: ['18_821', ('6674', '6677')],
 24: ['18_821', ('6678', '6679')],
 25: ['18_821', ('6678', '6680')],
 26: ['18_821', ('6678', '6681')],
 27: ['18_821', ('6684', '6685')],
 28: ['18_821', ('6686', '6685

#### test on whole test set

In [333]:
in_map_dict = dict() 
cross_map_dict = dict()
data4pred_in = []
data4pred_cross = []
within_feature_data = []
cross_feature_data = []

in_id = 0
cross_id = 0
in_sent_file = "umass_testing_within"
cross_sent_file = "umass_testing_cross"
in_sent_feature_dict_file = "features_labels/umass18SVM_within_sent_features_dict.pkl"
cross_sent_feature_dict_file = "features_labels/umass18SVM_cross_sent.svm_features_dict.pkl"
umass_cross_sent_entity2idx_map_file = "features_labels/umass_svm_cross_e2e_entity2idx.pkl"
umass_in_sent_entity2idx_map_file = "features_labels/umass_svm_within_e2e_entity2idx.pkl"


#for each files
bioc_file = bioc_dir + file_id + bioc_suffix
sent_file = sent_file_dir + file_id + sent_suffix
map_file = map_file_dir + file_id + map_suffix
rel_file = rel_file_dir + file_id + rel_suffix

map_file_dir = "made_test_data_full/corpus_sent/"
# bioc_dir = "made_test_data_full/NER_pred_res/" 
bioc_dir = "made_test_data_full/annotations/" 
c_dir = "made_test_data_full/corpus/"

for file_id in os.listdir(c_dir):
    print("********" + file_id + "********")
    #read sentence mapping file
    sents, sent_boundry_map = extract_sent_map(map_file_dir + file_id + map_suffix)
    #read bioc file
    txt = process_bioc(bioc_dir + file_id + bioc_suffix)
    #get all entities from bioc
    anns, aid2idx = extract_entities_from_bioc(txt)
    #get all relations from bioc
    nsents, tid2idx = label_tokens_in_sent(anns, sents)

    #generate relations within one sentence
    within_rels, in_id = generate_data_for_prediction(in_id, file_id, anns, sent_boundry_map, in_map_dict)
    data4pred_in.extend(within_rels)
    within_fea = generate_features_for_prediction(within_rels, anns, nsents, aid2idx, tid2idx)
    within_feature_data.extend(within_fea)
    print(len(within_rels))
    print(len(within_feature_data))

    #generate relations cross sentence
    cross_rels, cross_id = generate_data_for_prediction(cross_id, file_id, anns, sent_boundry_map, cross_map_dict, within_rels, False)
    data4pred_cross.extend(cross_rels)
    cross_fea = generate_features_for_prediction(cross_rels, anns, nsents, aid2idx, tid2idx, False)
    cross_feature_data.extend(cross_fea)
    print(len(cross_rels))
    print(len(cross_feature_data))

#global

#random forest data generation
output_testing_data_for_random_forest("umassRF_testing_within", within_feature_data)
output_testing_data_for_random_forest("umassRF_testing_cross", cross_feature_data)

#output in sent prediction data
output_SVM_prediction_data(in_sent_file, within_feature_data, in_sent_feature_dict_file)
dump_dict(cross_map_dict, umass_cross_sent_entity2idx_map_file)
# #output cross sent prediction data
output_SVM_prediction_data(cross_sent_file, cross_feature_data, cross_sent_feature_dict_file)
dump_dict(in_map_dict, umass_in_sent_entity2idx_map_file)

********18_821********
['86346', 'Severity', 832, 841, 'stage IIA'] start boundary not match - check corpus
['86355', 'SSLIF', 904, 922, 'pulmonary toxicity'] start boundary not match - check corpus
['86335', 'SSLIF', 1952, 1958, 'Nausea'] start boundary not match - check corpus
['86336', 'SSLIF', 1963, 1971, 'Vomiting'] start boundary not match - check corpus
['86337', 'SSLIF', 1976, 1984, 'Diarrhea'] start boundary not match - check corpus
45
45
77
77
********11_742********
2
47
0
77
********11_113********
['12276', 'SSLIF', 4003, 4014, 'cardiotoxic'] start boundary not match - check corpus
['12274', 'ADE', 4214, 4237, 'ventricular dysfunction'] start boundary not match - check corpus
['12275', 'ADE', 4426, 4449, 'ventricular dysfunction'] start boundary not match - check corpus
8
55
2
79
********20_965********
6
61
3
82
********18_60********
17
78
3
85
********11_388********
0
78
0
85
********12_987********
['104542', 'Severity', 504, 513, 'low-grade'] start boundary not match - che

['53192', 'Severity', 542, 550, 'stage 2A'] start boundary not match - check corpus
8
4777
1
1804
********20_574********
6
4783
0
1804
********1_1071********
['113855', 'Severity', 3431, 3446, 'almost entirely'] start boundary not match - check corpus
473
5256
0
1804
********12_994********
['105068', 'Severity', 459, 468, 'low-grade'] start boundary not match - check corpus
73
5329
32
1836
********20_382********
['40304', 'Severity', 1126, 1130, 'some'] start boundary not match - check corpus
['40287', 'ADE', 1220, 1226, 'groggy'] start boundary not match - check corpus
['40277', 'SSLIF', 1361, 1377, 'mind is in a fog'] start boundary not match - check corpus
['40283', 'SSLIF', 3371, 3383, 'fibromyalgia'] start boundary not match - check corpus
29
5358
16
1852
********11_100********
['10904', 'Indication', 1204, 1212, 'nauseous'] start boundary not match - check corpus
['10859', 'SSLIF', 3025, 3029, 'rash'] start boundary not match - check corpus
['10860', 'SSLIF', 3062, 3066, 'rash'] 

['110551', 'Drug', 2446, 2453, 'calcium'] start boundary not match - check corpus
77
9699
10
3690
********20_721********
['75198', 'SSLIF', 565, 577, 'losing words'] start boundary not match - check corpus
['75262', 'SSLIF', 1164, 1174, 'Osteopenia'] start boundary not match - check corpus
['75266', 'SSLIF', 1803, 1818, 'sinuses tender'] start boundary not match - check corpus
17
9716
2
3692
********18_495********
19
9735
23
3715
********18_698********
['73018', 'ADE', 968, 980, 'constipation'] start boundary not match - check corpus
198
9933
83
3798
********11_754********
3
9936
0
3798
********20_945********
['100238', 'Severity', 2255, 2260, 'gross'] start boundary not match - check corpus
6
9942
0
3798
********20_717********
['74752', 'Duration', 477, 485, '2 cycles'] start boundary not match - check corpus
['74753', 'Duration', 493, 501, '2 cycles'] start boundary not match - check corpus
['74754', 'Duration', 515, 523, '2 cycles'] start boundary not match - check corpus
['74736', 

In [318]:
in_map_dict

{0: ['18_821', ('6644', '6643')],
 1: ['18_821', ('6644', '6646')],
 2: ['18_821', ('6645', '6643')],
 3: ['18_821', ('6645', '6646')],
 4: ['18_821', ('6651', '6653')],
 5: ['18_821', ('6652', '6653')],
 6: ['18_821', ('6654', '6653')],
 7: ['18_821', ('6655', '6656')],
 8: ['18_821', ('6657', '6658')],
 9: ['18_821', ('6657', '6659')],
 10: ['18_821', ('6657', '6660')],
 11: ['18_821', ('6657', '6661')],
 12: ['18_821', ('6664', '6662')],
 13: ['18_821', ('6664', '6663')],
 14: ['18_821', ('6664', '6665')],
 15: ['18_821', ('6666', '6667')],
 16: ['18_821', ('6666', '6668')],
 17: ['18_821', ('6666', '6669')],
 18: ['18_821', ('6670', '6671')],
 19: ['18_821', ('6670', '6672')],
 20: ['18_821', ('6670', '6673')],
 21: ['18_821', ('6674', '6675')],
 22: ['18_821', ('6674', '6676')],
 23: ['18_821', ('6674', '6677')],
 24: ['18_821', ('6678', '6679')],
 25: ['18_821', ('6678', '6680')],
 26: ['18_821', ('6678', '6681')],
 27: ['18_821', ('6684', '6685')],
 28: ['18_821', ('6686', '6685

In [319]:
cross_map_dict[0]

['18_821', ('6642', '6643')]

#### convert SVM predication results to BioC format

In [336]:
'''
<relation id="102">
    <infon key="type">reason</infon>
    <node refid="631" role="annotation 1"/>
    <node refid="629" role="annotation 2"/>
</relation>
'''

def make_xml_body_head(file_id):
    return """<?xml version='1.0' encoding='utf-8' standalone='yes'?>
<collection>
  <source></source>
  <date></date>
  <key></key>
  <document>
    <id>%s</id>
    <passage>
      <offset>0</offset>"""%file_id


def make_xml_body_tail():
    return"""
    </passage>
  </document>
</collection>"""


def make_xml_relation(rid, rtype, e1_id, e2_id):
    return """
      <relation id="{}">
        <infon key="type">{}</infon>
        <node refid="{}" role="annotation 1"/>
        <node refid="{}" role="annotation 2"/>
      </relation>""".format(rid, rtype, e1_id, e2_id)


def make_xml_annotation(wid, tag, length, offset, text):
    return """
      <annotation id="{}">
        <infon key="type">{}</infon>
        <location length="{}" offset="{}"/>
        <text>{}</text>
      </annotation>""".format(wid, tag, length, offset, text)


def map_predict_res(pred_res, idx2label_file, idx2entity_file, prob=True):
    idx2label = load_dict(idx2label_file)
    idx2entity = load_dict(idx2entity_file)
    group_by_file = dict()
    with open(pred_res, "r") as f:
        if prob:
            for idx, line in enumerate(f):
                if idx == 0:
                    pass
                else:
                    label = int(line[:-1].split(" ")[0])
                    if label != 0:
                        rtype = idx2label[label]
                        rel_en = idx2entity[idx-1]
                        file_id = rel_en[0]
                        pred_rel = (rtype, rel_en[1])
                        if file_id not in group_by_file:
                            group_by_file[file_id] = [pred_rel]
                        else:
                            group_by_file[file_id].append(pred_rel)
        else:
            for idx, line in enumerate(f):
                if int(line[:-1]) != 0:
                    rtype = idx2label[int(line[:-1])]
                    rel_en = idx2entity[idx]
                    file_id = rel_en[0]
                    pred_rel = (rtype, rel_en[1])
                    if file_id not in group_by_file:
                        group_by_file[file_id] = [pred_rel]
                    else:
                        group_by_file[file_id].append(pred_rel)
    return group_by_file


def merge_pred_results_dict(d1, d2):
    #d1, d2 values are lists
    #d1, d2 should have same keys
    d = copy.deepcopy(d1)
    for k, v in d2.items():
        if k in d:
            d[k].extend(v)
        else:
            d[k] = v
    return d


def output_relation_BioC(rel_dir, grouped_data):
    global_id = 1
    if not os.path.isdir(rel_dir):
        os.mkdir(rel_dir)
    for file_id, rels in grouped_data.items():
        bioc_file = rel_dir + file_id + bioc_suffix
        head = make_xml_body_head(file_id)
        tail = make_xml_body_tail()
        rel = ""
        for each in rels:
            rel += make_xml_relation(global_id, each[0], each[1][0], each[1][1])
            global_id += 1
        with open(bioc_file, "w") as f:
            f.write(head + rel + tail)
            
def fill_not_appeared_file(corpus, pred_dir):
    file_id_list = os.listdir(corpus)
    pred_file_id = set(os.listdir(pred_dir))
    for each in file_id_list:
        file_id = each + bioc_suffix
        if file_id not in pred_file_id:
            head = make_xml_body_head(file_id)
            tail = make_xml_body_tail()
            bioc_file = pred_dir + "/" + file_id
            with open(bioc_file, "w") as f:
                f.write(head + tail)
                

def merge_ann_rel(rel_dir, ann_dir, output_dir):
    if not os.path.isdir(output_dir):
        os.mkdir(output_dir)
    suffix = ".bioc.xml"

    for each in os.listdir(rel_dir):
        fid = each.split(".")[0]
        txt1 = process_bioc(ann_dir+fid+suffix)
        annos = re.findall('<annotation id="(\d+)"> <infon key="type">(\w+)</infon> <location length="(\d+)" offset="(\d+)"/> <text>(.*?)</text> </annotation>',txt1)
        txt2 = process_bioc(rel_dir+fid+suffix)
        rels = re.findall('<relation id="(\d+)"> <infon key="type">(\w+|\w+/\w+)</infon> <node refid="(\d+)" role="(annotation \d+)"/> <node refid="(\d+)" role="(annotation \d+)"/> </relation>', txt2)  

        with open(output_dir+fid+suffix, "w") as f3:
            annss = ""
            relss = ""
            for each in annos:
                annss += make_xml_annotation(each[0], each[1], each[2], each[3], each[4])

            for each in rels:
                relss += make_xml_relation(each[0], each[1], each[2], each[4])

            f3.write(make_xml_body_head(fid)+annss+relss+make_xml_body_tail())

In [113]:
pred_res = "predres/umass18_in_sent_pred_res"
grouped_rels = map_predict_res(pred_res, in_sent_label_idx_dict_file, umass_in_sent_entity2idx_map_file)
output_relation_BioC("umass_relation_prediction/", grouped_rels)

In [116]:
d1 = {1:[1,2,3], 2:[4,5,6], 4:[15,16]}
d2 = {2:[7,8,9], 1:[10,11], 3:[11,12]}
print(merge_pred_results_dict(d1, d2))
print(merge_pred_results_dict(d2, d1))

{1: [1, 2, 3, 10, 11], 2: [4, 5, 6, 7, 8, 9], 4: [15, 16], 3: [11, 12]}
{2: [7, 8, 9, 4, 5, 6], 1: [10, 11, 1, 2, 3], 3: [11, 12], 4: [15, 16]}


In [328]:
pred_res_cross = "umass_predict_res/umass18_cross_sent_pred_res_4096_0.1"
pred_res_within = "umass_predict_res/umass18_in_sent_pred_res_4096_0.15"

# print(umass_cross_sent_entity2idx_map_file)
# print(umass_in_sent_entity2idx_map_file)
in_sent_label_idx_dict_file = "features_labels/umass18SVM_within_sent_index2label.pkl"
cross_sent_label_idx_dict_file = "features_labels/umass18SVM_cross_sent.svm_index2label.pkl"
umass_cross_sent_entity2idx_map_file = "features_labels/umass18SVM_cross_sent.svm_entity2idx.pkl"
umass_in_sent_entity2idx_map_file = "features_labels/umass18SVM_within_sent_entity2idx.pkl"


grouped_rels_cross = map_predict_res(pred_res_cross, cross_sent_label_idx_dict_file, umass_cross_sent_entity2idx_map_file)
grouped_rels_within = map_predict_res(pred_res_within, in_sent_label_idx_dict_file, umass_in_sent_entity2idx_map_file)

results = merge_pred_results_dict(grouped_rels_within, grouped_rels_cross)

output_relation_BioC("umass_relation_prediction/", results)

'''
from the results:
the three sets of prediction results based different parameters yield same output
except change the training strategy significantly, the output prediction will be highly consistant cross the models 

stable!
'''

fill_not_appeared_file("made_test_data_full/corpus", "umass_relation_prediction/")

{'20_394.bioc.xml', '20_1050.bioc.xml', '11_226.bioc.xml', '18_818.bioc.xml', '18_242.bioc.xml', '18_235.bioc.xml', '18_1078.bioc.xml', '20_379.bioc.xml', '7_1060.bioc.xml', '11_305.bioc.xml', '11_582.bioc.xml', '6_1074.bioc.xml', '11_848.bioc.xml', '19_972.bioc.xml', '11_219.bioc.xml', '4_1079.bioc.xml', '11_18.bioc.xml', '8_941.bioc.xml', '14_1072.bioc.xml', '13_1022.bioc.xml', '5_946.bioc.xml', '10_1070.bioc.xml', '4_969.bioc.xml', '17_1042.bioc.xml', '20_188.bioc.xml', '18_57.bioc.xml', '20_717.bioc.xml', '11_909.bioc.xml', '13_1002.bioc.xml', '11_1006.bioc.xml', '20_975.bioc.xml', '20_685.bioc.xml', '11_101.bioc.xml', '20_1033.bioc.xml', '18_60.bioc.xml', '18_820.bioc.xml', '7_1089.bioc.xml', '18_516.bioc.xml', '11_661.bioc.xml', '20_865.bioc.xml', '10_1081.bioc.xml', '18_698.bioc.xml', '11_534.bioc.xml', '18_440.bioc.xml', '18_693.bioc.xml', '11_543.bioc.xml', '1_1056.bioc.xml', '18_414.bioc.xml', '18_514.bioc.xml', '11_1052.bioc.xml', '18_520.bioc.xml', '11_110.bioc.xml', '20_66

### output e2e results (entities is from NER pipeline)

In [330]:
pred_res_cross = "umass_predict_res/umass18_cross_sent_pred_res_e2e_4096_0.1"
pred_res_within = "umass_predict_res/umass18_in_sent_pred_res_e2e_4096_0.15"

in_sent_label_idx_dict_file = "features_labels/umass18SVM_within_sent_index2label.pkl"
cross_sent_label_idx_dict_file = "features_labels/umass18SVM_cross_sent.svm_index2label.pkl"

umass_cross_sent_entity2idx_map_file = "features_labels/umass_svm_cross_e2e_entity2idx.pkl"
umass_in_sent_entity2idx_map_file = "features_labels/umass_svm_within_e2e_entity2idx.pkl"

grouped_rels_cross = map_predict_res(pred_res_cross, cross_sent_label_idx_dict_file, umass_cross_sent_entity2idx_map_file)
grouped_rels_within = map_predict_res(pred_res_within, in_sent_label_idx_dict_file, umass_in_sent_entity2idx_map_file)

results = merge_pred_results_dict(grouped_rels_within, grouped_rels_cross)

output_relation_BioC("umass_relation_prediction_e2e/", results)

'''
from the results:
the three sets of prediction results based different parameters yield same output
except change the training strategy significantly, the output prediction will be highly consistant cross the models 

stable!
'''

fill_not_appeared_file("made_test_data_full/corpus", "umass_relation_prediction_e2e/")

{'20_394.bioc.xml', '11_226.bioc.xml', '18_818.bioc.xml', '18_242.bioc.xml', '18_235.bioc.xml', '18_1078.bioc.xml', '20_379.bioc.xml', '7_1060.bioc.xml', '11_305.bioc.xml', '6_1074.bioc.xml', '11_848.bioc.xml', '19_972.bioc.xml', '11_219.bioc.xml', '11_18.bioc.xml', '8_941.bioc.xml', '14_1072.bioc.xml', '13_1022.bioc.xml', '5_946.bioc.xml', '10_1070.bioc.xml', '4_969.bioc.xml', '17_1042.bioc.xml', '18_57.bioc.xml', '20_188.bioc.xml', '20_717.bioc.xml', '11_909.bioc.xml', '13_1002.bioc.xml', '11_1006.bioc.xml', '20_975.bioc.xml', '11_101.bioc.xml', '20_1033.bioc.xml', '18_60.bioc.xml', '18_516.bioc.xml', '11_661.bioc.xml', '20_865.bioc.xml', '10_1081.bioc.xml', '18_698.bioc.xml', '11_534.bioc.xml', '18_440.bioc.xml', '18_693.bioc.xml', '11_543.bioc.xml', '1_1056.bioc.xml', '18_414.bioc.xml', '18_514.bioc.xml', '11_1052.bioc.xml', '11_110.bioc.xml', '20_665.bioc.xml', '18_787.bioc.xml', '18_579.bioc.xml', '20_112.bioc.xml', '18_572.bioc.xml', '11_396.bioc.xml', '11_423.bioc.xml', '1_1069

#### random forest prediction results to Bioc

##### taks2

In [341]:
pred_res_cross = "umass_predict_res/umassRF_testing_prediction_cross_results.txt"
pred_res_within = "umass_predict_res/umassRF_testing_prediction_within_results.txt"

in_sent_label_idx_dict_file = "features_labels/umass18SVM_within_sent_index2label.pkl"
cross_sent_label_idx_dict_file = "features_labels/umass18SVM_cross_sent.svm_index2label.pkl"

umass_cross_sent_entity2idx_map_file = "features_labels/umass18SVM_cross_sent.svm_entity2idx.pkl"
umass_in_sent_entity2idx_map_file = "features_labels/umass18SVM_within_sent_entity2idx.pkl"

grouped_rels_cross = map_predict_res(pred_res_cross, cross_sent_label_idx_dict_file, umass_cross_sent_entity2idx_map_file)
grouped_rels_within = map_predict_res(pred_res_within, in_sent_label_idx_dict_file, umass_in_sent_entity2idx_map_file)

results = merge_pred_results_dict(grouped_rels_within, grouped_rels_cross)

output_relation_BioC("umass_relation_prediction_RF/", results)

'''
from the results:
the three sets of prediction results based different parameters yield same output
except change the training strategy significantly, the output prediction will be highly consistant cross the models 

stable!
'''

fill_not_appeared_file("made_test_data_full/corpus", "umass_relation_prediction_RF/")

#relation_dir, ann_dir, result_output
merge_ann_rel("umass_relation_prediction_RF/", "made_test_data_full/annotations/", "Task2_RF/")

##### task3

In [346]:
pred_res_cross = "umass_predict_res/umassRF_testing_prediction_cross_e2e_results.txt"
pred_res_within = "umass_predict_res/umassRF_testing_prediction_within_e2e_results.txt"

in_sent_label_idx_dict_file = "features_labels/umass18SVM_within_sent_index2label.pkl"
cross_sent_label_idx_dict_file = "features_labels/umass18SVM_cross_sent.svm_index2label.pkl"

umass_cross_sent_entity2idx_map_file = "features_labels/umass_svm_cross_e2e_entity2idx.pkl"
umass_in_sent_entity2idx_map_file = "features_labels/umass_svm_within_e2e_entity2idx.pkl"

grouped_rels_cross = map_predict_res(pred_res_cross, cross_sent_label_idx_dict_file, umass_cross_sent_entity2idx_map_file)
grouped_rels_within = map_predict_res(pred_res_within, in_sent_label_idx_dict_file, umass_in_sent_entity2idx_map_file)

results = merge_pred_results_dict(grouped_rels_within, grouped_rels_cross)

output_relation_BioC("umass_relation_prediction_RF_e2e/", results)

'''
from the results:
the three sets of prediction results based different parameters yield same output
except change the training strategy significantly, the output prediction will be highly consistant cross the models 

stable!
'''

fill_not_appeared_file("made_test_data_full/corpus", "umass_relation_prediction_RF_e2e/")

#relation_dir, ann_dir, result_output
merge_ann_rel("umass_relation_prediction_RF_e2e/", "made_test_data_full/NER_pred_res/", "Task3_RF/")