In [2]:
from os import listdir

file_path_1 = "C:/Users/Leste/OneDrive - Johns Hopkins/Desktop/BDD data/extracted/training-RiskFactors-Complete-Set1/"
file_path_2 = "C:/Users/Leste/OneDrive - Johns Hopkins/Desktop/BDD data/extracted/training-RiskFactors-Complete-Set2/"
file_path_3 = "C:/Users/Leste/OneDrive - Johns Hopkins/Desktop/BDD data/extracted/testing-RiskFactors-Complete/"
names_1 = [f for f in listdir(file_path_1) if f.endswith('.xml')]
names_2 = [f for f in listdir(file_path_2) if f.endswith('.xml')]
names_3 = [f for f in listdir(file_path_3) if f.endswith('.xml')]

In [3]:
len(names_1), len(names_2), len(names_3)

(521, 269, 514)

In [4]:
import pandas as pd
import numpy as np

all1_df = pd.DataFrame(np.zeros((len(names_1), 3), dtype=object), columns=['text', 'annotation','loc'])
all2_df = pd.DataFrame(np.zeros((len(names_2), 3), dtype=object), columns=['text', 'annotation','loc'])
test_df = pd.DataFrame(np.zeros((len(names_3), 3), dtype=object), columns=['text', 'annotation','loc'])

In [5]:
import re
import xml.etree.ElementTree as ET


def to_df(names,df, file_path,PHI_status = True):
    #get text
    n = 0
    for name in names:
        tree = ET.parse(file_path + name)
        root = tree.getroot()

        ## Get the text
        nt = re.sub('\n',' ',root[0].text)
        nt = re.sub('\t',' ',nt) 
        nt = re.sub('"',"'",nt)
        ## sample 214 has a weird character
        nt = re.sub('>','&gt;',nt) 
        nt = re.sub('<','&lt;',nt)
        ## new wired character
        nt = re.sub('Â','',nt)
        nt = re.sub('â','',nt)
        nt = re.sub('€','',nt)
        nt = re.sub('™','',nt)
        df['text'][n] = nt
        n+=1
    
    #get annotations
    n = 0
    for name in names:
        tree = ET.parse(file_path + name)
        root = tree.getroot()
        ## Get the labels

        tag_list = []
        loc_list = []
        # get PHI labels if there are any
        if PHI_status == True:
            PHI = [root[1][x].tag for x in range(len(root[1]))].index('PHI')
            for k in range(PHI,len(root[1])):
                tag_list.append((root[1][k].attrib['text'],root[1][k].tag))
        
        # get the rest of labels
        for k in range(len(root[1])):
            if root[1][k].tag == 'SMOKER':
                continue
            
            for m in range(len(root[1][k])):
                if root[1][k][m].attrib.keys().__contains__('text') == False:
                    continue
                tag_list.append((root[1][k][m].attrib['text'],root[1][k][m].tag))
                loc_list.append((root[1][k][m].attrib['start'],root[1][k][m].attrib['end']))
        df['annotation'][n] = tag_list
        df['loc'][n] = loc_list
        n+=1
    return df

all_1 = to_df(names_1,all1_df, file_path_1, PHI_status = False)
all_2 = to_df(names_2,all2_df, file_path_2, PHI_status = False)
test_df = to_df(names_3,test_df, file_path_3, PHI_status = False)
train_df = pd.concat([all_1, all_2], ignore_index=True)

In [6]:
def intersection(ls1, ls2):
    index = [i for i, x in enumerate([x == ls2[0] for x in ls1]) if x]
    if index[-1] == len(ls1)-1:
        index = index[:-1]
    cut = 10000
    eva_temp = 10000
    for i in index:
        if (len(ls1) -i) < len(ls2):
            l = len(ls1) -i
        else:
            l = len(ls2)
        eva = sum([ls1[x+i] != ls2[x] for x in range(l)])
        if eva < eva_temp:
            cut = i
            eva_temp = eva
    out = "".join(ls1[0:cut]+ls2)
    return out

def pre_process(input):
    df = input.copy()

    for i in range(df.shape[0]):  
        for j in range(len(df['annotation'][i])):
                ## preprocess tagged text and location
                # remove extra spaces in the beginning and end of the annotation
                if re.search("^ +.*",df['annotation'][i][j][0]) != None or re.search(".* +$",df['annotation'][i][j][0]) != None:
                    front = len(df['annotation'][i][j][0]) - len(re.sub("^ +","",df['annotation'][i][j][0]))
                    end = len(df['annotation'][i][j][0]) - len(re.sub(" +$","",df['annotation'][i][j][0]))
                    df['loc'][i][j] = (str(int(df['loc'][i][j][0])+front),str(int(df['loc'][i][j][1])-end))
                    df['annotation'][i][j] = (re.sub(" +$","", re.sub("^ +","",df['annotation'][i][j][0])),df['annotation'][i][j][1])
                if int(df['loc'][i][j][0]) == df['text'][i].find('Record'):
                    df['loc'][i][j] = ('','')
                    df['annotation'][i][j] = ('','')
                    
    return df

def rm_dup(input):
    df = input.copy()
    for i in range(df.shape[0]):

        for j in range(len(df['annotation'][i])):

            for k in np.arange(j+1,len(df['annotation'][i])):
                # move on if the compared lables are ("","")
                if df['loc'][i][j] == ('',''):
                    break
                if df['loc'][i][k] == ('',''):
                    continue

                # find location contained within each other
                if int(df['loc'][i][j][0]) >= int(df['loc'][i][k][0]) and int(df['loc'][i][j][1]) <= int(df['loc'][i][k][1]):
                    df['loc'][i][j] = ('','')
                    df['annotation'][i][j] = ('','')
                    continue
                
                elif int(df['loc'][i][j][0]) <= int(df['loc'][i][k][0]) and int(df['loc'][i][j][1]) >= int(df['loc'][i][k][1]):
                    df['loc'][i][k] = ('','')
                    df['annotation'][i][k] = ('','')
                    continue

                # find location that overlap
                if int(df['loc'][i][j][0]) < int(df['loc'][i][k][0]) and int(df['loc'][i][j][1]) < int(df['loc'][i][k][1]) and int(df['loc'][i][j][1]) > int(df['loc'][i][k][0]):
                    #print(i,j,k)
                    new_s = intersection(df['annotation'][i][j][0],df['annotation'][i][k][0])
                    df['annotation'][i][j] = (new_s,df['annotation'][i][j][1])
                    df['loc'][i][j] = (df['loc'][i][j][0],df['loc'][i][k][1])
                    df['loc'][i][k] = ('','')
                    df['annotation'][i][k] = ('','')
                    continue
            
                elif int(df['loc'][i][j][0]) > int(df['loc'][i][k][0]) and int(df['loc'][i][j][1]) > int(df['loc'][i][k][1]) and int(df['loc'][i][j][0]) < int(df['loc'][i][k][1]):
                    #print(i,j,k)
                    #the order here matters, the first one should be the left most
                    new_s = intersection(df['annotation'][i][k][0],df['annotation'][i][j][0])
                    df['annotation'][i][k] = (new_s,df['annotation'][i][k][1])
                    df['loc'][i][k] = (df['loc'][i][k][0],df['loc'][i][j][1])
                    df['loc'][i][j] = ('','')
                    df['annotation'][i][j] = ('','')
                    continue

    return df

# preprocess df and run two pass on rm_dup

In [7]:
train_df = pre_process(train_df)
train_df.shape[0]

790

In [8]:
train_n = rm_dup(train_df)
train_n.shape[0]

790

In [9]:
temp_train = train_n.copy()
train_n = rm_dup(train_n)
train_n.shape[0]

790

In [10]:
test_df = pre_process(test_df)
test_df.shape[0]

514

In [11]:
test_n = rm_dup(test_df)
test_n.shape[0]

514

In [12]:
test_n = rm_dup(test_n)
test_n.shape[0]

514

In [13]:
for j in range(train_n.shape[0]):
    train_n['annotation'][j] = [train_n['annotation'][j][i] for i in range(len(train_n['annotation'][j])) if train_n['annotation'][j][i][0] != '']
    train_n['loc'][j] = [train_n['loc'][j][i] for i in range(len(train_n['loc'][j])) if train_n['loc'][j][i][0] != '']
    start = np.array([int(train_n['loc'][j][i][0]) for i in range(len(train_n['loc'][j]))])
    order = start.argsort()
    train_n['annotation'][j] = [train_n['annotation'][j][i] for i in order]
    train_n['loc'][j] = [train_n['loc'][j][i] for i in order]
train = train_n.drop('loc', axis=1)
train.shape[0]

790

In [14]:

for j in range(test_n.shape[0]):
    test_n['annotation'][j] = [test_n['annotation'][j][i] for i in range(len(test_n['annotation'][j])) if test_n['annotation'][j][i][0] != '']
    test_n['loc'][j] = [test_n['loc'][j][i] for i in range(len(test_n['loc'][j])) if test_n['loc'][j][i][0] != '']
    start = np.array([int(test_n['loc'][j][i][0]) for i in range(len(test_n['loc'][j]))])
    order = start.argsort()
    test_n['annotation'][j] = [test_n['annotation'][j][i] for i in order]
    test_n['loc'][j] = [test_n['loc'][j][i] for i in order]
test = test_n.drop('loc', axis=1)
test.shape[0]

514

# modify testing text (change coronary arterary disease to CAD ...)

In [15]:
import nltk
from nltk.corpus import stopwords
sw = stopwords.words('english')
negat = ["no", "nor", "not","don't","didn't","doesn't","isn't","aren't","wasn't","weren't","haven't","hasn't","hadn't","won't","wouldn't","shouldn't","can't","couldn't","mustn","mustn't","mightn't","mightn't","needn't","needn't","oughtn't","shan't","shan't","shouldn't","wasn't","weren't","won't","wouldn't","t","shouldn","wasn","weren","won","wouldn","can","couldn","didn","doesn","hadn","hasn","haven","isn","mightn","mustn","needn","oughtn","shan","shouldn","wasn","weren","won","wouldn"]
sw_n = [w for w in sw if w not in negat]

In [16]:

for i in range(test.shape[0]):
    word_tokens = test['text'][i].strip().split()
    filtered_sentence = [w for w in word_tokens if not w.lower() in sw_n]
    test['text'][i] = ' '.join(filtered_sentence)

In [17]:
test['text'] = test['text'].apply(lambda x: re.sub('coronary artery disease','CAD',x))
test['text'] = test['text'].apply(lambda x: re.sub('Coronary artery disease','CAD',x))
test['text'] = test['text'].apply(lambda x: re.sub('Coronary Artery Disease','CAD',x))
test['text'] = test['text'].apply(lambda x: re.sub('Blood Pressure','BP',x))
test['text'] = test['text'].apply(lambda x: re.sub('blood pressure','BP',x))
test['text'] = test['text'].apply(lambda x: re.sub('Blood pressure','BP',x))
test['text'] = test['text'].apply(lambda x: re.sub('blood Pressure','BP',x))
test['text'] = test['text'].apply(lambda x: re.sub('&#8211','',x))
#test['text'] = test['text'].apply(lambda x: re.sub(' p\.o\. ','per oral',x))
#test['text'] = test['text'].apply(lambda x: re.sub(' h/o ','had',x))
#test['text'] = test['text'].apply(lambda x: x.lower())

In [18]:
for i in range(test.shape[0]):
    for x in range(len(test['annotation'][i])):
        word_tokens = test['annotation'][i][x][0].strip().split()
        # converts the words in word_tokens to lower case and then checks whether 
        #they are present in stop_words or not
        filtered_sentence = [w for w in word_tokens if not w.lower() in sw_n]
        tagged_things = ' '.join(filtered_sentence)
        tagged_things = re.sub('coronary artery disease','CAD',tagged_things)
        tagged_things = re.sub('Coronary artery disease','CAD',tagged_things)
        tagged_things = re.sub('Coronary Artery Disease','CAD',tagged_things)
        tagged_things = re.sub('Blood Pressure','BP',tagged_things)
        tagged_things = re.sub('blood pressure','BP',tagged_things)
        tagged_things = re.sub('Blood pressure','BP',tagged_things)
        tagged_things = re.sub('blood Pressure','BP',tagged_things)
        tagged_things = re.sub('&#8211','',tagged_things)
        #tagged_things = re.sub(' p\.o\. ',' per oral ',tagged_things)
        #tagged_things = re.sub(' h/o ','had',tagged_things)
        test['annotation'][i][x] = (tagged_things,test['annotation'][i][x][1])

In [19]:
# remove stop words in text to match tags
for i in range(train.shape[0]):
    word_tokens = train['text'][i].strip().split()
    filtered_sentence = [w for w in word_tokens if not w.lower() in sw_n]
    train['text'][i] = ' '.join(filtered_sentence)

In [20]:
train['text'] = train['text'].apply(lambda x: re.sub('coronary artery disease','CAD',x))
train['text'] = train['text'].apply(lambda x: re.sub('Coronary artery disease','CAD',x))
train['text'] = train['text'].apply(lambda x: re.sub('Coronary Artery Disease','CAD',x))
train['text'] = train['text'].apply(lambda x: re.sub('Blood Pressure','BP',x))
train['text'] = train['text'].apply(lambda x: re.sub('blood pressure','BP',x))
train['text'] = train['text'].apply(lambda x: re.sub('Blood pressure','BP',x))
train['text'] = train['text'].apply(lambda x: re.sub('blood Pressure','BP',x))
train['text'] = train['text'].apply(lambda x: re.sub('&#8211','',x))
#train['text'] = train['text'].apply(lambda x: re.sub(' p\.o\. ',' per oral ',x))
#train['text'] = train['text'].apply(lambda x: re.sub(' h/o ',' had ',x))
#train['text'] = train['text'].apply(lambda x: x.lower())

In [21]:

for i in range(train.shape[0]):
    for x in range(len(train['annotation'][i])):
        word_tokens = train['annotation'][i][x][0].strip().split()
        # converts the words in word_tokens to lower case and then checks whether 
        #they are present in stop_words or not
        filtered_sentence = [w for w in word_tokens if not w.lower() in sw_n]
        tagged_things = ' '.join(filtered_sentence)
        tagged_things = re.sub('coronary artery disease','CAD',tagged_things)
        tagged_things = re.sub('Coronary artery disease','CAD',tagged_things)
        tagged_things = re.sub('Coronary Artery Disease','CAD',tagged_things)
        tagged_things = re.sub('Blood Pressure','BP',tagged_things)
        tagged_things = re.sub('blood pressure','BP',tagged_things)
        tagged_things = re.sub('Blood pressure','BP',tagged_things)
        tagged_things = re.sub('blood Pressure','BP',tagged_things)
        tagged_things = re.sub('&#8211','',tagged_things)
        #tagged_things = re.sub(' p\.o\. ',' per oral ',tagged_things)
        #tagged_things = re.sub(' h/o ',' had ',tagged_things)
        train['annotation'][i][x] = (tagged_things,train['annotation'][i][x][1])


In [35]:
from tqdm import tqdm
from difflib import SequenceMatcher
import pickle

def matcher(string, pattern):
    '''
    Return the start and end index of any pattern present in the text.
    '''
    match_list = []
    pattern = pattern.strip()
    seqMatch = SequenceMatcher(None, string, pattern, autojunk=False)
    match = seqMatch.find_longest_match(0, len(string), 0, len(pattern))
    if (match.size == len(pattern)):
        start = match.a
        end = match.a + match.size
        match_tup = (start, end)
        string = string.replace(pattern, "X" * len(pattern), 1)
        match_list.append(match_tup)

    return match_list, string


def create_labs(s,match_list):
    labs = ['O' for i in range(len(s.split()))]
    word_dict = pd.DataFrame({'word':s.split(),'label':labs})

    for start, end, e_type in match_list:
        index = len(re.findall(r' +',s[0:start]))
        num_words = len(s[start:end].split())

        if num_words > 1:
            word_dict.loc[index,'label'] = e_type
            for i in range(1,num_words):
                word_dict.loc[index+i,'label'] = e_type
        else:
            word_dict.loc[index,'label'] = e_type
    return word_dict


def to_txt(df, filepath):
    '''
    The function responsible for the creation of data in the said format.
    '''
    with open(filepath , 'w') as f:
        for text, annotation in zip(df.text, df.annotation):
            text_ = text    
            print(df.index[df['text']== text_].tolist())    
            match_list = []
            for i in annotation:
                a,text_= matcher(text_, i[0])
                match_list.append((a[0][0], a[0][1], i[1]))

            d = create_labs(text, match_list)

            for i in range(d.shape[0]):
                f.writelines(d['word'][i] + ' ' + d['label'][i] +'\n')
            f.writelines('\n')
            
def main(input,save_path):

    data = input
    to_txt(data, save_path)
    
#if __name__ == '__main__':
path = 'C:/Users/Leste/OneDrive - Johns Hopkins/Documents/GitHub/nlpsumm/BERT/data_processed/'

#main(train,path+'train.txt')
main(train,path+'train(clean).txt')
#main(test,path+'test(clean).txt')

[0]
[1]
[2]
[3]
[4]
[5]
[6]
[7]
[8]
[9]
[10]
[11]
[12]
[13]
[14]
[15]
[16]
[17]
[18]
[19]
[20]
[21]
[22]
[23]
[24]
[25]
[26]
[27]
[28]
[29]
[30]
[31]
[32]
[33]
[34]
[35]
[36]
[37]
[38]
[39]
[40]
[41]
[42]
[43]
[44]
[45]
[46]
[47]
[48]
[49]
[50]
[51]
[52]
[53]
[54]
[55]
[56]
[57]
[58]
[59]
[60]
[61]
[62]
[63]
[64]
[65]
[66]
[67]
[68]
[69]
[70]
[71]
[72]
[73]
[74]
[75]
[76]
[77]
[78]
[79]
[80]
[81]
[82]
[83]
[84]
[85]
[86]
[87]
[88]
[89]
[90]
[91]
[92]
[93]
[94]
[95]
[96]
[97]
[98]
[99]
[100]
[101]
[102]
[103]
[104]
[105]
[106]
[107]
[108]
[109]
[110]
[111]
[112]
[113]
[114]
[115]
[116]
[117]
[118]
[119]
[120]
[121]
[122]
[123]
[124]
[125]
[126]
[127]
[128]
[129]
[130]
[131]
[132]
[133]
[134]
[135]
[136]
[137]
[138]
[139]
[140]
[141]
[142]
[143]
[144]
[145]
[146]
[147]
[148]
[149]
[150]
[151]
[152]
[153]
[154]
[155]
[156]
[157]
[158]
[159]
[160]
[161]
[162]
[163]
[164]
[165]
[166]
[167]
[168]
[169]
[170]
[171]
[172]
[173]
[174]
[175]
[176]
[177]
[178]
[179]
[180]
[181]
[182]
[183]
[184]


In [None]:
test['annotation'][0]
test_df['text'][0]


In [None]:
#print(train_n['annotation'][[266]].tolist()), print(train_n['loc'][[266]].tolist())

names_3[382]

In [24]:
df = train.copy()
annotation = df.annotation[0]
text_ = df.text[0]

match_list = []
for i in annotation:
    print(i)
    a, text_ = matcher(text_, i[0])
    match_list.append((a[0][0], a[0][1], i[1]))
    d = create_labs(text_, match_list)


('BP 170/80.', 'HYPERTENSION')
('HCTZ', 'MEDICATION')
('Hypertension', 'HYPERTENSION')
('hyperlipidemia', 'HYPERLIPIDEMIA')
('HTN', 'HYPERTENSION')
('known hx CAD', 'CAD')
('CAD', 'CAD')
('s/p ant SEMI + stent LAD 2/67, Dr Oakley', 'CAD')
('NORVASC (AMLODIPINE)', 'MEDICATION')
('PLAVIX (CLOPIDOGREL)', 'MEDICATION')
('ATENOLOL', 'MEDICATION')
('ASA (ACETYLSALICYLIC ACID)', 'MEDICATION')
('ZESTRIL (LISINOPRIL)', 'MEDICATION')
('LIPITOR (ATORVASTATIN)', 'MEDICATION')
('HCTZ (HYDROCHLOROTHIAZIDE)', 'MEDICATION')
('NITROGLYCERIN 1/150 (0.4 MG)', 'MEDICATION')
('150/70 repeat 145/80', 'HYPERTENSION')
('CAD', 'CAD')


In [27]:
match_list


[(104, 114, 'HYPERTENSION'),
 (135, 139, 'MEDICATION'),
 (488, 500, 'HYPERTENSION'),
 (535, 549, 'HYPERLIPIDEMIA'),
 (571, 574, 'HYPERTENSION'),
 (580, 592, 'CAD'),
 (682, 685, 'CAD'),
 (686, 726, 'CAD'),
 (777, 797, 'MEDICATION'),
 (820, 840, 'MEDICATION'),
 (853, 861, 'MEDICATION'),
 (885, 911, 'MEDICATION'),
 (936, 956, 'MEDICATION'),
 (980, 1002, 'MEDICATION'),
 (1026, 1052, 'MEDICATION'),
 (1076, 1104, 'MEDICATION'),
 (1196, 1216, 'HYPERTENSION'),
 (1388, 1391, 'CAD')]

In [33]:
df.text[0][580:592]

'known hx CAD'