## Generate negative pairs by finding similar concepts

In [6]:
import csv
import random
import pandas as pd
import numpy as np
import progressbar
import os

RELA = 'has_contraindicated_drug'
RELA_count = 0

CUI1_set = set()
CUI2_set = set()
rela_dict = dict()

with open('./relas/'+RELA+'.csv') as f:
    reader = csv.reader(f, delimiter=',')
    row1 = next(reader)
    for row in reader:
        RELA_count += 1
        if row[1] not in CUI1_set:
            CUI1_set.add(row[1])
        if row[2] not in CUI2_set:
            CUI2_set.add(row[2])
        if row[1] not in rela_dict.keys():
            rela_dict[row[1]] = [row[2]]
        else:
            rela_dict[row[1]].append(row[2])
            
CUI1_list = list(CUI1_set)
CUI1_list_len = len(CUI1_list)
CUI2_list = list(CUI2_set)     
CUI2_list_len = len(CUI2_list)

def find_similar(target, corpus, n):
    # find top n phrases in corpus that contains the most common words as the target
    target_words = set(target.split())

    top_n_words = [(-1,' ')]*n
    for i in corpus:
        if i==target:
            continue
        sample_words = set(i.split())
        common = len(target_words & sample_words)
        if common > top_n_words[-1][0]:
            top_n_words[-1] = (common, i)
            top_n_words.sort(key=lambda x: x[0], reverse = True)
    return top_n_words


n = 4

data_N = RELA_count//2

cui1_list = []
cui2_list = []

bar = progressbar.ProgressBar(maxval=100, \
    widgets=[progressbar.Bar('=', '[', ']'), ' ', progressbar.Percentage()])
bar.start()

for i in range(data_N):
    perc = int(i/data_N*50)
    bar.update(perc)
    # find similar to CUI1
    index = random.randint(0,len(CUI1_list)-1)
    CUI1_string = CUI1_list[index]
    top_n_words = find_similar(CUI1_string,CUI2_list[:int(0.7*CUI2_list_len)],n)

    false_concepts_pool = []
    for i in top_n_words:
        if(i[1] not in rela_dict[CUI1_string]):
            false_concepts_pool.append(i[1])
    if(len(false_concepts_pool)):
        cui1_list.append(CUI1_string)
        cui2_list.append(random.choice(false_concepts_pool))

for i in range(data_N):
    perc = int(50+i/data_N*50)
    bar.update(perc)
    # find similar to CUI2
    index = random.randint(0,len(CUI2_list)-1)
    CUI2_string = CUI2_list[index]
    top_n_words = find_similar(CUI2_string,CUI1_list[:int(0.7*CUI1_list_len)],n)

    false_concepts_pool = []
    for i in top_n_words:
        if(CUI2_string not in rela_dict[i[1]]):
            false_concepts_pool.append(i[1])
    if(len(false_concepts_pool)):
        cui1_list.append(random.choice(false_concepts_pool))
        cui2_list.append(CUI2_string)

bar.finish()

df2 = pd.DataFrame(np.array([cui1_list,cui2_list]).T,
                   columns=['CUI1', 'CUI2'])

if not os.path.isdir(RELA):
    os.mkdir(RELA)


df2.to_csv('./'+RELA+'/negative_'+RELA+'.csv')



## Generate negative pairs by using similar relations

In [38]:
import csv
import random
import pandas as pd
import numpy as np
import progressbar
import string
import os

RELA = 'disease_excludes_abnormal_cell'
neg_RELA = 'disease_has_abnormal_cell'

if not os.path.isdir(RELA):
    os.mkdir(RELA)
    
lines_count = 0 
with open('./relas/'+RELA+'.csv') as f:
    lines_count = len(f.readlines(  ))
    print('lines_count = ', lines_count)

RELA_count = 0
with open('./relas/'+neg_RELA+'.csv', newline='') as f:
    reader = csv.reader(f, delimiter=',')
    row1 = next(reader)
    with open('./'+RELA+'/negative_'+RELA+'.csv', 'w') as write_f:
        writer = csv.writer(write_f, delimiter=',',
                            quotechar='"', quoting=csv.QUOTE_MINIMAL)
        writer.writerow(',CUI1, CUI2\n')
        for row in reader:
            writer.writerow(row)
            RELA_count += 1
            if(RELA_count == lines_count):
                print('break!!')
                break

lines_count =  17283


Error: need to escape, but no escapechar set

In [7]:
pos_CUI1_list = []
pos_CUI2_list = []
pos_label_list = []

neg_CUI1_list = []
neg_CUI2_list = []
neg_label_list = []


with open('./'+RELA+'/negative_'+RELA+'.csv', newline='') as f:
    reader = csv.reader(f, delimiter=',')
    row1 = next(reader)
    for row in reader:
        neg_CUI1_list.append(row[1])
        neg_CUI2_list.append(row[2])
        neg_label_list.append(0)

with open('./relas/'+RELA+'.csv', newline='') as f:
    reader = csv.reader(f, delimiter=',')
    row1 = next(reader)
    all_pos = []
    for row in reader:
        all_pos.append(row)

    for i in range(len(all_pos)):
        n = random.randint(0,len(all_pos)-1)
        pos_CUI1_list.append(all_pos[n][1])
        pos_CUI2_list.append(all_pos[n][2])
        pos_label_list.append(1)

train_CUI1 = neg_CUI1_list[:int(0.7*len(neg_CUI1_list))]   +pos_CUI1_list[:int(0.7*len(pos_CUI1_list))]
train_CUI2 = neg_CUI2_list[:int(0.7*len(neg_CUI2_list))]   +pos_CUI2_list[:int(0.7*len(pos_CUI2_list))]
train_label = neg_label_list[:int(0.7*len(neg_label_list))]+pos_label_list[:int(0.7*len(pos_label_list))]

dev_CUI1 = neg_CUI1_list[int(0.7*len(neg_CUI1_list)):int(0.9*len(neg_CUI1_list))]    +pos_CUI1_list[int(0.7*len(pos_CUI1_list)):int(0.9*len(pos_CUI1_list))]
dev_CUI2 = neg_CUI2_list[int(0.7*len(neg_CUI2_list)):int(0.9*len(neg_CUI2_list))]    +pos_CUI2_list[int(0.7*len(pos_CUI2_list)):int(0.9*len(pos_CUI2_list))]
dev_label = neg_label_list[int(0.7*len(neg_label_list)):int(0.9*len(neg_label_list))]+pos_label_list[int(0.7*len(pos_label_list)):int(0.9*len(pos_label_list))]

test_CUI1 = neg_CUI1_list[int(0.9*len(neg_CUI1_list)):]   +pos_CUI1_list[int(0.9*len(pos_CUI1_list)):]
test_CUI2 = neg_CUI2_list[int(0.9*len(neg_CUI2_list)):]   +pos_CUI2_list[int(0.9*len(pos_CUI2_list)):]
test_label = neg_label_list[int(0.9*len(neg_label_list)):]+pos_label_list[int(0.9*len(pos_label_list)):]

df1 = pd.DataFrame(np.array([train_CUI1,train_CUI2,train_label]).T,
                   columns=['CUI1', 'CUI2', 'label'])


df2 = pd.DataFrame(np.array([dev_CUI1,dev_CUI2,dev_label]).T,
                   columns=['CUI1', 'CUI2', 'label'])

df3 = pd.DataFrame(np.array([test_CUI1,test_CUI2,test_label]).T,
                   columns=['CUI1', 'CUI2', 'label'])

df1.to_csv(RELA+'/train.csv')
df2.to_csv(RELA+'/dev.csv')
df3.to_csv(RELA+'/test.csv')

