In [1]:
import numpy as np
from read_files import read_files

from gensim.models import KeyedVectors, Word2Vec
from gensim.models.word2vec import LineSentence
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
def train_w2v_embed(target, data, emb):
    if target is False:
        sentences = LineSentence(data)
        
        model = Word2Vec(sentences, window=6, vector_size=100, min_count=5, alpha=0.0001, workers=30)
        model.wv.save_word2vec_format(emb, binary=False)
        print(f"save embedding to {emb}")
    else:
        model = Word2Vec(data, window=6, vector_size=100, min_count=1, alpha=0.0001, workers=30)
    

    return model

In [3]:
def get_final_test(euphemism_answer, top_words, input_keywords):
    final_test = {}
    for x in top_words: # for each autophrase word
        if x in euphemism_answer: # top word(autophrase words)에 있는 x가 euphemism answer key(casual word)에 있고
            if any(e in euphemism_answer[x] for e in input_keywords): # formal drug 이름이 euphemism answer에 있으면
                final_test[x] = euphemism_answer[x] # euphemism answer를 걍 복제
            else: # 없으면
                final_test[x] = ['None'] # key = top word, value = none
        else: # euphemism answer에 top word(autophrase word)가 포함 x면
            final_test[x] = ['None'] # key = top word, value = none
    return final_test # (autophrase word:formal drug name)

In [4]:
def print_final(fin_out, fin_test, target_n):
    ranking_list = []
    target_n_list = []

    for i in range(max(target_n.values())+1): # iterate throught all the drug names 
        target_n_list.append([x for x in target_n if target_n[x] == i]) # put the target name in the same list if the count are equal(=if they are in the same line)

    for i, word in enumerate(fin_test): # considering only autophrase words that are in euph_answer
        pos = 0
        for j in fin_out[i]: # each list in (target_dict count cosine similarity 순서대로 넣어놓은 것)
            pos += 1
            if any(e in target_n_list[j] for e in fin_test[word]):
                break
        ranking_list.append(pos)

    print('Average ranking is {:.2f} for {:d} euphemisms.'.format(sum(ranking_list)/len(ranking_list), len(ranking_list)))

    topk_acc = [sum(x <= k + 1 for x in ranking_list) / len(fin_test) for k in range(len(target_n_list))]
    print('[Top-k Accuracy]: ', end='')
    
    for k in range(len(target_n_list)):
        print('|  {:2d}  '.format(k + 1), end='')
    print()
    print(' ' * 18, end='')
    
    for k in range(len(target_n_list)):
        print('| {:.2f} '.format(topk_acc[k]), end='')
    print()

    return 0

In [10]:
def w2v_detection():
    euph_file = 'data/euphemism_answer_drug.txt'
    t_file = 'data/target_keywords_drug.txt'
    auto_file = 'data/AutoPhrase.txt'

    answer, drug_formal, target_dict, _ = read_files(euph_file, t_file, auto_file)

    c_file = 'data/output/processed_corpus.txt'
    e_file = 'data/wiki_output_file.txt'
     
    w2v_model = train_w2v_embed(False, c_file, e_file)
    
    emb_dict = KeyedVectors.load_word2vec_format(e_file, binary=False, limit=20000)


    '''Detection'''
    target_vec = []
    real_seed = list(target_dict.keys())
    seq = []

    for i, seed in enumerate(real_seed):
        if seed in emb_dict: # 공식 약물 이름이 위키 뭐시기에 있으면 
            target_vec.append(emb_dict[seed]) # vector array를 target_vec에 넣기
            seq.append(i) # 여기엔 그 타겟의 인덱스 넣기

    target_vec = np.array(target_vec) # shape = (len(target_vec), 100)
    avg_target_vec = np.sum(target_vec, axis=0) / len(target_vec) # shape (100,) --> []안에 100개의 숫자가 있는 거

    top_k = [x[0] for x in w2v_model.wv.similar_by_vector(avg_target_vec, topn=1000) if x[0] not in real_seed] # top 1000개의 단어 중에서 real_seed에 없는 단어면 top_k에 저장
    # print(top_k)

    with open('data/output/w2v/wiki_euphemism_word2vec_embedding.txt', 'w') as fout:
        for i in top_k:
            fout.write(i+'\n')


    '''Identification'''
    euph_candidates = []
    with open('data/AutoPhrase_single-word.txt', 'r') as fin:
        for line in fin:
            euph_candidates.append(line.strip().split('\t')[1])
    
    fin_test = get_final_test(answer, euph_candidates, drug_formal) # = (autophrase word:formal drug name)

    result = []
    filtered_fin_test = {}

    for i in euph_candidates: # autophrase에서 나온 candidate이
        if (i in emb_dict) and (fin_test[i] != ['None']): # emb_dict에 있고 fin_test의 values(formal drug names)가 none이 아니면
            # np.argsort() = sort list in ascending order and return their index
            # cosine_similarity의 0번째 리스트를 [::-1] 역순으로
            # x = index of cosine_similarity
            # seq = index of target drug names that's in emb_dict (seq[x] = list of indexes)
            # result = target_dict의 values (formal drug name이 어떤 라인에 있는지 dict의 카운트를 저장)
            result.append([target_dict[real_seed[seq[x]]] for x in np.argsort(cosine_similarity([emb_dict[i]], target_vec)).tolist()[0][::-1]])
            filtered_fin_test[i] = fin_test[i] # fin_test의 value가 none이 아닌 애들만 모아놓은 dictionary

    final_result = []

    for i in result:
        temp = []
        for j in i:
            if j not in temp: # there can be multiple official drug names in one line (so the their dict values are equal)
                temp.append(j) # so append only when 선례가 없을 때, temp = target_dict count cosine similarity 순서대로 넣어놓은 것
        final_result.append(temp)

    print(final_result)
    print_final(final_result, filtered_fin_test, target_dict)


w2v_detection() 
        

[read_data.py] Reading data with read_files...
save embedding to data/wiki_output_file.txt
[[1, 23, 25, 9, 27, 14, 21, 5, 0, 13, 24, 22, 26, 4, 2, 20, 31, 11, 8, 18, 15, 6, 12, 16, 3, 17], [14, 23, 22, 1, 21, 5, 25, 27, 4, 9, 24, 0, 13, 11, 2, 20, 31, 8, 18, 26, 15, 12, 6, 3, 16, 17], [21, 22, 25, 23, 9, 5, 26, 1, 24, 27, 0, 14, 20, 13, 2, 4, 11, 8, 15, 18, 31, 6, 16, 12, 3, 17], [24, 31, 25, 9, 27, 1, 21, 20, 23, 14, 13, 22, 0, 5, 26, 4, 11, 2, 18, 8, 15, 12, 6, 16, 3, 17], [9, 23, 24, 21, 22, 13, 18, 11, 27, 31, 2, 1, 4, 25, 5, 14, 20, 0, 8, 12, 6, 15, 26, 16, 3, 17], [21, 24, 9, 27, 25, 1, 23, 14, 22, 31, 20, 5, 4, 0, 13, 11, 26, 2, 18, 8, 15, 12, 6, 16, 3, 17], [24, 1, 9, 14, 27, 23, 21, 25, 20, 31, 22, 0, 13, 4, 2, 5, 11, 26, 8, 18, 15, 12, 6, 16, 3, 17], [14, 1, 5, 4, 23, 27, 31, 24, 0, 25, 9, 13, 21, 22, 20, 26, 8, 11, 18, 2, 15, 12, 6, 16, 3, 17], [31, 14, 27, 24, 1, 25, 9, 22, 4, 21, 23, 26, 5, 13, 0, 20, 11, 2, 8, 18, 15, 16, 12, 6, 3, 17], [24, 9, 21, 23, 1, 31, 25, 27, 14, 

In [3]:
auto_word = []
with open('data/AutoPhrase.txt', 'r') as fin:
    for line in fin:
        auto_word.append(line.strip().split('\t')[1])

with open('data/output/processed_autophrase.txt', 'w') as fout:
    for word in auto_word:
        l = f"{word}\n"
        fout.write(l)

In [11]:
''' baseline part '''
def eval_w2v_embed(embed_fn, k=10, seeds=[], res_fn=None):
    fout = open(res_fn, "w")
    fout.write(", ".join(["KeyWord", "Neighbors"])+"\n")
    wv = KeyedVectors.load_word2vec_format(embed_fn, binary=False)
    for seed in seeds:
        print("seed: {}".format(seed))
        try:
            neb_scores = wv.similar_by_word(seed, topn=k)
            # print(neb_scores)
            nebs = [ns[0] for ns in neb_scores]
            fout.write(", ".join([seed] + nebs)+"\n")
            # select words
            for neb_score in neb_scores:
                neb, _ = neb_score
                double_neb_scores = wv.similar_by_word(neb, topn=k)
                neb_nebs = [tup[0] for tup in double_neb_scores]
                if seed in set(neb_nebs):
                    print("neb: {} for seed: {}".format(neb, seed))
        except:
            print("not in the dictionary!")
        print("\n")
    fout.close()

k = 50
euph_file = 'data/euphemism_answer_drug.txt'
t_file = 'data/target_keywords_drug.txt'
auto_file = 'data/AutoPhrase.txt'

_, _, target_dict, _ = read_files(euph_file, t_file, auto_file)
real_seed = list(target_dict.keys())
print(real_seed)

e_file = 'data/wiki_output_file.txt'
fout = f'data/output/w2v/wiki_eval_w2v_embed_top{k}.txt'
eval_w2v_embed(e_file, k, real_seed, fout)

[read_data.py] Reading data with read_files...
['acetaminophen and oxycodone combination', 'percocet', 'alprazolam', 'xanax', 'amphetamine', 'amphetamine and dextroamphetamine combination', 'adderall', 'buprenorphine and naloxone combination', 'suboxone', 'clonazepam', 'klonopin', 'cocaine', 'crack cocaine', 'ecstasy', 'mdma', 'molly', 'fentanyl', 'flunitrazepam', 'rohypnol', 'gamma-hydroxybutyric acid', 'ghb', 'heroin', 'hydrocodone', 'norco', 'vicodin', 'lorcet', 'hydromorphone', 'dilaudid', 'ketamine', 'ketalar', 'khat', 'lysergic acid diethylamide', 'lsd', 'marijuana', 'marijuana concentrates', 'hash oil', 'mescaline', 'peyote', 'methamphetamine', 'methylphenidate', 'ritalin', 'concerta', 'daytrana', 'morphine', 'opium', 'oxaydo', 'oxycodone', 'oxycontin', 'roxicodone', 'phencyclidine', 'pcp', 'promethazine', 'psilocybin mushrooms', 'synthetic cannabinoids', 'synthetic cathinones', 'steroids', 'u-47700']
seed: acetaminophen and oxycodone combination
not in the dictionary!


seed: p