In [8]:
import numpy as np
import pandas as pd
import re
from collections import defaultdict

from gensim.models import KeyedVectors, Word2Vec, FastText
from gensim.models.word2vec import LineSentence
from sklearn.metrics.pairwise import cosine_similarity

In [9]:
def read_files():
    answer = defaultdict(list)

    with open('data/euphemism_answer_drug.txt') as f:
        lines = f.readlines()
        pattern = '[;\\n]+'

        for l in lines:
            l = l.lower()
            val = l.split(':')[0]
            keys = l.split(':')[1]
            keys = re.split(pattern, keys)
            
            for k in keys:
                if len(k) == 0:
                    pass
                else:
                    answer[k.strip()].append(val) # key = casual, val = formal

    drug_euphemism = sorted(list(set([x[0] for x in answer.items()])))
    drug_formal = sorted(list(set([y for x in answer.items() for y in x[1]])))


    target_dict = {}
    count = 0

    # target_emb = []

    with open('data/target_keywords_drug.txt') as f:
        lines = f.readlines()
        for l in lines:
            l = l.strip().split('\t')
            for i in l:
                target_dict[i.strip()] = count
            count += 1
        
        # for l in lines:
        #     target_emb.append(l.strip().split('\t'))


    euph = []

    with open('data/AutoPhrase.txt') as f:
        lines = f.readlines()
        for l in lines:
            l = l.strip().split('\t')
            for i in range(len(l)):
                if i%2 != 0:
                    euph.append(l[i])
                else:
                    pass

    return answer, drug_formal, target_dict, euph

In [10]:
def train_w2v_embed(target, data, emb):
    if target is False:
        sentences = LineSentence(data)
        
        model = Word2Vec(sentences, window=6, vector_size=100, min_count=5, alpha=0.0001, workers=30)
        model.wv.save_word2vec_format(emb, binary=False)
    else:
        model = Word2Vec(data, window=6, vector_size=100, min_count=1, alpha=0.0001, workers=30)
    

    return model

In [11]:
def get_final_test(euphemism_answer, top_words, input_keywords):
    final_test = {}
    for x in top_words:
        if x in euphemism_answer: # top word(autophrase words)에 있는 x가 euphemism answer key(casual word)에 있고
            if any(e in euphemism_answer[x] for e in input_keywords): # formal drug 이름이 euphemism answer에 있으면
                final_test[x] = euphemism_answer[x] # euphemism answer를 걍 복제
            else: # 없으면
                final_test[x] = ['None'] # key = top word, value = none
        else: # euphemism answer에 top word(autophrase word)가 포함 x면
            final_test[x] = ['None'] # key = top word, value = none
    return final_test # (autophrase word:formal drug name)

In [12]:
def print_final(fin_out, fin_test, target_n):
    ranking_list = []
    target_n_list = []

    for i in range(max(target_n.values())+1): # iterate throught all the drug names 
        target_n_list.append([x for x in target_n if target_n[x] == i]) # put in the same list if the count are equal

    for i, word in enumerate(fin_test):
        pos = 0
        for j in fin_out[i]:
            pos += 1
            if any(e in target_n_list[j] for e in fin_test[word]):
                break
        ranking_list.append(pos)

    print('Average ranking is {:.2f} for {:d} euphemisms.'.format(sum(ranking_list)/len(ranking_list), len(ranking_list)))

    topk_acc = [sum(x <= k + 1 for x in ranking_list) / len(fin_test) for k in range(len(target_n_list))]
    print('[Top-k Accuracy]: ', end='')
    
    for k in range(len(target_n_list)):
        print('|  {:2d}  '.format(k + 1), end='')
    print()
    print(' ' * 18, end='')
    
    for k in range(len(target_n_list)):
        print('| {:.2f} '.format(topk_acc[k]), end='')
    print()

    return 0

In [14]:
def w2v_detection():
    answer, drug_formal, target_dict, _ = read_files()

    c_file = 'data/processed_corpus.txt'
    e_file = 'enwiki-20221020-pages-articles-multistream-index.txt'
     
    w2v_model = train_w2v_embed(False, c_file, e_file)
    
    emb_dict = KeyedVectors.load_word2vec_format(e_file, binary=False, limit=20000)


    '''Detection'''
    target_vec = []
    real_seed = list(target_dict.keys())
    seq = []

    for i, seed in enumerate(real_seed):
        if seed in emb_dict:
            target_vec.append(emb_dict[seed])
            seq.append(i)

    target_vec = np.array(target_vec)
    avg_target_vec = np.sum(target_vec, axis=0) / len(target_vec)

    top_k = [x[0] for x in w2v_model.wv.similar_by_vector(avg_target_vec, topn=1000) if x[0] not in real_seed]
    # print(top_k)

    with open('data/euphemism_word2vec_embedding.txt', 'w') as fout:
        for i in top_k:
            fout.write(i+'\n')


    '''Identification'''
    euph_candidates = []
    with open('data/AutoPhrase.txt', 'r') as fin:
        for line in fin:
            euph_candidates.append(line.strip().split('\t')[1])
    
    fin_test = get_final_test(answer, euph_candidates, drug_formal) # = (autophrase word:formal drug name)

    result = []
    filtered_fin_test = {}

    for i in euph_candidates: # candidate이
        if (i in emb_dict) and (fin_test[i] != ['None']): # emb_dict에 있고 autophrase에 있는 애가 formal drug를 지칭하면
            # np.argsort() = sort list in ascending order and return their index
            result.append([target_dict[real_seed[seq[x]]] for x in np.argsort(cosine_similarity([emb_dict[i]], target_vec)).tolist()[0][::-1]])
            filtered_fin_test[i] = fin_test[i]

    final_result = []

    for i in result:
        temp = []
        for j in i:
            if j not in temp:
                temp.append(j)
        final_result.append(temp)

    print(final_result)
    print_final(final_result, filtered_fin_test, target_dict)


w2v_detection() 
        

[[14, 1, 5, 0, 4, 23, 25, 27, 24, 21, 13, 22, 9, 26, 11, 20, 2, 8, 31, 18, 15, 6, 16, 12, 3, 17], [25, 14, 1, 0, 5, 23, 27, 13, 22, 9, 21, 24, 26, 4, 20, 11, 2, 8, 31, 18, 15, 12, 6, 16, 3, 17], [14, 23, 25, 24, 22, 1, 5, 9, 31, 21, 0, 13, 27, 26, 4, 11, 2, 20, 8, 18, 15, 12, 6, 16, 3, 17], [14, 21, 23, 1, 25, 9, 24, 5, 27, 13, 0, 22, 26, 4, 31, 11, 20, 2, 8, 18, 15, 6, 12, 16, 3, 17], [9, 24, 21, 31, 27, 20, 18, 11, 22, 1, 14, 23, 25, 4, 2, 0, 5, 8, 26, 12, 13, 6, 15, 16, 3, 17], [9, 24, 21, 27, 25, 1, 23, 22, 31, 14, 20, 11, 0, 5, 4, 2, 13, 18, 26, 8, 15, 12, 6, 16, 3, 17], [27, 25, 9, 24, 1, 23, 21, 0, 13, 14, 11, 5, 20, 22, 4, 2, 31, 26, 18, 8, 16, 15, 12, 6, 3, 17], [20, 24, 9, 31, 27, 4, 21, 11, 22, 23, 18, 13, 1, 25, 14, 8, 5, 0, 2, 12, 26, 15, 6, 16, 3, 17], [9, 31, 24, 27, 21, 1, 23, 14, 25, 22, 0, 20, 5, 11, 2, 4, 13, 26, 18, 8, 16, 12, 15, 6, 3, 17], [9, 24, 25, 21, 27, 1, 23, 14, 22, 0, 5, 13, 31, 20, 11, 4, 26, 2, 18, 8, 15, 12, 6, 16, 3, 17], [24, 23, 4, 9, 27, 22, 25, 1,

In [15]:
def eval_w2v_embed(embed_fn, seeds=[], res_fn=None):
    fout = open(res_fn, "w")
    fout.write(", ".join(["KeyWord", "Neighbors"])+"\n")
    wv = KeyedVectors.load_word2vec_format(embed_fn, binary=False)
    for seed in seeds:
        print("seed: {}".format(seed))
        try:
            neb_scores = wv.similar_by_word(seed, topn=100)
            # print(neb_scores)
            nebs = [ns[0] for ns in neb_scores]
            fout.write(", ".join([seed] + nebs)+"\n")
            # select words
            for neb_score in neb_scores:
                neb, score = neb_score
                double_neb_scores = wv.similar_by_word(neb, topn=100)
                neb_nebs = [tup[0] for tup in double_neb_scores]
                if seed in set(neb_nebs):
                    print("neb: {} for seed: {}".format(neb, seed))
        except:
            print("not in the dictionary!")
        print("\n")
    fout.close()
    
_, _, target_dict, _ = read_files()
real_seed = list(target_dict.keys())
print(real_seed)
eval_w2v_embed('enwiki-20221020-pages-articles-multistream-index.txt', real_seed, 'data/eval_w2v_embed.txt')

['acetaminophen and oxycodone combination', 'percocet', 'alprazolam', 'xanax', 'amphetamine', 'amphetamine and dextroamphetamine combination', 'adderall', 'buprenorphine and naloxone combination', 'suboxone', 'clonazepam', 'klonopin', 'cocaine', 'crack cocaine', 'ecstasy', 'mdma', 'molly', 'fentanyl', 'flunitrazepam', 'rohypnol', 'gamma-hydroxybutyric acid', 'ghb', 'heroin', 'hydrocodone', 'norco', 'vicodin', 'lorcet', 'hydromorphone', 'dilaudid', 'ketamine', 'ketalar', 'khat', 'lysergic acid diethylamide', 'lsd', 'marijuana', 'marijuana concentrates', 'hash oil', 'mescaline', 'peyote', 'methamphetamine', 'methylphenidate', 'ritalin', 'concerta', 'daytrana', 'morphine', 'opium', 'oxaydo', 'oxycodone', 'oxycontin', 'roxicodone', 'phencyclidine', 'pcp', 'promethazine', 'psilocybin mushrooms', 'synthetic cannabinoids', 'synthetic cathinones', 'steroids', 'u-47700']
seed: acetaminophen and oxycodone combination
not in the dictionary!


seed: percocet
neb: oxycodone for seed: percocet
neb: 