In [146]:
import numpy as np
import pandas as pd
import re
from collections import defaultdict

from gensim.models import KeyedVectors, Word2Vec, FastText
from gensim.models.word2vec import LineSentence
from sklearn.metrics.pairwise import cosine_similarity

In [175]:
def read_files():
    answer = defaultdict(list)

    with open('data/euphemism_answer_drug.txt') as f:
        lines = f.readlines()
        pattern = '[;\\n]+'

        for l in lines:
            l = l.lower()
            val = l.split(':')[0]
            keys = l.split(':')[1]
            keys = re.split(pattern, keys)
            
            for k in keys:
                if len(k) == 0:
                    pass
                else:
                    answer[k.strip()].append(val)

    drug_euphemism = sorted(list(set([x[0] for x in answer.items()])))
    drug_formal = sorted(list(set([y for x in answer.items() for y in x[1]])))


    target_dict = {}
    count = 0

    # target_emb = []

    with open('data/target_keywords_drug.txt') as f:
        lines = f.readlines()
        for l in lines:
            l = l.strip().split('\t')
            for i in l:
                target_dict[i.strip()] = count
            count += 1
        
        # for l in lines:
        #     target_emb.append(l.strip().split('\t'))


    euph = []

    with open('data/AutoPhrase_multi-words.txt') as f:
        lines = f.readlines()
        for l in lines:
            l = l.strip().split('\t')
            for i in range(len(l)):
                if i%2 != 0:
                    euph.append(l[i])
                else:
                    pass

    return answer, drug_formal, target_dict, euph

In [176]:
def train_w2v_embed(target, data, emb):
    if target is False:
        sentences = LineSentence(data)
        
        model = Word2Vec(sentences, window=6, vector_size=100, min_count=5, alpha=0.0001, workers=30)
        model.wv.save_word2vec_format(emb, binary=False)
    else:
        model = Word2Vec(data, window=6, vector_size=100, min_count=1, alpha=0.0001, workers=30)
    

    return model

In [186]:
def get_final_test(euphemism_answer, top_words, input_keywords):
    final_test = {}
    for x in top_words:
        if x in euphemism_answer:
            if any(e in euphemism_answer[x] for e in input_keywords):
                final_test[x] = euphemism_answer[x]
            else:
                final_test[x] = ['None']
        else:
            final_test[x] = ['None']
    return final_test

In [187]:
def print_final(fin_out, fin_test, target_n):
    ranking_list = []
    target_n_list = []

    for i in range(max(target_n.values())+1): # iterate throught all the drug names 
        target_n_list.append([x for x in target_n if target_n[x] == i]) # put in the same list if the count are equal

    for i, word in enumerate(fin_test):
        pos = 0
        for j in fin_out[i]:
            pos += 1
            if any(e in target_n_list[j] for e in fin_test[word]):
                break
        ranking_list.append(pos)

    print('Average ranking is {:.2f} for {:d} euphemisms.'.format(sum(ranking_list)/len(ranking_list), len(ranking_list)))

    topk_acc = [sum(x <= k + 1 for x in ranking_list) / len(fin_test) for k in range(len(target_n_list))]
    print('[Top-k Accuracy]: ', end='')
    
    for k in range(len(target_n_list)):
        print('|  {:2d}  '.format(k + 1), end='')
    print()
    print(' ' * 18, end='')
    
    for k in range(len(target_n_list)):
        print('| {:.2f} '.format(topk_acc[k]), end='')
    print()

    return 0

In [183]:
def w2v_detection():
    answer, drug_formal, target_dict, euph = read_files()

    c_file = 'data/input.txt'
    e_file = 'enwiki-20221020-pages-articles-multistream-index.txt'
     
    w2v_model = train_w2v_embed(False, c_file, e_file)
    
    emb_dict = KeyedVectors.load_word2vec_format(emb_fn, binary=False, limit=20000)


    '''Detection'''
    target_vec = []
    real_name = list(target_dict.keys())

    for i, seed in enumerate(real_name):
        if seed in emb_dict:
            target_vec.append(emb_dict[seed])

    target_vec = np.array(target_vec)
    avg_target_vec = np.sum(target_vec, axis=0) / len(target_vec)

    top_k = [x[0] for x in w2v_model.wv.similar_by_vector(avg_target_vec, topn=1000) if x[0] not in real_name]
    print(top_k)

    with open('/data/euphemism_word2vec_embedding.txt', 'w') as fout:
        for i in top_k:
            fout.write(i+'\n')


    '''Identification'''
    euph_candidates = []
    with open('data/AutoPhrase_multi-words.txt', 'r') as fin:
        for line in fin:
            euph_candidates.append(line.strip())
    
    



w2v_detection() 
        

['bullshit', 'zealand', 'officials', 'kept', 'flairing', 'style', 'chance', 'obscure', 'stars', 'friend', '21st', 'mountain', 'tree', 'children', 'forcing', 'thrills', 'lube', 'bill', 'occured', 'captcha', 'so', 'empath', 'gwern', 'oil', 'state', 'protection', 'relaxed', 'today', 'latest', 'leaning', 'host', 'hall', 'using', 'mr', 'guys', 'breathing', 'verification', 'lifetime', 'midst', 'behold', 'margins', 'website', 'highest', 'scope', 'successfully', 'quiet', 'logging', 'sudden', 'heshe', 'privileges', 'reminder', 'flavor', 'mix', 'lmk', 'consisted', 'ordered', 'gas', 'packaging', 'figure', 'diamond', 'quicker', 'informed', 'upper', 'stepped', 'starbucks', 'holyghost', 'esp', 'companies', 'neighborhood', 'purefire', 'rbitcoin', 'overwhelmed', 'steady', 'neat', 'comm', 'tor', 'breach', 'settings', 'export', 'confirm', 'reduced', 'hypothetically', 'r039', 'acropolis', 'might', 'opted', 'died', 'negative', 'beautifully', 'withdrawn', 'afford', 'hypothetical', 'education', 'mixture', '

In [121]:
# 인풋파일로 임베딩 찾아서 타겟 임베딩이랑 비교하고 (window=2~5로 하면 phrase로 됨)
# autophrase랑 비교해서 순위매기기