# Masked Model

In [1]:
import nltk
import random
import torch
import string
from collections import defaultdict
from nltk.corpus import stopwords
from pytorch_pretrained_bert import BertTokenizer, BertForMaskedLM
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using {device} device")
print('Initialize BERT vocabulary...')
bert_tokenizer = BertTokenizer(vocab_file='data/BERT_model_reddit/vocab.txt')
print('Initialize BERT model...')
bert_model = BertForMaskedLM.from_pretrained('data/BERT_model_reddit').to(device)
bert_model.eval()

Using cpu device
Initialize BERT vocabulary...
Initialize BERT model...


BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): BertLayerNorm()
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): BertLayerNorm()
              (dropout): Dropout(p=0.1, inplace=False)
            )
   

In [3]:
''' Masked Language Model '''
def MLM(sgs, input_keywords, thres=1, filter_uninformative=1): # sgs = list of masked sentence
    def to_bert_input(tokens, bert_tokenizer):
        token_idx = torch.tensor(bert_tokenizer.convert_tokens_to_ids(tokens)) # converts string token to int id and then covert to tensor (type: int)
        sep_idx = tokens.index('[SEP]')
        segment_idx = token_idx * 0 # initialize to have the same size as token_idx (fill with 0)
        segment_idx[(sep_idx + 1):] = 1 # SEP이 나오고 다음 토큰 부터 segment_idx = 1
        mask = (token_idx != 0) # token_idx가 0이 아니면 mask=True
        return token_idx.unsqueeze(dim = 0).to(device), segment_idx.unsqueeze(dim = 0).to(device), mask.unsqueeze(dim = 0).to(device) # increase dimension then put the model into cpu

    def single_MLM(message):
        MLM_k = 50
        tokens = bert_tokenizer.tokenize(message)
        if len(tokens) == 0:
            return []
        if tokens[0] != CLS:
            tokens = [CLS] + tokens
        if tokens[-1] != SEP:
            tokens.append(SEP)
        token_idx, segment_idx, mask = to_bert_input(tokens, bert_tokenizer)
        with torch.no_grad(): # stop auto gradient tracking cuz no backpropagation
            logits = bert_model(token_idx, segment_idx, mask, masked_lm_labels=None)
        logits = logits.squeeze(0) # remove 1 in index 0 in dimension
        probs = torch.softmax(logits, dim=-1)

        for idx, token in enumerate(tokens):
            if token == MASK:
                topk_prob, topk_indices = torch.topk(probs[idx, :], MLM_k) # topk는 tensor에서 값이 가장 큰 k(MLM_k) 개를 추출하는 연산
                # .cpu() = GPU 메모리에 올려져 있는 tensor를 cpu 메모리로 복사하는 method
                # tensor를 numpy로 변환하여 반환. 이때 저장공간을 공유하기 때문에 하나를 변경하면 다른 하나도 변경된다.
                # 또한 cpu 메모리에 올려져 있는 tensor만 .numpy() method를 사용할 수 있다
                topk_tokens = bert_tokenizer.convert_ids_to_tokens(topk_indices.cpu().numpy())

        out = [[topk_tokens[i], float(topk_prob[i])] for i in range(MLM_k)] # returns word and prob
        return out
    
    PAD, MASK, CLS, SEP = '[PAD]', '[MASK]', '[CLS]', '[SEP]'
    MLM_score = defaultdict(float)
    temp = sgs if len(sgs) < 10 else tqdm(sgs) # 문장 갯수가 10보다 작으면 그대로 지정, 아니면 tqdm(list)로 지정
    skip_ms_num = 0
    good_sgs = []
    for sgs_i in temp: # sgs_i = each sentence
        try:
            top_words = single_MLM(sgs_i) # topk word랑 그 %를 top MLM_k개 리스트로
            seen_input = 0
            for input_i in input_keywords: # for each formal drug name
                if input_i in [x[0] for x in top_words[:thres]]: # in this case, thres=5; x[0] = topk word
                    seen_input += 1 # formal drug name이 5개의 top words 중에 있으면 seen_input+=1
            if filter_uninformative == 1 and seen_input < 2:
                skip_ms_num += 1
                continue
            good_sgs.append(sgs_i)
            for j in top_words:
                if j[0] in string.punctuation: # j[0]=key of top_words=topk word
                    continue
                if j[0] in stopwords.words('english'):
                    continue
                if j[0] in input_keywords:
                    continue
                if j[0][:2] == '##':  # the '##' by BERT indicates that is not a word.
                    continue
                MLM_score[j[0]] += j[1]
            # print(sgs_i)
            # print([x[0] for x in top_words[:20]])
        except KeyError:
            print("KeyError occurred in MLM, and the sentence is: ", sgs_i)
    out = sorted(MLM_score, key=lambda x: MLM_score[x], reverse=True) # sort MLM_score based on the value (descending order)
    out_tuple = [[x, MLM_score[x]] for x in out]
    if len(sgs) >= 10:
        print('The percentage of uninformative masked sentences is {:d}/{:d} = {:.2f}%'.format(skip_ms_num, len(sgs), float(skip_ms_num)/len(sgs)*100))
    return out, out_tuple, good_sgs

In [4]:
''' topk part '''
def euphemism_detection(input_keywords, all_text, ms_limit, filter_uninformative): # input_keywords = drug formal
    print('\n' + '*' * 40 + ' [Euphemism Detection] ' + '*' * 40)
    print('[util.py] Input Keyword: ', end='')
    print(input_keywords)
    print('[util.py] Extracting masked sentences for input keywords...')
    masked_sentence = []
    for sentence in tqdm(all_text):
        temp = nltk.word_tokenize(sentence)
        for input_keyword_i in input_keywords:
            if input_keyword_i not in temp:
                continue
            temp_index = temp.index(input_keyword_i)
            masked_sentence += [' '.join(temp[: temp_index]) + ' [MASK] ' + ' '.join(temp[temp_index + 1:])] # ['a b [MASK] d e f']
    random.shuffle(masked_sentence)
    masked_sentence = masked_sentence[:ms_limit]
    print('[util.py] Generating top candidates...')
    top_words, _, informative = MLM(masked_sentence, input_keywords, thres=5, filter_uninformative=filter_uninformative) # top_words(=MLM_score) = dictionary of MLM_score in descending ordered value
    return top_words, informative

In [11]:
''' Printing functions '''
class print_color:
   PURPLE = '\033[95m'
   CYAN = '\033[96m'
   DARKCYAN = '\033[36m'
   BLUE = '\033[94m'
   GREEN = '\033[92m'
   YELLOW = '\033[93m'
   RED = '\033[91m'
   BOLD = '\033[1m'
   UNDERLINE = '\033[4m'
   END = '\033[0m'


def color_print_top_words(top_words, gt_euphemism):
    print('[Euphemism Candidates]: ')
    gt_euphemism_upper = set([y for x in gt_euphemism for y in x.split()])
    for i in top_words[:30]: # change this number for top k
        if i in gt_euphemism:
            print(print_color.BOLD + print_color.PURPLE + i + print_color.END, end=', ')
        # elif i in gt_euphemism_upper:
        #     print(print_color.UNDERLINE + print_color.PURPLE + i + print_color.END, end=', ')
        else:
            print(i, end=', ')
    print()

In [6]:
''' Evaluation '''
def evaluate_detection(top_words, gt_euphemism): # top_words = from euphemism_detection fnc, gt_euphemism = dictionary {euph:[formal drug names related]}
    color_print_top_words(top_words, gt_euphemism)
    correct_list = []  # appear in the ground truth
    correct_list_upper = []  # not appear in the ground truth but contain in a ground truth phase.
    gt_euphemism_upper = set([y for x in gt_euphemism for y in x.split()]) # phrases split into words, only unique words saved
    for i, x in enumerate(top_words):
        correct_list.append(1 if x in gt_euphemism else 0) # if name from formal drug names appear in top_wrods = 1, else 0
        correct_list_upper.append(1 if x in gt_euphemism_upper else 0) # 얘는 formal drug name을 단어별로 분해한 버전

    topk_precision_list = []
    cummulative_sum = 0
    topk_precision_list_upper = []
    cummulative_sum_upper = 0
    for i in range(0, len(correct_list)):
        cummulative_sum += correct_list[i] # add 1 for every correct word
        topk_precision_list.append(cummulative_sum/(i+1))
        cummulative_sum_upper += correct_list_upper[i]
        topk_precision_list_upper.append(cummulative_sum_upper/(i+1))

    # i   correct_list    cummulative_sum     topk_precision_list     
    # 0   1               1                   1/1 = 1                
    # 1   0               1                   1/2 = 0.5               
    # 2   0               1                   1/3 = 0.3333
    # 3   1               2                   2/4 = 0.5

    # print precision value
    for topk in [10, 20, 30, 50]:
        if topk < len(topk_precision_list): # topk가 len보다 작아야 출력 가능하니까 놓은 if
            print('Top-{:d} precision is {:.2f}'.format(topk, topk_precision_list[topk-1]))
    return 0

In [14]:
''' REAL main '''

# from identification import euphemism_identification
from read_files import read_all_data

dataset = 'data/output/processed_corpus.txt'
euph_file = 'data/euphemism_answer_drug.txt'
t_file = 'data/target_keywords_drug.txt'
auto_file = 'data/AutoPhrase.txt'
''' Read Data '''
all_text, euphemism_answer, drug_formal, target_name, _ = read_all_data(dataset, euph_file, t_file, auto_file)

[read_data.py] Reading data with read_all_data...
[read_data.py] Reading data with read_files...


100%|██████████| 6174234/6174234 [01:37<00:00, 63005.25it/s]

[read_data.py] Finish reading data using 99.10s





In [15]:
''' Euphemism Detection '''
top_words, informative = euphemism_detection(drug_formal, all_text, ms_limit=2000, filter_uninformative=1)
evaluate_detection(top_words, euphemism_answer)

with open('data/output/viz_informative_sentence.txt', 'w') as fout:
    for sent in informative:
        line = f"{sent}\n"
        fout.write(line)


**************************************** [Euphemism Detection] ****************************************
[util.py] Input Keyword: ['acetaminophen and oxycodone combination', 'adderall', 'alprazolam', 'amphetamine', 'amphetamine and dextroamphetamine combination', 'buprenorphine and naloxone combination', 'clonazepam', 'cocaine', 'concerta', 'crack cocaine', 'daytrana', 'dilaudid', 'ecstasy', 'fentanyl', 'flunitrazepam', 'gamma-hydroxybutyric acid', 'ghb', 'hash oil', 'heroin', 'hydrocodone', 'hydromorphone', 'ketalar', 'ketamine', 'khat', 'klonopin', 'lorcet', 'lsd', 'lysergic acid diethylamide', 'marijuana', 'marijuana concentrates', 'mdma', 'mescaline', 'methamphetamine', 'methylphenidate', 'molly', 'morphine', 'norco', 'opium', 'oxaydo', 'oxycodone', 'oxycontin', 'pcp', 'percocet', 'peyote', 'phencyclidine', 'promethazine', 'psilocybin mushrooms', 'ritalin', 'rohypnol', 'roxicodone', 'steroids', 'suboxone', 'synthetic cannabinoids', 'synthetic cathinones', 'u-47700', 'vicodin', 'xan

100%|██████████| 246263/246263 [00:29<00:00, 8312.62it/s]


[util.py] Generating top candidates...


100%|██████████| 2000/2000 [06:26<00:00,  5.17it/s]

The percentage of uninformative masked sentences is 1827/2000 = 91.35%
[Euphemism Candidates]: 
[1m[95mweed[0m, [1m[95macid[0m, drug, alcohol, [1m[95mcoke[0m, cannabis, drugs, mushrooms, [1m[95mspeed[0m, md, pills, crack, [1m[95mpot[0m, something, [1m[95mpowder[0m, [1m[95mblow[0m, [1m[95mk[0m, [1m[95mcrystal[0m, tobacco, [1m[95mlucy[0m, [1m[95mlean[0m, [1m[95mhash[0m, l, [1m[95mh[0m, [1m[95mstuff[0m, [1m[95mhydro[0m, [1m[95me[0m, [1m[95mcactus[0m, psychedelic, cigarettes, 
Top-10 precision is 0.40
Top-20 precision is 0.50
Top-30 precision is 0.57
Top-50 precision is 0.48



