## Library

In [1]:
import numpy as np
import tensorflow as tf
from copy import deepcopy

import pickle
from tqdm.notebook import tqdm

def load_pkl(file_path) :
    
    with open(file_path, 'rb') as f:
        df = pickle.load(f)
        
    return df

def save_pkl(df, file_path) :
    
    with open(file_path, 'wb') as f:
        pickle.dump(df, f)
        

## Tokenizer 불러오기

In [2]:
from tokenizers import BertWordPieceTokenizer
from transformers import BertTokenizerFast

# classtransformers.BertTokenizerFast(
# vocab_file, 
# tokenizer_file=None, 
# do_lower_case=True, 
# unk_token='[UNK]', 
# sep_token='[SEP]', 
# pad_token='[PAD]', 
# cls_token='[CLS]', 
# mask_token='[MASK]', 
# tokenize_chinese_chars=True, 
# strip_accents=None, **kwargs

In [3]:
tokenizer_for_load = BertTokenizerFast.from_pretrained('./model/BertTokenizer-6000-32000-vocab.txt'
                                                   , strip_accents=False
                                                   , lowercase=False)

Calling BertTokenizerFast.from_pretrained() with the path to a single file or url is deprecated


## 데이터 불러오기

In [43]:
dict1 = load_pkl('./dt/wiki_kor_210520_pages-articles/wiki_ko_dict1_p.pkl')
# dict2 = load_pkl('./dt/wiki_kor_210520_pages-articles/wiki_ko_dict2_p.pkl')

In [46]:
documents_ids = list(dict1.keys())

vocab_size = tokenizer_for_load.vocab_size

train_set = []

r_func = np.random.random

for document_id, document_key in tqdm(enumerate(documents_ids)) :
    
    current_document = dict1[document_key]
    
    for statement_id, current_statement in enumerate(current_document[:-2]) :
        
        train_dict = {}
        
        current_statement = current_document[statement_id]

        if r_func() > 0.5 :
            nsp_token = [0, 1]
            r_document_ind = document_id
            
            while r_document_ind == document_id :
                r_document_ind = np.random.randint(0, len(documents_ids))

            r_document = dict1[documents_ids[r_document_ind]]

            r_statement_ind = np.random.randint(0, len(r_document))
            next_statement = r_document[r_statement_ind]

        else :
            nsp_token = [1, 0]
            next_statement = current_document[statement_id+1]
        
        tokenized_current = tokenizer_for_load(current_statement, return_tensors="tf")
        tokenized_next = tokenizer_for_load(next_statement, return_tensors="tf")
        
        token_data = np.concatenate([tokenized_current['input_ids'].numpy()[0], tokenized_next['input_ids'].numpy()[0, 1:]])
        
        masked_data = deepcopy(token_data)
        masked_position = []
        for token_id in range(len(masked_data)) :
            
            if masked_data[token_id] in [0, 1, 2, 3, 4] :
                continue
                
            if r_func() < 0.15 :
                masked_position.append(token_id)
                how_change = r_func()
                if how_change > 0.2 :
                    
                    masked_data[token_id] = 4
                    
                elif how_change > 0.1 :
                    random_token = np.random.randint(5, vocab_size)
                    
                    while masked_data[token_id] == random_token :
                        random_token = np.random.randint(5, vocab_size)
                    masked_data[token_id] = random_token
                    
                
                else :
                    
                    continue
                
        
        
        train_dict['x'] = masked_data
        train_dict['label'] = token_data
        train_dict['NSP'] = nsp_token
        train_dict['masked_position'] = masked_position
        
        train_set.append(train_dict)

0it [00:00, ?it/s]

In [47]:
chck_id = 29

print(tokenizer_for_load.convert_ids_to_tokens(train_set[chck_id]['x']))
print(tokenizer_for_load.convert_ids_to_tokens(train_set[chck_id]['label']))

['[CLS]', '특히', '국제', '분쟁', '조정을', '[MASK]', '[MASK]', '김일성', ',', '아이티', '##의', '세', '##드라', '##스', '장군', ',', '팔레', '[MASK]', '##스타', '##인의', '하마', '##스', ',', '보스니아', '##의', '세르비아', '##계', '정권', '##자라는', '미국', '정부에', '대해', '협상을', '거부', '[MASK]', '사태', '##의', '위기를', '초래', '##한', '인물', '및', '단체를', '직접', '만나', '[MASK]', '##의', '원인을', '사망한', '##적으로', '해결하기', '위해', '힘썼다', '.', '[SEP]', '죽음의', '외', '##인', '##부대', '##에서', '[MASK]', '나가', '[MASK]', '위해', '##선', '정해진', '기간을', '채', '##우', '##든지', ',', '임무', '수행', '##의', '대가로', '받는', '돈을', '[MASK]', '비용을', '[MASK]', '##는', '길', '##밖에', '[MASK]', '.', '[SEP]']
['[CLS]', '특히', '국제', '분쟁', '조정을', '위해', '북한의', '김일성', ',', '아이티', '##의', '세', '##드라', '##스', '장군', ',', '팔레', '##인', '##스타', '##인의', '하마', '##스', ',', '보스니아', '##의', '세르비아', '##계', '정권', '같이', '미국', '정부에', '대해', '협상을', '거부', '##하면서', '사태', '##의', '위기를', '초래', '##한', '인물', '및', '단체를', '직접', '만나', '분쟁', '##의', '원인을', '근본', '##적으로', '해결하기', '위해', '힘썼다', '.', '[SEP]', '죽음의', '외', '##인', '##부

In [48]:
save_pkl(train_set, './dt/train_set1-masked-position.pkl')

In [49]:
# dict1 = load_pkl('./dt/wiki_kor_210520_pages-articles/wiki_ko_dict1_p.pkl')
dict2 = load_pkl('./dt/wiki_kor_210520_pages-articles/wiki_ko_dict2_p.pkl')

In [50]:
documents_ids = list(dict2.keys())

vocab_size = tokenizer_for_load.vocab_size

train_set = []

r_func = np.random.random

for document_id, document_key in tqdm(enumerate(documents_ids)) :
    
    current_document = dict2[document_key]
    
    for statement_id, current_statement in enumerate(current_document[:-2]) :
        
        train_dict = {}
        
        current_statement = current_document[statement_id]

        if r_func() > 0.5 :
            nsp_token = [0, 1]
            r_document_ind = document_id
            
            while r_document_ind == document_id :
                r_document_ind = np.random.randint(0, len(documents_ids))

            r_document = dict2[documents_ids[r_document_ind]]

            r_statement_ind = np.random.randint(0, len(r_document))
            next_statement = r_document[r_statement_ind]

        else :
            nsp_token = [1, 0]
            next_statement = current_document[statement_id+1]
        
        tokenized_current = tokenizer_for_load(current_statement, return_tensors="tf")
        tokenized_next = tokenizer_for_load(next_statement, return_tensors="tf")
        
        token_data = np.concatenate([tokenized_current['input_ids'].numpy()[0], tokenized_next['input_ids'].numpy()[0, 1:]])
        
        masked_data = deepcopy(token_data)
        
        for token_id in range(len(masked_data)) :
            
            if masked_data[token_id] in [0, 1, 2, 3, 4] :
                continue
                
            if r_func() < 0.15 :
                masked_position.append(token_id)
                how_change = r_func()
                if how_change > 0.2 :
                    
                    masked_data[token_id] = 4
                    
                elif how_change > 0.1 :
                    random_token = np.random.randint(5, vocab_size)
                    
                    while masked_data[token_id] == random_token :
                        random_token = np.random.randint(5, vocab_size)
                    masked_data[token_id] = random_token
                    
                
                else :
                    
                    continue
                
        
        
        train_dict['x'] = masked_data
        train_dict['label'] = token_data
        train_dict['NSP'] = nsp_token
        train_dict['masked_position'] = masked_position
        
        train_set.append(train_dict)

0it [00:00, ?it/s]

In [51]:
save_pkl(train_set, './dt/train_set2-masked-position.pkl')

In [52]:
train1 = load_pkl('./dt/train_set1-masked-position.pkl')
train2 = deepcopy(train_set)

In [53]:
train = train1 + train2

In [54]:
save_pkl(train, './dt/train_set-masked-position.pkl')

In [1]:
import numpy as np
import tensorflow as tf
from copy import deepcopy

import pickle
from tqdm.notebook import tqdm

def load_pkl(file_path) :
    
    with open(file_path, 'rb') as f:
        df = pickle.load(f)
        
    return df

def save_pkl(df, file_path) :
    
    with open(file_path, 'wb') as f:
        pickle.dump(df, f)
        

In [2]:
train = load_pkl('./dt/train_set-masked-position.pkl')

In [3]:
max_seq_len = 255

In [4]:
x = list(filter(lambda x: len(x['x']) <= max_seq_len, train))

In [None]:
save_pkl(x, './dt/train_set-maksed-position_under_{}.pkl'.format(max_seq_len))

In [None]:
save_pkl(x[:10000], './dt/train_set-maksed-position-sample-10000.pkl'.format(max_seq_len))