## Library

In [180]:
import numpy as np
import tensorflow as tf
from copy import deepcopy

import pickle
from tqdm.notebook import tqdm

def load_pkl(file_path) :
    
    with open(file_path, 'rb') as f:
        df = pickle.load(f)
        
    return df

def save_pkl(df, file_path) :
    
    with open(file_path, 'wb') as f:
        pickle.dump(df, f)
        

## Tokenizer 불러오기

In [106]:
from tokenizers import BertWordPieceTokenizer
from transformers import BertTokenizerFast

# classtransformers.BertTokenizerFast(
# vocab_file, 
# tokenizer_file=None, 
# do_lower_case=True, 
# unk_token='[UNK]', 
# sep_token='[SEP]', 
# pad_token='[PAD]', 
# cls_token='[CLS]', 
# mask_token='[MASK]', 
# tokenize_chinese_chars=True, 
# strip_accents=None, **kwargs

In [163]:
tokenizer_for_load = BertTokenizerFast.from_pretrained('./model/BertTokenizer-6000-32000-vocab.txt'
                                                   , strip_accents=False
                                                   , lowercase=False)

Calling BertTokenizerFast.from_pretrained() with the path to a single file or url is deprecated


## 데이터 불러오기

In [164]:
dict1 = load_pkl('./dt/wiki_kor_210520_pages-articles/wiki_ko_dict1_p.pkl')
# dict2 = load_pkl('./dt/wiki_kor_210520_pages-articles/wiki_ko_dict2_p.pkl')

In [176]:
documents_ids = list(dict1.keys())

vocab_size = tokenizer_for_load.vocab_size

train_set = []

r_func = np.random.random

for document_id, document_key in tqdm(enumerate(documents_ids)) :
    
    current_document = dict1[document_key]
    
    for statement_id, current_statement in enumerate(current_document[:-2]) :
        
        train_dict = {}
        
        current_statement = current_document[statement_id]

        if r_func() > 0.5 :
            nsp_token = [0, 1]
            r_document_ind = document_id
            
            while r_document_ind == document_id :
                r_document_ind = np.random.randint(0, len(documents_ids))

            r_document = dict1[documents_ids[r_document_ind]]

            r_statement_ind = np.random.randint(0, len(r_document))
            next_statement = r_document[r_statement_ind]

        else :
            nsp_token = [1, 0]
            next_statement = current_document[statement_id+1]
        
        tokenized_current = tokenizer_for_load(current_statement, return_tensors="tf")
        tokenized_next = tokenizer_for_load(next_statement, return_tensors="tf")
        
        token_data = np.concatenate([tokenized_current['input_ids'].numpy()[0], tokenized_next['input_ids'].numpy()[0, 1:]])
        
        masked_data = deepcopy(token_data)
        
        for token_id in range(len(masked_data)) :
            
            if masked_data[token_id] in [0, 1, 2, 3, 4] :
                continue
                
            if r_func() < 0.15 :
                
                how_change = r_func()
                if how_change > 0.2 :
                    
                    masked_data[token_id] = 4
                    
                elif how_change > 0.1 :
                    
                    masked_data[token_id] = np.random.randint(5, vocab_size)
                
                else :
                    
                    continue
                
        
        
        train_dict['x'] = masked_data
        train_dict['label'] = token_data
        train_dict['NSP'] = nsp_token
        
        train_set.append(train_dict)

0it [00:00, ?it/s]

In [178]:
chck_id = 29

print(tokenizer_for_load.convert_ids_to_tokens(train_set[chck_id]['x']))
print(tokenizer_for_load.convert_ids_to_tokens(train_set[chck_id]['label']))

['[CLS]', '특히', '국제', '분쟁', '조정을', '위해', '북한의', '김일성', ',', '아이티', '##의', '세', '##드라', '##스', '장군', '[MASK]', '팔레', '##인', '##스타', '##인의', '하마', '##스', ',', '보스니아', '[MASK]', '세르비아', '##계', '[MASK]', '같이', '미국', '정부에', '대해', '협상을', '거부', '##하면서', '사태', '[MASK]', '위기를', '초래', '##한', '인물', '및', '단체를', '직접', '만나', '분쟁', '##의', '원인을', '근본', '##적으로', '해결하기', '위해', '힘썼다', '냄새', '[SEP]', '[MASK]', '46', '.', '80', '##10', '##km2이고', ',', '인구는', '2015년', '8월', '기준으로', '5', ',', '66', '##2명이다', '.', '[SEP]']
['[CLS]', '특히', '국제', '분쟁', '조정을', '위해', '북한의', '김일성', ',', '아이티', '##의', '세', '##드라', '##스', '장군', ',', '팔레', '##인', '##스타', '##인의', '하마', '##스', ',', '보스니아', '##의', '세르비아', '##계', '정권', '같이', '미국', '정부에', '대해', '협상을', '거부', '##하면서', '사태', '##의', '위기를', '초래', '##한', '인물', '및', '단체를', '직접', '만나', '분쟁', '##의', '원인을', '근본', '##적으로', '해결하기', '위해', '힘썼다', '.', '[SEP]', '넓이는', '46', '.', '80', '##10', '##km2이고', ',', '인구는', '2015년', '8월', '기준으로', '5', ',', '66', '##2명이다', '.', '[SEP]']


In [181]:
save_pkl(train_set, './dt/train_set1.pkl')

In [182]:
# dict1 = load_pkl('./dt/wiki_kor_210520_pages-articles/wiki_ko_dict1_p.pkl')
dict2 = load_pkl('./dt/wiki_kor_210520_pages-articles/wiki_ko_dict2_p.pkl')

In [184]:
documents_ids = list(dict2.keys())

vocab_size = tokenizer_for_load.vocab_size

train_set = []

r_func = np.random.random

for document_id, document_key in tqdm(enumerate(documents_ids)) :
    
    current_document = dict2[document_key]
    
    for statement_id, current_statement in enumerate(current_document[:-2]) :
        
        train_dict = {}
        
        current_statement = current_document[statement_id]

        if r_func() > 0.5 :
            nsp_token = [0, 1]
            r_document_ind = document_id
            
            while r_document_ind == document_id :
                r_document_ind = np.random.randint(0, len(documents_ids))

            r_document = dict2[documents_ids[r_document_ind]]

            r_statement_ind = np.random.randint(0, len(r_document))
            next_statement = r_document[r_statement_ind]

        else :
            nsp_token = [1, 0]
            next_statement = current_document[statement_id+1]
        
        tokenized_current = tokenizer_for_load(current_statement, return_tensors="tf")
        tokenized_next = tokenizer_for_load(next_statement, return_tensors="tf")
        
        token_data = np.concatenate([tokenized_current['input_ids'].numpy()[0], tokenized_next['input_ids'].numpy()[0, 1:]])
        
        masked_data = deepcopy(token_data)
        
        for token_id in range(len(masked_data)) :
            
            if masked_data[token_id] in [0, 1, 2, 3, 4] :
                continue
                
            if r_func() < 0.15 :
                
                how_change = r_func()
                if how_change > 0.2 :
                    
                    masked_data[token_id] = 4
                    
                elif how_change > 0.1 :
                    
                    masked_data[token_id] = np.random.randint(5, vocab_size)
                
                else :
                    
                    continue
                
        
        
        train_dict['x'] = masked_data
        train_dict['label'] = token_data
        train_dict['NSP'] = nsp_token
        
        train_set.append(train_dict)

0it [00:00, ?it/s]

In [185]:
save_pkl(train_set, './dt/train_set2.pkl')

In [186]:
train1 = load_pkl('./dt/train_set1.pkl')
train2 = deepcopy(train_set)

In [188]:
train = train1 + train2

In [190]:
save_pkl(train, './dt/train_set.pkl')

In [195]:
x = list(map(lambda x: len(x['x']), train))

In [196]:
np.min(x), np.max(x), np.mean(x)

(11, 6838, 55.00080972449242)

In [211]:
len(x)

3180094

In [201]:
from collections import Counter

cnt = Counter(x)

In [209]:
cumsum = 0
for k in list(filter(lambda x : x>255, cnt.keys())) :
   cumsum += cnt[k]

In [210]:
cumsum

2238

In [212]:
x = list(filter(lambda x: len(x['x']) <= 255, train))

In [None]:
save_pkl(x, './dt/train_set_under_255.pkl')