In [39]:
import numpy as np
import tensorflow as tf
from copy import deepcopy

import import_ipynb
from QBert import qbert_model

import pickle
from tqdm.notebook import tqdm

from tensorflow.keras.preprocessing.sequence import pad_sequences

def load_pkl(file_path) :
    
    with open(file_path, 'rb') as f:
        df = pickle.load(f)
        
    return df

def save_pkl(df, file_path) :
    
    with open(file_path, 'wb') as f:
        pickle.dump(df, f)

def create_padding_mask(x):
    init_shape = x.shape
    mask = tf.cast(tf.math.equal(x, 0), tf.float32)
    # (batch_size, 1, 1, key의 문장 길이)
    return np.array(mask).reshape(init_shape[0], 1,1, init_shape[1])

def ind_to_weight(masked_pos, seq_len) :
    return tf.reduce_sum(tf.one_hot(masked_pos, seq_len), axis = 0)

def create_segments(inputs) :
    
    segment = []
    segment_num = 0
    
    for i, x in enumerate(inputs) :
        
        segment.append(segment_num)
        
        if x == 3 :
            segment_num+=1
            
    return np.array(segment)

In [40]:
class BertModule(tf.keras.Model) :

    def __init__(self, vocab_size, max_seq_len, num_layers, dff, d_model, num_heads, dropout, name) :
        super(BertModule, self).__init__()
        self.Bert = qbert_model(vocab_size, max_seq_len, num_layers, dff, d_model, num_heads, dropout, name)
        self.dense_cls = tf.keras.layers.Dense(2, activation = 'softmax', use_bias = False)
        self.vocab_size = vocab_size
    
    def call(self, inputs) :
        
        x, mask, lm, nsp, weight, segments = inputs[0], inputs[1], inputs[2], inputs[3], inputs[4], inputs[5]
    
        bert_outputs = self.Bert([x, mask, segments])

        y_pred = bert_outputs['sequence_output'] # 원하는 포지션

        decode_matrix = tf.linalg.pinv(self.Bert.layers[1].weights[0])
        
        pred_lm =  tf.math.softmax(tf.matmul(y_pred, decode_matrix))
        pred_cls = self.dense_cls(y_pred[:, 0])

        true_y_lm = tf.cast(tf.one_hot(tf.cast(lm, dtype = tf.int32), depth = self.vocab_size), dtype = tf.float32)

        lm_losses = (tf.reduce_sum(true_y_lm * -tf.math.log(pred_lm), axis = 2))
        lm_losses = lm_losses * weight
        lm_losses = tf.reduce_mean(lm_losses, axis = 1)

        nsp = tf.cast(nsp, dtype = tf.float32)
        cls_losses = tf.reduce_mean(tf.reduce_sum(nsp * -tf.math.log(pred_cls), axis = 1))

        total_loss = lm_losses + cls_losses

        return total_loss
    
    def get_pretrained_result(self, inputs) :
        
        x, mask, segments = inputs[0], inputs[1], inputs[2]
    
        bert_outputs = self.Bert([x, mask, segments])

        y_pred = bert_outputs['sequence_output']

        decode_matrix = tf.linalg.pinv(self.Bert.layers[1].weights[0])

        pred_lm =  tf.math.softmax(tf.matmul(y_pred, decode_matrix))
        pred_cls = self.dense_cls(y_pred[:, 0])
        
        return pred_lm, pred_cls

In [41]:
# train = load_pkl('./dt/train_set_under_255.pkl')
train = load_pkl('./dt/train_set-maksed-position-sample-10000.pkl')
# train = load_pkl('./dt/train_set-masked-position.pkl')

In [42]:
train = list(filter(lambda x: len(x['x']) <= 130, train))

In [43]:
vocab_size = 32000
max_seq_len = 130 # 512
num_layers = 6 # 12
dff = 384 * 2 # 768 * 4
d_model = 384 # 768
num_heads = 6 # 12
dropout = .1
name = 'qbert_210603'

In [50]:
x = pad_sequences([ x['x'] for x in train ], max_seq_len, padding = 'post')
y = pad_sequences([ x['label'] for x in train ] , max_seq_len, padding = 'post')
nsp = np.asarray([ x['NSP'] for x in train ])

weight = np.array([ ind_to_weight(x['masked_position'], max_seq_len) for x in train])

mask = create_padding_mask(x)

segments = np.array([create_segments(i) for i in x])

In [56]:
epochs = 1
lr = 1e-4
batch_size = 32

optimizer = tf.keras.optimizers.Adam(lr, beta_1=0.9, beta_2=0.999)

In [52]:
pretrainBert = BertModule(vocab_size, max_seq_len, num_layers, dff, d_model, num_heads, dropout, name)

In [53]:
pretrainBert.compile(optimizer=optimizer, loss ='mse')

In [49]:
false_y = np.array([ 0 for _ in range(len(x))])

In [54]:
pretrainBert.Bert.summary()

Model: "qbert_210603"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
inputs (InputLayer)             [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding_4 (Embedding)         (None, None, 384)    12288000    inputs[0][0]                     
__________________________________________________________________________________________________
position_embedding_2 (PositionE (None, None, 384)    49920       embedding_4[0][0]                
__________________________________________________________________________________________________
segments (InputLayer)           [(None, None)]       0                                            
_______________________________________________________________________________________

In [57]:
hist = pretrainBert.fit(batch_size = batch_size, epochs = epochs
                      , x = [x, mask, y, nsp, masked_lm_weight, segments], y = false_y)



In [58]:
today = '210604'

pretrainBert.save_weights('./model/BertPretrained-{}-{}-{}-{}-{}/'.format(today, max_seq_len, num_layers, d_model, num_heads))

## 결과 확인

In [59]:
pretrainBert.load_weights('./model/BertPretrained-{}-{}-{}-{}-{}/'.format(today, max_seq_len, num_layers, d_model, num_heads))

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x1b92d0e1f60>

In [60]:
sample_train_set = train[np.random.randint(0, len(train))]

In [61]:
from transformers import BertTokenizerFast

tokenizer_for_load = BertTokenizerFast.from_pretrained('./model/BertTokenizer-6000-32000-vocab.txt'
                                                   , strip_accents=False
                                                   , lowercase=False)

Calling BertTokenizerFast.from_pretrained() with the path to a single file or url is deprecated


In [62]:
train_statement = ' '.join(tokenizer_for_load.convert_ids_to_tokens(sample_train_set['x']))
train_statement

'[CLS] 반면에 [MASK] ##놀 ##리 ##식 커널 [MASK] ##든 [MASK] 커널 ##이 ##든 전통적인 [MASK] 설계 ##는 하드웨어 [MASK] ##화 계층 ( ##랑의 [MASK] 이나 장치 [MASK] 아래 자원을 숨 ##김 ##으로써 하드웨어 [MASK] 추상 ##화한 ##다 . [SEP] 한 예로 전통적인 시스템에서 물리 메모리 [MASK] 할당 ##할 때 실제 [MASK] 알려 ##주지 않기 때문에 오프 ##셋 ##과 기억 관리 장치를 통해서 ##만 문제를 해결 할 수 있다 . [SEP]'

In [63]:
train_statement = ' '.join(tokenizer_for_load.convert_ids_to_tokens(sample_train_set['label']))
train_statement

'[CLS] 반면에 모 ##놀 ##리 ##식 커널 ##이 ##든 마이크로 커널 ##이 ##든 전통적인 커널 설계 ##는 하드웨어 추상 ##화 계층 ( hal ) 이나 장치 드라이버 아래 자원을 숨 ##김 ##으로써 하드웨어 ##를 추상 ##화한 ##다 . [SEP] 한 예로 전통적인 시스템에서 물리 메모리 ##가 할당 ##할 때 실제 위치를 알려 ##주지 않기 때문에 오프 ##셋 ##과 기억 관리 장치를 통해서 ##만 문제를 해결 할 수 있다 . [SEP]'

In [64]:
# sample_train_set = train[np.random.randint(0, len(train))]

train_x = tf.reshape(sample_train_set['x'], (1, -1))
train_x = pad_sequences(train_x, max_seq_len, padding = 'post')
mask = create_padding_mask(train_x)

segments = tf.reshape(create_segments(train_x[0]), (1, -1))

In [66]:
# pretrainBert.load_weights('./model/model_weight_2106031943-epoch-1-0-loss-0.707.tf')
lm, nls = pretrainBert.get_pretrained_result([train_x, mask, segments])
train_statement = ' '.join(tokenizer_for_load.convert_ids_to_tokens(tf.argmax(lm, axis = 2)[0]))
train_statement

', , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ,'

In [67]:
nls

<tf.Tensor: shape=(1, 2), dtype=float32, numpy=array([[0.5398552 , 0.46014476]], dtype=float32)>