In [39]:
import numpy as np
import tensorflow as tf
from copy import deepcopy

import import_ipynb
from QBert import qbert_model

import pickle
from tqdm.notebook import tqdm

from tensorflow.keras.preprocessing.sequence import pad_sequences

def load_pkl(file_path) :
    
    with open(file_path, 'rb') as f:
        df = pickle.load(f)
        
    return df

def save_pkl(df, file_path) :
    
    with open(file_path, 'wb') as f:
        pickle.dump(df, f)

def create_padding_mask(x):
    init_shape = x.shape
    mask = tf.cast(tf.math.equal(x, 0), tf.float32)
    # (batch_size, 1, 1, key의 문장 길이)
    return np.array(mask).reshape(init_shape[0], 1,1, init_shape[1])

def ind_to_weight(masked_pos, seq_len) :
    return tf.reduce_sum(tf.one_hot(masked_pos, seq_len), axis = 0)

def create_segments(inputs) :
    
    segment = []
    segment_num = 0
    
    for i, x in enumerate(inputs) :
        
        segment.append(segment_num)
        
        if x == 3 :
            segment_num+=1
            
    return np.array(segment)

In [40]:
class BertModule(tf.keras.Model) :

    def __init__(self, vocab_size, max_seq_len, num_layers, dff, d_model, num_heads, dropout, name) :
        super(BertModule, self).__init__()
        self.Bert = qbert_model(vocab_size, max_seq_len, num_layers, dff, d_model, num_heads, dropout, name)
        self.dense_cls = tf.keras.layers.Dense(2, activation = 'softmax', use_bias = False)
        self.vocab_size = vocab_size
    
    def call(self, inputs) :
        
        x, mask, lm, nsp, weight, segments = inputs[0], inputs[1], inputs[2], inputs[3], inputs[4], inputs[5]
    
        bert_outputs = self.Bert([x, mask, segments])

        y_pred = bert_outputs['sequence_output']

        decode_matrix = tf.linalg.pinv(self.Bert.layers[1].weights[0])

        pred_lm =  tf.math.softmax(tf.matmul(y_pred, decode_matrix))
        pred_cls = self.dense_cls(y_pred[:, 0])

        true_y_lm = tf.cast(tf.one_hot(tf.cast(lm, dtype = tf.int32), depth = self.vocab_size), dtype = tf.float32)

        lm_losses = (tf.reduce_sum(true_y_lm * -tf.math.log(pred_lm), axis = 2))
        lm_losses = lm_losses * weight
        lm_losses = tf.reduce_mean(lm_losses, axis = 1)

        nsp = tf.cast(nsp, dtype = tf.float32)
        cls_losses = tf.reduce_mean(tf.reduce_sum(nsp * -tf.math.log(pred_cls), axis = 1))

        total_loss = lm_losses + cls_losses

        return total_loss
    
    def get_pretrained_result(self, inputs) :
        
        x, mask, segments = inputs[0], inputs[1], inputs[2]
    
        bert_outputs = self.Bert([x, mask, segments])

        y_pred = bert_outputs['sequence_output']

        decode_matrix = tf.linalg.pinv(self.Bert.layers[1].weights[0])

        pred_lm =  tf.math.softmax(tf.matmul(y_pred, decode_matrix))
        pred_cls = self.dense_cls(y_pred[:, 0])
        
        return pred_lm, pred_cls

In [41]:
# train = load_pkl('./dt/train_set_under_255.pkl')
train = load_pkl('./dt/train_set-maksed-position-sample-10000.pkl')
# train = load_pkl('./dt/train_set-masked-position.pkl')

In [42]:
train = list(filter(lambda x: len(x['x']) <= 130, train))

In [43]:
vocab_size = 32000
max_seq_len = 130
num_layers = 6
dff = 384 * 2
d_model = 384
num_heads = 6
dropout = .1
name = 'qbert_210603'

In [50]:
x = pad_sequences([ x['x'] for x in train ], max_seq_len, padding = 'post')
y = pad_sequences([ x['label'] for x in train ] , max_seq_len, padding = 'post')
nsp = np.asarray([ x['NSP'] for x in train ])

weight = np.array([ ind_to_weight(x['masked_position'], max_seq_len) for x in train])

mask = create_padding_mask(x)

segments = np.array([create_segments(i) for i in x])

In [56]:
epochs = 1
lr = 1e-4
batch_size = 32

optimizer = tf.keras.optimizers.Adam(lr, beta_1=0.9, beta_2=0.999)

In [52]:
pretrainBert = BertModule(vocab_size, max_seq_len, num_layers, dff, d_model, num_heads, dropout, name)

In [53]:
pretrainBert.compile(optimizer=optimizer, loss ='mse')

In [54]:
pretrainBert.Bert.summary()

Model: "qbert_210603"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
inputs (InputLayer)             [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding_4 (Embedding)         (None, None, 384)    12288000    inputs[0][0]                     
__________________________________________________________________________________________________
position_embedding_2 (PositionE (None, None, 384)    49920       embedding_4[0][0]                
__________________________________________________________________________________________________
segments (InputLayer)           [(None, None)]       0                                            
_______________________________________________________________________________________

In [49]:
false_y = np.array([ 0 for _ in range(len(x))])

In [None]:
hist = pretrainBert.fit(batch_size = batch_size, epochs = epochs
                      , x = [x, mask, y, nsp, masked_lm_weight, segments], y = false_y)

  9/310 [..............................] - ETA: 1:00:26 - loss: 3.0672

In [18]:
today = '210604'

pretrainBert.save_weights('./model/BertPretrained-{}-{}-{}-{}-{}/'.format(today, max_seq_len, num_layers, d_model, num_heads))

NotImplementedError: Saving the model to HDF5 format requires the model to be a Functional model or a Sequential model. It does not work for subclassed models, because such models are defined via the body of a Python method, which isn't safely serializable. Consider saving to the Tensorflow SavedModel format (by setting save_format="tf") or using `save_weights`.

## 결과 확인

In [29]:
pretrainBert.load_weights('./model/model_weight_2106031943-epoch-1-0-loss-0.707.tf')

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x1b9171635c0>

In [11]:
sample_train_set = train[np.random.randint(0, len(train))]

In [12]:
from transformers import BertTokenizerFast

tokenizer_for_load = BertTokenizerFast.from_pretrained('./model/BertTokenizer-6000-32000-vocab.txt'
                                                   , strip_accents=False
                                                   , lowercase=False)

Calling BertTokenizerFast.from_pretrained() with the path to a single file or url is deprecated


In [13]:
train_statement = ' '.join(tokenizer_for_load.convert_ids_to_tokens(sample_train_set['x']))
train_statement

'[CLS] 2016년 여름 ##철 ##에 장 ##마와 폭 ##염 ##에 대해 오 ##보를 계속 내어 비판을 샀다 . [SEP] 군과 관련된 조항 [MASK] 베르 ##사유 조약의 5번째 부분을 이룬다 似 [SEP]'

In [14]:
train_statement = ' '.join(tokenizer_for_load.convert_ids_to_tokens(sample_train_set['label']))
train_statement

'[CLS] 2016년 여름 ##철 ##에 장 ##마와 폭 ##염 ##에 대해 오 ##보를 계속 내어 비판을 샀다 . [SEP] 군과 관련된 조항 ##들이 베르 ##사유 조약의 5번째 부분을 이룬다 . [SEP]'

In [24]:
# sample_train_set = train[np.random.randint(0, len(train))]

train_x = tf.reshape(sample_train_set['x'], (1, -1))
train_x = pad_sequences(train_x, max_seq_len, padding = 'post')
mask = create_padding_mask(train_x)

segments = tf.reshape(create_segments(train_x[0]), (1, -1))

In [34]:
pretrainBert.load_weights('./model/model_weight_2106031943-epoch-1-0-loss-0.707.tf')
lm, nls = pretrainBert.get_pretrained_result([train_x, mask, segments])
train_statement = ' '.join(tokenizer_for_load.convert_ids_to_tokens(tf.argmax(lm, axis = 2)[0]))
train_statement

'##풍 400m 400m 400m ##집단 400m ##집단 400m 400m 400m ##집단 ##집단 ##집단 ##집단 ##집단 ##집단 ##집단 ##집단 ##집단 ##집단 ##집단 불안정 . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .'

In [32]:
nls

<tf.Tensor: shape=(1, 2), dtype=float32, numpy=array([[0.31824154, 0.6817584 ]], dtype=float32)>

In [37]:
pretrainBert.load_weights('./model/model_weight_2106031943-epoch-1-1-loss-1.338.tf')
lm, nls = pretrainBert.get_pretrained_result([train_x, mask, segments])
train_statement = ' '.join(tokenizer_for_load.convert_ids_to_tokens(tf.argmax(lm, axis = 2)[0]))
train_statement

'##lay ##lay ##lay ##lay ##lay ##lay ##lay ##lay , ##lay , , , , , , , , , , , . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .'

In [38]:
nls

<tf.Tensor: shape=(1, 2), dtype=float32, numpy=array([[0.36835203, 0.631648  ]], dtype=float32)>

In [35]:
pretrainBert.load_weights('./model/model_weight_2106031943-epoch-1-2-loss-1.583.tf')
lm, nls = pretrainBert.get_pretrained_result([train_x, mask, segments])
train_statement = ' '.join(tokenizer_for_load.convert_ids_to_tokens(tf.argmax(lm, axis = 2)[0]))
train_statement

', , , , , , , , , , , . , , , , , . , , , . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .'

In [36]:
nls

<tf.Tensor: shape=(1, 2), dtype=float32, numpy=array([[0.49151692, 0.50848305]], dtype=float32)>