In [1]:
import numpy as np
import tensorflow as tf
from copy import deepcopy

import import_ipynb
from QBert import qbert_model

import pickle
from tqdm.notebook import tqdm

from tensorflow.keras.preprocessing.sequence import pad_sequences

def load_pkl(file_path) :
    
    with open(file_path, 'rb') as f:
        df = pickle.load(f)
        
    return df

def save_pkl(df, file_path) :
    
    with open(file_path, 'wb') as f:
        pickle.dump(df, f)

def create_padding_mask(x):
    mask = tf.cast(tf.math.equal(x, 0), tf.float32)
    # (batch_size, 1, 1, key의 문장 길이)
    return mask[:, tf.newaxis, tf.newaxis, :]

importing Jupyter notebook from QBert.ipynb


In [2]:
# train = load_pkl('./dt/train_set_under_255.pkl')
train = load_pkl('./dt/train_set_parse-10000.pkl')

In [3]:
vocab_size = 32000
max_seq_len = 255
num_layers = 12
dff = 768
d_model = 768
num_heads = 12
dropout = .1
name = 'qbert_210602'

In [149]:
class BertModule(tf.keras.Model) :
    
    def __init__(self, vocab_size, max_seq_len, num_layers, dff, d_model, num_heads, dropout, name) :
        super(BertModule, self).__init__()
        
        self.Bert = qbert_model(vocab_size, max_seq_len, num_layers, dff, d_model, num_heads, dropout, name)
        
        self.dense_cls = tf.keras.layers.Dense(2, activation = 'softmax', use_bias = False)
    
    def call(self, inputs) :
        
        input, mask = inputs['input'], inputs['mask']
        bert_outputs = self.Bert([input, mask])
        
        y_pred = bert_outputs['sequence_output']
        
        decode_matrix = tf.linalg.pinv(self.Bert.layers[1].weights[0])
        
        pred_lm =  tf.math.softmax(tf.matmul(y_pred, decode_matrix))
        pred_cls = self.dense_cls(y_pred[:, 0])
        
        return [ pred_lm, pred_cls]

In [150]:
pretrainBert = BertModule(vocab_size, max_seq_len, num_layers, dff, d_model, num_heads, dropout, name)

## Batch_size = 1

In [151]:
test_input = pad_sequences(tf.reshape(train[0]['x'], (1, -1)), max_seq_len, padding = 'post')
test_mask = create_padding_mask(test_input)

In [152]:
test_y = pad_sequences(tf.reshape(train[0]['label'], (1, -1)), max_seq_len, padding = 'post')
test_nsp = train[0]['NSP']

In [153]:
pred_lm, pred_cls = pretrainBert({'input' : test_input, 
                                   'mask' : test_mask})

In [154]:
test_nsp * -tf.math.log(pred_cls)

<tf.Tensor: shape=(1, 2), dtype=float32, numpy=array([[0.      , 2.728628]], dtype=float32)>

In [182]:
loss_lm = tf.reduce_mean(tf.reduce_sum(tf.one_hot(test_y, depth = vocab_size) * -tf.math.log(pred_lm), axis = 2))
loss_cls = tf.reduce_mean(tf.reduce_sum(test_nsp * -tf.math.log(pred_cls), axis = 1))

In [185]:
losses = loss_lm + loss_cls

In [186]:
losses

<tf.Tensor: shape=(), dtype=float32, numpy=10.892082>

## Batch_size = 5

In [46]:
batch_size = 5

x = pad_sequences([ x['x'] for x in train ], max_seq_len, padding = 'post')
y = pad_sequences([ x['label'] for x in train ] , max_seq_len, padding = 'post')
nsp = [ x['NSP'] for x in train ]

In [47]:
batch_x = x[:batch_size]
batch_y = y[:batch_size]
batch_nsp = nsp[:batch_size]

In [48]:
batch_mask = create_padding_mask(batch_x)

In [49]:
pred_lm, pred_cls = pretrainBert({'input' : batch_x, 
                                   'mask' : batch_mask})

In [61]:
a = BertLoss()(y_true = [batch_y, batch_nsp], y_pred = [pred_lm, pred_cls])

In [66]:
a.numpy()

10.902119

## Training

In [183]:
class BertLoss_LM(tf.keras.losses.Loss) :
    
    def __init__(self) :
        super(BertLoss_LM, self).__init__()
        
    def call(self, y_true, y_pred):
        
        batch_y = tf.cast(tf.one_hot(tf.cast(y_true, dtype = tf.int32), depth = vocab_size), dtype = tf.float32)
        pred_lm = y_pred
        
        loss_lm = tf.reduce_mean(tf.reduce_sum(batch_y * -tf.math.log(pred_lm), axis = 2))
#         loss_cls = tf.reduce_mean(tf.reduce_sum(batch_nsp * -tf.math.log(pred_cls), axis = 1))
        
        return loss_lm

    
class BertLoss_CLS(tf.keras.losses.Loss) :
    
    def __init__(self) :
        super(BertLoss_CLS, self).__init__()
        
    def call(self, y_true, y_pred):
        batch_nsp = y_true
        pred_cls = y_pred
        
#         loss_lm = tf.reduce_mean(tf.reduce_sum(tf.one_hot(batch_y, depth = vocab_size) * -tf.math.log(pred_lm), axis = 2))
        loss_cls = tf.reduce_mean(tf.reduce_sum(batch_nsp * -tf.math.log(pred_cls), axis = 1))
        
        return loss_cls


In [184]:
batch_size = 5

x = pad_sequences([ x['x'] for x in train ], max_seq_len, padding = 'post')
y = pad_sequences([ x['label'] for x in train ] , max_seq_len, padding = 'post')
nsp = np.asarray([ x['NSP'] for x in train ])

mask = create_padding_mask(x)

In [185]:
epochs = 1
lr = 1e-5
batch_size = 5

optimizer = tf.keras.optimizers.Adam(lr)
loss_fn = [ BertLoss_LM(), BertLoss_CLS() ]

In [186]:
pretrainBert = BertModule(vocab_size, max_seq_len, num_layers, dff, d_model, num_heads, dropout, name)

In [187]:
pretrainBert.compile(loss=loss_fn, optimizer=optimizer)

In [None]:
pretrainBert.fit(batch_size = batch_size, callbacks = None, epochs = epochs
                 , x = {'input' : x,
                         'mask' : mask}
                 , y = [ y, nsp ])

   8/2000 [..............................] - ETA: 11:58:29 - loss: 11.3361 - output_1_loss: 10.3536 - output_2_loss: 0.9825