In [1]:
import numpy as np
import tensorflow as tf
from copy import deepcopy

import import_ipynb
from QBert import qbert_model

import pickle
from tqdm.notebook import tqdm

from tensorflow.keras.preprocessing.sequence import pad_sequences

def load_pkl(file_path) :
    
    with open(file_path, 'rb') as f:
        df = pickle.load(f)
        
    return df

def save_pkl(df, file_path) :
    
    with open(file_path, 'wb') as f:
        pickle.dump(df, f)

def create_padding_mask(x):
    mask = tf.cast(tf.math.equal(x, 0), tf.float32)
    # (batch_size, 1, 1, key의 문장 길이)
    return mask[:, tf.newaxis, tf.newaxis, :]

importing Jupyter notebook from QBert.ipynb


In [2]:
# train = load_pkl('./dt/train_set_under_255.pkl')
train = load_pkl('./dt/train_set_parse-10000.pkl')

In [3]:
vocab_size = 32000
max_seq_len = 255
num_layers = 12
dff = 768
d_model = 768
num_heads = 12
dropout = .1
name = 'qbert_210602'

In [132]:
class BertModule(tf.keras.Model) :
    
    def __init__(self, vocab_size, max_seq_len, num_layers, dff, d_model, num_heads, dropout, name) :
        super(BertModule, self).__init__()
        
        self.Bert = qbert_model(vocab_size, max_seq_len, num_layers, dff, d_model, num_heads, dropout, name)
        
        self.dense_cls = tf.keras.layers.Dense(2, activation = 'softmax', use_bias = False)
    
    def call(self, inputs) :
        
        input, mask = inputs['input'], inputs['mask']
        bert_outputs = self.Bert([input, mask])
        
        y_pred = bert_outputs['sequence_output']
        
        decode_matrix = tf.linalg.pinv(self.Bert.layers[1].weights[0])
        pred_lm =  tf.math.softmax(tf.matmul(y_pred, decode_matrix))
        
        
        pred_cls = self.dense_cls(y_pred[:, 0])
        
        return pred_lm, pred_cls

In [133]:
pretrainBert = BertModule(vocab_size, max_seq_len, num_layers, dff, d_model, num_heads, dropout, name)

## Batch_size = 1

In [134]:
test_input = pad_sequences(tf.reshape(train[0]['x'], (1, -1)), max_seq_len, padding = 'post')
test_mask = create_padding_mask(test_input)

In [135]:
test_y = pad_sequences(tf.reshape(train[0]['label'], (1, -1)), max_seq_len, padding = 'post')
test_nsp = train[0]['NSP']

In [136]:
pred_lm, pred_cls = pretrainBert({'input' : test_input, 
                                   'mask' : test_mask})

In [181]:
test_nsp * -tf.math.log(pred_cls)

<tf.Tensor: shape=(1, 2), dtype=float32, numpy=array([[0.       , 0.5143654]], dtype=float32)>

In [182]:
loss_lm = tf.reduce_mean(tf.reduce_sum(tf.one_hot(test_y, depth = vocab_size) * -tf.math.log(pred_lm), axis = 2))
loss_cls = tf.reduce_mean(tf.reduce_sum(test_nsp * -tf.math.log(pred_cls), axis = 1))

In [185]:
losses = loss_lm + loss_cls

In [186]:
losses

<tf.Tensor: shape=(), dtype=float32, numpy=10.892082>

## Batch_size = 5

In [197]:
batch_size = 5

x = pad_sequences([ x['x'] for x in train ], max_seq_len, padding = 'post')
y = pad_sequences([ x['label'] for x in train ] , max_seq_len, padding = 'post')
nsp = [ x['NSP'] for x in train ]

In [198]:
x.shape

(10000, 255)

In [199]:
y.shape

(10000, 255)

In [99]:
y_word = y_true['WORD']
y_nsp = y_true['NSP']

losses = []
# word 예측
for i in range(1, 3) :
    
    if y_word[0, i] == 0 :
        continue
        
    decode_matrix = tf.linalg.pinv(bert_model.layers[1].weights[0])
    decoded_vocab =  tf.matmul(y_pred[:, i], decode_matrix)
    
    one_hot = tf.one_hot(y_word[:, i], depth = vocab_size)
    log_probs = -tf.nn.log_softmax(decoded_vocab)
    loss_lm = tf.reduce_mean(tf.reduce_sum(one_hot * log_probs, axis = -1))
    
# NSP 예측


log_probs = -tf.nn.log_softmax(tf.keras.layers.Dense(2)(y_pred[:, 0]))
loss_nsp = tf.reduce_mean(tf.reduce_sum(y_nsp * log_probs, axis = -1))

loss_lm, loss_nsp

(<tf.Tensor: shape=(), dtype=float32, numpy=10.336973>,
 <tf.Tensor: shape=(), dtype=float32, numpy=0.5224187>)

In [86]:
?tf.reduce_mean

In [11]:
epochs = 2
lr = 1e-5

optimizer = tf.keras.optimizers.Adam(lr)

In [None]:
class BertLoss(tf.keras.losses.Loss) :
    
    def __init__(self) :
        super(BertLoss, self).__init__()
        
    def call(self, y_true, y_pred):
        
        
        
        y_pred = tf.convert_to_tensor_v2(y_pred)
        y_true = tf.cast(y_true, y_pred.dtype)
        return tf.reduce_mean(math_ops.square(y_pred - y_true), axis=-1)