In [50]:
import numpy as np
import tensorflow as tf
from copy import deepcopy

import import_ipynb
from QBert import qbert_model

import pickle
from tqdm.notebook import tqdm

from tensorflow.keras.preprocessing.sequence import pad_sequences

def load_pkl(file_path) :
    
    with open(file_path, 'rb') as f:
        df = pickle.load(f)
        
    return df

def save_pkl(df, file_path) :
    
    with open(file_path, 'wb') as f:
        pickle.dump(df, f)

def create_padding_mask(x):
    init_shape = x.shape
    mask = tf.cast(tf.math.equal(x, 0), tf.float32)
    # (batch_size, 1, 1, key의 문장 길이)
    return np.array(mask).reshape(init_shape[0], 1,1, init_shape[1])

def ind_to_weight(masked_pos, seq_len) :
    return tf.reduce_sum(tf.one_hot(masked_pos, seq_len), axis = 0)
    

In [51]:
class BertModule(tf.keras.Model) :

    def __init__(self, vocab_size, max_seq_len, num_layers, dff, d_model, num_heads, dropout, name) :
        super(BertModule, self).__init__()
        self.Bert = qbert_model(vocab_size, max_seq_len, num_layers, dff, d_model, num_heads, dropout, name)
        self.dense_cls = tf.keras.layers.Dense(2, activation = 'softmax', use_bias = False)
        self.vocab_size = vocab_size
    
    def call(self, inputs) :
        
        x, mask, lm, nsp, weight = inputs[0], inputs[1], inputs[2], inputs[3], inputs[4]
    
        bert_outputs = self.Bert([x, mask])

        y_pred = bert_outputs['sequence_output']

        decode_matrix = tf.linalg.pinv(self.Bert.layers[1].weights[0])

        pred_lm =  tf.math.softmax(tf.matmul(y_pred, decode_matrix))
        pred_cls = self.dense_cls(y_pred[:, 0])

        true_y_lm = tf.cast(tf.one_hot(tf.cast(lm, dtype = tf.int32), depth = self.vocab_size), dtype = tf.float32)

        lm_losses = (tf.reduce_sum(true_y_lm * -tf.math.log(pred_lm), axis = 2))
#         lm_losses = lm_losses * weight
        lm_losses = tf.reduce_mean(lm_losses, axis = 1)

        nsp = tf.cast(nsp, dtype = tf.float32)
        cls_losses = tf.reduce_mean(tf.reduce_sum(nsp * -tf.math.log(pred_cls), axis = 1))

        total_loss = lm_losses + cls_losses

        return total_loss
    
    def get_pretrained_result(self, inputs) :
        
        x, mask = inputs[0], inputs[1]
    
        bert_outputs = self.Bert([x, mask])

        y_pred = bert_outputs['sequence_output']

        decode_matrix = tf.linalg.pinv(self.Bert.layers[1].weights[0])

        pred_lm =  tf.math.softmax(tf.matmul(y_pred, decode_matrix))
        pred_cls = self.dense_cls(y_pred[:, 0])
        
        return pred_lm, pred_cls

In [52]:
# train = load_pkl('./dt/train_set_under_255.pkl')
# train = load_pkl('./dt/train_set-maksed-position-sample-10000.pkl')
train = load_pkl('./dt/train_set-masked-position.pkl')

In [53]:
train = list(filter(lambda x: len(x['x']) <= 130, train))

In [57]:
train_s = train[:batch_size * 10000]

In [59]:
train = train_s

In [60]:
vocab_size = 32000
max_seq_len = 130
num_layers = 3
dff = 256
d_model = 100
num_heads = 5
dropout = .1
name = 'qbert_210603'

In [None]:
x = pad_sequences([ x['x'] for x in train ], max_seq_len, padding = 'post')
y = pad_sequences([ x['label'] for x in train ] , max_seq_len, padding = 'post')
nsp = np.asarray([ x['NSP'] for x in train ])

masked_lm_weight = np.array([ ind_to_weight(x['masked_position'], max_seq_len) for x in train])

mask = create_padding_mask(x)

In [None]:
epochs = 1
lr = 1e-4
batch_size = 256

optimizer = tf.keras.optimizers.Adam(lr, beta_1=0.9, beta_2=0.999)

In [9]:
pretrainBert = BertModule(vocab_size, max_seq_len, num_layers, dff, d_model, num_heads, dropout, name)

In [11]:
pretrainBert.compile(optimizer=optimizer, loss ='mse')

In [48]:
pretrainBert.Bert.summary()

Model: "qbert_210603"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
inputs (InputLayer)             [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, None, 100)    3200000     inputs[0][0]                     
__________________________________________________________________________________________________
tf_op_layer_Mul_1 (TensorFlowOp [(None, None, 100)]  0           embedding_1[0][0]                
__________________________________________________________________________________________________
position_embedding_1 (PositionE (None, None, 100)    13000       tf_op_layer_Mul_1[0][0]          
_______________________________________________________________________________________

In [12]:
false_y = np.array([ 0 for _ in range(len(x))])

In [13]:
hist = pretrainBert.fit(batch_size = batch_size, epochs = epochs
                        , x = [x, mask, y, nsp, masked_lm_weight[:]], y = false_y)



  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "




In [14]:
today = '210603'

pretrainBert.save('./model/BertPretrained-{}-{}-{}-{}-{}.pt'.format(today, max_seq_len, num_layers, d_model, num_heads))

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
INFO:tensorflow:Assets written to: ./model/BertPretrained-210603-130-3-100-5.pt\assets


## 결과 확인

In [15]:
sample_train_set = train[np.random.randint(0, len(train))]

In [16]:
from transformers import BertTokenizerFast

tokenizer_for_load = BertTokenizerFast.from_pretrained('./model/BertTokenizer-6000-32000-vocab.txt'
                                                   , strip_accents=False
                                                   , lowercase=False)

Calling BertTokenizerFast.from_pretrained() with the path to a single file or url is deprecated


In [29]:
train_statement = ' '.join(tokenizer_for_load.convert_ids_to_tokens(sample_train_set['x']))
train_statement

'[CLS] 밀도 ##행렬 ##을 대각 ##화 ##하면 그 각 원소는 확률 formula _ 6 ##가 완비 , 이는 위의 통계 ##역학 [MASK] 정의 [MASK] 동등 ##하다 . [SEP] 콩 [MASK] [MASK] ##스트 퍼 ##블리 ##케이션 ##즈가 [MASK] 있으며 , 캘리포니아 샌프란시스코 ##에서 출판된 ##툉 [MASK] [SEP]'

In [30]:
train_statement = ' '.join(tokenizer_for_load.convert_ids_to_tokens(sample_train_set['label']))
train_statement

'[CLS] 밀도 ##행렬 ##을 대각 ##화 ##하면 그 각 원소는 확률 formula _ 6 ##가 되므로 , 이는 위의 통계 ##역학 ##적 정의 ##와 동등 ##하다 . [SEP] 콩 ##데 나 ##스트 퍼 ##블리 ##케이션 ##즈가 소유하고 있으며 , 캘리포니아 샌프란시스코 ##에서 출판된 ##다 . [SEP]'

In [44]:
sample_train_set = train[np.random.randint(0, len(train))]

train_x = tf.reshape(sample_train_set['x'], (1, -1))
train_x = pad_sequences(train_x, max_seq_len, padding = 'post')
mask = create_padding_mask(train_x)

In [45]:
lm, nls = pretrainBert.get_pretrained_result([train_x, mask, "", "", ""])

In [46]:
train_statement = ' '.join(tokenizer_for_load.convert_ids_to_tokens(tf.argmax(lm, axis = 2)[0]))
train_statement

"뉴저지 뉴저지 뉴저지 뉴저지 뉴저지 뉴저지 ' ' ' ' 뉴저지 뉴저지 뉴저지 뉴저지 뉴저지 뉴저지 뉴저지 뉴저지 뉴저지 ##m 뉴저지 ##m ##m 뉴저지 뉴저지 뉴저지 책임을 책임을 책임을 이때 이때 책임을 책임을 책임을 책임을 책임을 책임을 이때 책임을 책임을 책임을 책임을 이때 이때 책임을 책임을 책임을 이때 이때 이때 이때 이때 ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' 책임을 ' 책임을 책임을 책임을 책임을 책임을 책임을 ##소프트 책임을 책임을 책임을 책임을 책임을 책임을 책임을 책임을 책임을 책임을 책임을 책임을 책임을 책임을 책임을 책임을 책임을 ##m ##m ##m ##m ##m ##m ##m ' ' ##m ##m ##m ##m ##m ##m ##m ##m 이탈리아의 ##m ##m ##m ##m ##m ##m ##m ##m"

In [23]:
nls

<tf.Tensor: shape=(1, 2), dtype=float32, numpy=array([[0.5446929, 0.4553071]], dtype=float32)>

In [184]:
pretrainBert.Bert.layers[1].weights[0]

<tf.Variable 'embedding_19/embeddings:0' shape=(32000, 100) dtype=float32, numpy=
array([[ 0.00675264,  0.02614969,  0.04783888, ...,  0.00074808,
         0.0081974 ,  0.02426926],
       [ 0.02625655,  0.00214106,  0.02599093, ..., -0.0249519 ,
        -0.02679936,  0.01629817],
       [ 0.00712447, -0.02990005,  0.02621973, ..., -0.00580653,
        -0.00704775,  0.00537025],
       ...,
       [-0.0472474 ,  0.00386853, -0.00768339, ...,  0.02705099,
        -0.03506238,  0.0258582 ],
       [-0.00262028,  0.02314945,  0.00940512, ...,  0.02038808,
         0.02099202, -0.02432241],
       [ 0.02712569, -0.00255732, -0.04550922, ..., -0.00697222,
        -0.04350788,  0.0189664 ]], dtype=float32)>