In [1]:
import numpy as np
import tensorflow as tf
from copy import deepcopy

import import_ipynb
from QBert import qbert_model

import pickle
from tqdm.notebook import tqdm

from tensorflow.keras.preprocessing.sequence import pad_sequences

def load_pkl(file_path) :
    
    with open(file_path, 'rb') as f:
        df = pickle.load(f)
        
    return df

def save_pkl(df, file_path) :
    
    with open(file_path, 'wb') as f:
        pickle.dump(df, f)

def create_padding_mask(x):
    mask = tf.cast(tf.math.equal(x, 0), tf.float32)
    # (batch_size, 1, 1, key의 문장 길이)
    return mask[:, tf.newaxis, tf.newaxis, :]

def ind_to_weight(masked_pos, seq_len) :
    return tf.reduce_sum(tf.one_hot(masked_pos, seq_len), axis = 0)
    

importing Jupyter notebook from QBert.ipynb


In [2]:
# train = load_pkl('./dt/train_set_under_255.pkl')
train = load_pkl('./dt/train_set-maksed-position-sample-10000.pkl')

In [3]:
vocab_size = 32000
max_seq_len = 255
num_layers = 12
dff = 768
d_model = 768
num_heads = 12
dropout = .1
name = 'qbert_210602'

In [24]:
def BertModule(vocab_size, max_seq_len, num_layers, dff, d_model, num_heads, dropout, name) :
    
    x = tf.keras.Input(shape = (None, ), name = 'MASKED_X_INPUT')
    mask = tf.keras.Input(shape = (1, 1, None), name = 'MASKING_INPUT')
    lm = tf.keras.Input(shape = (max_seq_len, vocab_size), name = 'WORD_LABEING_INPUT')
    nsp = tf.keras.Input(shape = (2, ), name = 'NEXT_SENTENCE_PREDICTION_INPUT')
    weight = tf.keras.Input(shape = (max_seq_len, ), name = 'MASKING_WEIGHT')
    
    
    Bert = qbert_model(vocab_size, max_seq_len, num_layers, dff, d_model, num_heads, dropout, name)
    dense_cls = tf.keras.layers.Dense(2, activation = 'softmax', use_bias = False)

    bert_outputs = Bert([x, mask])

    y_pred = bert_outputs['sequence_output']

    decode_matrix = tf.linalg.pinv(Bert.layers[1].weights[0])

    pred_lm =  tf.math.softmax(tf.matmul(y_pred, decode_matrix))
    
    pred_cls = dense_cls(y_pred[:, 0])

    true_y_lm = tf.cast(tf.one_hot(tf.cast(lm, dtype = tf.int32), depth = vocab_size), dtype = tf.float32)
    lm_losses = tf.reduce_mean(tf.reduce_sum(true_y_lm * -tf.math.log(pred_lm), axis = 2))
    
    lm_losses = lm_losses * weight
    
    cls_losses = tf.reduce_mean(tf.reduce_sum(nsp * -tf.math.log(pred_cls), axis = 1))
    
    total_loss = lm_losses + cls_losses
    
    return tf.keras.Model([x, mask, lm, nsp, weight], total_loss)

In [25]:
pretrainBert = BertModule(vocab_size, max_seq_len, num_layers, dff, d_model, num_heads, dropout, name)

In [None]:
class BertLoss_LM(tf.keras.losses.Loss) :
    
    def __init__(self) :
        super(BertLoss_LM, self).__init__()
        
    def call(self, y_true, y_pred):
        
        lm_y = y_true['y']
        weight = y_true['weight']
        
        print(lm_y.shape)
        print(weight.shape)
        
        batch_y = tf.cast(tf.one_hot(tf.cast(y_true, dtype = tf.int32), depth = vocab_size), dtype = tf.float32)
        pred_lm = y_pred
        
        loss_lm = tf.reduce_mean(tf.reduce_sum(batch_y * -tf.math.log(pred_lm), axis = 2))
#         loss_cls = tf.reduce_mean(tf.reduce_sum(batch_nsp * -tf.math.log(pred_cls), axis = 1))
        
        return loss_lm

    
class BertLoss_CLS(tf.keras.losses.Loss) :
    
    def __init__(self) :
        super(BertLoss_CLS, self).__init__()
        
    def call(self, y_true, y_pred):
        batch_nsp = y_true
        pred_cls = y_pred
        
#         loss_lm = tf.reduce_mean(tf.reduce_sum(tf.one_hot(batch_y, depth = vocab_size) * -tf.math.log(pred_lm), axis = 2))
        loss_cls = tf.reduce_mean(tf.reduce_sum(batch_nsp * -tf.math.log(pred_cls), axis = 1))
        
        return loss_cls


In [4]:
# class BertModule(tf.keras.Model) :
    
#     def __init__(self, vocab_size, max_seq_len, num_layers, dff, d_model, num_heads, dropout, name) :
#         super(BertModule, self).__init__()
        
#         self.max_seq_len = max_seq_len
#         self.d_model = d_model
        
#         self.Bert = qbert_model(vocab_size, max_seq_len, num_layers, dff, d_model, num_heads, dropout, name)
        
#         self.dense_cls = tf.keras.layers.Dense(2, activation = 'softmax', use_bias = False)
    
#     def call(self, inputs) :
        
#         x, mask, lm, nsp, weight = inputs['input'], inputs['mask'], inputs['lm'], inputs['nsp'], inputs['weight']
        
#         batch_size = tf.shape(input)[0]
        
#         bert_outputs = self.Bert([input, mask])
        
#         y_pred = bert_outputs['sequence_output']
        
#         decode_matrix = tf.linalg.pinv(self.Bert.layers[1].weights[0])
        
#         pred_lm =  tf.math.softmax(tf.matmul(y_pred, decode_matrix))
#         pred_cls = self.dense_cls(y_pred[:, 0])
        
#         return [ pred_lm, pred_cls]

## Training

In [80]:
# class BertLoss_LM(tf.keras.losses.Loss) :
    
#     def __init__(self) :
#         super(BertLoss_LM, self).__init__()
        
#     def call(self, y_true, y_pred):
        
#         lm_y = y_true['y']
#         weight = y_true['weight']
        
#         print(lm_y.shape)
#         print(weight.shape)
        
#         batch_y = tf.cast(tf.one_hot(tf.cast(y_true, dtype = tf.int32), depth = vocab_size), dtype = tf.float32)
#         pred_lm = y_pred
        
#         loss_lm = tf.reduce_mean(tf.reduce_sum(batch_y * -tf.math.log(pred_lm), axis = 2))
# #         loss_cls = tf.reduce_mean(tf.reduce_sum(batch_nsp * -tf.math.log(pred_cls), axis = 1))
        
#         return loss_lm

    
# class BertLoss_CLS(tf.keras.losses.Loss) :
    
#     def __init__(self) :
#         super(BertLoss_CLS, self).__init__()
        
#     def call(self, y_true, y_pred):
#         batch_nsp = y_true
#         pred_cls = y_pred
        
# #         loss_lm = tf.reduce_mean(tf.reduce_sum(tf.one_hot(batch_y, depth = vocab_size) * -tf.math.log(pred_lm), axis = 2))
#         loss_cls = tf.reduce_mean(tf.reduce_sum(batch_nsp * -tf.math.log(pred_cls), axis = 1))
        
#         return loss_cls


In [6]:
train = list(filter(lambda x: len(x['x']) <= 130, train))

In [28]:
vocab_size = 32000
max_seq_len = 130
num_layers = 3
dff = 256
d_model = 100
num_heads = 5
dropout = .1
name = 'qbert_210603'

In [29]:
batch_size = 5

x = pad_sequences([ x['x'] for x in train ], max_seq_len, padding = 'post')
y = pad_sequences([ x['label'] for x in train ] , max_seq_len, padding = 'post')
nsp = np.asarray([ x['NSP'] for x in train ])

masked_lm_weight = np.array([ ind_to_weight(x['masked_position'], max_seq_len) for x in train])

mask = create_padding_mask(x)

In [30]:
# new_lm = []

# for i in range(len(y)) :
#     new_lm.append({"y" : y[i],
#                   "weight" : masked_lm_weight[i]})

In [31]:
epochs = 1
lr = 1e-3
batch_size = 10

optimizer = tf.keras.optimizers.Adam(lr)
# loss_fn = [ BertLoss_LM(), BertLoss_CLS() ]

In [32]:
pretrainBert = BertModule(vocab_size, max_seq_len, num_layers, dff, d_model, num_heads, dropout, name)

In [33]:
pretrainBert.compile(optimizer=optimizer)

In [91]:
# class qSeq(tf.keras.utils.Sequence) :
    
#     def __init__(self, x_set, mask_set, y_lm_set, y_nsp_set, masked_lm_weight, batch_size) :
        
#         self.x = x_set
#         self.mask = mask_set
#         self.y_lm = y_lm_set
#         self.y_nsp = y_nsp_set
#         self.masked_lm_weight = masked_lm_weight
#         self.batch_size = batch_size
        
#     def __len__(self) :
#         r = tf.math.ceil(len(self.x) / self.batch_size)
#         return r
    
#     def __getitem__(self, idx) :
#         batch_x = self.x[idx * self.batch_size : (idx+1) * batch_size]
#         batch_mask = self.mask[idx * self.batch_size : (idx+1) * batch_size]
#         batch_y_lm = self.y_lm[idx * self.batch_size : (idx+1) * batch_size]
#         batch_y_nsp = self.y_nsp[idx * self.batch_size : (idx+1) * batch_size]
#         batch_masked_lm_weight = self.masked_lm_weight[idx * self.batch_size : (idx+1) * batch_size]
        
#         return [{"input" : batch_x, "mask" : batch_mask},
#                        [[batch_y_lm, batch_masked_lm_weight], batch_y_nsp]]

In [92]:
# dataset = qSeq(x, mask, y, nsp, masked_lm_weight, batch_size)

In [None]:
pretrainBert([x, mask, y, nsp, masked_lm_weight])



In [100]:
np.array(new_lm).shape

(9909,)

In [101]:
hist = pretrainBert.fit(batch_size = batch_size, callbacks = None, epochs = epochs
                             , x = {'input' : x,
                                     'mask' : mask}
                             , y = [ np.array(new_lm), nsp ])

ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type dict).

In [19]:
pretrainBert.save('./model/BertPretrained-210602-{}-{}-{}-{}.pt'.format(max_seq_len, num_layers, d_model, num_heads))

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
INFO:tensorflow:Assets written to: ./model/BertPretrained-210602-130-3-100-5.pt\assets


## 결과 확인

In [121]:
sample_train_set = train[np.random.randint(0, len(train))]
sample_train_set = train[29]

In [25]:
test_set = load_pkl('./dt/train_set_under_255.pkl')[np.random.randint(10000, 1000000)]

In [125]:
from transformers import BertTokenizerFast

tokenizer_for_load = BertTokenizerFast.from_pretrained('./model/BertTokenizer-6000-32000-vocab.txt'
                                                   , strip_accents=False
                                                   , lowercase=False)

Calling BertTokenizerFast.from_pretrained() with the path to a single file or url is deprecated


In [127]:
train_statement = ' '.join(tokenizer_for_load.convert_ids_to_tokens(sample_train_set['label']))
train_statement

'[CLS] 특히 국제 분쟁 조정을 위해 북한의 김일성 , 아이티 ##의 세 ##드라 ##스 장군 , 팔레 ##인 ##스타 ##인의 하마 ##스 , 보스니아 ##의 세르비아 ##계 정권 같이 미국 정부에 대해 협상을 거부 ##하면서 사태 ##의 위기를 초래 ##한 인물 및 단체를 직접 만나 분쟁 ##의 원인을 근본 ##적으로 해결하기 위해 힘썼다 . [SEP] 넓이는 46 . 80 ##10 ##km2이고 , 인구는 2015년 8월 기준으로 5 , 66 ##2명이다 . [SEP]'

In [129]:
sample_train_set = train[np.random.randint(0, len(train))]

train_x = tf.reshape(sample_train_set['x'], (1, -1))
train_x = pad_sequences(train_x, max_seq_len, padding = 'post')
mask = create_padding_mask(train_x)

In [130]:
lm, nls = pretrainBert({"input" : train_x, 
                        "mask" : mask})

In [131]:
tf.argmax(lm, axis = 2)

<tf.Tensor: shape=(1, 130), dtype=int64, numpy=
array([[17328, 17360, 20083,  5483,  2019, 18409,  6030, 25117,  9202,
        14692, 26121,  3494, 27229,  4178,  2666, 25338, 13402,  1101,
        22674, 24924,  7654, 21774,  3057, 10126, 17184, 29053,    15,
        16914, 29689,   123, 18975,  1919, 29793, 14770, 25498, 15708,
        22909,  9951, 27557, 13215,   965,  9987,  2350, 20155, 14333,
         4858, 26958, 14489,  3173,  4480, 22968,  3308, 23452, 20518,
        31073,  2973,  8616, 13132, 23579, 12193, 29040, 26558, 31106,
        26844, 22248, 13991,  3920, 22616, 23662, 31055, 16463, 16199,
        20434, 12297, 10795, 27195,  3488, 10549, 15916, 25133,  9855,
         1907, 23976, 13786, 13825, 31484, 15073, 20471, 29217, 18239,
         3095, 23133,  1123,  2396, 17538, 16958,  7579,  9146,  7860,
        26091,  1059, 24044, 23068,  9914, 10082, 15687,  7680,  2289,
         1450, 27952, 31394, 25124,  4905,  3505,  3409, 17645,  8407,
        26843,   476,  1066, 

In [132]:
nls

<tf.Tensor: shape=(1, 2), dtype=float32, numpy=array([[0.882038  , 0.11796198]], dtype=float32)>