In [2]:
!pip install import_ipynb

!pip install transformers



In [5]:
import os
from google.colab import drive

In [11]:
drive.mount('/content/qdrive')

Drive already mounted at /content/qdrive; to attempt to forcibly remount, call drive.mount("/content/qdrive", force_remount=True).


In [6]:
os.chdir('/content/qdrive/MyDrive/Colab Notebooks/QBert')

In [7]:
import numpy as np
import tensorflow as tf
from copy import deepcopy

import import_ipynb
from QBert import qbert_model

import pickle
from tqdm.notebook import tqdm

from tensorflow.keras.preprocessing.sequence import pad_sequences

def load_pkl(file_path) :
    
    with open(file_path, 'rb') as f:
        df = pickle.load(f)
        
    return df

def save_pkl(df, file_path) :
    
    with open(file_path, 'wb') as f:
        pickle.dump(df, f)

def create_padding_mask(x):
    init_shape = x.shape
    mask = tf.cast(tf.math.equal(x, 0), tf.float32)
    # (batch_size, 1, 1, key의 문장 길이)
    return np.array(mask).reshape(init_shape[0], 1,1, init_shape[1])

def ind_to_weight(masked_pos, seq_len) :
    return tf.reduce_sum(tf.one_hot(masked_pos, seq_len), axis = 0)

def create_segments(inputs) :
    
    segment = []
    segment_num = 0
    
    for i, x in enumerate(inputs) :
        
        segment.append(segment_num)
        
        if x == 3 :
            segment_num+=1
            
    return np.array(segment)

importing Jupyter notebook from QBert.ipynb


In [8]:
class BertModule(tf.keras.Model) :

    def __init__(self, vocab_size, max_seq_len, num_layers, dff, d_model, num_heads, dropout, name) :
        super(BertModule, self).__init__()
        self.Bert = qbert_model(vocab_size, max_seq_len, num_layers, dff, d_model, num_heads, dropout, name)
        self.dense_cls = tf.keras.layers.Dense(2, activation = 'softmax', use_bias = False)
        self.vocab_size = vocab_size
    
    def call(self, inputs) :
        
        x, mask, lm, nsp, weight, segments = inputs[0], inputs[1], inputs[2], inputs[3], inputs[4], inputs[5]
    
        bert_outputs = self.Bert([x, mask, segments])

        y_pred = bert_outputs['sequence_output']

        decode_matrix = tf.linalg.pinv(self.Bert.layers[1].weights[0])

        pred_lm =  tf.math.softmax(tf.matmul(y_pred, decode_matrix))
        pred_cls = self.dense_cls(y_pred[:, 0])

        true_y_lm = tf.cast(tf.one_hot(tf.cast(lm, dtype = tf.int32), depth = self.vocab_size), dtype = tf.float32)

        lm_losses = (tf.reduce_sum(true_y_lm * -tf.math.log(pred_lm), axis = 2))
        lm_losses = lm_losses * weight
        lm_losses = tf.reduce_mean(lm_losses, axis = 1)

        nsp = tf.cast(nsp, dtype = tf.float32)
        cls_losses = tf.reduce_mean(tf.reduce_sum(nsp * -tf.math.log(pred_cls), axis = 1))

        total_loss = lm_losses + cls_losses

        return total_loss
    
    def get_pretrained_result(self, inputs) :
        
        x, mask = inputs[0], inputs[1]
    
        bert_outputs = self.Bert([x, mask])

        y_pred = bert_outputs['sequence_output']

        decode_matrix = tf.linalg.pinv(self.Bert.layers[1].weights[0])

        pred_lm =  tf.math.softmax(tf.matmul(y_pred, decode_matrix))
        pred_cls = self.dense_cls(y_pred[:, 0])
        
        return pred_lm, pred_cls

In [9]:
def process_inputs(train, max_seq_len) :
    
    train = list(filter(lambda x: len(x['x']) <= max_seq_len, train))

    x = pad_sequences([ inputs['x'] for inputs in train ], max_seq_len, padding = 'post')
    y = pad_sequences([ inputs['label'] for inputs in train ] , max_seq_len, padding = 'post')
    nsp = np.asarray([ inputs['NSP'] for inputs in train ])

    weight = np.array([ ind_to_weight(inputs['masked_position'], max_seq_len) for inputs in train])
    segments = np.array([ create_segments(inputs) for inputs in x])

    mask = create_padding_mask(x)

    return x, y, nsp, weight, segments, mask

In [10]:
vocab_size = 32000
max_seq_len = 130
num_layers = 3
dff = 256
d_model = 100
num_heads = 5
dropout = .1
name = 'qbert_210603'

In [11]:
data_path = '/content/qdrive/MyDrive/Data_Backup/210601_Bert_DT_BU/dt/'

ds_set = list(filter(lambda x : x.startswith('train_set-masked-position-'), os.listdir(data_path)))

In [12]:
epochs = 1
lr = 1e-4
batch_size = 128

In [13]:
pretrainBert = BertModule(vocab_size, max_seq_len, num_layers, dff, d_model, num_heads, dropout, name)

optimizer = tf.keras.optimizers.Adam(lr, beta_1=0.9, beta_2=0.999)
pretrainBert.compile(optimizer=optimizer, loss ='mse')

In [14]:
from datetime import datetime

In [16]:
import math

In [None]:
losses = []

for epoch in range(epochs) :

    total_step = 0
    now = datetime.now()
    print("EPOCH {} START ON {}".format(epoch, now))

    optimizer = tf.keras.optimizers.Adam(lr, beta_1=0.9, beta_2=0.999)
    pretrainBert.compile(optimizer=optimizer, loss ='mse')

    for ds in ds_set[0:1] :

        print("TRAIN ON {}".format(ds))
        dataset = load_pkl(os.path.join(data_path, ds))
        print("Data Loaded..")
        # x, y, nsp, weight, segments, mask = process_inputs(dataset, max_seq_len)
        print("Data Processed")
        print("Start Training on {}".format(ds))
        
        steps = math.ceil(len(x) / batch_size)

        for idx in range(steps) :

            batch_x = x[idx * batch_size: (idx+1)*batch_size]
            batch_y = y[idx * batch_size: (idx+1)*batch_size]
            batch_mask = mask[idx * batch_size: (idx+1)*batch_size]
            batch_nsp = nsp[idx * batch_size: (idx+1)*batch_size]
            batch_weight = weight[idx * batch_size: (idx+1)*batch_size]
            batch_segments = segments[idx * batch_size: (idx+1)*batch_size]
            false_y = np.array([ 0 for _ in range(batch_size)])

            loss = pretrainBert.train_on_batch(x = [batch_x, batch_mask, batch_y, batch_nsp, batch_weight, batch_segments]
                                               , y = false_y)
            losses.append(loss)

            if (idx % 500 == 0) | (idx+1) == steps) :
                print("Training -- {}/{} step -- {}".format(idx, steps+1, losses[-1]))

            total_step += 1

        print("Trained on {}_Loss {}".format(ds, losses[-1]))

        del([batch_x, batch_y, batch_mask, batch_nsp, batch_weight, batch_segments])
        del([dataset, x, y, nsp, weight, segments, mask, false_y])
    
    print("EPOCH {} END ON {} ({} 소요) ".format(epoch, datetime.now(), datetime.now() - now))

    pretrainBert.save_weights('/content/qdrive/MyDrive/Data_Backup/210601_Bert_DT_BU/model_weight_2106031849-epoch-{}-loss-{:.3f}.tf'.format(epoch, losses[-1]))

EPOCH 0 START ON 2021-06-03 09:59:59.259551
TRAIN ON train_set-masked-position-0.pkl
Data Loaded..
Data Processed
Start Training on train_set-masked-position-0.pkl
Training -- 1/2318 step -- 2.3771562576293945
Training -- 11/2318 step -- 3.0186190605163574
Training -- 21/2318 step -- 1.7656855583190918
Training -- 31/2318 step -- 2.1527233123779297
Training -- 41/2318 step -- 2.0238115787506104
Training -- 51/2318 step -- 1.857027530670166
Training -- 61/2318 step -- 1.5402508974075317
Training -- 71/2318 step -- 2.060739040374756
Training -- 81/2318 step -- 2.029837131500244
Training -- 91/2318 step -- 2.3308801651000977
Training -- 101/2318 step -- 1.7787905931472778
Training -- 111/2318 step -- 1.7355079650878906
Training -- 121/2318 step -- 1.855301856994629
Training -- 131/2318 step -- 2.0016918182373047
Training -- 141/2318 step -- 1.7667165994644165
Training -- 151/2318 step -- 1.9967494010925293
Training -- 161/2318 step -- 1.8256560564041138
Training -- 171/2318 step -- 1.9883

In [17]:
idx = 0

batch_x = x[idx * batch_size: (idx+1)*batch_size]
batch_y = y[idx * batch_size: (idx+1)*batch_size]
batch_mask = mask[idx * batch_size: (idx+1)*batch_size]
batch_nsp = nsp[idx * batch_size: (idx+1)*batch_size]
batch_weight = weight[idx * batch_size: (idx+1)*batch_size]
batch_segments = segments[idx * batch_size: (idx+1)*batch_size]
false_y = np.array([ 0 for _ in range(batch_size)])

In [18]:
loss = pretrainBert.train_on_batch(x = [batch_x, batch_mask, batch_y, batch_nsp, batch_weight, batch_segments]
                                               , y = false_y)



In [52]:
hist = pretrainBert.fit(batch_size = batch_size, epochs = epochs
                        , x = [x, mask, y, nsp, masked_lm_weight[:]], y = false_y)



In [53]:
pretrainBert.save_weights('/content/qdrive/MyDrive/Data_Backup/210601_Bert_DT_BU/model_weight_1_210603_500000_wi_weight.tf')