In [43]:
import pandas as pd

import tensorflow as tf
from evaluate import load

from transformers import AutoTokenizer, TFBertModel

from tqdm import tqdm, trange

In [4]:
BATCH_SIZE = 2
AUTOTUNE = tf.data.AUTOTUNE
MODEL_PATH = 'dbmdz/bert-base-turkish-cased'
TRAIN_DIR = '/content/drive/MyDrive/adversarial-taboo/adversarial-taboo datasets/my_quad.csv'

In [5]:
bert_tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
bert_model = TFBertModel.from_pretrained(MODEL_PATH)

## freeze the model
bert_model.trainable = False
max_position_embeddings = bert_model.config.max_position_embeddings
vocab_size = bert_model.config.vocab_size

Some layers from the model checkpoint at dbmdz/bert-base-turkish-cased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at dbmdz/bert-base-turkish-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [6]:
def create_tensor_slices(df):
    inputs = bert_tokenizer(df['answer'].tolist(), df['cloze'].tolist(), 
                            padding='max_length', truncation=True, 
                            return_tensors='tf')
    outputs = bert_tokenizer(df['question'].tolist(), 
                             padding='max_length', truncation=True, 
                             return_tensors='tf')
    
    return tf.data.Dataset.from_tensor_slices((
        {'input_ids': inputs.input_ids,
            'attention_mask': inputs.attention_mask,
            'token_type_ids': inputs.token_type_ids,
            'decoder_attention_mask': outputs.attention_mask,
            'labels': outputs.input_ids
        }, 
        outputs.input_ids)
    )

def create_dataset(train_dir, batch_size):
    trainval_df = pd.read_csv(train_dir)

    train_df = trainval_df.sample(frac=.85, random_state=42)
    val_df = trainval_df.drop(train_df.index)

    train_data = create_tensor_slices(train_df).shuffle(1000).batch(batch_size)
    val_data = create_tensor_slices(val_df).shuffle(1000).batch(batch_size)

    return train_data, val_data


train_data, val_data = create_dataset(TRAIN_DIR, BATCH_SIZE)

In [7]:
def create_model():
    input_ids = tf.keras.Input(shape=512, dtype=tf.int32)
    attention_mask = tf.keras.Input(shape=512, dtype=tf.int32)
    token_type_ids = tf.keras.Input(shape=512, dtype=tf.int32)

    decoder_attention_mask = tf.keras.Input(shape=512, dtype=tf.int32)
    labels = tf.keras.Input(shape=512, dtype=tf.int32)

    bert_outs = bert_model(input_ids=input_ids, 
                           attention_mask=attention_mask, 
                           token_type_ids=token_type_ids)
    
    decoder_encodings = tf.keras.layers.Embedding(vocab_size, 32, input_length=max_position_embeddings)(labels)
    rnn_outs = tf.keras.layers.LSTM(32, return_sequences=True)(decoder_encodings)
    dense_outs = tf.keras.layers.Dense(vocab_size, activation='softmax')(rnn_outs)

    model = tf.keras.Model(inputs={'input_ids': input_ids, 
                                   'attention_mask': attention_mask, 
                                   'token_type_ids': token_type_ids,
                                   'decoder_attention_mask': decoder_attention_mask,
                                   'labels': labels}, 
                           outputs=dense_outs)
    return model

model = create_model()
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_5 (InputLayer)           [(None, 512)]        0           []                               
                                                                                                  
 embedding (Embedding)          (None, 512, 32)      1024000     ['input_5[0][0]']                
                                                                                                  
 lstm (LSTM)                    (None, 512, 32)      8320        ['embedding[0][0]']              
                                                                                                  
 input_2 (InputLayer)           [(None, 512)]        0           []                               
                                                                                              

In [67]:
bleu = load('sacrebleu')

def get_bleu(x, y):
    x = tf.cast(x, tf.int64)
    y = tf.argmax(y, axis=-1)

    x = bert_tokenizer.batch_decode(x)
    y = bert_tokenizer.batch_decode(y)

    results = bleu.compute(references=x, predictions=y)
    return results['score']

In [68]:
model.compile(optimizer=tf.keras.optimizers.Adadelta(), 
              loss=tf.keras.losses.SparseCategoricalCrossentropy(), 
              metrics=get_bleu, run_eagerly = True)

In [None]:
model.fit(train_data, epochs=10, validation_data=val_data)

Epoch 1/10
  9/566 [..............................] - ETA: 37:26 - loss: 10.3723 - get_bleu: 0.0000e+00