https://stackoverflow.com/questions/64156202/add-dense-layer-on-top-of-huggingface-bert-model

# Install requirements

In [None]:
!pip install Datasets
!pip install transformers
!pip install scikit-learn

In [None]:
import pandas as pd
import numpy as np
from datasets import Dataset
from transformers import AutoTokenizer, TFAutoModel, TFAutoModelForSequenceClassification
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.optimizers import Adam
import datasets
import keras

## Define custom model

In [None]:
class CustomBERTModel(keras.Model):
    def __init__(self):
          super(CustomBERTModel, self).__init__()
          self.bert = TFAutoModel.from_pretrained("bert-base-uncased")
          ### New layers:
          self.linear1 = keras.layers.Dense(256)
          self.linear2 = keras.layers.Dense(2) ## 2 is the number of classes in this example

    def call(self, inputs, training=False):
          # call expects only one positional argument, so you have to pass in a tuple and unpack. The next parameter is a special reserved training parameter.
          ids, mask = inputs
          sequence_output = self.bert(ids, mask, training=training).last_hidden_state

          # sequence_output has the following shape: (batch_size, sequence_length, 768)
          linear1_output = self.linear1(sequence_output[:,0,:]) ## extract the 1st token's embeddings

          linear2_output = self.linear2(linear1_output)

          return linear2_output

### Use for debug only

In [None]:
tf.config.run_functions_eagerly(True)
tf.data.experimental.enable_debug_mode()

In [None]:
def preprocess_function(examples, tokenizer, max_length=128):
    return tokenizer(examples["text"],
                     truncation=True,
                     padding='max_length',
                     max_length=max_length,
                     return_tensors="np")

In [None]:
def get_data(train_path):
   train_df = pd.read_json(train_path, lines=True)
   train_df = train_df[['text', 'label']]
   train_df, test_df = train_test_split(train_df, test_size=0.2, stratify=train_df['label'], random_state=42)
   return train_df, test_df


In [None]:
file_path = 'data/subtaskA_train_monolingual.jsonl'

In [None]:
train_df, test_df = get_data(file_path)

In [None]:
print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")

In [None]:
# take only 100 sample from training data (test purposes)
train_df = train_df[:100]
test_df = test_df[:100]

In [None]:
labels = np.asarray(train_df['label'])

In [None]:
id2label = {0: "human", 1: "machine"}
label2id = {"human": 0, "machine": 1 }

In [None]:
# pandas dataframe to huggingface Dataset
train_dataset= Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

In [None]:
print(train_dataset)

In [None]:
dataset = datasets.DatasetDict({'train': train_dataset, 'test': test_dataset})

In [None]:
print(dataset)

In [None]:
dataset["train"][0]

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

In [None]:
tokenizer("Hello, this is a sentence!", "And this sentence goes with it.")

In [None]:
# pre_tokenizer_columns = set(dataset["train"].features)
encoded_dataset = dataset.map(preprocess_function, batched=True, fn_kwargs={'tokenizer': tokenizer})
# tokenizer_columns = list(set(encoded_dataset["train"].features) - pre_tokenizer_columns)
# print("Columns added by tokenizer:", tokenizer_columns)

In [None]:
encoded_dataset["train"].features["label"]

In [None]:
model = CustomBERTModel()

In [None]:
prepare_model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(label2id), id2label=id2label, label2id=label2id)

In [None]:
tf_train_dataset = prepare_model.prepare_tf_dataset(encoded_dataset["train"],
                                      batch_size=8,
                                      shuffle=True,
                                      tokenizer=tokenizer)

In [None]:
tf_test_dataset = prepare_model.prepare_tf_dataset(encoded_dataset["train"],
                                          batch_size=8,
                                          shuffle=False,
                                          tokenizer=tokenizer)

In [None]:
del prepare_model

In [None]:
optimizer = keras.optimizers.Adam(learning_rate=5e-5)
loss_fn = SparseCategoricalCrossentropy(from_logits=True)
train_acc_metric = tf.keras.metrics.SparseCategoricalAccuracy()
val_acc_metric = tf.keras.metrics.SparseCategoricalAccuracy()

In [None]:
model.compile(optimizer=optimizer, loss=loss_fn, metrics=[train_acc_metric])

In [None]:
def train_step(model, tf_train_dataset, tf_test_dataset, epochs=2):
  # train the model by using GradientTape
  optimizer = keras.optimizers.Adam(learning_rate=5e-5)
  
  for epoch in range(epochs):
      print(f"\nStart of Training Epoch {epoch}")
      for step, batch in enumerate(tf_train_dataset):
          ids = batch[0]['input_ids']
          mask = batch[0]['attention_mask']
          y = batch[1]

          with tf.GradientTape() as tape:
              logits = model((ids, mask), training=True)
              loss_value = loss_fn(y, logits)

          grads = tape.gradient(loss_value, model.trainable_weights)
          
          optimizer.apply_gradients(
                        (grad, var)
                        for (grad, var) in zip(grads, model.trainable_variables)
                        if grad is not None
                      )
          # Update training metric.
          train_acc_metric(y, logits)

          # Log every 200 batches.
          if step % 10 == 0:
              print(
                  "Training loss at step %d: %.4f"
                  % (step, float(loss_value))
              )
              #print accuracy on the training set
              train_acc = train_acc_metric.result()
              print("Training acc over epoch: %.4f" % (float(train_acc),))
          # Display metrics at the end of each epoch.

      train_acc_metric.reset_states()

      # perform validation on test data
      for step, batch in enumerate(tf_test_dataset):
          ids = batch[0]['input_ids']
          mask = batch[0]['attention_mask']
          y = batch[1]
          logits = model([ids, mask], training=False)
          # Update val metrics
          val_acc_metric(y, logits)

      val_acc = val_acc_metric.result()

      # print accuracy on the test set
      print("Test acc: %.4f" % (float(val_acc),))

      # Reset val metrics at the end of each epoch
      val_acc_metric.reset_states()

  model.save_weights('my_model', save_format='tf')

In [None]:
train_step(model, tf_train_dataset, tf_test_dataset, epochs=2)

### Save Model

In [None]:
# The recommended way to save a subclassed model is to use save_weights to create a TensorFlow SavedModel checkpoint
model.save_weights('my_model', save_format='tf')

### Load Model

In [None]:
loaded_model = CustomBERTModel()
loaded_model.load_weights('my_model')

### Train the loaded model on a batch

In [None]:
# Extract a batch from the training dataset
batch = next(iter(tf_train_dataset))

# Separate input data and target data from the batch
ids = batch[0]['input_ids']
mask = batch[0]['attention_mask']
y = batch[1]

In [None]:
# call the model on the extracted batch
loss_value = loaded_model.train_on_batch((ids, mask), y)
print(f"Loss: {loss_value}")

### Check that the model has been preserved

In [None]:
loaded_model.load_weights('my_model')

In [None]:
val_acc_metric.reset_state()
for step, batch in enumerate(tf_test_dataset):
        ids = batch[0]['input_ids']
        mask = batch[0]['attention_mask']
        y = batch[1]
        logits = loaded_model([ids, mask], training=False)
        # Update val metrics
        val_acc_metric(y, logits)
val_acc = val_acc_metric.result()
# print accuracy on the test set
print("test acc: %.4f" % (float(val_acc),))

In [None]:
# Check that the model state has been preserved
new_predictions = loaded_model.predict(tf_test_dataset)
# np.testing.assert_allclose(predictions, new_predictions, atol=1e-6)


### Another save

In [None]:
# save the custom model to file (not suported for custom class)
model.save('path_to_my_model',save_format='tf')

In [None]:
# load the model
new_model = keras.models.load_model('path_to_my_model')