https://stackoverflow.com/questions/64156202/add-dense-layer-on-top-of-huggingface-bert-model

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Install requirements

In [2]:
!pip install Datasets
!pip install transformers
!pip install scikit-learn

Collecting Datasets
  Downloading datasets-2.16.1-py3-none-any.whl (507 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from Datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from Datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from Datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyarrow-hotfix, dill, multiprocess, Datasets
Successfully installed Datasets-2.16.1 dill-0.3.7 multiprocess-0.70.15 pyarrow-hotfix-0.6


In [29]:
import pandas as pd
import numpy as np
from datasets import Dataset
from transformers import AutoTokenizer, TFAutoModel, TFAutoModelForSequenceClassification
import tensorflow as tf
from keras.layers import Dense, Dropout
from sklearn.model_selection import train_test_split
from tensorflow.keras.losses import SparseCategoricalCrossentropy,BinaryCrossentropy
from tensorflow.keras.optimizers import Adam
import datasets
import keras

## Define custom model

Using a cased version of BERT will mean a bigger vocabulary, which in our case will slow down the training and use up more memory.
Some tricks that may (or may not) improve training time: **Mixed precision** and **jit_compile**.

In [None]:
# try to improve the learning time
from tensorflow.keras import mixed_precision
policy = mixed_precision.Policy('mixed_float16')
mixed_precision.set_global_policy(policy)

In [91]:
MODEL_ID = "bert-base-uncased" # define here the model to use

In [89]:
class CustomBERTModel(keras.Model):
    def __init__(self, base_model_id: str):
          super(CustomBERTModel, self).__init__()
          self.bert = TFAutoModel.from_pretrained(base_model_id)
          ### New layers:
          self.linear1 = keras.layers.Dense(256)
          self.linear2 = keras.layers.Dense(2) ## 2 is the number of classes in this example

    def call(self, inputs, training=False):
          # call expects only one positional argument, so you have to pass in a tuple and unpack. The next parameter is a special reserved training parameter.
          ids, mask = inputs
          sequence_output = self.bert(ids, mask, training=training).last_hidden_state

          # sequence_output has the following shape: (batch_size, sequence_length, 768)
          linear1_output = self.linear1(sequence_output[:,0,:]) ## extract the 1st token's embeddings

          linear2_output = self.linear2(linear1_output)

          return linear2_output

In [90]:
class CustomBert(keras.Model):
    """Appending additional layers to the BERT model, so that it can be used
    for multi-label classification.
    """

    def __init__(self, base_model_id: str, num_labels: int):
        super().__init__()
        self._base = TFAutoModel.from_pretrained(base_model_id)
        self._base.trainable = False

        self._additional_layers = keras.Sequential([
            Dropout(0.1),
            Dense(512, activation="relu"),
            Dense(256, activation="relu"),
            Dense(num_labels, activation="sigmoid"),
        ])

    def call(self, inputs):
        out = self._base(inputs)
        out = out["last_hidden_state"][:, 0, :]

        return self._additional_layers(out)

### Use for debug only

In [None]:
tf.config.run_functions_eagerly(True)
tf.data.experimental.enable_debug_mode()

In [57]:
def preprocess_function(examples, tokenizer, max_length=128):
    return tokenizer(examples["text"],
                     truncation=True,
                     padding='max_length',
                     max_length=max_length,
                     return_tensors="np")

In [62]:
def get_data(train_path):
   train_df = pd.read_json(train_path, lines=True)
   train_df = train_df[['text', 'label']]
   train_df, validation_df = train_test_split(train_df, test_size=0.2, stratify=train_df['label'], random_state=42)
   return train_df, validation_df


In [59]:
file_path = 'drive/MyDrive/models/subtaskA_dev_monolingual.jsonl'

In [63]:
train_df, validation_df = get_data(file_path)

In [64]:
print(f"Train shape: {train_df.shape}")
print(f"Validation shape: {validation_df.shape}")

Train shape: (4000, 2)
Validation shape: (1000, 2)


In [68]:
# take only 100 sample from training data (test purposes)
train_df = train_df[:100]
validation_df = validation_df[:100]

In [66]:
labels = np.asarray(train_df['label'])

In [69]:
id2label = {0: "human", 1: "machine"}
label2id = {"human": 0, "machine": 1 }

In [70]:
# pandas dataframe to huggingface Dataset
train_dataset= Dataset.from_pandas(train_df)
validation_dataset = Dataset.from_pandas(validation_df)

In [13]:
print(train_dataset)

Dataset({
    features: ['text', 'label', '__index_level_0__'],
    num_rows: 100
})


In [73]:
dataset = datasets.DatasetDict({'train': train_dataset, 'validation': validation_dataset})

In [74]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'label', '__index_level_0__'],
        num_rows: 100
    })
    validation: Dataset({
        features: ['text', 'label', '__index_level_0__'],
        num_rows: 100
    })
})


In [None]:
dataset["train"][0]

In [92]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [77]:
tokenizer("Hello, this is a sentence!", "And this sentence goes with it.")

{'input_ids': [101, 7592, 1010, 2023, 2003, 1037, 6251, 999, 102, 1998, 2023, 6251, 3632, 2007, 2009, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [78]:
# pre_tokenizer_columns = set(dataset["train"].features)
encoded_dataset = dataset.map(preprocess_function, batched=True, fn_kwargs={'tokenizer': tokenizer})
# tokenizer_columns = list(set(encoded_dataset["train"].features) - pre_tokenizer_columns)
# print("Columns added by tokenizer:", tokenizer_columns)

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [19]:
encoded_dataset["train"].features["label"]

Value(dtype='int64', id=None)

In [20]:
model = CustomBERTModel(MODEL_ID)

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [79]:
model_bert = CustomBert(MODEL_ID, 2)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [80]:
prepare_model = TFAutoModelForSequenceClassification.from_pretrained(MODEL_ID, num_labels=len(label2id), id2label=id2label, label2id=label2id)

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [81]:
tf_train_dataset = prepare_model.prepare_tf_dataset(encoded_dataset["train"],
                                      batch_size=8,
                                      shuffle=True,
                                      tokenizer=tokenizer)

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [85]:
tf_validation_dataset = prepare_model.prepare_tf_dataset(encoded_dataset["validation"],
                                          batch_size=8,
                                          shuffle=True,
                                          tokenizer=tokenizer)

In [None]:
del prepare_model

In [83]:
optimizer = keras.optimizers.Adam(learning_rate=5e-5)
# loss_fn = SparseCategoricalCrossentropy(from_logits=True)
loss_fn = BinaryCrossentropy(from_logits=False) # use with activation = sigmoid for last layer
train_acc_metric = tf.keras.metrics.SparseCategoricalAccuracy()
val_acc_metric = tf.keras.metrics.SparseCategoricalAccuracy()

In [86]:
# model.compile(optimizer=optimizer, loss=loss_fn, metrics=[train_acc_metric])
model_bert.compile(optimizer=optimizer, loss=loss_fn, metrics=[train_acc_metric])

In [53]:
def train_step(model, tf_train_dataset, tf_test_dataset, epochs=2):
  # train the model by using GradientTape
  optimizer = keras.optimizers.Adam(learning_rate=5e-5)

  for epoch in range(epochs):
      print(f"\nStart of Training Epoch {epoch}")
      for step, batch in enumerate(tf_train_dataset):
          ids = batch[0]['input_ids']
          mask = batch[0]['attention_mask']
          y = batch[1]

          with tf.GradientTape() as tape:
              logits = model((ids, mask), training=True)
              loss_value = loss_fn(y, logits)

          grads = tape.gradient(loss_value, model.trainable_weights)

          optimizer.apply_gradients(
                        (grad, var)
                        for (grad, var) in zip(grads, model.trainable_variables)
                        if grad is not None
                      )
          # Update training metric.
          train_acc_metric(y, logits)

          # Log every 200 batches.
          if step % 10 == 0:
              print(
                  "Training loss at step %d: %.4f"
                  % (step, float(loss_value))
              )
              #print accuracy on the training set
              train_acc = train_acc_metric.result()
              print("Training acc over epoch: %.4f" % (float(train_acc),))
          # Display metrics at the end of each epoch.

      train_acc_metric.reset_states()

      # perform validation on test data
      for step, batch in enumerate(tf_test_dataset):
          ids = batch[0]['input_ids']
          mask = batch[0]['attention_mask']
          y = batch[1]
          logits = model([ids, mask], training=False)
          # Update val metrics
          val_acc_metric(y, logits)

      val_acc = val_acc_metric.result()

      # print accuracy on the test set
      print("Test acc: %.4f" % (float(val_acc),))

      # Reset val metrics at the end of each epoch
      val_acc_metric.reset_states()

  model.save_weights('my_model', save_format='tf')

In [94]:
model_bert.fit(tf_train_dataset,
               validation_data= tf_validation_dataset,
               epochs=1)



<keras.src.callbacks.History at 0x7e31d51af7c0>

In [96]:
# predict on test data
predictions = model_bert.predict(tf_validation_dataset)



In [97]:
print(predictions[0])

[0.6173216 0.3826784]


In [54]:
train_step(model_bert, tf_train_dataset, tf_test_dataset, epochs=2) # not needed anymore - just use the fit() method


Start of Training Epoch 0
Training loss at step 0: 0.6964
Training acc over epoch: 0.5000


KeyboardInterrupt: 

### Save Model

In [None]:
# The recommended way to save a subclassed model is to use save_weights to create a TensorFlow SavedModel checkpoint
model.save_weights('my_model', save_format='tf')

### Load Model

In [None]:
loaded_model = CustomBERTModel(MODEL_ID)
loaded_model.load_weights('my_model')

### Train the loaded model on a batch

In [None]:
# Extract a batch from the training dataset
batch = next(iter(tf_train_dataset))

# Separate input data and target data from the batch
ids = batch[0]['input_ids']
mask = batch[0]['attention_mask']
y = batch[1]

In [None]:
# call the model on the extracted batch
loss_value = loaded_model.train_on_batch((ids, mask), y)
print(f"Loss: {loss_value}")

### Check that the model has been preserved

In [None]:
loaded_model.load_weights('my_model')

In [None]:
val_acc_metric.reset_state()
for step, batch in enumerate(tf_test_dataset):
        ids = batch[0]['input_ids']
        mask = batch[0]['attention_mask']
        y = batch[1]
        logits = loaded_model([ids, mask], training=False)
        # Update val metrics
        val_acc_metric(y, logits)
val_acc = val_acc_metric.result()
# print accuracy on the test set
print("test acc: %.4f" % (float(val_acc),))

In [None]:
# Check that the model state has been preserved
new_predictions = loaded_model.predict(tf_test_dataset)
# np.testing.assert_allclose(predictions, new_predictions, atol=1e-6)


### Another save

In [None]:
# save the custom model to file (not suported for custom class)
model.save('path_to_my_model',save_format='tf')

In [None]:
# load the model
new_model = keras.models.load_model('path_to_my_model')