https://stackoverflow.com/questions/64156202/add-dense-layer-on-top-of-huggingface-bert-model

In [1]:
# from google.colab import drive
# drive.mount('/content/drive')

# Install requirements

In [2]:
# !pip install Datasets
# !pip install transformers
# !pip install scikit-learn

In [3]:
import pandas as pd
import numpy as np
from datasets import Dataset
from transformers import AutoTokenizer, TFAutoModel, TFAutoModelForSequenceClassification
import tensorflow as tf
from keras.layers import Dense, Dropout
from sklearn.model_selection import train_test_split
from tensorflow.keras.losses import SparseCategoricalCrossentropy,BinaryCrossentropy
from tensorflow.keras.optimizers import Adam
import datasets
import keras

2024-01-10 20:42:18.289852: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-01-10 20:42:18.334801: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-01-10 20:42:18.334834: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-01-10 20:42:18.335978: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-01-10 20:42:18.343298: I tensorflow/core/platform/cpu_feature_guar

## Define custom model

Using a cased version of BERT will mean a bigger vocabulary, which in our case will slow down the training and use up more memory.
Some tricks that may (or may not) improve training time: **Mixed precision** and **jit_compile**.

In [4]:
# try to improve the learning time
# from tensorflow.keras import mixed_precision
# policy = mixed_precision.Policy('mixed_float16')
# mixed_precision.set_global_policy(policy)

In [5]:
def scheduler(epoch, lr):
  if epoch == 10:
    return lr / 2
  else:
    return lr


In [6]:
MODEL_ID = "bert-base-uncased" # define here the model to use

In [7]:
class CustomBert(keras.Model):
    """Appending additional layers to the BERT model, so that it can be used
    for multi-label classification.
    """

    def __init__(self, base_model_id: str, num_labels: int):
        super().__init__()
        self._base = TFAutoModel.from_pretrained(base_model_id)
        self._base.trainable = False

        self._additional_layers = keras.Sequential([
            Dropout(0.1),
            Dense(256, activation="relu"),
            Dense(8, activation="relu"),
            # Dropout(0.25),
            Dense(4, activation='relu'),
            Dense(2, activation='relu'),
            Dense(1, activation="sigmoid"),
        ])

    def call(self, inputs):
        out = self._base(inputs)
        out = out["last_hidden_state"][:, 0, :]

        return self._additional_layers(out)

### Use for debug only

In [8]:
# tf.config.run_functions_eagerly(True)
# tf.data.experimental.enable_debug_mode()

In [9]:
def preprocess_function(examples, tokenizer, max_length=128):
    return tokenizer(examples["text"],
                     truncation=True,
                     padding='max_length',
                     max_length=max_length,
                     return_tensors="np")

In [10]:
def get_test_data(test_path):
    test_df = pd.read_json(test_path, lines=True)
    test_df = test_df[['text', 'label']]
    return test_df

In [11]:
def get_data(train_path):
   train_df = pd.read_json(train_path, lines=True)
   train_df = train_df[['text', 'label']]
   train_df, validation_df = train_test_split(train_df, test_size=0.1, stratify=train_df['label'])
   return train_df, validation_df


In [12]:
file_train_path = 'data/subtaskA_train_monolingual.jsonl'

In [13]:
train_df, validation_df = get_data(file_train_path)

In [14]:
file_test_path = 'data/subtaskA_dev_monolingual.jsonl'

In [15]:
test_df = get_test_data(file_test_path)

In [16]:
print(f"Train shape: {train_df.shape}")
print(f"Validation shape: {validation_df.shape}")
print(f"Test shape: {test_df.shape}")

Train shape: (107781, 2)
Validation shape: (11976, 2)
Test shape: (5000, 2)


In [17]:
# take only 100 sample from training data (test purposes)
# train_df = train_df[:100]
# validation_df = validation_df[:100]

In [18]:
id2label = {0: "human", 1: "machine"}
label2id = {"human": 0, "machine": 1 }

In [19]:
# pandas dataframe to huggingface Dataset
train_dataset= Dataset.from_pandas(train_df)
validation_dataset = Dataset.from_pandas(validation_df)
test_dataset = Dataset.from_pandas(test_df)

In [20]:
print(train_dataset)

Dataset({
    features: ['text', 'label', '__index_level_0__'],
    num_rows: 107781
})


In [21]:
dataset = datasets.DatasetDict({'train': train_dataset, 'validation': validation_dataset, 'test' : test_dataset})

In [22]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'label', '__index_level_0__'],
        num_rows: 107781
    })
    validation: Dataset({
        features: ['text', 'label', '__index_level_0__'],
        num_rows: 11976
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 5000
    })
})


In [23]:
dataset["train"][0]

{'text': '\n\nThis paper addresses the problem of segmentation and labeling of high dimensional time series data in an unsupervised way. The authors propose a Recurrent Hidden Semi-Markov Model (R-HSMM) to tackle the limitation of existing HSMM models with simple conditional assumptions of observations. Strengths of the paper include the design of a structure encoding function to accelerate the inference while preserving accuracy, and the generalization of the penalty method to distribution space to enable simultaneous training of the model and encoding function. Weaknesses include the lack of a comparison between the proposed R-HSMM and other existing methods, and the lack of a discussion of the computational complexity of the proposed model.',
 'label': 1,
 '__index_level_0__': 45339}

In [24]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

In [25]:
# pre_tokenizer_columns = set(dataset["train"].features)
encoded_dataset = dataset.map(preprocess_function, batched=True, fn_kwargs={'tokenizer': tokenizer})
# tokenizer_columns = list(set(encoded_dataset["train"].features) - pre_tokenizer_columns)
# print("Columns added by tokenizer:", tokenizer_columns)

Map:   0%|          | 0/107781 [00:00<?, ? examples/s]

Map:   0%|          | 0/11976 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [26]:
model_bert = CustomBert(MODEL_ID, 2)

2024-01-10 20:44:11.701189: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-01-10 20:44:11.756684: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-01-10 20:44:11.756980: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-

In [27]:
prepare_model = TFAutoModelForSequenceClassification.from_pretrained(MODEL_ID, num_labels=len(label2id), id2label=id2label, label2id=label2id)

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [28]:
tf_train_dataset = prepare_model.prepare_tf_dataset(encoded_dataset["train"],
                                      batch_size=64,
                                      shuffle=True,
                                      tokenizer=tokenizer)

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [29]:
tf_validation_dataset = prepare_model.prepare_tf_dataset(encoded_dataset["validation"],
                                          batch_size=64,
                                          shuffle=True,
                                          tokenizer=tokenizer)

In [30]:
tf_test_dataset = prepare_model.prepare_tf_dataset(encoded_dataset["test"],
                                          batch_size=64,
                                          shuffle=True,
                                          tokenizer=tokenizer)

In [31]:
del prepare_model

In [32]:
optimizer = keras.optimizers.Adam(learning_rate=5e-4) 
# loss_fn = SparseCategoricalCrossentropy(from_logits=False)
loss_fn = BinaryCrossentropy(from_logits=False) # use with activation = sigmoid for last layer
train_acc_metric = tf.keras.metrics.SparseCategoricalAccuracy()
val_acc_metric = tf.keras.metrics.SparseCategoricalAccuracy()

callback = tf.keras.callbacks.LearningRateScheduler(scheduler)

In [33]:
# model.compile(optimizer=optimizer, loss=loss_fn, metrics=[train_acc_metric])
model_bert.compile(optimizer=optimizer, loss=loss_fn, metrics=['accuracy'], jit_compile=True)

In [34]:
def train_step(model, tf_train_dataset, tf_test_dataset, epochs=2):
  # train the model by using GradientTape
  optimizer = keras.optimizers.Adam(learning_rate=5e-5)

  for epoch in range(epochs):
      print(f"\nStart of Training Epoch {epoch}")
      for step, batch in enumerate(tf_train_dataset):
          ids = batch[0]['input_ids']
          mask = batch[0]['attention_mask']
          y = batch[1]

          with tf.GradientTape() as tape:
              logits = model((ids, mask), training=True)
              loss_value = loss_fn(y, logits)

          grads = tape.gradient(loss_value, model.trainable_weights)

          optimizer.apply_gradients(
                        (grad, var)
                        for (grad, var) in zip(grads, model.trainable_variables)
                        if grad is not None
                      )
          # Update training metric.
          train_acc_metric(y, logits)

          # Log every 200 batches.
          if step % 10 == 0:
              print(
                  "Training loss at step %d: %.4f"
                  % (step, float(loss_value))
              )
              #print accuracy on the training set
              train_acc = train_acc_metric.result()
              print("Training acc over epoch: %.4f" % (float(train_acc),))
          # Display metrics at the end of each epoch.

      train_acc_metric.reset_states()

      # perform validation on test data
      for step, batch in enumerate(tf_test_dataset):
          ids = batch[0]['input_ids']
          mask = batch[0]['attention_mask']
          y = batch[1]
          logits = model([ids, mask], training=False)
          # Update val metrics
          val_acc_metric(y, logits)

      val_acc = val_acc_metric.result()

      # print accuracy on the test set
      print("Test acc: %.4f" % (float(val_acc),))

      # Reset val metrics at the end of each epoch
      val_acc_metric.reset_states()

  model.save_weights('my_model', save_format='tf')

In [None]:
model_bert.fit(tf_train_dataset,
               validation_data= tf_test_dataset,
               epochs=20, callbacks=[callback])

Epoch 1/20


2024-01-10 20:44:32.058254: I external/local_xla/xla/service/service.cc:168] XLA service 0x55d3e93ebcf0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2024-01-10 20:44:32.058294: I external/local_xla/xla/service/service.cc:176]   StreamExecutor device (0): Tesla T4, Compute Capability 7.5
2024-01-10 20:44:32.439276: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2024-01-10 20:44:32.516266: W tensorflow/compiler/tf2xla/kernels/assert_op.cc:38] Ignoring Assert operator custom_bert/tf_bert_model/bert/embeddings/assert_less/Assert/Assert
2024-01-10 20:44:35.855777: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:454] Loaded cuDNN version 8904
I0000 00:00:1704919482.135557    6644 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.
W0000 00:00:1704919482.165132    664



2024-01-10 20:54:35.771615: W tensorflow/compiler/tf2xla/kernels/assert_op.cc:38] Ignoring Assert operator custom_bert/tf_bert_model/bert/embeddings/assert_less/Assert/Assert
W0000 00:00:1704920077.744989    6642 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20

In [None]:
# predict on test data
predictions = model_bert.predict(tf_validation_dataset)

In [None]:
model_bert.evaluate(tf_validation_dataset)

In [None]:
print(predictions[0])

### Save Model

In [None]:
# The recommended way to save a subclassed model is to use save_weights to create a TensorFlow SavedModel checkpoint
model_bert.save_weights('my_model', save_format='tf')

### Load Model

In [None]:
loaded_model = CustomBERTModel(MODEL_ID)
loaded_model.load_weights('my_model')

### Train the loaded model on a batch

In [None]:
# Extract a batch from the training dataset
batch = next(iter(tf_train_dataset))

# Separate input data and target data from the batch
ids = batch[0]['input_ids']
mask = batch[0]['attention_mask']
y = batch[1]

In [None]:
# call the model on the extracted batch
loss_value = loaded_model.train_on_batch((ids, mask), y)
print(f"Loss: {loss_value}")

### Check that the model has been preserved

In [None]:
loaded_model.load_weights('my_model')

In [None]:
val_acc_metric.reset_state()
for step, batch in enumerate(tf_test_dataset):
        ids = batch[0]['input_ids']
        mask = batch[0]['attention_mask']
        y = batch[1]
        logits = loaded_model([ids, mask], training=False)
        # Update val metrics
        val_acc_metric(y, logits)
val_acc = val_acc_metric.result()
# print accuracy on the test set
print("test acc: %.4f" % (float(val_acc),))

In [None]:
# Check that the model state has been preserved
new_predictions = loaded_model.predict(tf_test_dataset)
# np.testing.assert_allclose(predictions, new_predictions, atol=1e-6)


### Another save

In [None]:
# save the custom model to file (not suported for custom class)
model_bert.save('path_to_my_model',save_format='tf')

In [None]:
# load the model
new_model = keras.models.load_model('path_to_my_model')