In [None]:
!pip install transformers

### imports

In [None]:
import json
from transformers import pipeline

import numpy as np

from pprint import pprint
from tensorflow.keras.optimizers import Adam

### Use transformers get a model for predicting MGT

In [None]:
pipe = pipeline(model="facebook/bart-large-mnli")

### load data from jsonl

In [None]:
def load_data(file_path):
  data_list = []
  with open(file_path, 'r') as file:
    for line in file:
        # load each line as a JSON object
        data = json.loads(line.strip())
        data_list.append(data)
  return data_list


In [5]:
file_path = 'data/subtaskA_train_monolingual.jsonl'

In [6]:
test_path = 'data/subtaskA_dev_monolingual.jsonl'

In [None]:
data_loaded = load_data(file_path)

### get a random sample from data

In [None]:
test_data = data_loaded[100]['text']
print(test_data)
print("label is: ", data_loaded[100]['label'])

### Test using pipeline

In [None]:
# test this using the pipeline
result = pipe(test_data,
              candidate_labels=["Written by humans", "Written by AI"],)

In [None]:
print(type(test_data))

In [None]:
pprint(result)

In [None]:
def prepare_test_data():
  texts = []
  labels = []
  for data in data_loaded:
    texts.append(data['text'])
    labels.append(data['label'])
  return texts,labels


In [None]:
x_test, y_test = prepare_test_data()

In [None]:
print(len(x_test))

In [None]:
x_test_sample = x_test[:3]
y_test_sample = y_test[:3]

In [None]:
results = pipe(x_test_sample,
              candidate_labels=["Written by humans", "Written by AI"],)

In [None]:
pprint(results)

In [None]:
# verify results
correct = 0
for i in range(len(results)):
    # take the index of the label with the highet score from scores
    if results[i]['scores'].index(max(results[i]['scores'])) == y_test[i]:
      correct +=1
print("Accuracy is : ", correct / len(results))

# Zero-shot classification


In [None]:
classifier = pipeline("zero-shot-classification")
classifier(
    x_test_sample,
    candidate_labels=["human", "AI"],
)

# Fine-tuning: using BERT pre-trained model!

In [None]:
x_test, y_test = prepare_test_data()

In [None]:
print(len(x_test))

### Split data into train and test

In [None]:
# split the data into train and test set
X_train, X_test, y_train, y_test = train_test_split(x_test, y_test, test_size=0.2, random_state=42)

In [None]:
# print the shapes of the train and test set
print(len(X_train), len(y_train), len(X_test), len(y_test))

In [None]:
# load the pre-trained BERt tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased')

In [None]:
# convert labels to numpy array
# the labels are already a list of 0 and 1s, so we can just convert that directly to a NumPy array without tokenization!
y_train = np.array(y_train)
y_test = np.array(y_test)

In [None]:
# Tokenize and pad the input sequences
X_train_tokens = tokenizer(X_train, padding=True, truncation=True, max_length=128, return_tensors="tf")
X_test_tokens = tokenizer(X_test, padding=True, truncation=True, max_length=128, return_tensors="tf")


In [None]:
print(X_train_tokens[0])

In [None]:
# prepare the model for training
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
model.fit(X_train_tokens['input_ids'], y_train, epochs=2, batch_size=32, validation_data=(X_test_tokens['input_ids'], y_test))

In [None]:
# evaluate the model on the test set:
scores = model.evaluate(X_test_tokens['input_ids'], y_test)
print("Accuracy:", scores[1])

### version 2

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

In [None]:
data_loaded[0]['text']

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
tokenized_data = tokenizer(X_train, return_tensors="np", padding=True, truncation=True, max_length=128)

In [None]:
tokenized_data = dict(tokenized_data)

In [None]:
# Load and compile our model
# for subtask A
id2label = {0 : "human" , 1: "machine"}
label2id = {"human": 0, "machine": 1}
model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased")

In [None]:
model.compile(Adam(3e-5))  # No loss argument!
# Hugging Face models automatically choose a loss that is appropriate for their task

In [None]:
model.fit(tokenized_data, y_train)

## Version 3

In [1]:
import pandas as pd
import numpy as np
from datasets import Dataset
from transformers import DataCollatorWithPadding
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.metrics import SparseCategoricalAccuracy
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
import tensorflow as tf
from sklearn.model_selection import train_test_split
from transformers import BertModel, BertTokenizer, TFBertForSequenceClassification, TFAutoModelForSequenceClassification
from tensorflow.keras.optimizers import Adam
import datasets

2024-01-17 16:35:57.186879: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-01-17 16:35:57.231963: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-01-17 16:35:57.231992: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-01-17 16:35:57.233151: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-01-17 16:35:57.240491: I tensorflow/core/platform/cpu_feature_guar

In [None]:
tf.config.run_functions_eagerly(True)
tf.data.experimental.enable_debug_mode()

In [2]:
def preprocess_function(examples, tokenizer, max_length=128):
    return tokenizer(examples["text"],
                     truncation=True,
                     padding='max_length',
                     max_length=max_length,
                     return_tensors="np")

In [3]:
def get_data(train_path, split=True):
    train_df = pd.read_json(train_path, lines=True)
    if split:
       train_df, test_df = train_test_split(train_df, test_size=0.2, stratify=train_df['label'], random_state=42)
       return train_df, test_df
    else:
        return train_df


In [7]:
train_df, val_df = get_data(file_path)

In [8]:
test_df = get_data(test_path, False)

In [9]:
print(test_df.shape)

(5000, 5)


In [10]:
labels = np.asarray(train_df['label'])

In [11]:
id2label = {0: "human", 1: "machine"}
label2id = {"human": 0, "machine": 1 }

In [12]:
 # pandas dataframe to huggingface Dataset
train_dataset_ours = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

In [13]:
print(train_dataset_ours)

Dataset({
    features: ['text', 'label', 'model', 'source', 'id', '__index_level_0__'],
    num_rows: 95805
})


In [14]:
# convert to train_dataset_ours to <class 'datasets.dataset_dict.DatasetDict'>
new_dataset = datasets.DatasetDict({'train': train_dataset_ours, 'test': test_dataset})

In [15]:
print(new_dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'model', 'source', 'id', '__index_level_0__'],
        num_rows: 95805
    })
    test: Dataset({
        features: ['text', 'label', 'model', 'source', 'id'],
        num_rows: 5000
    })
})


In [29]:
new_dataset["train"][0]

{'text': "When it comes to the 'why' we want to drink milk or water when we consume something 'rich' like chocolate, there are a few reasons to consider. To begin, both milk and water serve important roles in the process of digestion.\n\nFirst off, milk provides important nutritional benefits to the body. It can be a source of Omega-3 fatty acids, calcium, and proteins which all help contribute to a healthy lifestyle. Milk is also an excellent source of hydration, and it aids in the proper digestion of chocolate by coating the stomach and aiding in the absorption of fat. Milk is also believed to play a role in the control of blood sugar, as it helps to slow the release of sugar into the bloodstream when consumed.\n\nWater is also an important part of digestion, most notably for its calming effect on the stomach and intestines. It helps to reduce bloating, cramping, and stomach pains, while simultaneously boosting the digestion of one's food. By drinking a glass of water before or after

In [16]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

In [17]:
tokenizer("Hello, this is a sentence!", "And this sentence goes with it.")

{'input_ids': [101, 8667, 117, 1142, 1110, 170, 5650, 106, 102, 1262, 1142, 5650, 2947, 1114, 1122, 119, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [18]:
def preprocess_function(examples):
    # Keys of the returned dictionary will be added to the dataset as columns
    return tokenizer(examples["text"],truncation=True, padding='max_length', max_length=128)

In [19]:
encoded_dataset = new_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/95805 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [20]:
encoded_dataset["train"].features["label"]

Value(dtype='int64', id=None)

In [21]:
model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=len(label2id), id2label=id2label, label2id=label2id)

2024-01-17 16:37:56.967644: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-01-17 16:37:57.199914: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-01-17 16:37:57.200202: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-

In [22]:
tf_train_dataset = model.prepare_tf_dataset(encoded_dataset["train"],
                                      batch_size=16,
                                      shuffle=True,
                                      tokenizer=tokenizer)

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [23]:
tf_test_dataset = model.prepare_tf_dataset(encoded_dataset["test"],
                                          batch_size=16,
                                          shuffle=False,
                                           tokenizer=tokenizer)

In [24]:
model.compile(optimizer=Adam(3e-5), metrics=['acc'])  # No loss argument!

In [25]:
model.fit(tf_train_dataset, epochs=2)

Epoch 1/2


2024-01-17 16:38:28.369892: I external/local_xla/xla/service/service.cc:168] XLA service 0x7f272033cb50 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2024-01-17 16:38:28.369925: I external/local_xla/xla/service/service.cc:176]   StreamExecutor device (0): Tesla T4, Compute Capability 7.5
2024-01-17 16:38:28.376167: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2024-01-17 16:38:28.851151: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:454] Loaded cuDNN version 8904
I0000 00:00:1705509508.952330    3273 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/2


<keras.src.callbacks.History at 0x7f27d9b2ae10>

In [26]:
model.evaluate(tf_test_dataset)



[0.9728445410728455, 0.717199981212616]

In [27]:
# The recommended way to save a subclassed model is to use save_weights to create a TensorFlow SavedModel checkpoint
model.save_weights('fine_tuned_bert', save_format='tf')

### 2 directions: -> Change in BERT model or Change in Tokenizer model.

In [None]:
https://stackoverflow.com/questions/64156202/add-dense-layer-on-top-of-huggingface-bert-model

In [None]:
from tensorflow import keras
from transformers import AutoTokenizer, TFAutoModel
class CustomBERTModel(keras.Model):
    def __init__(self):
          super(CustomBERTModel, self).__init__()
          self.bert = TFAutoModel.from_pretrained("bert-base-uncased")
          ### New layers:
          self.linear1 = keras.layers.Dense(256)
          self.linear2 = keras.layers.Dense(2) ## 2 is the number of classes in this example

    def call(self, inputs, training=False):
          # call expects only one positional argument, so you have to pass in a tuple and unpack. The next parameter is a special reserved training parameter.
          ids, mask = inputs
          sequence_output = self.bert(ids, mask, training=training).last_hidden_state

          # sequence_output has the following shape: (batch_size, sequence_length, 768)
          linear1_output = self.linear1(sequence_output[:,0,:]) ## extract the 1st token's embeddings

          linear2_output = self.linear2(linear1_output)

          return linear2_output


In [None]:
model = CustomBERTModel()

In [None]:
train_acc_metric.reset_states()
val_acc_metric.reset_states()

In [None]:
def train_step(model, tf_train_dataset, tf_test_dataset, epochs=2):
  # train the model by using GradientTape
  optimizer = keras.optimizers.Adam(learning_rate=5e-5)
  loss_fn = SparseCategoricalCrossentropy(from_logits=True)
  train_acc_metric = tf.keras.metrics.SparseCategoricalAccuracy()
  val_acc_metric = tf.keras.metrics.SparseCategoricalAccuracy()
  for epoch in range(epochs):
      print(f"\nStart of Training Epoch {epoch}")
      for step, batch in enumerate(tf_train_dataset):
          # print(step)
          # print(batch)
          ids = batch[0]['input_ids']
          mask = batch[0]['attention_mask']
          y = batch[1]
          with tf.GradientTape() as tape:
              logits = model((ids, mask), training=True)
              loss_value = loss_fn(y, logits)
              # print(f"Loss at step {step}: {loss_value}")
          grads = tape.gradient(loss_value, model.trainable_weights)
          # Filter trainable weights that have gradients
          trainable_vars = [var for var, grad in zip(model.trainable_weights, grads) if grad is not None]

          # optimizer.apply_gradients(zip(grads, model.trainable_weights))
          optimizer.apply_gradients(
                        (grad, var)
                        for (grad, var) in zip(grads, model.trainable_variables)
                        if grad is not None
                      )
          # Update training metric.
          train_acc_metric(y, logits)

          # Log every 200 batches.
          if step % 10 == 0:
              print(
                  "Training loss at step %d: %.4f"
                  % (step, float(loss_value))
              )
              #print accuracy on the training set
              train_acc = train_acc_metric.result()
              print("Training acc over epoch: %.4f" % (float(train_acc),))
          # Display metrics at the end of each epoch.

      train_acc_metric.reset_states()
      # perform validation on test data
      for step, batch in enumerate(tf_test_dataset):
          ids = batch[0]['input_ids']
          mask = batch[0]['attention_mask']
          y = batch[1]
          logits = model([ids, mask], training=False)
          # Update val metrics
          val_acc_metric(y, logits)
      val_acc = val_acc_metric.result()
      # print accuracy on the test set
      print("Test acc: %.4f" % (float(val_acc),))
      # Reset val metrics at the end of each epoch
      val_acc_metric.reset_states()

  model.save_weights('my_model', save_format='tf')

In [None]:
train_step(new_model, tf_train_dataset, tf_test_dataset, epochs=2)

In [None]:
# The recommended way to save a subclassed model is to use save_weights to create a TensorFlow SavedModel checkpoint
model.save_weights('my_model', save_format='tf')

In [None]:
# restore the loaded model
new_model = CustomBERTModel()


In [None]:
new_model.compile(optimizer=optimizer, loss=loss_fn, metrics=[train_acc_metric])
# call the model on part of the training set to build the model

In [None]:
# Extract a batch from the training dataset
batch = next(iter(tf_train_dataset))
# Separate input data and target data from the batch
ids = batch[0]['input_ids']
mask = batch[0]['attention_mask']
y = batch[1]

In [None]:
# call the model on the extracted batch
loss_value = new_model.train_on_batch((ids, mask), y)
print(f"Loss: {loss_value}")

In [None]:
new_model.load_weights('my_model')

In [None]:
val_acc_metric.reset_state()
for step, batch in enumerate(tf_test_dataset):
        ids = batch[0]['input_ids']
        mask = batch[0]['attention_mask']
        y = batch[1]
        logits = new_model([ids, mask], training=False)
        # Update val metrics
        val_acc_metric(y, logits)
val_acc = val_acc_metric.result()
# print accuracy on the test set
print("test acc: %.4f" % (float(val_acc),))

In [None]:
# Check that the model state has been preserved
new_predictions = new_model.predict(tf_test_dataset)
np.testing.assert_allclose(predictions, new_predictions, atol=1e-6)


In [None]:
# save the custom model to file (not suported for custom class)
model.save('path_to_my_model',save_format='tf')

In [None]:
# load the model
new_model = keras.models.load_model('path_to_my_model')