In [5]:
file_path = 'data/subtaskA_train_monolingual.jsonl'

In [6]:
test_path = 'data/subtaskA_dev_monolingual.jsonl'

In [None]:
MODEL_ID = 'bert-base-cased'

## **Fine tuning using Bert Cased**

In [1]:
import pandas as pd
import numpy as np
from datasets import Dataset
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from transformers import AutoTokenizer
import tensorflow as tf
from sklearn.model_selection import train_test_split
from transformers import TFAutoModelForSequenceClassification
from tensorflow.keras.optimizers import Adam
import datasets

2024-01-17 16:35:57.186879: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-01-17 16:35:57.231963: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-01-17 16:35:57.231992: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-01-17 16:35:57.233151: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-01-17 16:35:57.240491: I tensorflow/core/platform/cpu_feature_guar

In [None]:
tf.config.run_functions_eagerly(True)
tf.data.experimental.enable_debug_mode()

In [2]:
def preprocess_function(examples, tokenizer):
    # Keys of the returned dictionary will be added to the dataset as columns
    return tokenizer(examples["text"],truncation=True, padding='max_length', max_length=128)

In [3]:
def get_data(train_path, split=True):
    train_df = pd.read_json(train_path, lines=True)
    if split:
       train_df, test_df = train_test_split(train_df, test_size=0.2, stratify=train_df['label'], random_state=42)
       return train_df, test_df
    else:
        return train_df


In [7]:
train_df, val_df = get_data(file_path)

In [8]:
test_df = get_data(test_path, False)

In [10]:
labels = np.asarray(train_df['label'])

In [11]:
id2label = {0: "human", 1: "machine"}
label2id = {"human": 0, "machine": 1 }

In [12]:
# pandas dataframe to huggingface Dataset
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

In [13]:
print(train_dataset)

Dataset({
    features: ['text', 'label', 'model', 'source', 'id', '__index_level_0__'],
    num_rows: 95805
})


In [14]:
# convert to train_dataset_ours to <class 'datasets.dataset_dict.DatasetDict'>
new_dataset = datasets.DatasetDict({'train': train_dataset, 'test': test_dataset})

In [15]:
print(new_dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'model', 'source', 'id', '__index_level_0__'],
        num_rows: 95805
    })
    test: Dataset({
        features: ['text', 'label', 'model', 'source', 'id'],
        num_rows: 5000
    })
})


In [29]:
new_dataset["train"][0]

{'text': "When it comes to the 'why' we want to drink milk or water when we consume something 'rich' like chocolate, there are a few reasons to consider. To begin, both milk and water serve important roles in the process of digestion.\n\nFirst off, milk provides important nutritional benefits to the body. It can be a source of Omega-3 fatty acids, calcium, and proteins which all help contribute to a healthy lifestyle. Milk is also an excellent source of hydration, and it aids in the proper digestion of chocolate by coating the stomach and aiding in the absorption of fat. Milk is also believed to play a role in the control of blood sugar, as it helps to slow the release of sugar into the bloodstream when consumed.\n\nWater is also an important part of digestion, most notably for its calming effect on the stomach and intestines. It helps to reduce bloating, cramping, and stomach pains, while simultaneously boosting the digestion of one's food. By drinking a glass of water before or after

In [16]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

In [19]:
encoded_dataset = new_dataset.map(preprocess_function, batched=True, fn_kwargs={'tokenizer': tokenizer})

Map:   0%|          | 0/95805 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [21]:
model = TFAutoModelForSequenceClassification.from_pretrained(MODEL_ID, num_labels=len(label2id), id2label=id2label, label2id=label2id)

2024-01-17 16:37:56.967644: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-01-17 16:37:57.199914: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-01-17 16:37:57.200202: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-

In [22]:
tf_train_dataset = model.prepare_tf_dataset(encoded_dataset["train"],
                                      batch_size=16,
                                      shuffle=True,
                                      tokenizer=tokenizer)

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [23]:
tf_test_dataset = model.prepare_tf_dataset(encoded_dataset["test"],
                                          batch_size=16,
                                          shuffle=False,
                                           tokenizer=tokenizer)

In [24]:
model.compile(optimizer=Adam(3e-5), metrics=['acc'])  # No loss argument!

In [25]:
model.fit(tf_train_dataset, epochs=2)

Epoch 1/2


2024-01-17 16:38:28.369892: I external/local_xla/xla/service/service.cc:168] XLA service 0x7f272033cb50 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2024-01-17 16:38:28.369925: I external/local_xla/xla/service/service.cc:176]   StreamExecutor device (0): Tesla T4, Compute Capability 7.5
2024-01-17 16:38:28.376167: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2024-01-17 16:38:28.851151: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:454] Loaded cuDNN version 8904
I0000 00:00:1705509508.952330    3273 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/2


<keras.src.callbacks.History at 0x7f27d9b2ae10>

In [26]:
model.evaluate(tf_test_dataset)



[0.9728445410728455, 0.717199981212616]

In [27]:
# The recommended way to save a subclassed model is to use save_weights to create a TensorFlow SavedModel checkpoint
model.save_weights('fine_tuned_bert', save_format='tf')