# Fine Tuning BERT

In [5]:
import numpy as np
from datasets import load_dataset
from transformers import AutoTokenizer
import tensorflow as tf

In [6]:
dataset = load_dataset("silicone", "maptask")

Found cached dataset silicone (C:/Users/asaju/.cache/huggingface/datasets/silicone/maptask/1.0.0/af617406c94e3f78da85f7ea74ebfbd3f297a9665cb54adbae305b03bc4442a5)
100%|██████████| 3/3 [00:00<00:00, 998.41it/s]


In [7]:
# Separate the dataset into train val and test
train_dataset = dataset["train"]
val_dataset = dataset["validation"]
test_dataset = dataset["test"]

# Separate X (Utterance) and y (Dialogue_Act)
train_X = train_dataset["Utterance"]
train_y = train_dataset["Dialogue_Act"]

val_X = val_dataset["Utterance"]
val_y = val_dataset["Dialogue_Act"]

test_X = test_dataset["Utterance"]
test_y = test_dataset["Dialogue_Act"]

In [8]:
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Tokenize the dataset
train_encodings = tokenizer(train_X, truncation=True, padding=True)
val_encodings = tokenizer(val_X, truncation=True, padding=True)
test_encodings = tokenizer(test_X, truncation=True, padding=True)

# Convert labels from string to one hot
label_list = np.unique(train_y)
label_dict = {label: i for i, label in enumerate(label_list)}

train_y_one_hot = np.zeros((len(train_y), len(label_list)))
val_y_one_hot = np.zeros((len(val_y), len(label_list)))
test_y_one_hot = np.zeros((len(test_y), len(label_list)))

for i, label in enumerate(train_y):
    train_y_one_hot[i][label_dict[label]] = 1

for i, label in enumerate(val_y):
    val_y_one_hot[i][label_dict[label]] = 1

for i, label in enumerate(test_y):
    test_y_one_hot[i][label_dict[label]] = 1

# Create the dataset
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    train_y_one_hot
))

val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(val_encodings),
    val_y_one_hot
))

test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    test_y_one_hot
))
# Create the model
from transformers import TFAutoModelForSequenceClassification

model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=12)

# Train the model
from transformers import TFTrainer, TFTrainingArguments

training_args = TFTrainingArguments(
    output_dir="./results",          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
)

trainer = TFTrainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset             # evaluation dataset
)

trainer.train()

# Evaluate the model
trainer.evaluate()

Downloading tf_model.h5:   0%|          | 0.00/536M [00:00<?, ?B/s]

KeyboardInterrupt: 