# Model Training

## 1.Import Dependancies

In [13]:
import tensorflow as tf
from transformers import (
    AutoTokenizer,
    TFAutoModelForMaskedLM,
    TrainingArguments,
    Trainer,
)
import pandas as pd
from datasets import Dataset

## 2. Load the model and tokenizer

In [14]:
# Load BioBERT model and tokenizer (TensorFlow version)
tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.2")
model = TFAutoModelForMaskedLM.from_pretrained("dmis-lab/biobert-base-cased-v1.2", from_pt=True)

# Check for GPU
device = "GPU" if tf.config.list_physical_devices("GPU") else "CPU"

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertForMaskedLM: ['cls.predictions.decoder.bias']
- This IS expected if you are initializing TFBertForMaskedLM from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForMaskedLM from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertForMaskedLM were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForMaskedLM for predictions without further training.


## Load and Preprocess Dataset

### helper functions

In [15]:
# Tokenization function

def tokenize_function(example):
    return tokenizer(
        example["context"],
        text_pair=example["response"],
        padding="max_length",
        truncation=True,
        max_length=512,
    )

In [16]:
# Convert to TensorFlow dataset

def tf_format(example):
    return (
        {
            "input_ids": tf.convert_to_tensor(example["input_ids"]),
            "attention_mask": tf.convert_to_tensor(example["attention_mask"]),
        },
        tf.convert_to_tensor(example["input_ids"]), 
    )

In [17]:
dataset_path = "../dataset/dataset.csv"

# Load dataset
df = pd.read_csv(dataset_path)

# Ensure all values are strings and fill NaN with an empty string
df["context"] = df["context"].astype(str).fillna("")
df["response"] = df["response"].astype(str).fillna("")


# Convert to Hugging Face Dataset
dataset = Dataset.from_pandas(df)

# Tokenize dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True)

train_dataset = tf.data.Dataset.from_generator(
    lambda: (tf_format(sample) for sample in tokenized_dataset),
    output_signature=(
        {
            "input_ids": tf.TensorSpec(shape=(512,), dtype=tf.int32),
            "attention_mask": tf.TensorSpec(shape=(512,), dtype=tf.int32),
        },
        tf.TensorSpec(shape=(512,), dtype=tf.int32),
    ),
)


# Shuffle and batch
BATCH_SIZE = 8
train_dataset = train_dataset.shuffle(10000).batch(BATCH_SIZE)

                                                                 

## Compile And Train Model

In [18]:
# Compile the model
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

model.compile(optimizer=optimizer, loss=loss_fn)

# Train the model
EPOCHS = 1
model.fit(train_dataset, epochs=EPOCHS)

print("Training complete!")

2025-02-21 23:59:59.198182: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:450] ShuffleDatasetV3:14: Filling up shuffle buffer (this may take a while): 5966 of 10000
2025-02-22 00:00:04.335598: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:480] Shuffle buffer filled.


Training complete!


In [23]:
model.save_pretrained("biobert_model_tf")
tokenizer.save_pretrained("biobert_model_tf")

('biobert_model_tf/tokenizer_config.json',
 'biobert_model_tf/special_tokens_map.json',
 'biobert_model_tf/vocab.txt',
 'biobert_model_tf/added_tokens.json',
 'biobert_model_tf/tokenizer.json')

In [24]:
model.evaluate(train_dataset)

2025-02-22 07:43:37.679957: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:450] ShuffleDatasetV3:14: Filling up shuffle buffer (this may take a while): 4610 of 10000
2025-02-22 07:43:43.813034: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:480] Shuffle buffer filled.


    597/Unknown - 4033s 7s/step - loss: 5.4054e-05

KeyboardInterrupt: 