### While checking the code for grading you dont need to run this code, all the files required are uploaded to gogle drive and the link is shared as a text file in the main branch under the name Downloading required files, as it could not be uploaded manually or by github desktop.
#### This file is for Training the BERT model. This outputs a few files that are required by 6_BERT_Test.py and 7_final_ensemble.py

In [1]:
# Code required for my system as I was facing issues with NLTK
import os
import ssl
import nltk

# Set the NLTK_DATA environment variable to your provided path
os.environ["NLTK_DATA"] = "NLP-Project/nltk_data"

# Add this directory to nltk's search path
nltk.data.path.append("NLP-Project/nltk_data")

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('averaged_perceptron_tagger_eng')
nltk.download('stopwords')

print("Setup complete. NLTK data path set to:", os.environ["NLTK_DATA"])

Setup complete. NLTK data path set to: /Users/craigroberts/Documents/Coding/NLP/MediScan_NLP_Proj/nltk_data


[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /Users/craigroberts/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/craigroberts/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
import pandas as pd
import numpy as np
import torch
from transformers import BertForSequenceClassification, BertTokenizerFast, TrainingArguments, Trainer, EarlyStoppingCallback
from sklearn.preprocessing import LabelEncoder
from datasets import Dataset
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style="whitegrid")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

  from .autonotebook import tqdm as notebook_tqdm


Using device: cpu


In [3]:
train_df = pd.read_csv("Final_data/train_data.csv")  # Assumes columns 'claim' and 'label'
dev_df   = pd.read_csv("Final_data/dev_data.csv")

print("Train DataFrame shape:", train_df.shape)
print("Dev DataFrame shape:", dev_df.shape)

# Encode labels if they are strings
if train_df["label"].dtype == object:
    le = LabelEncoder()
    all_labels = pd.concat([train_df["label"], dev_df["label"]], axis=0)
    le.fit(all_labels)
    train_df["label_encoded"] = le.transform(train_df["label"])
    dev_df["label_encoded"] = le.transform(dev_df["label"])
    num_labels = len(le.classes_)
else:
    train_df["label_encoded"] = train_df["label"]
    dev_df["label_encoded"] = dev_df["label"]
    num_labels = len(np.unique(train_df["label"]))
    class DummyLE:
        pass
    le = DummyLE()
    le.classes_ = np.sort(np.unique(train_df["label"]))

print("Number of classes:", num_labels)
print("Label mapping:", dict(zip(le.classes_, range(num_labels))))

# Create Hugging Face Datasets using 'claim' and 'label_encoded'
train_dataset = Dataset.from_pandas(train_df[["claim", "label_encoded"]])
dev_dataset = Dataset.from_pandas(dev_df[["claim", "label_encoded"]])

# Rename label column to "labels"
train_dataset = train_dataset.rename_column("label_encoded", "labels")
dev_dataset = dev_dataset.rename_column("label_encoded", "labels")

# Remove extraneous columns (keep only 'claim' and 'labels')
cols_to_keep = ["claim", "labels"]
train_dataset = train_dataset.remove_columns([col for col in train_dataset.column_names if col not in cols_to_keep])
dev_dataset = dev_dataset.remove_columns([col for col in dev_dataset.column_names if col not in cols_to_keep])

print("Training samples:", len(train_dataset), "Dev samples:", len(dev_dataset))

Train DataFrame shape: (5338, 6)
Dev DataFrame shape: (2224, 6)
Number of classes: 2
Label mapping: {np.int64(0): 0, np.int64(1): 1}
Training samples: 5338 Dev samples: 2224


In [4]:
# Load the pre-trained BERT tokenizer ("bert-base-uncased")
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

def tokenize_function(examples):
    return tokenizer(examples["claim"], padding="max_length", truncation=True, max_length=128)

train_dataset = train_dataset.map(tokenize_function, batched=True)
dev_dataset = dev_dataset.map(tokenize_function, batched=True)

train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
dev_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

print("Tokenization complete!")

Map: 100%|██████████| 5338/5338 [00:00<00:00, 38005.25 examples/s]
Map: 100%|██████████| 2224/2224 [00:00<00:00, 42526.44 examples/s]

Tokenization complete!





In [5]:
from transformers import EarlyStoppingCallback

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = np.mean(preds == labels)
    return {"accuracy": acc}

# Base training arguments
base_training_args = TrainingArguments(
    output_dir="./bert_output",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    logging_dir="./logs",
    logging_steps=50,
)

# Load a fresh BERT model instance
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_labels)
model.to(device)

trainer = Trainer(
    model=model,
    args=base_training_args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

print("Trainer is set up!")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Trainer is set up!


In [6]:
import itertools

# Define grid for hyperparameters -> to find the best parameters, better than randomly trying to get the best pair
learning_rates = [2e-5, 3e-5]
train_batch_sizes = [16, 32]
eval_batch_sizes = [16, 32]

best_acc = 0
best_config = None
results = []

for lr, train_bs, eval_bs in itertools.product(learning_rates, train_batch_sizes, eval_batch_sizes):
    print(f"Training with lr={lr}, train_bs={train_bs}, eval_bs={eval_bs}")

    training_args = TrainingArguments(
        output_dir="./temp_bert_output",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        learning_rate=lr,
        per_device_train_batch_size=train_bs,
        per_device_eval_batch_size=eval_bs,
        num_train_epochs=3,
        weight_decay=0.01,
        load_best_model_at_end=True,
        metric_for_best_model="accuracy",
        logging_dir="./temp_logs",
        logging_steps=50,
        disable_tqdm=False,
    )

    # Load a new model instance for each run
    temp_model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_labels)
    temp_model.to(device)

    temp_trainer = Trainer(
        model=temp_model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=dev_dataset,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
    )

    temp_trainer.train()
    eval_results = temp_trainer.evaluate()
    acc = eval_results["eval_accuracy"]
    results.append({"learning_rate": lr, "train_bs": train_bs, "eval_bs": eval_bs, "accuracy": acc})
    print(f"Configuration: lr={lr}, train_bs={train_bs}, eval_bs={eval_bs} -> Accuracy: {acc:.4f}")

    if acc > best_acc:
        best_acc = acc
        best_config = (lr, train_bs, eval_bs)

print("Best configuration:", best_config, "with accuracy:", best_acc)

Training with lr=2e-05, train_bs=16, eval_bs=16


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.3997,0.420025,0.769784
2,0.3454,0.429787,0.797212
3,0.256,0.465635,0.792266


Configuration: lr=2e-05, train_bs=16, eval_bs=16 -> Accuracy: 0.7972
Training with lr=2e-05, train_bs=16, eval_bs=32


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.4167,0.425258,0.76304
2,0.3569,0.411995,0.798561
3,0.2749,0.461236,0.794514


Configuration: lr=2e-05, train_bs=16, eval_bs=32 -> Accuracy: 0.7986
Training with lr=2e-05, train_bs=32, eval_bs=16


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.4446,0.440266,0.756295
2,0.3664,0.401557,0.790468
3,0.3106,0.427917,0.785072


Configuration: lr=2e-05, train_bs=32, eval_bs=16 -> Accuracy: 0.7905
Training with lr=2e-05, train_bs=32, eval_bs=32


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.4434,0.438584,0.756295
2,0.3668,0.397647,0.790917
3,0.3071,0.424058,0.78732


Configuration: lr=2e-05, train_bs=32, eval_bs=32 -> Accuracy: 0.7909
Training with lr=3e-05, train_bs=16, eval_bs=16


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.4092,0.424321,0.766637
2,0.3359,0.399677,0.802158
3,0.2074,0.519337,0.801259


Configuration: lr=3e-05, train_bs=16, eval_bs=16 -> Accuracy: 0.8022
Training with lr=3e-05, train_bs=16, eval_bs=32


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.4124,0.426446,0.767536
2,0.3389,0.408493,0.801709
3,0.2228,0.518688,0.79946


Configuration: lr=3e-05, train_bs=16, eval_bs=32 -> Accuracy: 0.8017
Training with lr=3e-05, train_bs=32, eval_bs=16


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.4463,0.447674,0.755396
2,0.3644,0.410667,0.785971
3,0.2891,0.431585,0.788219


Configuration: lr=3e-05, train_bs=32, eval_bs=16 -> Accuracy: 0.7882
Training with lr=3e-05, train_bs=32, eval_bs=32


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.4335,0.43449,0.758543
2,0.3444,0.394251,0.794514
3,0.2538,0.437141,0.797212


Configuration: lr=3e-05, train_bs=32, eval_bs=32 -> Accuracy: 0.7972
Best configuration: (3e-05, 16, 16) with accuracy: 0.802158273381295


In [7]:
# Rebuild training arguments with the best hyperparameters from the grid search
final_training_args = TrainingArguments(
    output_dir="./bert_output_final",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=best_config[0],
    per_device_train_batch_size=best_config[1],
    per_device_eval_batch_size=best_config[2],
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    logging_dir="./logs_final",
    logging_steps=50,
)

# Load a new BERT model instance
final_model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_labels)
final_model.to(device)

final_trainer = Trainer(
    model=final_model,
    args=final_training_args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

# Train the final model using the best hyperparameters
final_trainer.train()

# Evaluate the final model on the dev set
final_eval_results = final_trainer.evaluate()
print("Final Evaluation Results on Dev Set:")
print(final_eval_results)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.4141,0.410068,0.769335
2,0.3363,0.407277,0.801709
3,0.2111,0.527986,0.798561


Final Evaluation Results on Dev Set:
{'eval_loss': 0.4072774350643158, 'eval_accuracy': 0.8017086330935251, 'eval_runtime': 19.2906, 'eval_samples_per_second': 115.289, 'eval_steps_per_second': 7.206, 'epoch': 3.0}


In [8]:
final_model.save_pretrained("Bert_Model_Final")
tokenizer.save_pretrained("Bert_Model_Final_Tokenizer")

import pickle
target_folder = "Compressed model folder"  # already exists
file_path = os.path.join(target_folder, "Bert_Model_Final_State.pkl")
with open(file_path, "wb") as f:
    pickle.dump(final_model.state_dict(), f)

print("Final BERT model and tokenizer saved in 'Bert_Model_Final', and state_dict pickled as 'Bert_Model_Final_State.pkl'.")

Final BERT model and tokenizer saved in 'Bert_Model_Final', and state_dict pickled as 'Bert_Model_Final_State.pkl'.
