In [None]:
import os
import ssl
import nltk

# Set the NLTK_DATA environment variable to your provided path
os.environ["NLTK_DATA"] = "/Users/craigroberts/Documents/Coding/NLP/MediScan_NLP_Proj/nltk_data"

# Add this directory to nltk's search path
nltk.data.path.append("/Users/craigroberts/Documents/Coding/NLP/MediScan_NLP_Proj/nltk_data")

# Optionally disable SSL verification to avoid certificate errors when downloading
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

# Download necessary NLTK resources
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('stopwords')

print("Setup complete. NLTK data path set to:", os.environ["NLTK_DATA"])

In [None]:
import pandas as pd
import numpy as np
import torch
from transformers import BertForSequenceClassification, BertTokenizerFast, TrainingArguments, Trainer, EarlyStoppingCallback
from sklearn.preprocessing import LabelEncoder
from datasets import Dataset
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style="whitegrid")

# Set device (GPU if available, else CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

In [None]:
# Load your training and dev CSV files (adjust paths as needed)
train_df = pd.read_csv("Final_data/train_data.csv")  # Assumes columns 'claim' and 'label'
dev_df   = pd.read_csv("Final_data/dev_data.csv")

print("Train DataFrame shape:", train_df.shape)
print("Dev DataFrame shape:", dev_df.shape)

# Encode labels if they are strings
if train_df["label"].dtype == object:
    le = LabelEncoder()
    all_labels = pd.concat([train_df["label"], dev_df["label"]], axis=0)
    le.fit(all_labels)
    train_df["label_encoded"] = le.transform(train_df["label"])
    dev_df["label_encoded"] = le.transform(dev_df["label"])
    num_labels = len(le.classes_)
else:
    train_df["label_encoded"] = train_df["label"]
    dev_df["label_encoded"] = dev_df["label"]
    num_labels = len(np.unique(train_df["label"]))
    class DummyLE:
        pass
    le = DummyLE()
    le.classes_ = np.sort(np.unique(train_df["label"]))

print("Number of classes:", num_labels)
print("Label mapping:", dict(zip(le.classes_, range(num_labels))))

# Create Hugging Face Datasets using 'claim' and 'label_encoded'
train_dataset = Dataset.from_pandas(train_df[["claim", "label_encoded"]])
dev_dataset = Dataset.from_pandas(dev_df[["claim", "label_encoded"]])

# Rename label column to "labels"
train_dataset = train_dataset.rename_column("label_encoded", "labels")
dev_dataset = dev_dataset.rename_column("label_encoded", "labels")

# Remove extraneous columns (keep only 'claim' and 'labels')
cols_to_keep = ["claim", "labels"]
train_dataset = train_dataset.remove_columns([col for col in train_dataset.column_names if col not in cols_to_keep])
dev_dataset = dev_dataset.remove_columns([col for col in dev_dataset.column_names if col not in cols_to_keep])

print("Training samples:", len(train_dataset), "Dev samples:", len(dev_dataset))

In [None]:
# Load the pre-trained BERT tokenizer ("bert-base-uncased")
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

def tokenize_function(examples):
    return tokenizer(examples["claim"], padding="max_length", truncation=True, max_length=128)

# Tokenize the datasets in batches
train_dataset = train_dataset.map(tokenize_function, batched=True)
dev_dataset = dev_dataset.map(tokenize_function, batched=True)

# Set format for PyTorch (using only necessary columns)
train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
dev_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

print("Tokenization complete!")

In [None]:
from transformers import EarlyStoppingCallback

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = np.mean(preds == labels)
    return {"accuracy": acc}

# Base training arguments (will be updated in grid search)
base_training_args = TrainingArguments(
    output_dir="./bert_output",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,                # Placeholder (to be tuned)
    per_device_train_batch_size=16,    # Placeholder (to be tuned)
    per_device_eval_batch_size=16,     # Placeholder (to be tuned)
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    logging_dir="./logs",
    logging_steps=50,
)

# Load a fresh BERT model instance
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_labels)
model.to(device)

trainer = Trainer(
    model=model,
    args=base_training_args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

print("Trainer is set up!")

In [None]:
import itertools

# Define grid for hyperparameters
learning_rates = [2e-5, 3e-5]
train_batch_sizes = [16, 32]
eval_batch_sizes = [16, 32]

best_acc = 0
best_config = None
results = []

# Manual grid search loop
for lr, train_bs, eval_bs in itertools.product(learning_rates, train_batch_sizes, eval_batch_sizes):
    print(f"Training with lr={lr}, train_bs={train_bs}, eval_bs={eval_bs}")

    # Define training arguments for this configuration
    training_args = TrainingArguments(
        output_dir="./temp_bert_output",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        learning_rate=lr,
        per_device_train_batch_size=train_bs,
        per_device_eval_batch_size=eval_bs,
        num_train_epochs=3,
        weight_decay=0.01,
        load_best_model_at_end=True,
        metric_for_best_model="accuracy",
        logging_dir="./temp_logs",
        logging_steps=50,
        disable_tqdm=False,
    )

    # Load a new model instance for each run
    temp_model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_labels)
    temp_model.to(device)

    temp_trainer = Trainer(
        model=temp_model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=dev_dataset,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
    )

    temp_trainer.train()
    eval_results = temp_trainer.evaluate()
    acc = eval_results["eval_accuracy"]
    results.append({"learning_rate": lr, "train_bs": train_bs, "eval_bs": eval_bs, "accuracy": acc})
    print(f"Configuration: lr={lr}, train_bs={train_bs}, eval_bs={eval_bs} -> Accuracy: {acc:.4f}")

    if acc > best_acc:
        best_acc = acc
        best_config = (lr, train_bs, eval_bs)

print("Best configuration:", best_config, "with accuracy:", best_acc)

In [None]:
# Rebuild training arguments with the best hyperparameters from the grid search
final_training_args = TrainingArguments(
    output_dir="./bert_output_final",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=best_config[0],
    per_device_train_batch_size=best_config[1],
    per_device_eval_batch_size=best_config[2],
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    logging_dir="./logs_final",
    logging_steps=50,
)

# Load a new BERT model instance
final_model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_labels)
final_model.to(device)

final_trainer = Trainer(
    model=final_model,
    args=final_training_args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

# Train the final model using the best hyperparameters
final_trainer.train()

# Evaluate the final model on the dev set
final_eval_results = final_trainer.evaluate()
print("Final Evaluation Results on Dev Set:")
print(final_eval_results)

In [None]:
# Save the final BERT model and tokenizer using Hugging Face's save_pretrained method
final_model.save_pretrained("Bert_Model_Final")
tokenizer.save_pretrained("Bert_Model_Final_Tokenizer")

# Optionally, pickle the model's state_dict
import pickle
with open("Bert_Model_Final_State.pkl", "wb") as f:
    pickle.dump(final_model.state_dict(), f)

print("Final BERT model and tokenizer saved in 'Bert_Model_Final', and state_dict pickled as 'Bert_Model_Final_State.pkl'.")