In [1]:
import torch
import numpy as np
from torch.utils.data import DataLoader, Dataset
from transformers import BertForTokenClassification, BertTokenizerFast
from sklearn.metrics import classification_report, accuracy_score

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
class AIDataset(Dataset):
    def __init__(self, file_path, tokenizer, label_map, max_length=128):
        self.tokenizer = tokenizer
        self.label_map = label_map
        self.max_length = max_length
        self.texts = []
        self.labels = []

        with open(file_path, 'r', encoding='utf-8') as file:
            tokens = []
            tag_labels = []
            for line in file:
                line = line.strip()
                if line == "" or line.startswith("-DOCSTART-"):
                    if tokens:
                        self.texts.append(tokens)
                        self.labels.append(tag_labels)
                        tokens, tag_labels = [], []
                    continue
                parts = line.split()
                tokens.append(parts[0])
                tag_labels.append(parts[-1])

            if tokens:
                self.texts.append(tokens)
                self.labels.append(tag_labels)

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        tokens = self.texts[idx]
        labels = self.labels[idx]

        tokenized_inputs = self.tokenizer(
            tokens,
            is_split_into_words=True,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )

        input_ids = tokenized_inputs['input_ids'].squeeze(0)
        attention_mask = tokenized_inputs['attention_mask'].squeeze(0)

        # Convert labels to indices using the label map
        label_ids = [self.label_map.get(label, self.label_map['O']) for label in labels]

        # Handle subword tokens
        new_labels = []
        previous_word_idx = None
        for word_idx in tokenized_inputs.word_ids(batch_index=0):
            if word_idx is None or word_idx == previous_word_idx:
                new_labels.append(-100)
            else:
                new_labels.append(label_ids[word_idx])
            previous_word_idx = word_idx

        label_ids = torch.tensor(new_labels[:self.max_length] + [-100] * (self.max_length - len(new_labels)), dtype=torch.long)

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': label_ids
        }


In [4]:
# label_map = {
#     'O': 0,
#     'B-product': 1, 'I-product': 2,
#     'B-field': 3, 'I-field': 4,
#     'B-task': 5, 'I-task': 6,
#     'B-researcher': 7, 'I-researcher': 8,
#     'B-country': 9, 'I-country': 10,
#     'B-politician': 11, 'I-politician': 12,
#     'B-election': 13, 'I-election': 14,
#     'B-person': 15, 'I-person': 16,
#     'B-organisation': 17, 'I-organisation': 18,
#     'B-location': 19, 'I-location': 20,
#     'B-misc': 21, 'I-misc': 22,
#     'B-politicalparty': 23, 'I-politicalparty': 24,
#     'B-event': 25, 'I-event': 26,
#     'B-scientist': 27, 'I-scientist': 28,
#     'B-university': 29, 'I-university': 30,
#     'B-discipline': 31, 'I-discipline': 32,
#     'B-enzyme': 33, 'I-enzyme': 34,
#     'B-protein': 35, 'I-protein': 36,
#     'B-chemicalelement': 37, 'I-chemicalelement': 38,
#     'B-chemicalcompound': 39, 'I-chemicalcompound': 40,
#     'B-astronomicalobject': 41, 'I-astronomicalobject': 42,
#     'B-academicjournal': 43, 'I-academicjournal': 44,
#     'B-theory': 45, 'I-theory': 46,
#     'B-award': 47, 'I-award': 48,
#     'B-musicgenre': 49, 'I-musicgenre': 50,
#     'B-song': 51, 'I-song': 52,
#     'B-band': 53, 'I-band': 54,
#     'B-album': 55, 'I-album': 56,
#     'B-musicalartist': 57, 'I-musicalartist': 58,
#     'B-musicalinstrument': 59, 'I-musicalinstrument': 60,
#     'B-book': 61, 'I-book': 62,
#     'B-writer': 63, 'I-writer': 64,
#     'B-poem': 65, 'I-poem': 66,
#     'B-magazine': 67, 'I-magazine': 68,
#     'B-literarygenre': 69, 'I-literarygenre': 70,
#     'B-programlang': 71, 'I-programlang': 72,
#     'B-algorithm': 73, 'I-algorithm': 74,
#     'B-metrics': 75, 'I-metrics': 76,
#     'B-conference': 77, 'I-conference': 78
# }

label_map = {
    'O': 0,
    'B-country': 9, 'I-country': 10,
    'B-politician': 11, 'I-politician': 12,
    'B-election': 13, 'I-election': 14,
    'B-person': 15, 'I-person': 16,
    'B-organisation': 17, 'I-organisation': 18,
    'B-location': 19, 'I-location': 20,
    'B-misc': 21, 'I-misc': 22,
    'B-politicalparty': 23, 'I-politicalparty': 24,
    'B-event': 25, 'I-event': 26
}


In [5]:
# Device configuration - use GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
# Setup
model = BertForTokenClassification.from_pretrained('/content/drive/MyDrive/Capstone Project Data/Direct Fine-Tuning')
tokenizer = BertTokenizerFast.from_pretrained('/content/drive/MyDrive/Capstone Project Data/Direct Fine-Tuning')
model.to(device)

In [7]:
# DataLoader setup
dev_dataset = AIDataset('/content/drive/MyDrive/Capstone Project Data/English NER data (Domains)/politics/test.txt', tokenizer, label_map)
dev_loader = DataLoader(dev_dataset, batch_size=8, shuffle=False)

In [8]:
print("Number of labels in the model:", model.num_labels)

Number of labels in the model: 79


In [None]:
# Evaluation

all_predictions = []
all_true_labels = []

for batch in dev_loader:
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels_tensor = batch['labels'].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1).detach().cpu().numpy()
    labels_tensor = labels_tensor.detach().cpu().numpy()

    # Correctly handling predictions and true labels to ensure consistent lengths
    for i in range(input_ids.size(0)):
        input_length = int(attention_mask[i].sum())
        pred = predictions[i][:input_length]
        true = labels_tensor[i][:input_length]

        # Filter out '-100' values used for subword tokens in labels
        valid_indices = [idx for idx, label in enumerate(true) if label != -100]

        # Append valid predictions and labels
        all_predictions.extend(pred[valid_indices])
        all_true_labels.extend(true[valid_indices])


accuracy = accuracy_score(all_true_labels, all_predictions)
print("Accuracy:", accuracy)

unique_labels = sorted(set(all_predictions) | set(all_true_labels))
target_names = [k for k, v in sorted(label_map.items(), key=lambda item: item[1]) if v in unique_labels]

report = classification_report(
    all_true_labels,
    all_predictions,
    labels=unique_labels,
    target_names=target_names,
    zero_division=0
)

print("Classification Report:\n", report)
