In [1]:
!pip install transformers torch

In [2]:
!pip install seqeval

In [1]:
from transformers import BertTokenizerFast, BertForTokenClassification
from torch.utils.data import DataLoader, Dataset
import torch
import numpy as np

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
from seqeval.metrics import f1_score, classification_report

class NERDataset(Dataset):
    def __init__(self, texts, tags, tokenizer, max_len, label_map):
        self.texts = texts
        self.tags = tags
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.label_map = label_map

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = self.texts[item]
        tags = self.tags[item]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        labels = [self.label_map[tag] if tag in self.label_map else self.label_map['O'] for tag in tags]
        labels = [-100] + labels[:self.max_len-2] + [-100] * (self.max_len - len(labels) - 1)

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(labels, dtype=torch.long)
        }

# Define label_map
label_map = {
    'O': 0,
    'B-product': 1, 'I-product': 2,
    'B-field': 3, 'I-field': 4,
    'B-task': 5, 'I-task': 6,
    'B-researcher': 7, 'I-researcher': 8,
    'B-country': 9, 'I-country': 10,
    'B-politician': 11, 'I-politician': 12,
    'B-election': 13, 'I-election': 14,
    'B-person': 15, 'I-person': 16,
    'B-organisation': 17, 'I-organisation': 18,
    'B-location': 19, 'I-location': 20,
    'B-misc': 21, 'I-misc': 22,
    'B-politicalparty': 23, 'I-politicalparty': 24,
    'B-event': 25, 'I-event': 26,
    'B-scientist': 27, 'I-scientist': 28,
    'B-university': 29, 'I-university': 30,
    'B-discipline': 31, 'I-discipline': 32,
    'B-enzyme': 33, 'I-enzyme': 34,
    'B-protein': 35, 'I-protein': 36,
    'B-chemicalelement': 37, 'I-chemicalelement': 38,
    'B-chemicalcompound': 39, 'I-chemicalcompound': 40,
    'B-astronomicalobject': 41, 'I-astronomicalobject': 42,
    'B-academicjournal': 43, 'I-academicjournal': 44,
    'B-theory': 45, 'I-theory': 46,
    'B-award': 47, 'I-award': 48,
    'B-musicgenre': 49, 'I-musicgenre': 50,
    'B-song': 51, 'I-song': 52,
    'B-band': 53, 'I-band': 54,
    'B-album': 55, 'I-album': 56,
    'B-musicalartist': 57, 'I-musicalartist': 58,
    'B-musicalinstrument': 59, 'I-musicalinstrument': 60,
    'B-book': 61, 'I-book': 62,
    'B-writer': 63, 'I-writer': 64,
    'B-poem': 65, 'I-poem': 66,
    'B-magazine': 67, 'I-magazine': 68,
    'B-literarygenre': 69, 'I-literarygenre': 70,
    'B-programlang': 71, 'I-programlang': 72,
    'B-algorithm': 73, 'I-algorithm': 74,
    'B-metrics': 75, 'I-metrics': 76,
    'B-conference': 77, 'I-conference': 78
}


# Load model and tokenizer with the correct number of labels
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
model = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=len(label_map))

# Example function to read NER data
def read_ner_data(file_path):
    texts, tags = [], []
    with open(file_path, 'r') as file:
        words, labels = [], []
        for line in file:
            if line.startswith('-DOCSTART-') or line == '\n':
                if words:
                    texts.append(' '.join(words))
                    tags.append(labels)
                    words, labels = [], []
                continue
            splits = line.strip().split()
            words.append(splits[0])
            labels.append(splits[-1])
        if words:
            texts.append(' '.join(words))
            tags.append(labels)
    return texts, tags

# Load data
file_path = '/content/drive/MyDrive/Capstone Project Data/English NER data (Domains)/science/train.txt'
texts, tags = read_ner_data(file_path)

# Create dataset and dataloader
max_len = 128
dataset = NERDataset(texts, tags, tokenizer, max_len, label_map)
loader = DataLoader(dataset, batch_size=32)

idx_to_tag = {idx: tag for tag, idx in label_map.items()}


# Evaluate model
model.eval()
model.to('cuda')
predictions, true_labels = [], []
with torch.no_grad():
    for batch in loader:
        input_ids = batch['input_ids'].to('cuda')
        attention_mask = batch['attention_mask'].to('cuda')
        labels = batch['labels'].to('cuda')
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        pred_labels = torch.argmax(logits, dim=2)

        # Iterate over each sequence in the batch
        for i in range(input_ids.shape[0]):
            input_length = torch.sum(attention_mask[i]).item()  # Actual length of the sequence
            # Extract the predictions and true labels using the actual length
            # Excluding CLS and SEP for predictions and true labels which are ignored by -100
            pred_slice = pred_labels[i][1:input_length-1].tolist()
            true_slice = labels[i][1:input_length-1].tolist()
            # Convert indices to tags, excluding -100
            predictions.append([idx_to_tag[p] for p, t in zip(pred_slice, true_slice) if t != -100])
            true_labels.append([idx_to_tag[t] for t in true_slice if t != -100])

# Calculate F1 Score
print("F1 Score:", f1_score(true_labels, predictions))
print(classification_report(true_labels, predictions))

