In [1]:
import os
import torch
import numpy as np
import seaborn as sns
import torch.nn.functional as F
from tqdm.auto import tqdm
from torch.optim import AdamW
import matplotlib.pyplot as plt
import json
from torch.nn import CrossEntropyLoss
from torch.utils.data import DataLoader, Dataset
from transformers import BertForTokenClassification, BertTokenizerFast
from sklearn.metrics import classification_report, confusion_matrix

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
label_map = {
    'O': 0,
    'B-product': 1, 'I-product': 2,
    'B-field': 3, 'I-field': 4,
    'B-task': 5, 'I-task': 6,
    'B-researcher': 7, 'I-researcher': 8,
    'B-country': 9, 'I-country': 10,
    'B-politician': 11, 'I-politician': 12,
    'B-election': 13, 'I-election': 14,
    'B-person': 15, 'I-person': 16,
    'B-organisation': 17, 'I-organisation': 18,
    'B-location': 19, 'I-location': 20,
    'B-misc': 21, 'I-misc': 22,
    'B-politicalparty': 23, 'I-politicalparty': 24,
    'B-event': 25, 'I-event': 26,
    'B-scientist': 27, 'I-scientist': 28,
    'B-university': 29, 'I-university': 30,
    'B-discipline': 31, 'I-discipline': 32,
    'B-enzyme': 33, 'I-enzyme': 34,
    'B-protein': 35, 'I-protein': 36,
    'B-chemicalelement': 37, 'I-chemicalelement': 38,
    'B-chemicalcompound': 39, 'I-chemicalcompound': 40,
    'B-astronomicalobject': 41, 'I-astronomicalobject': 42,
    'B-academicjournal': 43, 'I-academicjournal': 44,
    'B-theory': 45, 'I-theory': 46,
    'B-award': 47, 'I-award': 48,
    'B-musicgenre': 49, 'I-musicgenre': 50,
    'B-song': 51, 'I-song': 52,
    'B-band': 53, 'I-band': 54,
    'B-album': 55, 'I-album': 56,
    'B-musicalartist': 57, 'I-musicalartist': 58,
    'B-musicalinstrument': 59, 'I-musicalinstrument': 60,
    'B-book': 61, 'I-book': 62,
    'B-writer': 63, 'I-writer': 64,
    'B-poem': 65, 'I-poem': 66,
    'B-magazine': 67, 'I-magazine': 68,
    'B-literarygenre': 69, 'I-literarygenre': 70,
    'B-programlang': 71, 'I-programlang': 72,
    'B-algorithm': 73, 'I-algorithm': 74,
    'B-metrics': 75, 'I-metrics': 76,
    'B-conference': 77, 'I-conference': 78
}


In [4]:
len(label_map)

79

In [5]:
model_path = '/content/drive/MyDrive/Capstone Project Data/DAPT_Checkpoint'
model = BertForTokenClassification.from_pretrained(model_path, num_labels=79)
tokenizer = BertTokenizerFast.from_pretrained(model_path)

  self.pid = os.fork()
Some weights of BertForTokenClassification were not initialized from the model checkpoint at /content/drive/MyDrive/Capstone Project Data/DAPT_Checkpoint and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
class NERDataset(Dataset):
    def __init__(self, tokenizer, file_paths, label_map):
        self.tokenizer = tokenizer
        self.label_map = label_map
        self.encodings = {'input_ids': [], 'attention_mask': []}
        self.labels = []

        # Process each file and populate texts and labels
        for file_path in file_paths:
            with open(file_path, 'r', encoding='utf-8') as file:
                tokens = []
                tag_labels = []
                for line in file:
                    line = line.strip()
                    if not line:
                        if tokens:
                            self.process_sentence(tokens, tag_labels)
                            tokens, tag_labels = [], []
                        continue
                    if line.startswith("-DOCSTART-"):
                        continue

                    # Handle splitting of tokens and tags
                    parts = line.split()
                    if len(parts) == 2:
                        token, tag = parts
                        tokens.append(token)
                        tag_labels.append(tag)
                    else:
                        print(f"Skipping malformed line: {line}")
                if tokens:  # Process any remaining sentence
                    self.process_sentence(tokens, tag_labels)

    def process_sentence(self, tokens, tag_labels):
        encodings = self.tokenizer(tokens, is_split_into_words=True, truncation=True, padding='max_length', max_length=128, return_attention_mask=True, return_tensors='pt')
        input_ids = encodings['input_ids'][0]
        attention_mask = encodings['attention_mask'][0]

        # Initialize labels tensor for input_ids with a default ignore index (-100)
        labels = torch.full(input_ids.shape, fill_value=-100, dtype=torch.long)

        # Get word ids for mapping tokens to their word origins
        word_ids = encodings.word_ids(batch_index=0)

        previous_word_idx = None
        current_label_idx = self.label_map['O']
        for token_idx, word_idx in enumerate(word_ids):
            if word_idx is None:
                continue
            if word_idx != previous_word_idx:
                if word_idx < len(tag_labels):
                    current_label_idx = self.label_map.get(tag_labels[word_idx], self.label_map['O'])
            labels[token_idx] = current_label_idx
            previous_word_idx = word_idx

        self.encodings['input_ids'].append(input_ids)
        self.encodings['attention_mask'].append(attention_mask)
        self.labels.append(labels)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input_ids': self.encodings['input_ids'][idx],
            'attention_mask': self.encodings['attention_mask'][idx],
            'labels': self.labels[idx]
        }


In [7]:
# Paths to your training files
file_paths = [
    '/content/drive/MyDrive/Capstone Project Data/English NER data (Domains)/ai/train.txt',
    '/content/drive/MyDrive/Capstone Project Data/English NER data (Domains)/literature/train.txt',
    '/content/drive/MyDrive/Capstone Project Data/English NER data (Domains)/music/train.txt',
    '/content/drive/MyDrive/Capstone Project Data/English NER data (Domains)/politics/train.txt',
    '/content/drive/MyDrive/Capstone Project Data/English NER data (Domains)/science/train.txt'
]

dataset = NERDataset(tokenizer, file_paths, label_map)
dataloader = DataLoader(dataset, batch_size=8, shuffle=True)

In [8]:
class FocalLoss(torch.nn.Module):
    def __init__(self, alpha=0.25, gamma=0.2):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma

    def forward(self, inputs, targets):
        # Convert outputs to probabilities using softmax
        BCE_loss = F.cross_entropy(inputs, targets, reduction='none')
        # Create tensors for alpha and the gamma factor
        pt = torch.exp(-BCE_loss)
        at = self.alpha * (1 - pt) + (1 - self.alpha) * pt
        F_loss = at * (1 - pt) ** self.gamma * BCE_loss
        return F_loss.mean()

In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
optimizer = AdamW(model.parameters(), lr=2e-5)

In [10]:
loss_fn = FocalLoss(alpha=0.25, gamma=0.001)

In [None]:
# Start training
model.train()
for epoch in range(25):  # Adjust the number of epochs if necessary
    total_loss = 0
    progress_bar = tqdm(enumerate(dataloader), total=len(dataloader), desc=f"Epoch {epoch + 1}")
    for i, batch in progress_bar:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Reset gradients
        model.zero_grad()

        # Forward pass
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        # Flatten the output for focal loss
        active_labels = labels.view(-1)  # Flatten labels
        active_logits = logits.view(-1, model.config.num_labels)  # Flatten logits

        # Calculate loss
        loss = loss_fn(active_logits, active_labels)

        # Backward pass and optimize
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        progress_bar.set_postfix({'loss': total_loss / (i + 1)})

    # Calculate the average loss for the epoch
    average_loss = total_loss / len(dataloader)
    print(f"Epoch {epoch + 1}: Average Loss = {average_loss:.4f}")

In [13]:
# Save the model to disk
model.save_pretrained('/content/drive/MyDrive/Capstone Project Data/Direct Fine-Tuning')

# Save the tokenizer to disk
tokenizer.save_pretrained('/content/drive/MyDrive/Capstone Project Data/Direct Fine-Tuning')

print("Model saved successfully.")

Model saved successfully.
