In [None]:
!pip install datasets transformers seqeval[gpu]
!pip install torchinfo

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting seqeval[gpu]
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.w

In [None]:
import os
import json
import tqdm
import torch
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizerFast, BertForTokenClassification
from transformers import AutoTokenizer 
from transformers import RobertaForTokenClassification
# from torchinfo import summary

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

cuda


In [None]:
labels_to_ids = {
    "O": 0,
    "PER_B": 1,
    "PER_I": 2,
    "LOC_B": 3,
    "LOC_I": 4,
    "ORG_B": 5,
    "ORG_I": 6
}

ids_to_labels = ["O", "PER_B", "PER_I", "LOC_B", "LOC_I", "ORG_B", "ORG_I"]

In [None]:
splits = {'train': 'train.json', 'valid': 'validation.json', 'test': 'test.json'}

dataset_type = 'train'

# import dataset to dataframe
# import train split 
with open("./"+splits[dataset_type], 'r') as jsonfile:
    dataset_json = json.load(jsonfile)
    train_sentences = dataset_json['sentences']
    train_tags = dataset_json['tags']

# get label
train_labels = []
for tags in train_tags:
    temp = [ids_to_labels[int(t)] for t in tags]
    train_labels.append(','.join(temp))

train_data = {
    "sentence": train_sentences,
    "word_labels": train_labels
}

train_df = pd.DataFrame(train_data)

# import validation split
dataset_type = 'valid'

with open("./"+splits[dataset_type], 'r') as jsonfile:
    dataset_json = json.load(jsonfile)
    valid_sentences = dataset_json['sentences']
    valid_tags = dataset_json['tags']

valid_labels = []
for tags in valid_tags:
    temp = [ids_to_labels[int(t)] for t in tags]
    valid_labels.append(','.join(temp))

valid_data = {
    "sentence": valid_sentences,
    "word_labels": valid_labels
}

valid_df = pd.DataFrame(valid_data)

# import test split
dataset_type = 'test'

with open("./"+splits[dataset_type], 'r') as jsonfile:
    dataset_json = json.load(jsonfile)
    test_sentences = dataset_json['sentences']
    test_tags = dataset_json['tags']

test_labels = []
for tags in test_tags:
    temp = [ids_to_labels[int(t)] for t in tags]
    test_labels.append(','.join(temp))

test_data = {
    "sentence": test_sentences,
    "word_labels": test_labels
}

test_df = pd.DataFrame(test_data)

# train_df.head()

Unnamed: 0,sentence,word_labels
0,Kenyan Firms Eye Deals During Obama Summit Tag...,"O,O,O,O,O,PER_B,O,O,O,O,O,O,O,O,O,O,O,PER_B,O,..."
1,By Neville Otuki Kenya 's business leaders wer...,"O,PER_B,PER_I,LOC_B,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
2,"Industrialists , entrepreneurs and bankers sai...","O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
3,"More on This Kenya : Mombasa Road , Uhuru High...","O,O,O,LOC_B,O,LOC_B,LOC_I,O,LOC_B,LOC_I,O,LOC_..."
4,""" Local manufacturers will be looking for join...","O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."


In [None]:
# hyperparameter and model name
MAX_LEN = 128
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
EPOCHS = 50
LEARNING_RATE = 1e-05
MAX_GRAD_NORM = 10
model_name = "bert-base"
# model_name = "roberta-base"

In [None]:
# load tokenizer
if model_name == 'bert-base':
    tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
elif model_name == 'roberta-base':
    tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-base", add_prefix_space=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/252 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/758 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [None]:
class dataset(Dataset):
  def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        # print(len(dataframe))
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

  def __getitem__(self, index):
        # step 1: get the sentence and word labels
        sentence = self.data.sentence[index].strip().split()
        word_labels = self.data.word_labels[index].split(",")

        # step 2: use tokenizer to encode sentence (includes padding/truncation up to max length)
        # BertTokenizerFast provides a handy "return_offsets_mapping" functionality for individual tokens
        encoding = self.tokenizer(sentence,
                            is_split_into_words=True,
                            # return_offsets_mapping=True,
                            padding='max_length',
                            truncation=True,
                            max_length=self.max_len)

        # step 3: create token labels only for first word pieces of each tokenized word
        word_ids = encoding.word_ids()
        labels = [labels_to_ids[label] for label in word_labels]
        # code based on https://huggingface.co/transformers/custom_datasets.html#tok-ner
        # create an empty array of -100 of length max_length
        encoded_labels = np.ones(len(word_ids), dtype=int) * -100

        # set only labels whose first offset position is 0 and the second is not 0
        i = 0
        for idx in range(len(word_ids)):
            if word_ids[idx] != None and word_ids[idx] != word_ids[idx - 1]:
                if i < len(labels):
                    encoded_labels[idx] = labels[i]
                else:
                    encoded_labels[idx] = 0
                i += 1

        # step 4: turn everything into PyTorch tensors
        item = {key: torch.as_tensor(val) for key, val in encoding.items()}
        item['labels'] = torch.as_tensor(encoded_labels)

        return item

  def __len__(self):
        return self.len

In [None]:
print("TRAIN Dataset: {}".format(train_df.shape))
print("TEST Dataset: {}".format(test_df.shape))
print("VALID Dataset: {}".format(valid_df.shape))

# set datasets
training_set = dataset(train_df, tokenizer, MAX_LEN)
testing_set = dataset(test_df, tokenizer, MAX_LEN)
valid_set = dataset(valid_df, tokenizer, MAX_LEN)

TRAIN Dataset: (3261, 2)
TEST Dataset: (1074, 2)
VALID Dataset: (401, 2)


In [None]:
# set data loader
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
validing_loader = DataLoader(valid_set, **test_params)
testing_loader = DataLoader(testing_set, **test_params)

len(training_loader)

408

In [None]:
# load model
if model_name == 'bert-base':
    model = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=len(labels_to_ids))
elif model_name == 'roberta-base':
    model = RobertaForTokenClassification.from_pretrained("FacebookAI/roberta-base", num_labels=len(labels_to_ids))
model.to(device)

optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)

config.json:   0%|          | 0.00/533 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/439M [00:00<?, ?B/s]

In [None]:
# Defining the training function on the 80% of the dataset for tuning the bert model
def train(epoch):
    tr_loss, tr_accuracy = 0, 0
    nb_tr_examples, nb_tr_steps = 0, 0
    tr_preds, tr_labels = [], []
    # put model in training mode
    model.train()

    for idx, batch in enumerate(training_loader):
        # print(idx, batch)

        ids = batch['input_ids'].to(device, dtype = torch.long)
        mask = batch['attention_mask'].to(device, dtype = torch.long)
        labels = batch['labels'].to(device, dtype = torch.long)

        loss, tr_logits = model(input_ids=ids, attention_mask=mask, labels=labels).to_tuple()
        tr_loss += loss.item()

        nb_tr_steps += 1
        nb_tr_examples += labels.size(0)

        if idx % 100==0:
            loss_step = tr_loss/nb_tr_steps
            print(f"Training loss per 100 training steps: {loss_step}")

        # compute training accuracy
        flattened_targets = labels.view(-1) # shape (batch_size * seq_len,)
        active_logits = tr_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
        flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)

        # only compute accuracy at active labels
        active_accuracy = labels.view(-1) != -100 # shape (batch_size, seq_len)
        #active_labels = torch.where(active_accuracy, labels.view(-1), torch.tensor(-100).type_as(labels))

        labels = torch.masked_select(flattened_targets, active_accuracy)
        predictions = torch.masked_select(flattened_predictions, active_accuracy)

        tr_labels.extend(labels)
        tr_preds.extend(predictions)

        tmp_tr_accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
        tr_accuracy += tmp_tr_accuracy

        # gradient clipping
        torch.nn.utils.clip_grad_norm_(
            parameters=model.parameters(), max_norm=MAX_GRAD_NORM
        )

        # backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    epoch_loss = tr_loss / nb_tr_steps
    tr_accuracy = tr_accuracy / nb_tr_steps
    print(f"Training loss epoch: {epoch_loss}")
    print(f"Training accuracy epoch: {tr_accuracy}")

In [None]:
def valid(model, testing_loader):
    # put model in evaluation mode
    model.eval()

    eval_loss, eval_accuracy = 0, 0
    nb_eval_examples, nb_eval_steps = 0, 0
    eval_preds, eval_labels = [], []

    with torch.no_grad():
        for idx, batch in enumerate(testing_loader):

            ids = batch['input_ids'].to(device, dtype = torch.long)
            mask = batch['attention_mask'].to(device, dtype = torch.long)
            labels = batch['labels'].to(device, dtype = torch.long)

            loss, eval_logits = model(input_ids=ids, attention_mask=mask, labels=labels).to_tuple()

            eval_loss += loss.item()

            nb_eval_steps += 1
            nb_eval_examples += labels.size(0)

            if idx % 100==0:
                loss_step = eval_loss/nb_eval_steps
                print(f"Validation loss per 100 evaluation steps: {loss_step}")

            # compute evaluation accuracy
            flattened_targets = labels.view(-1) # shape (batch_size * seq_len,)
            active_logits = eval_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
            flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)

            # only compute accuracy at active labels
            active_accuracy = labels.view(-1) != -100 # shape (batch_size, seq_len)

            labels = torch.masked_select(flattened_targets, active_accuracy)
            predictions = torch.masked_select(flattened_predictions, active_accuracy)

            eval_labels.extend(labels)
            eval_preds.extend(predictions)

            tmp_eval_accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
            eval_accuracy += tmp_eval_accuracy

    labels = [ids_to_labels[id.item()] for id in eval_labels]
    predictions = [ids_to_labels[id.item()] for id in eval_preds]

    eval_loss = eval_loss / nb_eval_steps
    eval_accuracy = eval_accuracy / nb_eval_steps
    print(f"Validation Loss: {eval_loss}")
    print(f"Validation Accuracy: {eval_accuracy}")

    return eval_loss, eval_accuracy, labels, predictions

In [None]:
progress = tqdm.tqdm(total=EPOCHS)
best_acc = 0

for epoch in range(2):
    # print(f"Training epoch: {epoch + 1}")
    train(epoch)

    if epoch % 5 == 0:
        loss, acc, _, _ = valid(model, validing_loader)
        print(f'Validate loss at {epoch} epoch with loss {loss}, and accuracy {acc}.')
        if acc > best_acc:
            best_acc = acc
            save_model = model
            print(f'Saving best model')

    progress.update()

In [None]:
_, _, vlabels, vpredictions = valid(save_model, validing_loader)
_, _, labels, predictions = valid(save_model, testing_loader)

In [None]:
# run test dataset
progress = tqdm.tqdm(total=len(test_sentences))
answer_tags = []
for sentence in test_sentences:

    inputs = tokenizer(sentence.split(),
                        is_split_into_words=True,
                        return_offsets_mapping=True,
                        padding='max_length',
                        truncation=True,
                        max_length=MAX_LEN,
                        return_tensors="pt")

    # move to gpu
    ids = inputs["input_ids"].to(device)
    mask = inputs["attention_mask"].to(device)
    # forward pass
    outputs = save_model(ids, attention_mask=mask)
    # outputs = model(ids, attention_mask=mask)
    logits = outputs[0]

    active_logits = logits.view(-1, save_model.num_labels) # shape (batch_size * seq_len, num_labels)
    # active_logits = logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
    flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size*seq_len,) - predictions at the token level

    tokens = tokenizer.convert_ids_to_tokens(ids.squeeze().tolist())
    token_predictions = [ids_to_labels[i] for i in flattened_predictions.cpu().numpy()]
    wp_preds = list(zip(tokens, token_predictions)) # list of tuples. Each tuple = (wordpiece, prediction)

    prediction = []
    for token_pred, mapping in zip(wp_preds, inputs["offset_mapping"].squeeze().tolist()):
        #only predictions on first word pieces are important
        if mapping[0] == 0 and mapping[1] != 0:
            prediction.append(token_pred[1])
        else:
            continue
    answer_tags.append({"tags": prediction})
    progress.update()

# save result
directory = model_name.split('/')[-1]
if not os.path.exists(f'./{directory}'):
    os.makedirs(f'./{directory}')

output_json = {"type": "test", "sentences": test_sentences, "tags": test_tags, "answer_tags": answer_tags}
with open(f'./{directory}/result.json', 'w') as jsonfile:
    json.dump(output_json, jsonfile)

In [None]:
# sace model
import os

directory = f"./model/{model_name}"

if not os.path.exists(directory):
    os.makedirs(directory)

# save vocabulary of the tokenizer
tokenizer.save_vocabulary(directory)
# save the model weights and its configuration file
save_model.save_pretrained(directory)
print('All files saved')