In [None]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

from pathlib import Path
import scipy
import numpy as np
import os,sys

currentdir = Path.cwd()
sys.path.insert(0,str(currentdir)+'\\utils')
import skseq
import skseq.sequences.structured_perceptron as spc
from utils import *

This notebook containins all the code required to train the models and store them in fitted model folder.

## Loading Data

In [None]:
corpus = NerCorpus()

In [None]:
data_path = "./data"

#data_path = parentdir + data_path

train_seq = corpus.read_sequence_list(data_path + "/train_data_ner.csv",
                                            max_sent_len=100)

## Structure Perceptron

### Structure Perceptron w/ given features

In [None]:
feature_mapper = skseq.sequences.id_feature.IDFeatures(train_seq)
feature_mapper.build_features()

In [None]:
sp = spc.StructuredPerceptron(corpus.word_dict, corpus.tag_dict, feature_mapper)
sp.num_epochs = 5
num_epochs = 5
sp.fit(feature_mapper.dataset, num_epochs)

Epoch: 0 Accuracy: 0.893766
Epoch: 1 Accuracy: 0.931943
Epoch: 2 Accuracy: 0.941427
Epoch: 3 Accuracy: 0.946285
Epoch: 4 Accuracy: 0.949950


In [None]:
sp.save_model("./fitted_models/perceptron_5_iter_given")

### Structure Perceptron w/ extra features

In [None]:
from skseq.sequences.extended_features import *

feature_mapper_ext = Extended_Features(train_seq)
feature_mapper_ext.build_features()

In [None]:
sp = spc.StructuredPerceptron(corpus.word_dict, corpus.tag_dict, feature_mapper_ext)
sp.num_epochs = 5
num_epochs = 5
sp.fit(feature_mapper_ext.dataset, num_epochs)

Epoch: 0 Accuracy: 0.929694
Epoch: 1 Accuracy: 0.943821
Epoch: 2 Accuracy: 0.947946
Epoch: 3 Accuracy: 0.950402
Epoch: 4 Accuracy: 0.952245


In [None]:
sp.save_model("./fitted_models/perceptron_5_iter_extra")

## Deep Learning approach

### Bi-LSTM

If errors occur when running this section, please see the following notebook in Google Colab where all the outputs can be seen: https://drive.google.com/file/d/1QGFU2hH6p8pU6hAaWe6dHxUKCng7t00V/view?usp=sharing

train_df = pd.read_csv('./data/train_data_ner.csv', encoding="latin1").fillna(method="ffill")
test_df = pd.read_csv("./data/test_data_ner.csv", encoding="latin1").fillna(method="ffill")
tiny_test_df = pd.read_csv("./data/tiny_test.csv", encoding="latin1").fillna(method="ffill")

In [None]:
# PREPROCESSING

# concatenating 3 df to create 1 vocabulary
concatenated_df = pd.concat([train_df, test_df, tiny_test_df], ignore_index=True)

# creating vocabulary
word2idx, tag2idx, n_words, n_tags, tags = create_vocabulary_bilstm(concatenated_df)

# getting sentences
train_getter = SentenceGetter(train_df)
test_getter = SentenceGetter(test_df)
tiny_test_getter = SentenceGetter(tiny_test_df)

train_sentences = train_getter.sentences
test_sentences = test_getter.sentences
tiny_test_sentences = tiny_test_getter.sentences

# Encode and pad sequences for each dataset
X_train, y_train = encode_and_pad_sequences_bilstm(train_sentences, tags, word2idx, tag2idx, n_words, max_len=50)
X_test, y_test = encode_and_pad_sequences_bilstm(test_sentences,tags, word2idx, tag2idx, n_words, max_len=50)
X_tiny_test, y_tiny_test = encode_and_pad_sequences_bilstm(tiny_test_sentences, tags, word2idx, tag2idx, n_words, max_len=50)

In [None]:
# creating dataset and dataloader
train_dataset = NERDataset(X_train, y_train)
val_dataset = NERDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

In [None]:
# Create class weights
weights, unique_tags = create_class_weights(train_df['tags'])

In [None]:
# hyperparameters for the model
embedding_dim = 100
hidden_dim = 256
dropout = 0.5
learning_rate = 0.001
weight_decay = 1e-5 # L2 regularization
n_epochs = 5

# initializing the model
model = BiLSTMModel(vocab_size=n_words, tagset_size=n_tags, padding_idx=n_words-1)
loss_function = nn.CrossEntropyLoss(weight=weights)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

In [None]:
%%time

# Train and evaluate the model
train_losses, valid_losses, train_accuracies, valid_accuracies = train_bilstm(train_loader, val_loader, model, loss_function, optimizer, n_epochs, n_tags)

In [None]:
plot_metrics_bilstm(train_losses, valid_losses, train_accuracies, valid_accuracies)

In [None]:
# Define the path where you want to save the model
model_save_path = "bi-lstm_model2.pth"

# Save the model's state dictionary
torch.save(model.state_dict(), model_save_path)

In [None]:
# evaluating train and test set

In [None]:
#train
tr_word2idx, tr_tag2idx, tr_n_words, tr_n_tags, tr_tags = create_vocabulary_bilstm(train_df)
# Define idx2word and idx2tag mappings
tr_idx2word = {v: k for k, v in tr_word2idx.items()}
tr_idx2tag = {v: k for k, v in tr_tag2idx.items()}

In [None]:
# Get predicted and true tags for further analysis
train_preds, train_labels = get_predictions_and_labels(model, train_loader)

In [None]:
# Convert tags to sentences
tagged_sentences = tags_to_sentences(train_sentences, train_preds, idx2word, idx2tag)

# Print example sentences with predicted tags
for sentence in tagged_sentences:
    print(sentence)

In [None]:
#plot_confusion_matrix_bilstm(y_train, train_preds, idx2tag, save=True) # getting ERROR

In [None]:
eval_loss, eval_accuracy, eval_accuracy_without_o, f1 = evaluate_bilstm_tiny_test(model, train_loader, loss_function, n_tags, tag2idx)

print(f"Test Loss: {eval_loss}")
print(f"Test Accuracy: {eval_accuracy}")
print(f"Test Accuracy without 'O': {eval_accuracy_without_o}")
print(f"Test F1 Score: {f1}")

In [None]:
#test
ts_word2idx, ts_tag2idx, ts_n_words, ts_n_tags, ts_tags = create_vocabulary_bilstm(test_df)
# Define idx2word and idx2tag mappings
ts_idx2word = {v: k for k, v in ts_word2idx.items()}
ts_idx2tag = {v: k for k, v in ts_tag2idx.items()}

In [None]:
# Get predicted and true tags for further analysis
test_preds, test_labels = get_predictions_and_labels(model, val_loader)

In [None]:
# Convert tags to sentences
tagged_sentences = tags_to_sentences(test_sentences, test_preds, idx2word, idx2tag)

# Print example sentences with predicted tags
for sentence in tagged_sentences:
    print(sentence)

In [None]:
plot_confusion_matrix_bilstm(y_test, test_preds, idx2tag, save=True)

In [None]:
# Evaluate the model on the test set
eval_loss, eval_accuracy, eval_accuracy_without_o, f1 = evaluate_bilstm_tiny_test(model, val_loader, loss_function, n_tags, tag2idx)

print(f"Test Loss: {eval_loss}")
print(f"Test Accuracy: {eval_accuracy}")
print(f"Test Accuracy without 'O': {eval_accuracy_without_o}")
print(f"Test F1 Score: {f1}")

### Pre-trained BERT

In [None]:
#  Install Python libraries using pip inside Colab
#!pip install accelerate -U
#!pip install -U transformers
#!pip install evaluate
#!pip install seqeval -U

In [4]:
import pandas as pd

# Please change paths to your directory
train = pd.read_csv("/content/drive/MyDrive/nlp/ner/train_data_ner.csv")
test = pd.read_csv("/content/drive/MyDrive/nlp/ner/test_data_ner.csv")
tiny = pd.read_csv("/content/drive/MyDrive/nlp/ner/tiny_test.csv")

In [5]:
# Import tokenizer from a pretrained BERT

from transformers import AutoTokenizer
from transformers import DataCollatorForTokenClassification

model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [6]:
# tag dict to correlate labels and numbers
tag_dict = {'O': 0,}
i = 1
for name in ['geo', 'gpe', 'tim', 'org', 'per', 'art', 'nat', 'eve']:
    for prefix in ["B-", "I-"]:
        tag_dict[prefix + name] = i
        i += 1
tag_dict

{'O': 0,
 'B-geo': 1,
 'I-geo': 2,
 'B-gpe': 3,
 'I-gpe': 4,
 'B-tim': 5,
 'I-tim': 6,
 'B-org': 7,
 'I-org': 8,
 'B-per': 9,
 'I-per': 10,
 'B-art': 11,
 'I-art': 12,
 'B-nat': 13,
 'I-nat': 14,
 'B-eve': 15,
 'I-eve': 16}

In [7]:
# Functions to load and preprocess the data

# Fix labels according to word_ids
def align_labels_with_tokens(labels, word_ids):
    new_labels = [] # correct labels

    # Initialize a variable to keep track of the current word's ID
    current_word = None

    for word_id in word_ids:
        if word_id != current_word: # Start of a new word/entity
            current_word = word_id

            # Assign -100 to labels for special tokens, else use the word's label
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Handle special tokens by assigning them a label of -100
            new_labels.append(-100)
        else: # Token belongs to the same word/entity as the previous token
            label = labels[word_id]
            if label % 2 == 1: # If word split into different tokens: B -> I
                label += 1
            new_labels.append(label)
    return new_labels

def tokenize_and_align_labels(examples, save_wordid=False):
    tokenized_inputs = tokenizer(
        examples["words"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    if save_wordid:
        return tokenized_inputs, [tokenized_inputs.word_ids(i) for i in range(len(all_labels))]
    return tokenized_inputs

def format_data(df, save_wordid=False):
    df["number_tag"] = df["tags"].replace(tag_dict)
    df['words'] = df['words'].astype(str)
    sentences = df.groupby(by="sentence_id")["words"].apply(list)
    tags = df.groupby(by="sentence_id")["number_tag"].apply(list)
    data = {}
    data["words"] = sentences.tolist()
    data["tags"] = tags.tolist()
    if save_wordid:
        tokenized_datasets, wordids = tokenize_and_align_labels(data, save_wordid=save_wordid)
    else:
        tokenized_datasets = tokenize_and_align_labels(data, save_wordid=save_wordid)
    dataset = [ {key: tokenized_datasets[key][i]
                  for key in ['input_ids', 'token_type_ids', 'attention_mask', 'labels']}
                 for i in range(len(tokenized_datasets['input_ids'])) ]
    if save_wordid:
        return dataset, wordids
    return dataset

In [8]:
train_dataset, train_wordids = format_data(train, save_wordid=True)
test_dataset, test_wordids = format_data(test, save_wordid=True)
tiny_dataset, tiny_wordids = format_data(tiny, save_wordid=True)

In [9]:
# define metrics

import numpy as np
import evaluate
metric = evaluate.load("seqeval")

def compute_metrics(eval_preds):
    # Function compute_metrics used to compute the precision, recall
    # F1 score, and accuracy of the predictions made by a model.

    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    # Ignore -100
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    # Compute the metrix with evaluate
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

In [10]:
# Define model

from transformers import AutoModelForTokenClassification
import torch

# Map labels and id's
label_names = ['O', 'B-geo', 'I-geo', 'B-gpe', 'I-gpe', 'B-tim', 'I-tim', 'B-org', 'I-org', 'B-per', 'I-per', 'B-art', 'I-art', 'B-nat', 'I-nat', 'B-eve', 'I-eve']
id2label = {i: label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

# Load a pre-trained model for token classification
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
)
torch.device('cuda')

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


device(type='cuda')

In [11]:
# to make it faster
import accelerate
from transformers import TrainingArguments

# Define Training arguments
args = TrainingArguments(
    "bert-finetuned-ner4_v3",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-4,
    num_train_epochs=3,
    weight_decay=0.01,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
)



In [None]:
%%time
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)
trainer.train()
#!cp -r bert-finetuned-ner4_v3 /content/drive/MyDrive/nlp/ner/