In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

from pathlib import Path
import scipy
import numpy as np
import os,sys

currentdir = Path.cwd()
sys.path.insert(0,str(currentdir)+'\\utils') 
import skseq
import skseq.sequences.structured_perceptron as spc
from utils import *

This notebook containins all the code required to train the models and store them in fitted model folder.

## Loading Data

In [2]:
corpus = NerCorpus()

In [3]:
data_path = "./data"

#data_path = parentdir + data_path

train_seq = corpus.read_sequence_list(data_path + "/train_data_ner.csv", 
                                            max_sent_len=100)

## Structure Perceptron

### Structure Perceptron w/ given features

In [4]:
feature_mapper = skseq.sequences.id_feature.IDFeatures(train_seq)
feature_mapper.build_features()

In [5]:
sp = spc.StructuredPerceptron(corpus.word_dict, corpus.tag_dict, feature_mapper)
sp.num_epochs = 5
num_epochs = 5
sp.fit(feature_mapper.dataset, num_epochs)

Epoch: 0 Accuracy: 0.893766
Epoch: 1 Accuracy: 0.931943
Epoch: 2 Accuracy: 0.941427
Epoch: 3 Accuracy: 0.946285
Epoch: 4 Accuracy: 0.949950


In [6]:
sp.save_model("./fitted_models/perceptron_5_iter_given")

### Structure Perceptron w/ extra features

In [4]:
from skseq.sequences.extended_features import *

feature_mapper_ext = Extended_Features(train_seq)
feature_mapper_ext.build_features()

In [5]:
sp = spc.StructuredPerceptron(corpus.word_dict, corpus.tag_dict, feature_mapper_ext)
sp.num_epochs = 5
num_epochs = 5
sp.fit(feature_mapper_ext.dataset, num_epochs)

Epoch: 0 Accuracy: 0.929694
Epoch: 1 Accuracy: 0.943821
Epoch: 2 Accuracy: 0.947946
Epoch: 3 Accuracy: 0.950402
Epoch: 4 Accuracy: 0.952245


In [6]:
sp.save_model("./fitted_models/perceptron_5_iter_extra")

## Deep Learning approach

### Bi-LSTM

If errors occur when running this section, please see the following notebook in Google Colab where all the outputs can be seen: https://drive.google.com/file/d/1QGFU2hH6p8pU6hAaWe6dHxUKCng7t00V/view?usp=sharing

train_df = pd.read_csv('./data/train_data_ner.csv', encoding="latin1").fillna(method="ffill")
test_df = pd.read_csv("./data/test_data_ner.csv", encoding="latin1").fillna(method="ffill")
tiny_test_df = pd.read_csv("./data/tiny_test.csv", encoding="latin1").fillna(method="ffill")

In [None]:
# PREPROCESSING

# concatenating 3 df to create 1 vocabulary
concatenated_df = pd.concat([train_df, test_df, tiny_test_df], ignore_index=True)

# creating vocabulary
word2idx, tag2idx, n_words, n_tags, tags = create_vocabulary_bilstm(concatenated_df)

# getting sentences
train_getter = SentenceGetter(train_df)
test_getter = SentenceGetter(test_df)
tiny_test_getter = SentenceGetter(tiny_test_df)

train_sentences = train_getter.sentences
test_sentences = test_getter.sentences
tiny_test_sentences = tiny_test_getter.sentences

# Encode and pad sequences for each dataset
X_train, y_train = encode_and_pad_sequences_bilstm(train_sentences, tags, word2idx, tag2idx, n_words, max_len=50)
X_test, y_test = encode_and_pad_sequences_bilstm(test_sentences,tags, word2idx, tag2idx, n_words, max_len=50)
X_tiny_test, y_tiny_test = encode_and_pad_sequences_bilstm(tiny_test_sentences, tags, word2idx, tag2idx, n_words, max_len=50)

In [None]:
# creating dataset and dataloader
train_dataset = NERDataset(X_train, y_train)
val_dataset = NERDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

In [None]:
# Create class weights
weights, unique_tags = create_class_weights(train_df['tags'])

In [None]:
# hyperparameters for the model
embedding_dim = 100
hidden_dim = 256
dropout = 0.5
learning_rate = 0.001
weight_decay = 1e-5 # L2 regularization
n_epochs = 5

# initializing the model
model = BiLSTMModel(vocab_size=n_words, tagset_size=n_tags, padding_idx=n_words-1)
loss_function = nn.CrossEntropyLoss(weight=weights)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

In [None]:
%%time

# Train and evaluate the model
train_losses, valid_losses, train_accuracies, valid_accuracies = train_bilstm(train_loader, val_loader, model, loss_function, optimizer, n_epochs, n_tags)

In [None]:
plot_metrics_bilstm(train_losses, valid_losses, train_accuracies, valid_accuracies)

In [None]:
# Define the path where you want to save the model
model_save_path = "bi-lstm_model2.pth"

# Save the model's state dictionary
torch.save(model.state_dict(), model_save_path)

In [1]:
# evaluating train and test set

In [None]:
#train
tr_word2idx, tr_tag2idx, tr_n_words, tr_n_tags, tr_tags = create_vocabulary_bilstm(train_df)
# Define idx2word and idx2tag mappings
tr_idx2word = {v: k for k, v in tr_word2idx.items()}
tr_idx2tag = {v: k for k, v in tr_tag2idx.items()}

In [None]:
# Get predicted and true tags for further analysis
train_preds, train_labels = get_predictions_and_labels(model, train_loader)

In [None]:
# Convert tags to sentences
tagged_sentences = tags_to_sentences(train_sentences, train_preds, idx2word, idx2tag)

# Print example sentences with predicted tags
for sentence in tagged_sentences:
    print(sentence)

In [None]:
#plot_confusion_matrix_bilstm(y_train, train_preds, idx2tag, save=True) # getting ERROR

In [None]:
eval_loss, eval_accuracy, eval_accuracy_without_o, f1 = evaluate_bilstm_tiny_test(model, train_loader, loss_function, n_tags, tag2idx)

print(f"Test Loss: {eval_loss}")
print(f"Test Accuracy: {eval_accuracy}")
print(f"Test Accuracy without 'O': {eval_accuracy_without_o}")
print(f"Test F1 Score: {f1}")

In [None]:
#test
ts_word2idx, ts_tag2idx, ts_n_words, ts_n_tags, ts_tags = create_vocabulary_bilstm(test_df)
# Define idx2word and idx2tag mappings
ts_idx2word = {v: k for k, v in ts_word2idx.items()}
ts_idx2tag = {v: k for k, v in ts_tag2idx.items()}

In [None]:
# Get predicted and true tags for further analysis
test_preds, test_labels = get_predictions_and_labels(model, val_loader)

In [None]:
# Convert tags to sentences
tagged_sentences = tags_to_sentences(test_sentences, test_preds, idx2word, idx2tag)

# Print example sentences with predicted tags
for sentence in tagged_sentences:
    print(sentence)

In [None]:
plot_confusion_matrix_bilstm(y_test, test_preds, idx2tag, save=True)

In [None]:
# Evaluate the model on the test set
eval_loss, eval_accuracy, eval_accuracy_without_o, f1 = evaluate_bilstm_tiny_test(model, val_loader, loss_function, n_tags, tag2idx)

print(f"Test Loss: {eval_loss}")
print(f"Test Accuracy: {eval_accuracy}")
print(f"Test Accuracy without 'O': {eval_accuracy_without_o}")
print(f"Test F1 Score: {f1}")

### Pre-trained BERT