# Experiment Set Up

Imports

In [None]:
# FastText high ram required
!curl -l https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz --output cc.en.300.bin.gz
!gzip -d cc.en.300.bin.gz

In [None]:
!pip3 install datasets transformers tqdm
!pip install nltk
!pip install sklearn-crfsuite
# !pip install fasttext
!pip install matplotlib

In [None]:
import os
import numpy as np
import torch
from transformers import (AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification)
from datasets import load_dataset, load_metric
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix
from transformers import TrainerCallback
from sklearn_crfsuite import CRF
from transformers import RobertaTokenizer, BertTokenizer, BertModel
from transformers import DistilBertTokenizer, DistilBertModel
import numpy as np
# import wandb
import transformers
from transformers import EarlyStoppingCallback
from tqdm.notebook import tqdm
from datasets import ClassLabel, Sequence
import random
import pandas as pd
from IPython.display import display, HTML
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.metrics import confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from gensim.models import Word2Vec, Doc2Vec
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
import gensim
import gensim.downloader as api
print(list(api.info()['models']))
from sklearn.preprocessing import LabelEncoder
import gensim.downloader as api
# import fasttext
# import fasttext.util
from sklearn import svm
from sklearn.metrics import classification_report
import torch.nn as  nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from tqdm import tqdm
from sklearn.metrics import accuracy_score
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, TimeDistributed
import torch.optim as optim
import tqdm
import matplotlib.pyplot as plt


Load the Pretrained Embedding Models

In [None]:
import gensim.downloader
from gensim.models.word2vec import Word2Vec
import gensim.downloader as api

word2vec_model = api.load('word2vec-google-news-300')

print(list(gensim.downloader.info()['models'].keys()))

In [None]:
!pip install fasttext
import fasttext
import fasttext.util

In [None]:
fasttext_model =  fasttext.load_model('cc.en.300.bin')

Load Dataset

In [None]:
dataset = load_dataset("surrey-nlp/PLOD-CW")

train_data = dataset["train"]
validation_data = dataset["validation"]
test_data = dataset["test"]
print(dataset)
print('First example from the train split, ', train_data[0])
print('Column names,', train_data.column_names)

X_train = train_data['tokens']
X_train_pos_tags = train_data['pos_tags']
y_train = train_data['ner_tags']

X_val = validation_data['tokens']
X_val_pos_tags = validation_data['pos_tags']
y_val = validation_data['ner_tags']

X_test = test_data['tokens']
X_test_pos_tags = test_data['pos_tags']
y_test = test_data['ner_tags']

Define Constants

In [None]:
num_labels_pad = 5
lstm_units = 64
embedding_dim = 300

pos_embedding_dim = 50
unique_pos_tags = set(tag for sequence in X_train_pos_tags+X_val_pos_tags+X_test_pos_tags for tag in sequence)
pos_embeddings = np.random.rand(len(unique_pos_tags), pos_embedding_dim)
pos_tag_to_index = {tag: idx for idx, tag in enumerate(unique_pos_tags)}
print("unique pos tags, ", unique_pos_tags)
print("unique pos tag to index, ", pos_tag_to_index)

max_seq_length_train = max(len(seq) for seq in y_train)
max_seq_length_val = max(len(seq) for seq in y_val)
max_seq_length_test = max(len(seq) for seq in y_test)
combined_max_seq_length = max(max_seq_length_train, max_seq_length_val, max_seq_length_test)

label_encoding = {"B-O": 0, "B-AC": 1, "B-LF": 2, "I-LF": 3, "PAD": 4}
unique_labels = set(tag for sequence in y_train+y_val+y_test for tag in sequence)
print("unique labels, ", unique_labels)

Define Functions

In [None]:
def convert_labels(tag_sequence, pad_label="PAD"):
  convert_dict={0: 0, 1: 1, 3: 2, 4: 3, 'PAD': 4}
  pad_value = convert_dict[pad_label]
  encodings=[]
  for tag in tag_sequence:
    encodings.append(convert_dict.get(tag, pad_value))
  return encodings

def w2v_sequence_embedder_pad(token_list, max_sequence_length, embedding_dim=300):
  embeddings = []

  for token in token_list:
    if token in word2vec_model:
      embeddings.append(word2vec_model[token])
    else:
      embeddings.append(np.zeros(embedding_dim,))

  num_padding = max_sequence_length - len(token_list)

  if num_padding > 0:
      embeddings.extend([np.zeros(embedding_dim,) for _ in range(num_padding)])
  elif num_padding < 0:
      embeddings = embeddings[:max_sequence_length]

  sequence_embedding = np.array(embeddings)
  return sequence_embedding


def encode_sequences_pad(tag_sequence, max_sequence_length, pad_label="PAD"):
  pad_value = label_encoding[pad_label]

  encodings=[]

  for tag in tag_sequence:
    encodings.append(label_encoding.get(tag, pad_value))

  num_padding = max_sequence_length - len(tag_sequence)

  if num_padding > 0:
      encodings.extend([pad_value for _ in range(num_padding)])
  elif num_padding < 0:
      encodings = encodings[:max_sequence_length]
  return encodings

def encode_sequences(tag_sequence, pad_label='PAD'):
  pad_value = label_encoding[pad_label]

  encodings=[]

  for tag in tag_sequence:
    encodings.append(label_encoding.get(tag, pad_value))

  return encodings

def add_padding(tag_sequence, max_sequence_length, pad_label="PAD"):
  pad_value = label_encoding[pad_label]
  encodings=[]

  for tag in tag_sequence:
      encodings.append(tag)

  num_padding = max_sequence_length - len(tag_sequence)

  if num_padding > 0:
      encodings.extend([pad_value for _ in range(num_padding)])
  elif num_padding < 0:
      encodings = encodings[:max_sequence_length]
  return encodings

from sklearn.metrics import accuracy_score

def print_classification_report(model, test_data, test_labels):
  data_test = np.array(test_data)
  labels_test = np.array(test_labels)
  predictions = model.predict(data_test)
  predictions = np.argmax(predictions, axis=-1)
  true_labels = np.argmax(labels_test, axis=-1)

  flat_predictions = predictions.flatten()
  flat_true_labels = true_labels.flatten()

  PAD_INDEX = label_encoding['PAD']
  indices_to_keep = flat_true_labels != PAD_INDEX
  filtered_predictions = flat_predictions[indices_to_keep]
  filtered_true_labels = flat_true_labels[indices_to_keep]
  accuracy = accuracy_score(filtered_true_labels, filtered_predictions)
  print(f"Accuracy: {accuracy}")
  report = classification_report(filtered_true_labels, filtered_predictions, target_names=[key for key in label_encoding if key != 'PAD'], labels=[label_encoding[key] for key in label_encoding if key != 'PAD'])
  print(report)

def plot_graphs(model_history, graph_name, graph_type, colour='blue'):
  accuracy = model_history.history[graph_type]
  epochs = range(1, len(accuracy) + 1)

  plt.figure(figsize=(10, 6))
  plt.plot(epochs, accuracy, colour, label=f'Training {graph_type}')
  plt.title(graph_name)
  plt.xlabel('Epoch')
  plt.ylabel(graph_type)
  plt.legend()
  plt.show()

def ft_word_embedder(token_sequence):
    embeddings = []
    for token in token_sequence:
        embeddings.append(fasttext_model.get_word_vector(token))
    return embeddings

def w2v_word_embedder(token_list, embedding_dim=300):
  embeddings = []

  for token in token_list:
    if token in word2vec_model:
      embeddings.append(word2vec_model[token])
    else:
      embeddings.append(np.zeros(embedding_dim,))
  return embeddings


# Data Analysis

### Sequence Length Distribution

In [None]:
!pip install matplotlib

import matplotlib.pyplot as plt

bins = 30
figsize = (10, 6)

def plot_sentence_length_distribution(sentence_lengths, title, color):
    plt.figure(figsize=figsize)
    plt.hist(sentence_lengths, bins=bins, color=color, edgecolor='black', range=(0, 350))
    plt.title(title)
    plt.xlabel('Sentence Length')
    plt.ylabel('Frequency')
    plt.ylim(0, 300)
    plt.show()

train_sentence_lengths = [len(tokens) for tokens in X_train]
plot_sentence_length_distribution(train_sentence_lengths, 'Training Set Sentence Length Distribution', 'skyblue')

val_sentence_lengths = [len(tokens) for tokens in X_val]
plot_sentence_length_distribution(val_sentence_lengths, 'Validation Set Sentence Length Distribution', 'magenta')

test_sentence_lengths = [len(tokens) for tokens in X_test]
plot_sentence_length_distribution(test_sentence_lengths, 'Test Set Sentence Length Distribution', 'orange')

### Label Distribution

In [None]:
from collections import Counter
import matplotlib.pyplot as plt

ner_tags = [ner for sentence in train_data['ner_tags'] for ner in sentence]
ner_tag_counts = Counter(ner_tags)

ner_labels, ner_counts = zip(*ner_tag_counts.items())

plt.figure(figsize=(12, 8))
plt.bar(ner_labels, ner_counts, color='orange', alpha=0.7)
plt.xticks(rotation=45)
plt.title('NER Tag Distribution')
plt.xlabel('NER Tags')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()

### POS Tag Distribution

In [None]:
from collections import Counter
import matplotlib.pyplot as plt

pos_tags = [pos for sentence in train_data['pos_tags'] for pos in sentence]
pos_tag_counts = Counter(pos_tags)

pos_labels, pos_counts = zip(*pos_tag_counts.items())

plt.figure(figsize=(12, 8))
plt.bar(pos_labels, pos_counts, color='skyblue', alpha=0.7)
plt.xticks(rotation=45)
plt.title('POS Tag Distribution')
plt.xlabel('POS Tags')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()

### Check Acronyms and Long Forms in the Data

In [None]:
def extract_specific_words(data, tags_of_interest):
    words_of_interest = {tag: [] for tag in tags_of_interest}

    for tokens, ner_tags in zip(data['tokens'], data['ner_tags']):
        for token, tag in zip(tokens, ner_tags):
            if tag in tags_of_interest:
                words_of_interest[tag].append(token)

    return words_of_interest

interest_tags = ['B-AC', 'B-LF', 'I-LF']

train_words = extract_specific_words(train_data, interest_tags)
validation_words = extract_specific_words(validation_data, interest_tags)
test_words = extract_specific_words(test_data, interest_tags)

print("Words tagged as 'B-AC' in Training Data:", train_words['B-AC'])
print("Words tagged as 'B-LF' in Training Data:", train_words['B-LF'])
print("Words tagged as 'I-LF' in Training Data:", train_words['I-LF'])


print("Words tagged as 'B-AC' in Validation Data:", validation_words['B-AC'])
print("Words tagged as 'B-LF' in Validation Data:", validation_words['B-LF'])
print("Words tagged as 'I-LF' in Validation Data:", validation_words['I-LF'])

print("Words tagged as 'B-AC' in Test Data:", test_words['B-AC'])
print("Words tagged as 'B-LF' in Test Data:", test_words['B-LF'])
print("Words tagged as 'I-LF' in Test Data:", test_words['I-LF'])

### Co-occurance Entity Types

In [None]:
from collections import defaultdict
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

co_occurrences = defaultdict(lambda: defaultdict(int))

for sentence_tags in train_data['ner_tags']:
    unique_tags = set(sentence_tags)
    for tag1 in unique_tags:
        for tag2 in unique_tags:
            if tag1 != tag2:
                co_occurrences[tag1][tag2] += 1
                co_occurrences[tag2][tag1] += 1

co_occurrence_matrix = pd.DataFrame(co_occurrences).fillna(0)
co_occurrence_matrix = co_occurrence_matrix.astype(int)

print(co_occurrence_matrix)

plt.figure(figsize=(12, 10))
sns.heatmap(co_occurrence_matrix, annot=True, fmt="d", cmap="YlGnBu")
plt.title("BIO Tag Type Co-occurrence Matrix")
plt.xlabel("BIO Tag Types")
plt.ylabel("BIO Tag Types")
plt.show()

# Experiment 1: Feature Representation Experimentation

### 1.1 Experiment: FastText

Prepare labels for SVM

In [None]:
flat_labels_train = [item for sublist in y_train for item in sublist]
flat_labels_val = [item for sublist in y_val for item in sublist]
flat_labels_test = [item for sublist in y_test for item in sublist]


encoded_labels_train = []
for sample in flat_labels_train:
    encoded_labels_train.append(label_encoding.get(sample))

encoded_labels_val = []
for sample in flat_labels_val:
    encoded_labels_val.append(label_encoding.get(sample))

encoded_labels_test = []
for sample in flat_labels_test:
    encoded_labels_test.append(label_encoding.get(sample))

Prepare embeddings for SVM

In [None]:
flat_tokens_train = [item for sublist in X_train for item in sublist]
flat_tokens_val = [item for sublist in X_val for item in sublist]
flat_tokens_test = [item for sublist in X_test for item in sublist]

train_token_ft_embeddings = ft_word_embedder(flat_tokens_train)
val_token_ft_embeddings = ft_word_embedder(flat_tokens_val)
test_token_ft_embeddings = ft_word_embedder(flat_tokens_test)

Train SVM

In [None]:
from sklearn import svm

svm_model_ft = svm.LinearSVC()
svm_model_ft.fit(train_token_ft_embeddings, encoded_labels_train)

Predict and obtain metrics

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Evaluation on validation set
svm_ft_prediction_val = svm_model_ft.predict(val_token_ft_embeddings)
accuracy = accuracy_score(encoded_labels_val, svm_ft_prediction_val)
cm = confusion_matrix(encoded_labels_val, svm_ft_prediction_val)
print("Validation report:\n", classification_report(encoded_labels_val, svm_ft_prediction_val))
print("Validation accuracy:", accuracy)
print("Validation confusion matrix:\n", cm)

# Evaluation on test set
svm_ft_prediction_test = svm_model_ft.predict(test_token_ft_embeddings)
accuracy = accuracy_score(encoded_labels_test, svm_ft_prediction_test)
cm = confusion_matrix(encoded_labels_test, svm_ft_prediction_test)
print("Test report:\n", classification_report(encoded_labels_test, svm_ft_prediction_test))
print("Test accuracy:", accuracy)
print("test confusion matrix:\n", cm)

### 1.2 Experiment: Bag of Words (BoW)

In [None]:
from sklearn.svm import SVC
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

vectorizer = CountVectorizer()
train_token_bow_embeddings = vectorizer.fit_transform(flat_tokens_train)
val_token_bow_embeddings = vectorizer.transform(flat_tokens_val)
test_token_bow_embeddings = vectorizer.transform(flat_tokens_test)

svm_model_bow = svm.LinearSVC()
svm_model_bow.fit(train_token_bow_embeddings, encoded_labels_train)

In [None]:
# Evaluate on validation set
svm_bow_prediction_val = svm_model_bow.predict(val_token_bow_embeddings)
accuracy = accuracy_score(encoded_labels_val, svm_bow_prediction_val)
cm = confusion_matrix(encoded_labels_val, svm_bow_prediction_val)
print("Validation report:\n", classification_report(encoded_labels_val, svm_bow_prediction_val))
print("Validation accuracy:", accuracy)
print("Validation confusion matrix:\n", cm)

# Evaluate on test set
svm_bow_prediction_test = svm_model_bow.predict(test_token_bow_embeddings)
accuracy = accuracy_score(encoded_labels_test, svm_bow_prediction_test)
cm = confusion_matrix(encoded_labels_test, svm_bow_prediction_test)
print("Test report:\n", classification_report(encoded_labels_test, svm_bow_prediction_test))
print("Test accuracy:", accuracy)
print("test confusion matrix:\n", cm)

### 1.3 Experiment: Word2Vec

In [None]:
from sklearn.svm import SVC
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

train_token_w2v_embeddings = w2v_word_embedder(flat_tokens_train)
val_token_w2v_embeddings = w2v_word_embedder(flat_tokens_val)
test_token_w2v_embeddings = w2v_word_embedder(flat_tokens_test)

svm_model_w2v = svm.LinearSVC()
svm_model_w2v.fit(train_token_w2v_embeddings, encoded_labels_train)

In [None]:
# Evaluate on validation set
svm_w2v_prediction_val = svm_model_w2v.predict(val_token_w2v_embeddings)
accuracy = accuracy_score(encoded_labels_val, svm_w2v_prediction_val)
cm = confusion_matrix(encoded_labels_val, svm_w2v_prediction_val)
print("Validation report:\n", classification_report(encoded_labels_val, svm_w2v_prediction_val))
print("Validation accuracy:", accuracy)
print("Validation confusion matrix:\n", cm)

# Evaluate on test set
svm_w2v_prediction_test = svm_model_w2v.predict(test_token_w2v_embeddings)
accuracy = accuracy_score(encoded_labels_test, svm_w2v_prediction_test)
cm = confusion_matrix(encoded_labels_test, svm_w2v_prediction_test)
print("Test report:\n", classification_report(encoded_labels_test, svm_w2v_prediction_test))
print("Test accuracy:", accuracy)
print("test confusion matrix:\n", cm)

# Experiment 2: Model Experimentation

### 2.1 Experiment: CRF

In [None]:
def word2features(sentence:list, i:int):
    word = sentence[i]
    features = {
        'word': word,
        'is_first': i == 0, #if the word is a first word
        'is_last': i == len(sentence) - 1,  #if the word is a last word
        'is_capitalized': word[0].upper() == word[0],
        'is_all_caps': word.upper() == word,      #word is in uppercase
        'is_all_lower': word.lower() == word,      #word is in lowercase
         #prefix of the word
        'prefix-1': word[0],
        'prefix-2': word[:2],
        'prefix-3': word[:3],
         #suffix of the word
        'suffix-1': word[-1],
        'suffix-2': word[-2:],
        'suffix-3': word[-3:],
         #extracting previous word
        'prev_word': '' if i == 0 else sentence[i-1][0],
         #extracting next word
        'next_word': '' if i == len(sentence)-1 else sentence[i+1][0],
        'has_hyphen': '-' in word,    #if word has hypen
        'is_numeric': word.isdigit(),  #if word is in numeric
        'capitals_inside': word[1:].lower() != word[1:]
    }
    return features

def sent2features(tokens:list):
    return [word2features(tokens, index) for index in range(len(tokens))]

train_token_crf_embeddings=[sent2features(tokens) for tokens in X_train]
val_token_crf_embeddings=[sent2features(tokens) for tokens in X_val]
test_token_crf_embeddings=[sent2features(tokens) for tokens in X_test]

crf = CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)

# Train crf model
try:
    crf.fit(train_token_crf_embeddings, y_train)
except AttributeError:
    pass

print(train_token_crf_embeddings[0])
print(len(y_train[0]))

from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Evaluate on val set
crf_prediction_val = crf.predict(val_token_crf_embeddings)
crf_prediction_val_flat = [label for sublist in crf_prediction_val for label in sublist] # Flatten nested lists for evaluation
accuracy = accuracy_score(flat_labels_val, crf_prediction_val_flat)
cm = confusion_matrix(flat_labels_val, crf_prediction_val_flat)
print("Validation report:\n", classification_report(flat_labels_val, crf_prediction_val_flat))
print("Validation accuracy:", accuracy)
print("Validation confusion matrix:\n", cm)

# Evaluate on test set
crf_prediction_test = crf.predict(test_token_crf_embeddings)
crf_prediction_test_flat = [label for sublist in crf_prediction_test for label in sublist] # Flatten nested lists for evaluation
accuracy = accuracy_score(flat_labels_test, crf_prediction_test_flat)
cm = confusion_matrix(flat_labels_test, crf_prediction_test_flat)
print("Test report:\n", classification_report(flat_labels_test, crf_prediction_test_flat))
print("Test accuracy:", accuracy)
print("Test confusion matrix:\n", cm)

### 2.2 Experiment: LSTM

In [None]:
import tensorflow as tf
from datasets import load_dataset
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, TimeDistributed, Masking, InputLayer, Embedding
from tensorflow.keras.utils import to_categorical
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, classification_report
import gensim.downloader as api
import os

# Ensure the directory for saving graphs exists
os.makedirs('lstmgraphs', exist_ok=True)

# Load the Word2Vec model
print("Loading Word2Vec model...")
word2vec_model = api.load('word2vec-google-news-300')
print("Word2Vec model loaded.")

# Load the dataset from Hugging Face's 'datasets'
print("Loading dataset...")
dataset = load_dataset("surrey-nlp/PLOD-CW")
train_data = dataset["train"]
validation_data = dataset["validation"]
test_data = dataset["test"]
print("Dataset loaded.")

# Constants
MAX_SEQ_LENGTH = 323
NUM_LABELS = 5
EMBEDDING_DIM = 300

label_encoding = {"B-O": 1, "B-AC": 2, "B-LF": 3, "I-LF": 4, "PAD": 0}

# Function to encode labels
def encode_labels(labels):
    print("Encoding labels...")
    encoded = [[label_encoding[label] for label in sequence] for sequence in labels]
    print("Labels encoded.")
    return encoded

# Function to embed tokens using Word2Vec
def embed_tokens(tokens):
    print("Embedding tokens...")
    embedded_tokens = []
    for token_sequence in tokens:
        sequence_embedding = [word2vec_model[word] if word in word2vec_model else np.zeros(EMBEDDING_DIM) for word in token_sequence]
        if len(sequence_embedding) < MAX_SEQ_LENGTH:
            sequence_embedding += [np.zeros(EMBEDDING_DIM) for _ in range(MAX_SEQ_LENGTH - len(sequence_embedding))]
        embedded_tokens.append(np.array(sequence_embedding[:MAX_SEQ_LENGTH]))
    print("Tokens embedded.")
    return np.array(embedded_tokens)

def pad_sequences_custom(data, maxlen, dtype='float32'):
    print(f"Padding sequences to a maximum length of {maxlen}...")
    padded_data = tf.keras.preprocessing.sequence.pad_sequences(data, maxlen=maxlen, dtype=dtype, padding='post', truncating='post', value=0.0)
    print("Sequences padded.")
    return padded_data

# Prepare datasets
def prepare_data(data):
    print("Preparing data...")
    tokens = data['tokens']
    labels = encode_labels(data['ner_tags'])
    embedded_tokens = embed_tokens(tokens)
    print("Token Embeddings: ", embedded_tokens.shape)
    padded_labels = pad_sequences_custom(labels, MAX_SEQ_LENGTH, dtype='int32')
    print("Data prepared.")
    return embedded_tokens, to_categorical(padded_labels, num_classes=NUM_LABELS), padded_labels


print("Preparing training data...")
X_train, y_train, train_padded_labels = prepare_data(train_data)
sample_weight = np.where(train_padded_labels == label_encoding["PAD"], 0, 1)
print("Training data ready.")

print("Preparing validation data...")
X_val, y_val, val_padded_labels = prepare_data(validation_data)
val_sample_weight = np.where(val_padded_labels == label_encoding["PAD"], 0, 1)
print("Validation data ready.")

print("Preparing test data...")
X_test, y_test, test_padded_labels = prepare_data(test_data)
test_sample_weight = np.where(test_padded_labels == label_encoding["PAD"], 0, 1)
print("Test data ready.")

def create_lstm_model(num_labels, lstm_units, vocab_size, embedding_size, w2v_weights, max_sequence_length):
    model = Sequential()
    model.add(Masking(mask_value=0.0, input_shape=(max_sequence_length, embedding_size)))
    model.add(LSTM(lstm_units, return_sequences=True))
    model.add(TimeDistributed(Dense(num_labels, activation='softmax')))
    return model

NUM_LABELS_PAD = 5
LSTM_UNITS = 64
COMBINED_MAX_SEQ_LENGTH = 323
w2v_weights = word2vec_model.vectors

vocab_size, embedding_size = w2v_weights.shape
print("Word2Vec weights shape:", w2v_weights.shape)
print("Creating LSTM model...")
lstm_model = create_lstm_model(NUM_LABELS_PAD, LSTM_UNITS, vocab_size, EMBEDDING_DIM, w2v_weights, COMBINED_MAX_SEQ_LENGTH)
print("LSTM model created.")

# Train the model
print("Starting training...")
lstm_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
print("Before i am pased in: ", X_train.shape)
print("Starting training...")
history = lstm_model.fit(
    X_train,
    y_train,
    validation_data=(X_val, y_val),
    epochs=30,
    batch_size=32,
    sample_weight=sample_weight
)
print("Training completed.")

def evaluate_model(model, X, y):
    print("Evaluating model...")
    y_pred = model.predict(X)
    y_pred = np.argmax(y_pred, axis=-1)
    y_true = np.argmax(y, axis=-1)

    mask = y_true != label_encoding["PAD"]
    y_pred_masked = y_pred[mask]
    y_true_masked = y_true[mask]

    accuracy = np.mean(y_pred_masked == y_true_masked)
    print(f"Accuracy: {accuracy * 100:.2f}%")

    labels_without_pad = [label for label in label_encoding if label != "PAD"]
    label_values_without_pad = [label_encoding[label] for label in labels_without_pad]

    print("Classification Report:")
    print(classification_report(y_true_masked, y_pred_masked, labels=label_values_without_pad, target_names=labels_without_pad, zero_division=1))

    cm = confusion_matrix(y_true_masked, y_pred_masked, labels=label_values_without_pad)
    sns.heatmap(cm, annot=True, fmt='d', cmap='seismic', xticklabels=labels_without_pad, yticklabels=labels_without_pad)
    plt.title('Confusion Matrix')
    plt.xlabel('Predicted Labels')
    plt.ylabel('True Labels')
    plt.savefig('lstmgraphs/confusion_matrix.png')
    plt.show()

# Run evaluation
print("Evaluating on test data...")
evaluate_model(lstm_model, X_test, y_test)
print("Evaluation completed.")

### 2.3 Experiment: BiLSTM

In [None]:
code: # Import necessary libraries
import numpy as np
import tensorflow as tf
from datasets import load_dataset
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, TimeDistributed, Masking, InputLayer, Embedding, Bidirectional
from tensorflow.keras.utils import to_categorical
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, classification_report
import gensim.downloader as api
import os

os.makedirs('lstmgraphs', exist_ok=True)

print("Loading Word2Vec model...")
word2vec_model = api.load('word2vec-google-news-300')
print("Word2Vec model loaded.")

print("Loading dataset...")
dataset = load_dataset("surrey-nlp/PLOD-CW")
train_data = dataset["train"]
validation_data = dataset["validation"]
test_data = dataset["test"]
print("Dataset loaded.")

MAX_SEQ_LENGTH = 323
NUM_LABELS = 5
EMBEDDING_DIM = 300

label_encoding = {"B-O": 1, "B-AC": 2, "B-LF": 3, "I-LF": 4, "PAD": 0}

def encode_labels(labels):
    print("Encoding labels...")
    encoded = [[label_encoding[label] for label in sequence] for sequence in labels]
    print("Labels encoded.")
    return encoded

def embed_tokens(tokens):
    print("Embedding tokens...")
    embedded_tokens = []
    for token_sequence in tokens:
        sequence_embedding = [word2vec_model[word] if word in word2vec_model else np.zeros(EMBEDDING_DIM) for word in token_sequence]
        if len(sequence_embedding) < MAX_SEQ_LENGTH:
            sequence_embedding += [np.zeros(EMBEDDING_DIM) for _ in range(MAX_SEQ_LENGTH - len(sequence_embedding))]
        embedded_tokens.append(np.array(sequence_embedding[:MAX_SEQ_LENGTH]))
    print("Tokens embedded.")
    return np.array(embedded_tokens)

def pad_sequences_custom(data, maxlen, dtype='float32'):
    print(f"Padding sequences to a maximum length of {maxlen}...")
    padded_data = tf.keras.preprocessing.sequence.pad_sequences(data, maxlen=maxlen, dtype=dtype, padding='post', truncating='post', value=0.0)
    print("Sequences padded.")
    return padded_data

def prepare_data(data):
    print("Preparing data...")
    tokens = data['tokens']
    labels = encode_labels(data['ner_tags'])
    embedded_tokens = embed_tokens(tokens)
    print("Token Embeddings: ", embedded_tokens.shape)
    padded_labels = pad_sequences_custom(labels, MAX_SEQ_LENGTH, dtype='int32')
    print("Data prepared.")
    return embedded_tokens, to_categorical(padded_labels, num_classes=NUM_LABELS), padded_labels


print("Preparing training data...")
X_train, y_train, train_padded_labels = prepare_data(train_data)
sample_weight = np.where(train_padded_labels == label_encoding["PAD"], 0, 1)
print("Training data ready.")

print("Preparing validation data...")
X_val, y_val, val_padded_labels = prepare_data(validation_data)
val_sample_weight = np.where(val_padded_labels == label_encoding["PAD"], 0, 1)
print("Validation data ready.")

print("Preparing test data...")
X_test, y_test, test_padded_labels = prepare_data(test_data)
test_sample_weight = np.where(test_padded_labels == label_encoding["PAD"], 0, 1)
print("Test data ready.")

def create_bilstm_model(num_labels, lstm_units, vocab_size, embedding_size, w2v_weights, max_sequence_length):
    model = Sequential()
    model.add(Masking(mask_value=0.0, input_shape=(max_sequence_length, embedding_size)))
    model.add(Bidirectional(LSTM(lstm_units, return_sequences=True)))
    model.add(TimeDistributed(Dense(num_labels, activation='softmax')))
    return model

NUM_LABELS_PAD = 5
LSTM_UNITS = 64
COMBINED_MAX_SEQ_LENGTH = 323
w2v_weights = word2vec_model.vectors

vocab_size, embedding_size = w2v_weights.shape
print("Word2Vec weights shape:", w2v_weights.shape)
print("Creating LSTM model...")
lstm_model = create_bilstm_model(NUM_LABELS_PAD, LSTM_UNITS, vocab_size, EMBEDDING_DIM, w2v_weights, COMBINED_MAX_SEQ_LENGTH)
print("LSTM model created.")

print("Starting training...")
lstm_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])  # Compile the modelfjl
print("Before i am pased in: ", X_train.shape)
print("Starting training...")
history = lstm_model.fit(
    X_train,
    y_train,
    validation_data=(X_val, y_val),
    epochs=30,
    batch_size=32,
    sample_weight=sample_weight
)
print("Training completed.")

def evaluate_model(model, X, y):
    print("Evaluating model...")
    y_pred = model.predict(X)
    y_pred = np.argmax(y_pred, axis=-1)
    y_true = np.argmax(y, axis=-1)

    mask = y_true != label_encoding["PAD"]
    y_pred_masked = y_pred[mask]
    y_true_masked = y_true[mask]

    accuracy = np.mean(y_pred_masked == y_true_masked)
    print(f"Accuracy: {accuracy * 100:.2f}%")

    labels_without_pad = [label for label in label_encoding if label != "PAD"]
    label_values_without_pad = [label_encoding[label] for label in labels_without_pad]

    print("Classification Report:")
    print(classification_report(y_true_masked, y_pred_masked, labels=label_values_without_pad, target_names=labels_without_pad, zero_division=1))

    cm = confusion_matrix(y_true_masked, y_pred_masked, labels=label_values_without_pad)
    sns.heatmap(cm, annot=True, fmt='d', cmap='seismic', xticklabels=labels_without_pad, yticklabels=labels_without_pad)
    plt.title('Confusion Matrix')
    plt.xlabel('Predicted Labels')
    plt.ylabel('True Labels')
    plt.savefig('lstmgraphs/confusion_matrix.png')  # Save confusion matrix to a file
    plt.show()

print("Evaluating on test data...")
evaluate_model(lstm_model, X_test, y_test)
print("Evaluation completed.")

# Experiment 3: POS Tags Inclusion and Additional Filtered Dataset

### 3.1 Experiment: Add POS Tags

In [None]:
import numpy as np
import tensorflow as tf
from datasets import load_dataset
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, TimeDistributed, Masking, InputLayer, Embedding, Bidirectional
from tensorflow.keras.utils import to_categorical
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, classification_report
import gensim.downloader as api
import os

os.makedirs('lstmgraphs', exist_ok=True)

print("Loading Word2Vec model...")
word2vec_model = api.load('word2vec-google-news-300')
print("Word2Vec model loaded.")

print("Loading dataset...")
dataset = load_dataset("surrey-nlp/PLOD-CW")
train_data = dataset["train"]
validation_data = dataset["validation"]
test_data = dataset["test"]
print("Dataset loaded.")
print("Preparing POS embeddings...")
train_pos_tags = train_data['pos_tags']
unique_pos_tags = set(tag for sequence in train_pos_tags for tag in sequence)
pos_embedding_dim = 50
pos_embeddings = np.random.rand(len(unique_pos_tags), pos_embedding_dim)
pos_tag_to_index = {tag: idx for idx, tag in enumerate(unique_pos_tags)}
print("POS embeddings prepared.")

MAX_SEQ_LENGTH = 323
NUM_LABELS = 5
EMBEDDING_DIM = 300

label_encoding = {"B-O": 1, "B-AC": 2, "B-LF": 3, "I-LF": 4, "PAD": 0}

def encode_labels(labels):
    print("Encoding labels...")
    encoded = [[label_encoding[label] for label in sequence] for sequence in labels]
    print("Labels encoded.")
    return encoded

def embed_tokens(tokens, pos_tags, pos_embeddings, pos_tag_to_index, pos_embedding_dim=50):
    print("Embedding tokens with POS...")
    embedded_sequences = []
    for token_sequence, pos_sequence in zip(tokens, pos_tags):
        sequence_embeddings = []
        for token, pos_tag in zip(token_sequence, pos_sequence):
            word_embedding = word2vec_model[token] if token in word2vec_model else np.zeros(EMBEDDING_DIM)
            pos_embedding = get_pos_embedding(pos_tag, pos_embeddings, pos_tag_to_index, pos_embedding_dim)
            combined_embedding = np.concatenate((word_embedding, pos_embedding))
            sequence_embeddings.append(combined_embedding)

        if len(sequence_embeddings) < MAX_SEQ_LENGTH:
            padding = np.zeros(EMBEDDING_DIM + pos_embedding_dim,)
            sequence_embeddings.extend([padding for _ in range(MAX_SEQ_LENGTH - len(sequence_embeddings))])

        sequence_embeddings = sequence_embeddings[:MAX_SEQ_LENGTH]
        embedded_sequences.append(np.array(sequence_embeddings))

    print("Tokens with POS embedded.")
    return np.array(embedded_sequences)

def get_pos_embedding(pos_tag, pos_embeddings, pos_tag_to_index, pos_embedding_dim=50):
    if pos_tag not in pos_tag_to_index:
        return np.zeros(pos_embedding_dim)
    index = pos_tag_to_index[pos_tag]
    return pos_embeddings[index]

def pad_sequences_custom(data, maxlen, dtype='float32'):
    print(f"Padding sequences to a maximum length of {maxlen}...")
    padded_data = tf.keras.preprocessing.sequence.pad_sequences(data, maxlen=maxlen, dtype=dtype, padding='post', truncating='post', value=0.0)
    print("Sequences padded.")
    return padded_data

def prepare_data(data):
    print("Preparing data...")
    tokens = data['tokens']
    pos_tags = data['pos_tags']
    labels = encode_labels(data['ner_tags'])
    embedded_tokens = embed_tokens(tokens,pos_tags,pos_embeddings, pos_tag_to_index, pos_embedding_dim=50)
    print("Token Embeddings: ", embedded_tokens.shape)
    padded_labels = pad_sequences_custom(labels, MAX_SEQ_LENGTH, dtype='int32')
    print("Data prepared.")
    return embedded_tokens, to_categorical(padded_labels, num_classes=NUM_LABELS), padded_labels


print("Preparing training data...")
X_train, y_train, train_padded_labels = prepare_data(train_data)
sample_weight = np.where(train_padded_labels == label_encoding["PAD"], 0, 1)
print("Training data ready.")

print("Preparing validation data...")
X_val, y_val, val_padded_labels = prepare_data(validation_data)
val_sample_weight = np.where(val_padded_labels == label_encoding["PAD"], 0, 1)
print("Validation data ready.")

print("Preparing test data...")
X_test, y_test, test_padded_labels = prepare_data(test_data)
test_sample_weight = np.where(test_padded_labels == label_encoding["PAD"], 0, 1)
print("Test data ready.")

def create_bilstm_model(num_labels, lstm_units, vocab_size, embedding_size, w2v_weights, max_sequence_length):
    model = Sequential()
    model.add(Masking(mask_value=0.0, input_shape=(max_sequence_length, embedding_size+50)))
    model.add(Bidirectional(LSTM(lstm_units, return_sequences=True)))
    model.add(TimeDistributed(Dense(num_labels, activation='softmax')))
    return model

NUM_LABELS_PAD = 5
LSTM_UNITS = 64
COMBINED_MAX_SEQ_LENGTH = 323
w2v_weights = word2vec_model.vectors

vocab_size, embedding_size = w2v_weights.shape
print("Word2Vec weights shape:", w2v_weights.shape)
print("Creating LSTM model...")
lstm_model = create_bilstm_model(NUM_LABELS_PAD, LSTM_UNITS, vocab_size, EMBEDDING_DIM, w2v_weights, COMBINED_MAX_SEQ_LENGTH)
print("LSTM model created.")

print("Starting training...")
lstm_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])  # Compile the modelfjl
print("Before i am pased in: ", X_train.shape)
print("Starting training...")
history = lstm_model.fit(
    X_train,
    y_train,
    validation_data=(X_val, y_val),
    epochs=30,
    batch_size=32,
    sample_weight=sample_weight
)
print("Training completed.")

def evaluate_model(model, X, y):
    print("Evaluating model...")
    y_pred = model.predict(X)
    y_pred = np.argmax(y_pred, axis=-1)
    y_true = np.argmax(y, axis=-1)

    mask = y_true != label_encoding["PAD"]
    y_pred_masked = y_pred[mask]
    y_true_masked = y_true[mask]

    accuracy = np.mean(y_pred_masked == y_true_masked)
    print(f"Accuracy: {accuracy * 100:.2f}%")

    labels_without_pad = [label for label in label_encoding if label != "PAD"]
    label_values_without_pad = [label_encoding[label] for label in labels_without_pad]

    print("Classification Report:")
    print(classification_report(y_true_masked, y_pred_masked, labels=label_values_without_pad, target_names=labels_without_pad, zero_division=1))

    cm = confusion_matrix(y_true_masked, y_pred_masked, labels=label_values_without_pad)
    sns.heatmap(cm, annot=True, fmt='d', cmap='seismic', xticklabels=labels_without_pad, yticklabels=labels_without_pad)
    plt.title('Confusion Matrix')
    plt.xlabel('Predicted Labels')
    plt.ylabel('True Labels')
    plt.savefig('lstmgraphs/confusion_matrix.png')
    plt.show()

print("Evaluating on test data...")
evaluate_model(lstm_model, X_test, y_test)
print("Evaluation completed.")

### 3.2 Experiment: Additional Data

In [None]:
import numpy as np
import tensorflow as tf
from datasets import load_dataset
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, TimeDistributed, Masking, InputLayer, Embedding, Bidirectional
from tensorflow.keras.utils import to_categorical
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, classification_report
import gensim.downloader as api
import os

os.makedirs('lstmgraphs', exist_ok=True)

print("Loading Word2Vec model...")
word2vec_model = api.load('word2vec-google-news-300')
print("Word2Vec model loaded.")

print("Loading dataset...")
dataset = load_dataset("surrey-nlp/PLOD-CW")
train_data = dataset["train"]
validation_data = dataset["validation"]
test_data = dataset["test"]
print("Dataset loaded.")

print("Loading filtered dataset...")
filtereddataset = load_dataset("surrey-nlp/PLOD-filtered")
filtered_train_data = filtereddataset["train"]
filtered_valid_data = filtereddataset["validation"]
filtered_test_data = filtereddataset["test"]
print("Dataset loaded.")

MAX_SEQ_LENGTH = 323
NUM_LABELS = 5
EMBEDDING_DIM = 300

label_encoding = {"B-O": 1, "B-AC": 2, "B-LF": 3, "I-LF": 4, "PAD": 0}
filtered_label_encoding = {0:1, 1:2, 3:3, 4:4, "PAD":0}

def encode_labels(labels):
    print("Encoding labels...")
    encoded = [[label_encoding[label] for label in sequence] for sequence in labels]
    print("Labels encoded.")
    return encoded

def encode_filtered_labels(labels):
    print("Encoding labels...")
    encoded = [[filtered_label_encoding[label] for label in sequence] for sequence in labels]
    print("Labels encoded.")
    return encoded

def embed_tokens(tokens):
    print("Embedding tokens...")
    embedded_tokens = []
    for token_sequence in tokens:
        sequence_embedding = [word2vec_model[word] if word in word2vec_model else np.zeros(EMBEDDING_DIM) for word in token_sequence]
        if len(sequence_embedding) < MAX_SEQ_LENGTH:
            sequence_embedding += [np.zeros(EMBEDDING_DIM) for _ in range(MAX_SEQ_LENGTH - len(sequence_embedding))]
        embedded_tokens.append(np.array(sequence_embedding[:MAX_SEQ_LENGTH]))
    print("Tokens embedded.")
    return np.array(embedded_tokens)

def pad_sequences_custom(data, maxlen, dtype='float32'):
    print(f"Padding sequences to a maximum length of {maxlen}...")
    padded_data = tf.keras.preprocessing.sequence.pad_sequences(data, maxlen=maxlen, dtype=dtype, padding='post', truncating='post', value=0.0)
    print("Sequences padded.")
    return padded_data

def prepare_data(data, filtered_data):
    print("Preparing data...")
    tokens = data['tokens']
    labels = encode_labels(data['ner_tags'])
    embedded_tokens = embed_tokens(tokens)
    print("Token Embeddings: ", embedded_tokens.shape)
    padded_labels = pad_sequences_custom(labels, MAX_SEQ_LENGTH, dtype='int32')
    print("Data prepared.")

    print("Preparing filtered data...")
    clip_size = int(len(filtered_data) * 0.1)
    filtered_tokens=filtered_data['tokens'][:clip_size]
    filtered_labels=encode_filtered_labels(filtered_data['ner_tags'][:clip_size])
    embedded_filtered_tokens=embed_tokens(filtered_tokens)
    print("Filtered Token Embeddings: ", embedded_filtered_tokens.shape)
    padded_filtered_labels = pad_sequences_custom(filtered_labels, MAX_SEQ_LENGTH, dtype='int32')

    combined_embeddings = np.concatenate([embedded_tokens, embedded_filtered_tokens], axis=0)
    combined_labels = np.concatenate([padded_labels, padded_filtered_labels], axis=0)

    return combined_embeddings, to_categorical(combined_labels, num_classes=NUM_LABELS), combined_labels


print("Preparing training data...")
X_train, y_train, train_padded_labels = prepare_data(train_data, filtered_train_data)
sample_weight = np.where(train_padded_labels == label_encoding["PAD"], 0, 1)
print("Training data ready.")

print("Preparing validation data...")
X_val, y_val, val_padded_labels = prepare_data(validation_data, filtered_valid_data)
val_sample_weight = np.where(val_padded_labels == label_encoding["PAD"], 0, 1)
print("Validation data ready.")

print("Preparing test data...")
X_test, y_test, test_padded_labels = prepare_data(test_data, filtered_test_data)
test_sample_weight = np.where(test_padded_labels == label_encoding["PAD"], 0, 1)
print("Test data ready.")

def create_bilstm_model(num_labels, lstm_units, vocab_size, embedding_size, w2v_weights, max_sequence_length):
    model = Sequential()
    model.add(Masking(mask_value=0.0, input_shape=(max_sequence_length, embedding_size)))
    model.add(Bidirectional(LSTM(lstm_units, return_sequences=True)))
    model.add(TimeDistributed(Dense(num_labels, activation='softmax')))
    return model

NUM_LABELS_PAD = 5
LSTM_UNITS = 64
COMBINED_MAX_SEQ_LENGTH = 323
w2v_weights = word2vec_model.vectors

vocab_size, embedding_size = w2v_weights.shape
print("Word2Vec weights shape:", w2v_weights.shape)
print("Creating LSTM model...")
lstm_model = create_bilstm_model(NUM_LABELS_PAD, LSTM_UNITS, vocab_size, EMBEDDING_DIM, w2v_weights, COMBINED_MAX_SEQ_LENGTH)
print("LSTM model created.")

print("Starting training...")
lstm_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])  # Compile the modelfjl
print("Before i am pased in: ", X_train.shape)
print("Starting training...")
history = lstm_model.fit(
    X_train,
    y_train,
    validation_data=(X_val, y_val),
    epochs=30,
    batch_size=32,
    sample_weight=sample_weight
)
print("Training completed.")

def evaluate_model(model, X, y):
    print("Evaluating model...")
    y_pred = model.predict(X)
    y_pred = np.argmax(y_pred, axis=-1)
    y_true = np.argmax(y, axis=-1)

    mask = y_true != label_encoding["PAD"]
    y_pred_masked = y_pred[mask]
    y_true_masked = y_true[mask]

    accuracy = np.mean(y_pred_masked == y_true_masked)
    print(f"Accuracy: {accuracy * 100:.2f}%")

    labels_without_pad = [label for label in label_encoding if label != "PAD"]
    label_values_without_pad = [label_encoding[label] for label in labels_without_pad]

    print("Classification Report:")
    print(classification_report(y_true_masked, y_pred_masked, labels=label_values_without_pad, target_names=labels_without_pad, zero_division=1))

    cm = confusion_matrix(y_true_masked, y_pred_masked, labels=label_values_without_pad)
    sns.heatmap(cm, annot=True, fmt='d', cmap='seismic', xticklabels=labels_without_pad, yticklabels=labels_without_pad)
    plt.title('Confusion Matrix')
    plt.xlabel('Predicted Labels')
    plt.ylabel('True Labels')
    plt.savefig('lstmgraphs/confusion_matrix.png')
    plt.show()

print("Evaluating on test data...")
evaluate_model(lstm_model, X_test, y_test)
print("Evaluation completed.")

# Experiment 4: Pre-training LLM's Experimentation

In [None]:
%pip install datasets
%pip install transformers
%pip install spacy
%pip install torch
%pip install spacy-transformers
%pip install transformers[torch]
%pip install seqeval

In [None]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

if not os.path.exists('llmgraphs'):
    os.makedirs('llmgraphs')

print('Dataset loaded:', dataset)
print('Size of validation set:', len(validation_data))
print('Size of test set:', len(test_data))

print('First example from the train split:', train_data[0])
print('Column names:', train_data.column_names)

metric = load_metric("seqeval")

label_encoding = {"B-O": 0, "B-AC": 1, "B-LF": 2, "I-LF": 3}
label_list = ["B-O", "B-AC", "B-LF", "I-LF"]

models_to_test = ["bert-base-uncased", "roberta-base", "albert-base-v2"]
results_summary = {}

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    true_predictions = [[label_list[p] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]
    true_labels = [[label_list[l] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]
    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results.get("overall_precision", 0),
        "recall": results.get("overall_recall", 0),
        "f1": results.get("overall_f1", 0),
        "accuracy": results.get("overall_accuracy", 0),
    }

def tokenize_and_align_labels(examples, tokenizer, label_encoding):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, padding=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None or word_idx == previous_word_idx:
                label_ids.append(-100)
            else:
                label_ids.append(label_encoding[label[word_idx]])
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

models_to_test = ["bert-base-uncased", "roberta-base", "albert-base-v2"]
results_summary = {}
metrics_history = {model: {'train_loss': [], 'eval_loss': [], 'eval_accuracy': [], 'train_accuracy': []} for model in models_to_test}

class LoggingCallback(TrainerCallback):
    def _init_(self, model_name):
        self.model_name = model_name

    def on_log(self, args, state, control, logs=None, **kwargs):
        model_name = self.model_name  # Use the stored model_name
        if 'loss' in logs:
            metrics_history[model_name]['train_loss'].append(logs['loss'])
        if 'eval_loss' in logs:
            metrics_history[model_name]['eval_loss'].append(logs['eval_loss'])
        if 'eval_accuracy' in logs:
            metrics_history[model_name]['eval_accuracy'].append(logs['eval_accuracy'])
        if 'train_accuracy' in logs:
            metrics_history[model_name]['train_accuracy'].append(logs['accuracy'])


metrics_history = {model: {'train_loss': [], 'eval_loss': [], 'eval_accuracy': [],'train_accuracy': []} for model in models_to_test}

for model_name in models_to_test:
    print(f"Processing {model_name}")
    tokenizer = AutoTokenizer.from_pretrained(model_name, add_prefix_space=True if "roberta" in model_name else False)
    tokenized_train_dataset = train_data.map(lambda x: tokenize_and_align_labels(x, tokenizer, label_encoding), batched=True)
    tokenized_val_dataset = validation_data.map(lambda x: tokenize_and_align_labels(x, tokenizer, label_encoding), batched=True)

    model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=len(label_encoding))
    training_args = TrainingArguments(
        output_dir=f"./{model_name}-finetuned-NER",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        logging_strategy="epoch",
        learning_rate=3e-5,
        per_device_train_batch_size=32,
        per_device_eval_batch_size=32,
        num_train_epochs=20,
        weight_decay=0.01,
        load_best_model_at_end=True,
        metric_for_best_model='f1',
        report_to="none"
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train_dataset,
        eval_dataset=tokenized_val_dataset,
        compute_metrics=compute_metrics,
        tokenizer=tokenizer,
        data_collator=DataCollatorForTokenClassification(tokenizer),
        callbacks=[LoggingCallback(model_name)]
    )

    trainer.train()
    results_summary[model_name] = trainer.evaluate(tokenized_val_dataset)

    plt.figure(figsize=(12, 6))
    plt.subplot(1, 2, 1)
    plt.plot(metrics_history[model_name]['train_loss'], label='Train Loss')
    plt.plot(metrics_history[model_name]['eval_loss'], label='Validation Loss')
    plt.title(f'{model_name} Training vs Validation Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()

    plt.subplot(1, 2, 2)
    plt.plot(metrics_history[model_name]['train_accuracy'], label='Training Accuracy')
    plt.plot(metrics_history[model_name]['eval_accuracy'], label='Validation Accuracy')
    plt.title(f'{model_name} Validation Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()

    plt.savefig(f'llmgraphs/{model_name}_training_validation_metrics.png')
    plt.close()

    del model, tokenizer, trainer
    torch.cuda.empty_cache()

for model_name, results in results_summary.items():
    print(f"\nFinal Results for {model_name}:")
    for key, value in results.items():
        print(f"{key}: {value:.4f}")