## LSTM

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
from torch.utils.tensorboard import SummaryWriter
from tqdm import tqdm
from collections import defaultdict

def prepare_sequence(seq, to_ix, max_len=None):
    idxs = [to_ix.get(w, to_ix['<UNK>']) for w in seq]  # Use `<UNK>` token if word is not found
    if max_len:
        idxs += [to_ix['<PAD>']] * (max_len - len(idxs))  # Padding
    return torch.tensor(idxs, dtype=torch.long)

def read_data(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()
    len_sentence_max = 0
    training_data = []

    sentences = []
    tags = []
    temp_sentence = []
    temp_tags = []
    
    for line in lines:
        if line.strip() == "":  # New sentence begins after an empty line
            if temp_sentence and temp_tags:
                sentences.append(temp_sentence)
                tags.append(temp_tags)
                training_data.append((temp_sentence, temp_tags))
                if len(temp_sentence) > len_sentence_max:
                    len_sentence_max = len(temp_sentence)
                temp_sentence = []  # Reset temp lists for next sentence
                temp_tags = []
        else:
            word, tag = line.strip().split()
            temp_sentence.append(word)
            temp_tags.append(tag)
    # print(len_sentence_max, file_path)
    # Catch any remaining sentence not followed by empty line
    if temp_sentence and temp_tags:
        sentences.append(temp_sentence)
        tags.append(temp_tags)
        training_data.append((temp_sentence, temp_tags))
    
    return sentences, tags, training_data

def create_batches(training_data, word_to_ix, batch_size=32):
    # Sort your data by length (descending) to use pack_padded_sequence effectively
    sorted_data = sorted(training_data, key=lambda x: len(x[0]), reverse=True)
    batched_data = []
    
    for i in range(0, len(sorted_data), batch_size):
        # Get a batch of sentences
        batch = sorted_data[i:i + batch_size]
        
        # Compute max length of a sentence in this batch
        max_len = max(len(x[0]) for x in batch)
        
        # Prepare padded sequences and corresponding tags
        padded_sentences = []
        padded_tags = []
        for sentence, tags in batch:
            padded_sentences.append(prepare_sequence(sentence, word_to_ix, max_len=max_len))
            padded_tags.append(prepare_sequence(tags, tag_to_ix, max_len=max_len))
        
        # Pad the sequences using pad_sequence function
        batch_sentences = pad_sequence(padded_sentences, batch_first=True, padding_value=word_to_ix['<PAD>'])
        batch_tags = pad_sequence(padded_tags, batch_first=True, padding_value=tag_to_ix['O'])  # Assuming 'O' is the PAD tag
        
        batched_data.append((batch_sentences, batch_tags))
        # print(batched_data)
    return batched_data

# Example usage:
train_file_path = './conll2003/trainprocessed.txt'  # Replace with the path to your .txt file
valid_file_path = './conll2003/validprocessed.txt'  # Replace with the path to your .txt file
test_file_path = './conll2003/testprocessed.txt'  # Replace with the path to your .txt file

train_sentences, train_tags, training_data= read_data(train_file_path)
valid_sentences, valid_tags, valid_data= read_data(valid_file_path)
test_sentences, test_tags, test_data= read_data(test_file_path)


# print(training_data)

# Build the word and tag dictionaries based on the all data

tag_to_ix = {'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6, 'B-MISC': 7, 'I-MISC': 8, "<UNK>": 9, "<PAD>": 10}
ix_to_tag = {index: tag for tag, index in tag_to_ix.items()} # inverse of tag-index dict
# tag_to_ix['<PAD>'] = tag_to_ix['O']    # Assume 'O' is the PAD tag
# tag_to_ix['<UNK>'] = tag_to_ix['O']    # Assume 'O' is the PAD tag

# Initialize word frequency dictionary
word_freq = defaultdict(int)

# Count word frequencies
for sentence in train_sentences + valid_sentences + test_sentences:
    for word in sentence:
        word_freq[word] += 1

word_to_ix = {'<UNK>': 0, '<PAD>': 1}

# for sentence in train_sentences:
#     for word in sentence:
#         if word not in word_to_ix:
#             word_to_ix[word] = len(word_to_ix)
# for sentence in valid_sentences:
#     for word in sentence:
#         if word not in word_to_ix:
#             word_to_ix[word] = len(word_to_ix)
# for sentence in test_sentences:
#     for word in sentence:
#         if word not in word_to_ix:
#             word_to_ix[word] = len(word_to_ix)
for sentence in train_sentences + valid_sentences + test_sentences:
    for word in sentence:
        if word_freq[word] >= 3 and word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)

# Assuming you've added '<PAD>' to word_to_ix and 'O' to tag_to_ix # Add a PAD token


batch_size = 32  # Define the batch size
train_batches = create_batches(training_data, word_to_ix, batch_size=batch_size)
valid_batches = create_batches(valid_data, word_to_ix, batch_size=batch_size)
test_batches = create_batches(test_data, word_to_ix, batch_size=batch_size)
print(test_batches)

  from .autonotebook import tqdm as notebook_tqdm


[(tensor([[10332,   533,  1698,  ...,  1895,   117,     9],
        [ 1417,   533,  1698,  ...,     1,     1,     1],
        [ 1421,  2195,   328,  ...,     1,     1,     1],
        ...,
        [ 1686,   264,   880,  ...,     1,     1,     1],
        [   53,    14,  4160,  ...,     1,     1,     1],
        [   53,  1328,  8623,  ...,     1,     1,     1]]), tensor([[ 3,  0,  0,  ...,  5,  0,  0],
        [ 5,  0,  0,  ..., 10, 10, 10],
        [ 5,  6,  0,  ..., 10, 10, 10],
        ...,
        [ 0,  0,  0,  ..., 10, 10, 10],
        [ 0,  0,  0,  ..., 10, 10, 10],
        [ 0,  0,  0,  ..., 10, 10, 10]])), (tensor([[  238,   262,   379,  ...,   313,  6932,     9],
        [10471,    66,     0,  ...,    39,   689,     9],
        [10535,    17,    39,  ...,  1910,   363,     9],
        ...,
        [ 1373,   417,   343,  ...,     1,     1,     1],
        [ 2207, 10415,    17,  ...,     1,     1,     1],
        [    0,     0, 10447,  ...,     1,     1,     1]]), tensor([[ 0,  0

In [1]:
import torch
from torch.utils.data import DataLoader, Dataset
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
from torch.utils.tensorboard import SummaryWriter
from tqdm import tqdm
from collections import defaultdict

class WordTagDataset(Dataset):
    def __init__(self, file_path, word_dict, tag_dict):
        self.sentences = []
        self.tag_sequences = []
        self.word_dict = word_dict
        self.tag_dict = tag_dict
        self.pad_index = self.word_dict['<PAD>']
        self.load_data(file_path)

    def load_data(self, file_path):
        current_sentence = []
        current_tags = []
        with open(file_path, 'r') as file:
            for line in file:
                line = line.strip()
                if line == '':
                    if current_sentence and current_tags:
                        self.sentences.append(current_sentence)
                        self.tag_sequences.append(current_tags)
                        current_sentence = []
                        current_tags = []
                else:
                    word, tag = line.split()
                    current_sentence.append(self.word_dict[word])
                    current_tags.append(self.tag_dict[tag])
            if current_sentence and current_tags:
                self.sentences.append(current_sentence)
                self.tag_sequences.append(current_tags)

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        return torch.tensor(self.sentences[idx], dtype=torch.long), torch.tensor(self.tag_sequences[idx], dtype=torch.long)

# Function to build a master dictionary from multiple files
def build_master_dict(file_paths):
    word_dict = {'<PAD>': 0}
    tag_dict = {}
    for file_path in file_paths:
        with open(file_path, 'r') as file:
            for line in file:
                line = line.strip()
                if line:
                    word, tag = line.split()
                    if word not in word_dict:
                        word_dict[word] = len(word_dict)
                    if tag not in tag_dict:
                        tag_dict[tag] = len(tag_dict)
    return word_dict, tag_dict

def collate_fn(batch):
    sentences, tags = zip(*batch)
    max_length = max(len(sentence) for sentence in sentences)
    
    # Assuming that `pad_index` and a default padding value for tags are defined
    pad_index = transformer_train_dataset.word_dict['<PAD>']
    tag_pad_val = -1  # Assuming -1 is used for padding the tags
    
    sentences_padded = torch.full((len(sentences), max_length), pad_index, dtype=torch.long)
    tags_padded = torch.full((len(tags), max_length), tag_pad_val, dtype=torch.long)

    for i, (sentence, tag) in enumerate(zip(sentences, tags)):
        sentences_padded[i, :len(sentence)] = sentence
        tags_padded[i, :len(tag)] = tag

    return sentences_padded, tags_padded

# List of file paths to build the master dictionaries
file_paths = ['./conll2003/trainprocessed.txt', './conll2003/validprocessed.txt', './conll2003/testprocessed.txt']
master_word_dict, master_tag_dict = build_master_dict(file_paths)

# Create the datasets
transformer_train_dataset = WordTagDataset('./conll2003/trainprocessed.txt', master_word_dict, master_tag_dict)
transformer_valid_dataset = WordTagDataset('./conll2003/validprocessed.txt', master_word_dict, master_tag_dict)
transformer_test_dataset = WordTagDataset('./conll2003/testprocessed.txt', master_word_dict, master_tag_dict)

# Use the DataLoader to handle batching
train_data_loader = DataLoader(transformer_train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
valid_data_loader = DataLoader(transformer_valid_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
test_data_loader = DataLoader(transformer_test_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)

# print(test_data_loader)
# for batch in test_data_loader:
#     sentence, tag = batch
#     print(len(sentence))
#     print(sentence)
#     print(tag)

# Reverse dictionaries for later use
word_dict_reverse = {v: k for k, v in master_word_dict.items()}
tag_dict_reverse = {v: k for k, v in master_tag_dict.items()}
print(word_dict_reverse)
print(len(word_dict_reverse))
print(tag_dict_reverse)
print(master_word_dict)
print(master_tag_dict)

  from .autonotebook import tqdm as notebook_tqdm


30290
{0: 'B-ORG', 1: 'O', 2: 'B-MISC', 3: 'B-PER', 4: 'I-PER', 5: 'B-LOC', 6: 'I-ORG', 7: 'I-MISC', 8: 'I-LOC'}
{'B-ORG': 0, 'O': 1, 'B-MISC': 2, 'B-PER': 3, 'I-PER': 4, 'B-LOC': 5, 'I-ORG': 6, 'I-MISC': 7, 'I-LOC': 8}


In [4]:
class LSTMTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim * 2, tagset_size)
    def forward(self, sentences):
        # sentences is expected to be a tensor of dimensions: (batch_size, sequence_length)
        # Embedding layer expects input of shape (batch_size, sequence_length)
        embeds = self.word_embeddings(sentences)
        
        # LSTM layer expects input of shape (batch_size, sequence_length, embedding_dim)
        # No need to manually reshape embeds because setting batch_first=True handles this
        lstm_out, _ = self.lstm(embeds)
        
        # Reshape the output for the Linear layer
        # We concatenate the lstm outputs for the whole sequence (lstm_out) 
        # and we no longer need to manually account for the sequence length
        tag_space = self.hidden2tag(lstm_out.reshape(-1, self.hidden_dim * 2))
        
        # Compute the softmax on the reshaped tensor
        # The shape of tag_space is (batch_size * sequence_length, tagset_size)
        tag_scores = F.log_softmax(tag_space, dim=1)

        # Reshape tag_scores to (batch_size, sequence_length, tagset_size)
        tag_scores = tag_scores.view(sentences.size(0), sentences.size(1), -1)

        return tag_scores

In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report

# Assuming that `LSTMTagger` is already defined and imported
# Instantiate the model
EMBEDDING_DIM = 256
HIDDEN_DIM = 512
VOCAB_SIZE = len(master_word_dict)
TAGSET_SIZE = len(master_tag_dict)

model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, VOCAB_SIZE, TAGSET_SIZE)
loss_function = nn.CrossEntropyLoss(ignore_index=-1)  # Negative Log Likelihood Loss
optimizer = optim.Adam(model.parameters(), lr=0.01)

# Assuming you are using a GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def validate(model, valid_data_loader, tag_dict_reverse, device):
    model.eval()  # Switch to evaluation mode
    all_true_tags = []
    all_pred_tags = []
    with torch.no_grad():  # No gradients required for validation
        for sentences, tags in valid_data_loader:
            sentences = sentences.to(device)
            tags = tags.to(device)
            
            # Forward pass
            pred_scores = model(sentences)
            pred_scores = pred_scores.view(-1, TAGSET_SIZE)
            _, predicted = torch.max(pred_scores, 1)  # Get the indices of the max log-probability
            # Convert the tensors to lists and append to the list of all tags
            true_tag_idx = tags.view(-1).cpu().numpy()
            pred_tag_idx = predicted.cpu().numpy()

            valid_indices = true_tag_idx != -1
            all_true_tags.append([tag_dict_reverse[idx] for idx in true_tag_idx[valid_indices]])
            all_pred_tags.append([tag_dict_reverse[idx] for idx in pred_tag_idx[valid_indices]])

    # Convert indices to tag strings, excluding padding

    # Calculate and print classification report
    classificationReport = classification_report(all_true_tags, all_pred_tags, zero_division=0)
    
    return classificationReport

writer = SummaryWriter(f'runs_LSTM/LSTM_tagger_experiment_EMBEDDING_DIM{EMBEDDING_DIM}')

for epoch in range(50):  # Replace num_epochs with the actual number
    print("epoch:", epoch)
    total_loss = 0
    model.train()
    for sentences, tags in train_data_loader:
        model.zero_grad()
        
        sentences = sentences.to(device)  # Shape: (batch_size, sequence_length)
        tags = tags.to(device) # Shape: (batch_size, sequence_length)
        # print(sentences.shape)
        # print(tags.shape)
        # Forward pass
        pred_scores = model(sentences)  # Shape: (batch_size, sequence_length, tagset_size)
        pred_scores = pred_scores.view(-1, len(master_tag_dict))
        # Reshape tags to match the shape of pred_scores for loss computation
        # Flatten tags to match output (batch_size * sequence_length)
        tags = tags.view(-1)  
        
        # Flatten pred_scores to match (batch_size * sequence_length, tagset_size)
        # print(pred_scores.shape)
        # print(tags.shape)
        loss = loss_function(pred_scores, tags)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    print(f"Total Training Loss: {total_loss}")
    writer.add_scalar('Training Loss', total_loss, epoch)

    # Evaluation step
    report = validate(model, valid_data_loader, tag_dict_reverse, device)
    print(report)
torch.save(model, f'LSTM_models/LSTM_tagger_experiment_EMBEDDING_DIM{EMBEDDING_DIM}.pt')
writer.close()

epoch: 0
Total Training Loss: 172.7196629792452
              precision    recall  f1-score   support

         LOC       0.71      0.73      0.72      1837
        MISC       0.62      0.61      0.62       922
         ORG       0.42      0.56      0.48      1341
         PER       0.38      0.37      0.38      1842

   micro avg       0.53      0.56      0.54      5942
   macro avg       0.53      0.57      0.55      5942
weighted avg       0.53      0.56      0.54      5942

epoch: 1
Total Training Loss: 67.90918926149607
              precision    recall  f1-score   support

         LOC       0.66      0.79      0.72      1837
        MISC       0.65      0.65      0.65       922
         ORG       0.52      0.58      0.55      1341
         PER       0.43      0.51      0.47      1842

   micro avg       0.55      0.63      0.59      5942
   macro avg       0.57      0.63      0.60      5942
weighted avg       0.56      0.63      0.59      5942

epoch: 2
Total Training Loss: 46.7

In [32]:
# Batching
import sys
from seqeval.metrics import classification_report, f1_score
from seqeval.scheme import IOB2

# def evaluate_model(data, model, tag_to_ix):
#     y_true = []
#     y_pred = []
#     with torch.no_grad():
#         for sentence_batch, targets_batch in data:
#             tag_scores = model(sentence_batch)
#             tag_probs = torch.softmax(tag_scores, dim=2)
#             _, predicted_tags = torch.max(tag_probs, dim=2)
#             for i in range(len(targets_batch)):
#                 targets = targets_batch[i]
#                 predicted = predicted_tags[i]
#                 valid_indices = targets != tag_to_ix["<PAD>"]
#                 y_true.extend(targets[valid_indices].tolist())
#                 y_pred.extend(predicted[valid_indices].tolist())
#     precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='weighted', zero_division=0)
#     return precision, recall, f1

def evaluate_model(data, model, tag_dict_reverse):
    y_true = []
    y_pred = []
    with torch.no_grad():
        for sentence_batch, targets_batch in data:
            sentence_batch = sentence_batch.t()
            targets_batch = targets_batch.t()
            # print(sentence_batch.shape)
            # print(targets_batch.shape)

            tag_scores = model(sentence_batch)
            # print(tag_scores.shape)
            # tag_probs = torch.softmax(tag_scores, dim=2)
            # _, predicted_tags = torch.max(tag_probs, dim=2)
            tag_probs = torch.argmax(tag_scores, axis=2).cpu().numpy()
            print(tag_probs)
            # print(predicted_tags.shape)
            for i in range(len(targets_batch)):
                targets = targets_batch[i]
                predicted = predicted_tags[i]
                valid_indices = targets != -1
                # print(targets.shape)
                # print(predicted.shape)
                print(targets)
                print(predicted)
                print(predicted[valid_indices].tolist())
                y_true.append([tag_dict_reverse[index] for index in targets[valid_indices].tolist()])
                y_pred.append([tag_dict_reverse[index] for index in predicted[valid_indices].tolist()])
    # precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_true, y_pred)
    report = classification_report(y_true, y_pred, zero_division=0)
    return f1, report


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = "cpu"

DIM_LIST = [32, 64, 128]

for dim in DIM_LIST:
    EMBEDDING_DIM = dim
    HIDDEN_DIM = dim

    model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(master_word_dict), len(master_tag_dict))
    loss_function = nn.NLLLoss(ignore_index=-1)
    # optimizer = optim.SGD(model.parameters(), lr=0.1)
    optimizer = optim.Adam(model.parameters(), lr = 1e-3)

    patience = 5
    counter = 0
    best_f1 = 0.0
    writer = SummaryWriter(f'runs_LSTM_makeup/lstm_tagger_experiment_{dim}')


    for epoch in range(200):  # you'd typically want to validate the number of epochs on real data
        total_loss = 0
        for sentence_batch, targets_batch in tqdm(train_data_loader):
            model.zero_grad()

            tag_scores = model(sentence_batch)
            loss = loss_function(tag_scores.view(-1, tag_scores.shape[-1]), targets_batch.view(-1))
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        writer.add_scalar('Training Loss', total_loss, epoch)
        print("loss", total_loss)

        # After each epoch, you can evaluate the model's performance on the validation set
        f1, report = evaluate_model(valid_data_loader, model, tag_dict_reverse)
        print(f"Epoch {epoch}: F1 {f1:.4f}")
        print(report)

        if f1 > best_f1:
            best_f1 = f1
            counter = 0  # reset the counter if the performance improved
            
        else:
            counter += 1
            if counter >= patience:  # if performance hasn't improved for 'patience' epochs
                print(f"Early stopping triggered after epoch {epoch}")
                break
    torch.save(model, f'models/batch32_dim-{dim}_lr0.1.pth')
    writer.close()

  0%|          | 0/439 [00:00<?, ?it/s]

100%|██████████| 439/439 [00:09<00:00, 45.37it/s]


loss 380.4792939722538
[[1 1 1 ... 1 1 1]
 [1 1 1 ... 1 1 1]
 [1 1 1 ... 1 1 1]
 ...
 [1 1 1 ... 1 1 1]
 [1 1 1 ... 1 1 1]
 [1 1 1 ... 1 1 1]]


NameError: name 'predicted_tags' is not defined

In [17]:
from sklearn.metrics import accuracy_score

model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), len(tag_to_ix))
optimizer = optim.SGD(model.parameters(), lr=0.1)

# Load the checkpoint
checkpoint = torch.load('./models/batch32_dim100_lr0.1.pth')

# Restore the model and optimizer states
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

# If you are resuming training or need information about the last epoch and loss, you can retrieve them too
epoch = checkpoint['epoch']
loss = checkpoint['loss']
def test_accuracy(test_batches):
    with torch.no_grad():
        all_predictions = []
        all_targets = []

        for batch_sentences, batch_tags in test_batches:
            # Run the forward pass
            tag_scores = model(batch_sentences)

            # Convert the scores to predictions
            predictions = torch.argmax(tag_scores, dim=2)

            # Flatten the predictions and the true tag indices
            predictions = predictions.view(-1)
            true_tags = batch_tags.view(-1)

            # Ignoring the padding
            non_pad_elements = true_tags != tag_to_ix['<PAD>']
            predictions = predictions[non_pad_elements]
            true_tags = true_tags[non_pad_elements]

            # Collect all predictions and true tags
            all_predictions.extend(predictions.tolist())
            all_targets.extend(true_tags.tolist())

    accuracy = accuracy_score(all_targets, all_predictions)
    return accuracy

print(f"Test Accuracy: {test_accuracy(test_batches):.4f}")

Test Accuracy: 0.6704


## Transformer

In [24]:
import torch
from torch.utils.data import DataLoader, Dataset

class WordTagDataset(Dataset):
    def __init__(self, file_path, word_dict, tag_dict):
        self.sentences = []
        self.tag_sequences = []
        self.word_dict = word_dict
        self.tag_dict = tag_dict
        self.pad_index = self.word_dict['<PAD>']
        self.load_data(file_path)

    def load_data(self, file_path):
        current_sentence = []
        current_tags = []
        with open(file_path, 'r') as file:
            for line in file:
                line = line.strip()
                if line == '':
                    if current_sentence and current_tags:
                        self.sentences.append(current_sentence)
                        self.tag_sequences.append(current_tags)
                        current_sentence = []
                        current_tags = []
                else:
                    word, tag = line.split()
                    current_sentence.append(self.word_dict[word])
                    current_tags.append(self.tag_dict[tag])
            if current_sentence and current_tags:
                self.sentences.append(current_sentence)
                self.tag_sequences.append(current_tags)

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        return torch.tensor(self.sentences[idx], dtype=torch.long), torch.tensor(self.tag_sequences[idx], dtype=torch.long)

# Function to build a master dictionary from multiple files
def build_master_dict(file_paths):
    word_dict = {'<PAD>': 0}
    tag_dict = {}
    for file_path in file_paths:
        with open(file_path, 'r') as file:
            for line in file:
                line = line.strip()
                if line:
                    word, tag = line.split()
                    if word not in word_dict:
                        word_dict[word] = len(word_dict)
                    if tag not in tag_dict:
                        tag_dict[tag] = len(tag_dict)
    return word_dict, tag_dict

def collate_fn(batch):
    sentences, tags = zip(*batch)
    max_length = max(len(sentence) for sentence in sentences)
    
    # Assuming that `pad_index` and a default padding value for tags are defined
    pad_index = transformer_train_dataset.word_dict['<PAD>']
    tag_pad_val = -1  # Assuming -1 is used for padding the tags
    
    sentences_padded = torch.full((len(sentences), max_length), pad_index, dtype=torch.long)
    tags_padded = torch.full((len(tags), max_length), tag_pad_val, dtype=torch.long)

    for i, (sentence, tag) in enumerate(zip(sentences, tags)):
        sentences_padded[i, :len(sentence)] = sentence
        tags_padded[i, :len(tag)] = tag

    return sentences_padded, tags_padded

# List of file paths to build the master dictionaries
file_paths = ['./conll2003/trainprocessed.txt', './conll2003/validprocessed.txt', './conll2003/testprocessed.txt']
master_word_dict, master_tag_dict = build_master_dict(file_paths)

# Create the datasets
transformer_train_dataset = WordTagDataset('./conll2003/trainprocessed.txt', master_word_dict, master_tag_dict)
transformer_valid_dataset = WordTagDataset('./conll2003/validprocessed.txt', master_word_dict, master_tag_dict)
transformer_test_dataset = WordTagDataset('./conll2003/testprocessed.txt', master_word_dict, master_tag_dict)

# Use the DataLoader to handle batching
train_data_loader = DataLoader(transformer_train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
valid_data_loader = DataLoader(transformer_valid_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
test_data_loader = DataLoader(transformer_test_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)

print(test_data_loader)
for batch in test_data_loader:
    sentence, tag = batch
    print(len(sentence))
    print(sentence)
    print(tag)

# Reverse dictionaries for later use
word_dict_reverse = {v: k for k, v in master_word_dict.items()}
tag_dict_reverse = {v: k for k, v in master_tag_dict.items()}
print(word_dict_reverse)
print(len(word_dict_reverse))
print(tag_dict_reverse)

<torch.utils.data.dataloader.DataLoader object at 0x000001E10DAA4220>
32
tensor([[ 3631,  1225, 11856,  ...,     0,     0,     0],
        [ 1903,  2875,     0,  ...,     0,     0,     0],
        [ 1620,     9,     0,  ...,     0,     0,     0],
        ...,
        [13495,    71,  3164,  ...,     0,     0,     0],
        [ 3620,  7744, 29850,  ...,     0,     0,     0],
        [ 6801, 29744, 29745,  ...,     0,     0,     0]])
tensor([[ 1,  5,  1,  ..., -1, -1, -1],
        [ 1,  1, -1,  ..., -1, -1, -1],
        [ 1,  1, -1,  ..., -1, -1, -1],
        ...,
        [ 3,  1,  1,  ..., -1, -1, -1],
        [ 1,  3,  4,  ..., -1, -1, -1],
        [ 3,  4,  1,  ..., -1, -1, -1]])
32
tensor([[ 1225,    17,    18,  ...,     0,     0,     0],
        [  252,   316,   239,  ...,     0,     0,     0],
        [ 8338,  3121,     0,  ...,     0,     0,     0],
        ...,
        [15785,  1509, 15853,  ...,     0,     0,     0],
        [21416, 26890,     0,  ...,     0,     0,     0],
     

In [30]:
def read_data(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()
    len_sentence_max = 0
    training_data = []

    sentences = []
    tags = []
    temp_sentence = []
    temp_tags = []
    
    for line in lines:
        if line.strip() == "":  # New sentence begins after an empty line
            if temp_sentence and temp_tags:
                sentences.append(temp_sentence)
                tags.append(temp_tags)
                training_data.append((temp_sentence, temp_tags))
                if len(temp_sentence) > len_sentence_max:
                    len_sentence_max = len(temp_sentence)
                temp_sentence = []  # Reset temp lists for next sentence
                temp_tags = []
        else:
            word, tag = line.strip().split()
            temp_sentence.append(word)
            temp_tags.append(tag)
    # print(len_sentence_max, file_path)
    # Catch any remaining sentence not followed by empty line
    if temp_sentence and temp_tags:
        sentences.append(temp_sentence)
        tags.append(temp_tags)
        training_data.append((temp_sentence, temp_tags))
    
    return sentences, tags, training_data

    
train_file_path = './conll2003/trainprocessed.txt'  # Replace with the path to your .txt file
valid_file_path = './conll2003/validprocessed.txt'  # Replace with the path to your .txt file
test_file_path = './conll2003/testprocessed.txt'  # Replace with the path to your .txt file

train_sentences, train_tags, training_data = read_data(train_file_path)
valid_sentences, valid_tags, valid_data = read_data(valid_file_path)
test_sentences, test_tags, test_data = read_data(test_file_path)

class MyDataset(Dataset):
    def __init__(self, sentences):
        self.words = []
        self.labels = []
        for i in range(len(sentences)):
            self.words.append(sentences[i][0])
            self.labels.append(sentences[i][1])
        print(self.words[0])
        print(self.labels[0])

    def __len__(self):
        return len(self.words)

    def __getitem__(self, index):
        words = self.words[index]
        labels = self.labels[index]
        return words, labels


train_dataset = MyDataset(training_data)
valid_dataset = MyDataset(valid_data)
test_dataset = MyDataset(test_data)

batch_size = 32
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

print(test_data_loader)
for batch in test_data_loader:
    sentence, tag = batch
    print(len(sentence))
    print(sentence)
    print(tag)

['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']
['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O']
['CRICKET', '-', 'LEICESTERSHIRE', 'TAKE', 'OVER', 'AT', 'TOP', 'AFTER', 'INNINGS', 'VICTORY', '.']
['O', 'O', 'B-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
['SOCCER', '-', 'JAPAN', 'GET', 'LUCKY', 'WIN', ',', 'CHINA', 'IN', 'SURPRISE', 'DEFEAT', '.']
['O', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'B-PER', 'O', 'O', 'O', 'O']
<torch.utils.data.dataloader.DataLoader object at 0x000001E10DAA4220>
32
tensor([[ 1891,   676, 27045,  ...,     0,     0,     0],
        [  134,   756,  1430,  ...,     0,     0,     0],
        [ 1464,   157,   418,  ...,     0,     0,     0],
        ...,
        [ 1807,   390,  5281,  ...,  1873,   132,     9],
        [ 4058,  1270,  2091,  ...,     0,     0,     0],
        [20195,    17,    89,  ...,     0,     0,     0]])
tensor([[ 1,  1,  2,  ..., -1, -1, -1],
        [ 1,  1,  1,  ..., -1, -1, -1],
        [ 1,  1,  1,  ..

In [31]:
import math
import os
from tempfile import TemporaryDirectory
from typing import Tuple

import torch
from torch import nn, Tensor
from torch.nn import TransformerEncoder, TransformerEncoderLayer
from torch.utils.data import dataset
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = "cpu"

class TransformerModel(nn.Module):

    def __init__(self, ntoken: int, d_model: int, nhead: int, d_hid: int,
                 nlayers: int, tags: int, dropout: float = 0.5):
        super().__init__()
        self.model_type = 'Transformer'
        self.pos_encoder = PositionalEncoding(d_model, dropout)
        encoder_layers = TransformerEncoderLayer(d_model, nhead, d_hid, dropout, batch_first=True)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        self.embedding = nn.Embedding(ntoken, d_model)
        self.d_model = d_model
        self.linear = nn.Linear(d_model, tags)

        self.init_weights()

    def init_weights(self) -> None:
        initrange = 0.1
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.linear.bias.data.zero_()
        self.linear.weight.data.uniform_(-initrange, initrange)

    def forward(self, src: Tensor, src_mask: Tensor = None) -> Tensor:
        """
        Arguments:
            src: Tensor, shape ``[seq_len, batch_size]``
            src_mask: Tensor, shape ``[seq_len, seq_len]``

        Returns:
            output Tensor of shape ``[seq_len, batch_size, ntoken]``
        """
        src = self.embedding(src) * math.sqrt(self.d_model)
        src = self.pos_encoder(src)
        if src_mask is None:
            """Generate a square causal mask for the sequence. The masked positions are filled with float('-inf').
            Unmasked positions are filled with float(0.0).
            """
            src_mask = nn.Transformer.generate_square_subsequent_mask(src.size(1)).to(device)
        output = self.transformer_encoder(src, src_mask)
        output = self.linear(output)
        return output
    
class PositionalEncoding(nn.Module):
    
    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x: Tensor) -> Tensor:
        """
        Arguments:
            x: Tensor, shape ``[seq_len, batch_size, embedding_dim]``
        """
        x = x + self.pe[:x.size(1)].transpose(0, 1)
        return self.dropout(x)

In [34]:
import torch.optim as optim
import itertools
from sklearn.metrics import precision_recall_fscore_support
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report
from seqeval.scheme import IOB2
import numpy as np
# from torch.utils.tensorboard import SummaryWriter
from tqdm import tqdm

def train_epoch(model, train_data_loader, loss_fn, optimizer, device):
    model.train()  # turn on train mode
    total_loss = 0

    for words, tags in tqdm(train_data_loader):
        words = words.to(device)
        tags = tags.to(device)
        optimizer.zero_grad()

        # Forward pass
        output = model(words)
        output_flat = output.view(-1, len(tag_dict_reverse))
        loss = loss_fn(output_flat, tags.view(-1))

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    return total_loss / len(train_batches)

def validate_model(model, valid_data_loader, loss_fn, device, idx2tag=tag_dict_reverse):
    model.eval()
    total_loss = 0
    predictions, true_labels = [], []

    with torch.no_grad():
        for input_tensor, target_tensor in valid_data_loader:
            input_tensor = input_tensor.to(device)
            target_tensor = target_tensor.to(device)

            # Forward pass
            output = model(input_tensor)
            loss = loss_fn(output.view(-1, len(tag_dict_reverse)), target_tensor.view(-1))
            total_loss += loss.item()

            # Convert the model output to tags
            output_tags = output.argmax(2) # Assuming output shape is (batch_size, sequence_length, num_tags)
            # Convert indices to tags
            for i in range(output.size(0)): # Iterate over each item in the batch
                pred_tags = [idx2tag[idx] for idx in output_tags[i].cpu().numpy()]
                true_tags = [idx2tag[idx] for idx in target_tensor[i].cpu().numpy()]
                
                pred_tags_filtered = [tag for tag, true_tag in zip(pred_tags, true_tags) if true_tag != '<PAD>']
                true_tags_filtered = [true_tag for true_tag in true_tags if true_tag != '<PAD>']
                # predictions.append(pred_tags)
                # true_labels.append(true_tags)
                predictions.append(pred_tags_filtered)
                true_labels.append(true_tags_filtered)

    # Calculate seqeval metrics
    # precision = precision_score(true_labels, predictions)
    # recall = recall_score(true_labels, predictions)
    # f1 = f1_score(true_labels, predictions)
    report = classification_report(true_labels, predictions, zero_division=0)

    return total_loss / len(valid_batches), report

dropouts = [0.2, 0.5]
nheads = [8, 16]
nlayers = [6, 8]

ntokens = len(word_dict_reverse)  # size of vocabulary
emsize = 256  # embedding dimension
d_hid = 1024  # dimension of the feedforward network model in ``nn.TransformerEncoder``
criterion = nn.CrossEntropyLoss(ignore_index=-1)

# Create a list of all possible combinations
hyperparameter_combinations = list(itertools.product(
     dropouts, nheads, nlayers))

for dropout, nhead, nlayer in hyperparameter_combinations:
    # TODO: modify dmodel and dhid
    model = TransformerModel(ntoken=ntokens, d_model=256, nhead=nhead, d_hid=1024, nlayers=nlayer, tags=len(tag_dict_reverse), dropout=dropout).to(device)
    model = model.to(device)
    optimizer = optim.Adam(model.parameters())
    # lr = 5.0  # learning rate
    # optimizer = torch.optim.SGD(model.parameters(), lr=lr)

    # writer = SummaryWriter(f'runs_Transformer/transformer_tagger_experiment_dropout-{dropout}_nheads-{nhead}_nlayer-{nlayer}')
    patience = 5
    counter = 0
    best_loss = np.inf

    for epoch in range(100):
        epoch_loss = train_epoch(model, train_data_loader, criterion, optimizer, device)
        # epoch_loss = train_epoch_eval(model, train_batches, criterion, optimizer, device, tag_to_ix, ix_to_tag)
        val_loss, report = validate_model(model, valid_data_loader, criterion, device)
        print(f'Epoch {epoch} Loss: {epoch_loss}, Validation Loss: {val_loss}')
        print(report)
        # writer.add_scalar('Loss', epoch_loss, epoch)
        
        if epoch_loss < best_loss:
            best_loss = epoch_loss
            counter = 0  # reset the counter if the performance improved
            
        else:
            counter += 1
            if counter >= patience:  # if performance hasn't improved for 'patience' epochs
                torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'loss': loss,
                }, f'transformer_models/dropout-{dropout}_nheads-{nhead}_nlayer-{nlayer}_final.pt')
                print(f"Early stopping triggered after epoch {epoch}")
                break
    
    # writer.close()

  0%|          | 0/439 [00:00<?, ?it/s]


RuntimeError: CUDA error: device-side assert triggered
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


### 2023-12-14 morning

In [30]:
import torch.optim as optim
import itertools
from sklearn.metrics import precision_recall_fscore_support
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report
from seqeval.scheme import IOB2


ntoken = len(tag_to_ix) # words dictionary

def train_epoch(model, train_batches, loss_fn, optimizer, device):
    model.train()  # turn on train mode
    total_loss = 0

    for words, tags in tqdm(train_batches):
        words = words.to(device)
        tags = tags.to(device)
        optimizer.zero_grad()

        # Forward pass
        output = model(words)
        output_flat = output.view(-1, ntoken)
        loss = loss_fn(output_flat, tags.view(-1))

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    return total_loss / len(train_batches)

# def validate_model(model, valid_batches, loss_fn, device):
#     model.eval()
#     total_loss = 0
#     predictions, true_labels = [], []
#     with torch.no_grad():
#         for input_tensor, target_tensor in valid_batches:
#             input_tensor = input_tensor.to(device)
#             target_tensor = target_tensor.to(device)

#             # Forward pass
#             output = model(input_tensor)
#             loss = loss_fn(output.view(-1, ntoken), target_tensor.view(-1))
#             total_loss += loss.item()
#     return total_loss / len(valid_batches)

def validate_model(model, valid_batches, loss_fn, device, idx2tag=ix_to_tag):
    model.eval()
    total_loss = 0
    predictions, true_labels = [], []

    with torch.no_grad():
        for input_tensor, target_tensor in valid_batches:
            input_tensor = input_tensor.to(device)
            target_tensor = target_tensor.to(device)

            # Forward pass
            output = model(input_tensor)
            loss = loss_fn(output.view(-1, ntoken), target_tensor.view(-1))
            total_loss += loss.item()

            # Convert the model output to tags
            output_tags = output.argmax(2) # Assuming output shape is (batch_size, sequence_length, num_tags)
            # Convert indices to tags
            for i in range(output.size(0)): # Iterate over each item in the batch
                pred_tags = [idx2tag[idx] for idx in output_tags[i].cpu().numpy()]
                true_tags = [idx2tag[idx] for idx in target_tensor[i].cpu().numpy()]
                
                pred_tags_filtered = [tag for tag, true_tag in zip(pred_tags, true_tags) if true_tag != '<PAD>']
                true_tags_filtered = [true_tag for true_tag in true_tags if true_tag != '<PAD>']
                # predictions.append(pred_tags)
                # true_labels.append(true_tags)
                predictions.append(pred_tags_filtered)
                true_labels.append(true_tags_filtered)

    # Calculate seqeval metrics
    # precision = precision_score(true_labels, predictions)
    # recall = recall_score(true_labels, predictions)
    # f1 = f1_score(true_labels, predictions)
    report = classification_report(true_labels, predictions, zero_division=0)

    return total_loss / len(valid_batches), report

def save_model(model, optimizer, epoch, file_path):
    torch.save({
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'epoch': epoch
    }, file_path)



### Restart

In [31]:

import os
import numpy as np

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)


dropouts = [0.2, 0.5]
nheads = [8, 16]
nlayers = [6, 8]

criterion = nn.CrossEntropyLoss()

# Create a list of all possible combinations
hyperparameter_combinations = list(itertools.product(
     dropouts, nheads, nlayers))

for dropout, nhead, nlayer in hyperparameter_combinations:
    # TODO: modify dmodel and dhid
    print(len(tag_to_ix))
    model = TransformerModel(ntoken=ntoken, d_model=256, nhead=nhead, d_hid=1024, nlayers=nlayer, tags=len(tag_to_ix), dropout=dropout)
    print(model)
    model = model.to(device)
    optimizer = optim.Adam(model.parameters())
    # lr = 5.0  # learning rate
    # optimizer = torch.optim.SGD(model.parameters(), lr=lr)

    writer = SummaryWriter(f'runs_Transformer/transformer_tagger_experiment_dropout-{dropout}_nheads-{nhead}_nlayer-{nlayer}')
    patience = 5
    counter = 0
    best_loss = np.inf

    for epoch in range(100):
        epoch_loss = train_epoch(model, train_batches, criterion, optimizer, device)
        # epoch_loss = train_epoch_eval(model, train_batches, criterion, optimizer, device, tag_to_ix, ix_to_tag)
        val_loss, report = validate_model(model, valid_batches, criterion, device)
        print(f'Epoch {epoch} Loss: {epoch_loss}, Validation Loss: {val_loss}')
        print(report)
        writer.add_scalar('Loss', epoch_loss, epoch)
        
        if epoch_loss < best_loss:
            best_loss = epoch_loss
            counter = 0  # reset the counter if the performance improved
            
        else:
            counter += 1
            if counter >= patience:  # if performance hasn't improved for 'patience' epochs
                torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'loss': loss,
                }, f'transformer_models/dropout-{dropout}_nheads-{nhead}_nlayer-{nlayer}_final.pt')
                print(f"Early stopping triggered after epoch {epoch}")
                break
    
    writer.close()

cuda
11
TransformerModel(
  (pos_encoder): PositionalEncoding(
    (dropout): Dropout(p=0.2, inplace=False)
  )
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-5): 6 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True)
        )
        (linear1): Linear(in_features=256, out_features=1024, bias=True)
        (dropout): Dropout(p=0.2, inplace=False)
        (linear2): Linear(in_features=1024, out_features=256, bias=True)
        (norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.2, inplace=False)
        (dropout2): Dropout(p=0.2, inplace=False)
      )
    )
  )
  (embedding): Embedding(11, 256)
  (linear): Linear(in_features=256, out_features=11, bias=True)
)


RuntimeError: CUDA error: device-side assert triggered
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
def load_model(model, optimizer, file_path):
    checkpoint = torch.load(file_path)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    epoch = checkpoint['epoch']
    return model, optimizer, epoch

model_to_load = TransformerModel(ntoken=ntoken, d_model=512, nhead=8, d_hid=2048, nlayers=6, dropout=0.2)
model_to_load = model_to_load.to(device)
optimizer_to_load = optim.Adam(model_to_load.parameters())

# Load the saved model
model_to_load, optimizer_to_load, start_epoch = load_model(model_to_load, optimizer_to_load, 'transformer_models/xxx.pt')

# Now `model_to_load` contains the weights from the saved model and is ready for making predictions
# Don't forget to set the model to evaluation mode
model_to_load.eval()