# PyTorch POS Tagging

## Requirements
- PyTorch
- huggingface datasets
- tqdm
- spacy

In [6]:
%pip install spacy
# %conda install spacy # or install using conda
%pip install datasets

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [7]:
# download resources for english
# `run` has to be replaced by `python` if run in a shell
%run -m spacy download en_core_web_sm

ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject

  warn('Unknown failure executing module: <%s>' % mod_name)


In [8]:
import os
import zipfile
import random
from functools import partial

from tqdm import tqdm
import torch
from datasets import load_dataset
from huggingface_hub import hf_hub_download
from torch.utils.data import DataLoader
from tqdm import trange

print("Torch Version: ", torch.__version__)

Torch Version:  2.5.1+cu124


Loads the POS tagging dataset from the Hugging Face hub and prepares it for further processing.

In [9]:
dataset = load_dataset("batterydata/pos_tagging")

Displays the loaded dataset followed by its training and test splits.

In [10]:
print(dataset)
print(dataset["train"])
print(dataset["test"])

DatasetDict({
    train: Dataset({
        features: ['words', 'labels'],
        num_rows: 13054
    })
    test: Dataset({
        features: ['words', 'labels'],
        num_rows: 1451
    })
})
Dataset({
    features: ['words', 'labels'],
    num_rows: 13054
})
Dataset({
    features: ['words', 'labels'],
    num_rows: 1451
})


### Some global settings

In [11]:
EMB_CACHE = os.path.expanduser("./glove/")
DATASET_ROOT = os.path.expanduser("./")
BATCH_SIZE = 16 # make sure that batches fit into your device's memory but note that the batch size influences your training (it is a hyperparameter)
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # 'cuda' for GPU (optional specify device id) and 'cpu' for CPU

## Our neural network consists of one fully connected linear layer

The softmax is part of the loss function in PyTorch, so you can omit this in the forward function.

The embedding layer
- maps from indices to vectors
- is not trained (freezed)

In [12]:
class Net(torch.nn.Module):
    # this resembles a really simple neural network: an embedding layer followed by a fully
    # connected linear layer such that predictions are computed for each token in the sequence
    # and batch independently
    def __init__(self, embedding_vectors, num_classes, hidden_dim):
        super().__init__()
        # PyTorch's embedding layer maps from indices to embeddings, freeze will tell PyTorch to
        # not train this layer, i.e. not modifying any weight
        self.embedding = torch.nn.Embedding.from_pretrained(embedding_vectors, freeze=True)
        # a fully connected linear layer mapping the embedded vector to a vector of fixed size
        # (num_classes in this case)
        self.hidden_layer = torch.nn.Linear(embedding_vectors.size(1), hidden_dim)
        self.output_layer = torch.nn.Linear(hidden_dim, num_classes)

    def forward(self, inputs):
        # simple forwarding through our model
        # PyTorch takes care of keeping track of the operations for the backward pass
        emmedded_inputs = self.embedding(inputs)
        z_1 = self.hidden_layer(emmedded_inputs)
        a_1 = torch.nn.functional.leaky_relu(z_1, negative_slope=0.2)
        z_2 = self.output_layer(a_1)        
        return z_2 # softmax is applied in the loss function

### GloVe
GloVe embeddings were trained with a special objective.
Word pairs share the same underlying concept: Vector differences should be roughly equal.

<img src="https://nlp.stanford.edu/projects/glove/images/man_woman.jpg" width=500/>\
source: https://nlp.stanford.edu/projects/glove/



### Create iterator such that each iteration returns a batch from shuffled data

In [13]:
# Download the GloVe embeddings
glove = hf_hub_download("stanfordnlp/glove", "glove.6B.zip")

with zipfile.ZipFile(glove, "r") as f:
    print(f.namelist())

['glove.6B.100d.txt', 'glove.6B.200d.txt', 'glove.6B.300d.txt', 'glove.6B.50d.txt']


Open the 'glove.6B.300d.txt' file from the downloaded GloVe archive and print the first few lines for inspection.

In [14]:
# There are multiple files with different dimensionality of the features in the zip archive: 50d, 100d, 200d, 300d
filename = "glove.6B.300d.txt"
with zipfile.ZipFile(glove, "r") as f:
    for idx, line in enumerate(f.open(filename)):
        print(line)
        if idx == 5:
            break

b'the 0.04656 0.21318 -0.0074364 -0.45854 -0.035639 0.23643 -0.28836 0.21521 -0.13486 -1.6413 -0.26091 0.032434 0.056621 -0.043296 -0.021672 0.22476 -0.075129 -0.067018 -0.14247 0.038825 -0.18951 0.29977 0.39305 0.17887 -0.17343 -0.21178 0.23617 -0.063681 -0.42318 -0.11661 0.093754 0.17296 -0.33073 0.49112 -0.68995 -0.092462 0.24742 -0.17991 0.097908 0.083118 0.15299 -0.27276 -0.038934 0.54453 0.53737 0.29105 -0.0073514 0.04788 -0.4076 -0.026759 0.17919 0.010977 -0.10963 -0.26395 0.07399 0.26236 -0.1508 0.34623 0.25758 0.11971 -0.037135 -0.071593 0.43898 -0.040764 0.016425 -0.4464 0.17197 0.046246 0.058639 0.041499 0.53948 0.52495 0.11361 -0.048315 -0.36385 0.18704 0.092761 -0.11129 -0.42085 0.13992 -0.39338 -0.067945 0.12188 0.16707 0.075169 -0.015529 -0.19499 0.19638 0.053194 0.2517 -0.34845 -0.10638 -0.34692 -0.19024 -0.2004 0.12154 -0.29208 0.023353 -0.11618 -0.35768 0.062304 0.35884 0.02906 0.0073005 0.0049482 -0.15048 -0.12313 0.19337 0.12173 0.44503 0.25147 0.10781 -0.17716 0.03

Unpack GloVe embeddings from a zip file, build a word-to-index dictionary, and store each word's embedding vector in a list.

In [15]:
# Unpack the downloaded file
word_to_index = dict()
embeddings = []

with zipfile.ZipFile(glove, "r") as f:
    for idx, line in enumerate(f.open(filename)):
        values = line.split()
        word = values[0].decode("utf-8")
        features = torch.tensor([float(value) for value in values[1:]])
        word_to_index[word] = idx
        embeddings.append(features)

Unpack the GloVe embeddings, create a word-to-index dictionary, and store each word's embedding vector in a list.

In [16]:
# Last token in the vocabulary is '<unk>' which is used for out-of-vocabulary words
# We also add a '<pad>' token to the vocabulary for padding sequences
word_to_index["<pad>"] = len(word_to_index)
padding_token_id = word_to_index["<pad>"]
unk_token_id = word_to_index["<unk>"]

embeddings.append(torch.zeros(embeddings[0].shape))

# Convert the list of tensors to a single tensor
embeddings = torch.stack(embeddings)

print(f"Embedding shape: {embeddings.size(1)}")
print(f"Padding token id: {padding_token_id}")
print(f"Unknown token id: {unk_token_id}")

Embedding shape: 300
Padding token id: 400001
Unknown token id: 400000


Create dictionaries to map labels to indices and vice versa, and print the number of unique classes.

In [17]:
labels_unique = list(set([label for sample in dataset["train"] for label in sample["labels"]]))
print(labels_unique)
print(f"Number of classes: {len(labels_unique)}")
ctoi = {label: idx for idx, label in enumerate(labels_unique)}
itoc = {idx: label for label, idx in ctoi.items()}
print(ctoi)
print(itoc)

['-RRB-', 'VBZ', 'VB', 'NNP', 'LS', ':', 'VBG', 'WRB', ')', 'POS', 'TO', 'NN', 'JJ', 'FW', 'JJS', 'RP', 'IN', 'WP', 'PRP$', "''", 'SYM', '-LRB-', ',', '.', '$', 'PRP', 'WDT', '#', '-NONE-', 'NNS', 'JJR', '(', 'WP$', 'UH', 'VBD', 'NNPS', '``', 'CD', 'CC', 'VBP', 'EX', 'MD', 'RBS', 'DT', 'RBR', 'RB', 'VBN', 'PDT']
Number of classes: 48
{'-RRB-': 0, 'VBZ': 1, 'VB': 2, 'NNP': 3, 'LS': 4, ':': 5, 'VBG': 6, 'WRB': 7, ')': 8, 'POS': 9, 'TO': 10, 'NN': 11, 'JJ': 12, 'FW': 13, 'JJS': 14, 'RP': 15, 'IN': 16, 'WP': 17, 'PRP$': 18, "''": 19, 'SYM': 20, '-LRB-': 21, ',': 22, '.': 23, '$': 24, 'PRP': 25, 'WDT': 26, '#': 27, '-NONE-': 28, 'NNS': 29, 'JJR': 30, '(': 31, 'WP$': 32, 'UH': 33, 'VBD': 34, 'NNPS': 35, '``': 36, 'CD': 37, 'CC': 38, 'VBP': 39, 'EX': 40, 'MD': 41, 'RBS': 42, 'DT': 43, 'RBR': 44, 'RB': 45, 'VBN': 46, 'PDT': 47}
{0: '-RRB-', 1: 'VBZ', 2: 'VB', 3: 'NNP', 4: 'LS', 5: ':', 6: 'VBG', 7: 'WRB', 8: ')', 9: 'POS', 10: 'TO', 11: 'NN', 12: 'JJ', 13: 'FW', 14: 'JJS', 15: 'RP', 16: 'IN', 

Create functions to tokenize text, map tokens and labels to indices, and prepare the dataset for training.

In [18]:
def tokenize(text: str):
    return text.lower().split()


def map_token_to_index(token):
    # Return the index of the token or the index of the '<unk>' token if the token is not in the vocabulary
    return word_to_index.get(token, unk_token_id)


def map_text_to_indices(text: str):
    return [map_token_to_index(token.lower()) for token in text]


def map_labels_to_indices(labels: list):
    #return [ctoi[label] for label in labels]
    # TODO: Implement the mapping of the labels to indices
    return NotImplementedError


def prepare_dataset(dataset):
    #return map(lambda x: {"token_ids": map_text_to_indices(x["words"])}, dataset)
    dataset = dataset.map(lambda x: {"token_ids": map_text_to_indices(x["words"])}, num_proc=4)
    dataset = dataset.map(lambda x: {"label_ids": map_labels_to_indices(x["labels"])}, num_proc=4)
    return dataset


dataset_train_tokenized = prepare_dataset(dataset["train"])
dataset_valid_tokenized = prepare_dataset(dataset["test"])

# Print the first sample in the tokenized training dataset
print(dataset_train_tokenized[0].keys())

Map (num_proc=4): 100%|██████████| 13054/13054 [00:00<00:00, 15051.06 examples/s]
Map (num_proc=4): 100%|██████████| 1451/1451 [00:00<00:00, 1908.31 examples/s]


dict_keys(['words', 'labels', 'token_ids', 'label_ids'])


Tokenize the text, map tokens and labels to indices, and prepare the dataset for training.

In [21]:
def pad_inputs(batch, keys_to_pad=["token_ids", "label_ids"], padding_value=-1):
    # Pad keys_to_pad to the maximum length in batch
    padded_batch = {}
    for key in keys_to_pad:
        # Get maximum length in batch
        max_len = max([len(sample[key]) for sample in batch])
        # Pad all samples to the maximum length
        padded_batch[key] = torch.tensor(
            [
                sample[key] + [padding_value] * (max_len - len(sample[key]))
                for sample in batch
            ]
        )
    # Add remaining keys to the batch
    for key in batch[0].keys():
        if key not in keys_to_pad:
            padded_batch[key] = torch.tensor([sample[key] for sample in batch])
    return padded_batch


def get_dataloader(dataset, batch_size=32, shuffle=False):
    # Create a DataLoader for the dataset
    return DataLoader(
        dataset,
        batch_size=batch_size,
        collate_fn=partial(pad_inputs, padding_value=padding_token_id),
        shuffle=shuffle,
    )


# We select the columns that we want to keep in the dataset
dataset_train_tokenized = dataset_train_tokenized.with_format(
    columns=["token_ids", "label_ids"]
)

dataset_valid_tokenized = dataset_train_tokenized = dataset_valid_tokenized.with_format(
    columns=["token_ids", "label_ids"]
)

# Create a DataLoader for the training dataset
dataloader_train = get_dataloader(dataset_train_tokenized, batch_size=8, shuffle=True)
dataloader_valid = get_dataloader(dataset_valid_tokenized, batch_size=8, shuffle=True)

for batch in dataloader_train:
    token_ids = batch["token_ids"]
    labels = batch["label_ids"]
    print(token_ids)
    print(labels)
    break

tensor([[  1627,   3637,   5949,     32,    287, 400000,      4,     30,   3479,
              1,      0,   1557,      5,   1431,   1296,      2, 400001, 400001,
         400001, 400001, 400001, 400001, 400001, 400001, 400001, 400001, 400001,
         400001, 400001, 400001, 400001, 400001, 400001, 400001, 400001, 400001,
         400001, 400001],
        [ 19743,  17047,     14,      7,   1207,      3,    387,      9,  19743,
         112810,      2, 400001, 400001, 400001, 400001, 400001, 400001, 400001,
         400001, 400001, 400001, 400001, 400001, 400001, 400001, 400001, 400001,
         400001, 400001, 400001, 400001, 400001, 400001, 400001, 400001, 400001,
         400001, 400001],
        [    63,     14,     80,  43144,     93, 400000,      3,    689,    281,
           1947,      1,      0,    560,     13,     42, 400000,     14,     36,
           1698,      4,      0,    347,   2488,   3311,    693,      2, 400001,
         400001, 400001, 400001, 400001, 400001, 400001, 

## Set up model, loss and optimizer
- Cross Entropy is Softmax + Negative Log Likelihood
- As optimizer we use Adam (adapts the learning rate per weight)

(run this only once as Jupyter keeps the model (including the weights) and the optimizer in memory)

In [None]:
# set up model and optimizer
model = Net(embedding_vectors=embeddings, num_classes=len(ctoi), hidden_dim=embeddings.size(1)//2).to(DEVICE)
criterion = torch.nn.CrossEntropyLoss(reduction='mean', ignore_index=padding_token_id)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
metric_dict = {'loss': '------', 'accuracy': '------'}

## Evaluation function comparing prediction with gold label

In [23]:
def evaluate(data_iter, net):
    correct_count = 0
    total_count = 0
    for i, batch in enumerate(data_iter):
        # extract input and labels
        
        token_ids = batch["token_ids"].to(device=DEVICE)
        labels = batch["label_ids"].to(device=DEVICE)

        # predict only
        with torch.no_grad():
            outputs = net(token_ids)
        outputs_classes = outputs.argmax(dim=2)

        # compute amount of correct predictions
        # sequence lengths within the batch might be different, so we need to take care of that
        inputs_lengths = (token_ids != 400001).sum(dim=1)
        
        total_count += inputs_lengths.sum()
        # iterate over each sample of the batch
        batch_size = outputs_classes.size(0)
        for i in range(batch_size):
            for j in range(inputs_lengths[i]):
                correct_count += int(outputs_classes[i][j] == labels[i][j])
    return correct_count/total_count.float().item()

## The actual training loop

- runs several epochs
- in each epoch
 - forward the batch
 - computes the loss for the output of the whole batch
 - reduces (e.g. average, sum) the loss
 - computes derivatives of weights by backpropagation
 - optimizer updates weights
 - evaluate on validation/development dataset

In [28]:
NUM_EPOCHS = 5

# a nice progress bar to make the waiting time much better
pbar = tqdm(total=NUM_EPOCHS*len(dataloader_train), postfix=metric_dict)

# run for NUM_EPOCHS epochs
for epoch in range(NUM_EPOCHS):
    # run for every data (in batches) of our iterator
    
    pbar.set_description(f"Epoch {epoch + 1}/{NUM_EPOCHS}")
    for i, batch in enumerate(dataloader_train):
        # extract input and labels
        token_ids = batch["token_ids"].to(device=DEVICE)
        labels = batch["label_ids"].to(device=DEVICE)

        # forward + backward + optimize
        outputs = model(token_ids)
        
        # 2D loss function expects input as (batch, prediction, sequence) and target as (batch, sequence) containing the class index
        loss = criterion(outputs.permute(0,2,1), labels)
        # otherwise use view function to get rid of sequence dimension by effectively concatenating all sequence items
        # loss = criterion(outputs.view(-1, len(classes)), labels.view(-1))

        # zero the parameter gradients
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # print statistics
        pbar.update(labels.size(0))
        metric_dict.update({'loss': f'{loss.item():6.3f}'})
        pbar.set_postfix(metric_dict)
        
    # evaluate on validation set after each epoch
    metric_dict.update({'accuracy': f'{100*evaluate(dataloader_valid, model):6.2f}%'})
    pbar.set_postfix(metric_dict)

Epoch 5/5: : 7255it [00:50, 142.98it/s, loss=0.185, accuracy=91.90%] 




## Randomly predict sample from test set

In [25]:
def map_list(list_: list, mapping: dict):
    return [mapping[item] for item in list_]

def tokens_to_index(tokens: list, vocabulary: dict):
    return map_list(tokens, vocabulary)

def indices_to_class(indices: list, classes: dict):
    return map_list(indices, classes)

In [27]:
dataset_valid_tokenized = prepare_dataset(dataset["test"])
sample_idx = random.randint(1, len(dataset_valid_tokenized))
sample = dataset_valid_tokenized[sample_idx]
# map tokens to index using vocabulary
# sample_tokens_indexed = tokens_to_index(sample.Text, vocab)
# build input vector and add batch dimension
sample_tensor = torch.tensor(sample["token_ids"]).unsqueeze(dim=0).to(DEVICE)

# forward / predict
with torch.no_grad():
    # get rid of batch dimension (is set to 1)
    outputs = model(sample_tensor).squeeze(dim=0)

predictions = [itoc[output.argmax(dim=0).item()] for output in outputs]
print("Input:", ' '.join(sample["words"]))
print(f"Prediction:   {predictions}")
print(f"Ground truth: {sample["labels"]}")
accuracy = sum([1 for pred, gt in zip(predictions, sample["labels"]) if pred == gt]) / len(sample["labels"])
print(f"Accuracy: {accuracy*100:.2f}%")

Input: This year it is expected *-1 to be a net importer and is said *-1 to be seeking *-2 to buy about 200,000 tons of sugar *-3 to meet internal needs , analysts said 0 *T*-4 .
Prediction:   ['DT', 'NN', 'PRP', 'VBZ', 'VBN', '-NONE-', 'TO', 'VB', 'DT', 'NN', 'NN', 'CC', 'VBZ', 'VBD', '-NONE-', 'TO', 'VB', 'VBG', '-NONE-', 'TO', 'VB', 'IN', 'CD', 'NNS', 'IN', 'NN', '-NONE-', 'TO', 'VB', 'JJ', 'VBZ', ',', 'NNS', 'VBD', '-NONE-', '-NONE-', '.']
Ground truth: ['DT', 'NN', 'PRP', 'VBZ', 'VBN', '-NONE-', 'TO', 'VB', 'DT', 'JJ', 'NN', 'CC', 'VBZ', 'VBN', '-NONE-', 'TO', 'VB', 'VBG', '-NONE-', 'TO', 'VB', 'IN', 'CD', 'NNS', 'IN', 'NN', '-NONE-', 'TO', 'VB', 'JJ', 'NNS', ',', 'NNS', 'VBD', '-NONE-', '-NONE-', '.']
Accuracy: 91.89%


### Interactive prediction

In [None]:
# TODO: remove and add as a task

# text = input("Please enter your text: ")

# # map tokens to index using vocabulary
# tokens = tokenizer(text)
# tokens_indexed = tokens_to_index(tokens, vocab)
# # build input vector and add batch dimension
# tensor = torch.tensor(tokens_indexed).unsqueeze(dim=0).to(DEVICE)

# # forward / predict
# with torch.no_grad():
#     # get rid of batch dimension (is set to 1)
#     outputs = model(tensor).squeeze(dim=0)

# print("Input:", tokens)
# print("Prediction:", indices_to_class(outputs.argmax(dim=1), classes))