# PyTorch POS Tagging

## Requirements
- PyTorch
- huggingface_hub
- datasets
- tqdm
- spacy

In [None]:
# for pip, check conda online!
%pip install datasets huggingface_hub

In [None]:
import zipfile
import random
from functools import partial

from tqdm.auto import tqdm
import torch
from datasets import load_dataset
from huggingface_hub import hf_hub_download
from torch.utils.data import DataLoader

print("Torch Version: ", torch.__version__)

Loads the POS tagging dataset from the Hugging Face hub and prepares it for further processing.

In [None]:
dataset = load_dataset("batterydata/pos_tagging")
print(dataset)

We only have a training and a test dataset, so we use some training samples for validation:

In [None]:
dataset_split = dataset["train"].train_test_split(test_size=0.1, shuffle=True)
dataset["validation"] = dataset_split["test"]
dataset["train"] = dataset_split["train"]
print(dataset)

Display the first training sample:

In [None]:
print(dataset["train"][0])

## Data Processing

Download and unpack GloVe 300d embeddings from a zip file, build a word-to-index dictionary, and store each word's embedding vector in a list.

In [None]:
# Download and unpack the GloVe embeddings
glove = hf_hub_download("stanfordnlp/glove", "glove.6B.zip")

# There are multiple files with different dimensionality of the features in the zip archive: 50d, 100d, 200d, 300d
filename = "glove.6B.300d.txt"

word_to_index = dict()
embeddings = []

with zipfile.ZipFile(glove, "r") as f:
    for idx, line in enumerate(f.open(filename)):
        values = line.split()
        word = values[0].decode("utf-8")
        features = torch.tensor([float(value) for value in values[1:]])
        word_to_index[word] = idx
        embeddings.append(features)

Add padding and unknown tokens:

In [None]:
# Last token in the vocabulary is '<unk>' which is used for out-of-vocabulary words
# We also add a '<pad>' token to the vocabulary for padding sequences
word_to_index["<pad>"] = len(word_to_index)
padding_token_id = word_to_index["<pad>"]
unk_token_id = word_to_index["<unk>"]

embeddings.append(torch.zeros(embeddings[0].shape))

# Convert the list of tensors to a single tensor
embeddings = torch.stack(embeddings)

print(f"Embedding shape: {embeddings.size(1)}")
print(f"Padding token id: {padding_token_id}")
print(f"Unknown token id: {unk_token_id}")

Create dictionaries to map labels to indices and vice versa, and print the number of unique classes.

In [None]:
labels_unique = list(
    set([label for sample in dataset["train"] for label in sample["labels"]])
)
print(labels_unique)
print(f"Number of classes: {len(labels_unique)}")
ctoi = {label: idx for idx, label in enumerate(labels_unique)}
itoc = {idx: label for label, idx in ctoi.items()}
print(ctoi)
print(itoc)

Map tokens and labels to indices, and prepare the dataset for training.

In [None]:
def map_list_using_dict(mapping, keys: list, default=None):
    return [mapping.get(key, default) for key in keys]


def map_tokens_to_indices(tokens: list[str]):
    # Return the index of each token or the index of the '<unk>' token if a token is not in the vocabulary
    return map_list_using_dict(
        word_to_index, [token.lower() for token in tokens], unk_token_id
    )


def map_labels_to_indices(labels: list):
    # TODO: Implement the mapping of the labels to indices
    return NotImplementedError


def prepare_dataset(dataset):
    # return map(lambda x: {"token_ids": map_text_to_indices(x["words"])}, dataset)
    dataset = dataset.map(
        lambda x: {
            "token_ids": map_tokens_to_indices(x["words"]),
            "label_ids": map_labels_to_indices(x["labels"]),
        },
        num_proc=1,
    )
    return dataset


dataset = prepare_dataset(dataset)
dataset_train_tokenized = dataset["train"]
dataset_validation_tokenized = dataset["validation"]

# Print the first sample in the tokenized training dataset
print(dataset_train_tokenized[0].keys())
print(dataset_train_tokenized[0])

We again pad inputs to the maximum sequence length in the batch.\
But this time, we also have to pad the labels:

In [None]:
def pad_inputs(batch, keys_to_pad=["token_ids", "label_ids"], padding_value=-1):
    # Pad keys_to_pad to the maximum length in batch
    padded_batch = {}
    for key in keys_to_pad:
        # Get maximum length in batch
        max_len = max([len(sample[key]) for sample in batch])
        # Pad all samples to the maximum length
        padded_batch[key] = torch.tensor(
            [
                sample[key] + [padding_value] * (max_len - len(sample[key]))
                for sample in batch
            ]
        )
    # Add remaining keys to the batch
    for key in batch[0].keys():
        if key not in keys_to_pad:
            padded_batch[key] = torch.tensor([sample[key] for sample in batch])
    return padded_batch


def get_dataloader(dataset, batch_size=32, shuffle=False):
    # Create a DataLoader for the dataset
    return DataLoader(
        dataset,
        batch_size=batch_size,
        collate_fn=partial(pad_inputs, padding_value=padding_token_id),
        shuffle=shuffle,
    )


# Create a DataLoader for the training dataset with the selected columns
dataloader_train = get_dataloader(
    dataset_train_tokenized.with_format(columns=["token_ids", "label_ids"]),
    batch_size=8,
    shuffle=True,
)
dataloader_validation = get_dataloader(
    dataset_validation_tokenized.with_format(columns=["token_ids", "label_ids"]),
    batch_size=8,
    shuffle=True,
)

for batch in dataloader_train:
    token_ids = batch["token_ids"]
    labels = batch["label_ids"]
    print(token_ids)
    print(labels)
    break

## Using GPUs

So far, we have not paid attention to which device the PyTorch operations are running on.\
By default, they run on the CPU, however, a GPU is usually much faster when performing tensor operations.\
For this, you will need to have a supported GPU available on the device where you execute this code.\
Our servers at the IMS provide GPUs (strauss, nandu, kiwi).\
You can either remotely connect your editor and run the code there, or connect to a remote Python Kernel.

Once there is a supported GPU available on your machine that runs the code, you can copy tensors and even models using the method `.to(device)` to `device`.\
`device` can be specified using `torch.device`:
```python
# 'cuda' for GPU (optionally specify device id, e.g., 'cuda:0' for the first GPU) and 'cpu' for CPU
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
```

> Note that not specifying a device id might use all GPUs! Therefore, always set the device id or restrict the available GPUs, e.g., using the environment variable `CUDA_VISIBLE_DEVICES`.
> You can set this variable first using `export CUDA_VISIBLE_DEVICES=3` so that any executed command afterward will use the GPU with id 3 (the 4th GPU) or directly set it for your command using `CUDA_VISIBLE_DEVICES=3 command`.

You may also allocate tensors on a specific device during intialization:
```python
a = torch.tensor(..., device=device)
```
This works for all the tensor creation operations!

In [None]:
print(f"CUDA available: {torch.cuda.is_available()}")
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## Our neural network consists of one fully connected linear layer

The softmax is part of the loss function in PyTorch, so you can omit this in the forward function.

The embedding layer
- maps from indices to vectors
- is not trained (freezed)

In [None]:
class SimplePOSTagger(torch.nn.Module):
    # this resembles a really simple neural network: an embedding layer followed by a fully
    # connected linear layer such that predictions are computed for each token in the sequence
    # and batch independently
    def __init__(self, embedding_vectors, num_classes, hidden_dim):
        super().__init__()
        # PyTorch's embedding layer maps from indices to embeddings, freeze will tell PyTorch to
        # not train this layer, i.e. not modifying any weight
        self.embedding = torch.nn.Embedding.from_pretrained(
            embedding_vectors, freeze=True
        )
        # a fully connected linear layer mapping the embedded vector to a vector of fixed size
        # (num_classes in this case)
        self.hidden_layer = torch.nn.Linear(embedding_vectors.size(1), hidden_dim)
        self.output_layer = torch.nn.Linear(hidden_dim, num_classes)

    def forward(self, inputs):
        # simple forwarding through our model
        # PyTorch takes care of keeping track of the operations for the backward pass
        emmedded_inputs = self.embedding(inputs)
        z_1 = self.hidden_layer(emmedded_inputs)
        a_1 = torch.nn.functional.leaky_relu(z_1, negative_slope=0.2)
        z_2 = self.output_layer(a_1)
        return z_2  # softmax is applied in the loss function

## Set up model, loss and optimizer
- Cross Entropy is Softmax + Negative Log Likelihood
- As optimizer we use Adam (adapts the learning rate per weight)

(run this only once as Jupyter keeps the model (including the weights) and the optimizer in memory)

In [None]:
# Set up model and optimizer and move model to device
model = SimplePOSTagger(embedding_vectors=embeddings, num_classes=len(ctoi), hidden_dim=128).to(DEVICE)
criterion = torch.nn.CrossEntropyLoss(reduction='mean', ignore_index=padding_token_id)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
metric_dict = {'loss': '------', 'accuracy': '------'}

## Evaluation function comparing prediction with gold label

In [None]:
def evaluate(data_iter, model):
    correct_count = 0
    total_count = 0
    for i, batch in enumerate(data_iter):
        # extract input and labels

        # move data to device since our model is on the device
        token_ids = batch["token_ids"].to(device=DEVICE)
        labels = batch["label_ids"].to(device=DEVICE)

        # predict only
        with torch.no_grad():
            outputs = model(token_ids)
        outputs_classes = outputs.argmax(dim=2)

        # compute amount of correct predictions
        # sequence lengths within the batch might be different, so we need to take care of that
        inputs_lengths = (token_ids != 400001).sum(dim=1)

        total_count += inputs_lengths.sum()
        # iterate over each sample of the batch
        batch_size = outputs_classes.size(0)
        for i in range(batch_size):
            for j in range(inputs_lengths[i]):
                correct_count += int(outputs_classes[i][j] == labels[i][j])
    return correct_count / total_count.float().item()

In [None]:
accuracy = evaluate(dataloader_validation, model)
print(f"Accuracy on the validation dataset: {accuracy}")

## The actual training loop

- runs several epochs
- in each epoch
 - forward the batch
 - computes the loss for the output of the whole batch
 - reduces (e.g. average, sum) the loss
 - computes derivatives of weights by backpropagation
 - optimizer updates weights
 - evaluate on validation/development dataset

In [None]:
NUM_EPOCHS = 5

# a nice progress bar to make the waiting time much better
pbar = tqdm(total=NUM_EPOCHS*len(dataloader_train), postfix=metric_dict)

# evaluate on validation set first
metric_dict.update({'accuracy': f'{100*evaluate(dataloader_validation, model):6.2f}%'})
pbar.set_postfix(metric_dict)

# run for NUM_EPOCHS epochs
for epoch in range(NUM_EPOCHS):
    # run for every data (in batches) of our iterator
    
    pbar.set_description(f"Epoch {epoch + 1}/{NUM_EPOCHS}")
    for i, batch in enumerate(dataloader_train):
        # extract input and labels and move data to device since our model is on the device
        token_ids = batch["token_ids"].to(device=DEVICE)
        labels = batch["label_ids"].to(device=DEVICE)

        # forward + backward + optimize
        outputs = model(token_ids)
        
        # 2D loss function expects input as (batch, prediction, sequence) and target as (batch, sequence) containing the class index
        loss = criterion(outputs.permute(0,2,1), labels)
        # otherwise use view function to get rid of sequence dimension by effectively concatenating all sequence items
        # loss = criterion(outputs.view(-1, len(classes)), labels.view(-1))

        # zero the parameter gradients
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # print statistics
        pbar.update()
        metric_dict.update({'loss': f'{loss.item():6.3f}'})
        pbar.set_postfix(metric_dict)
        
    # evaluate on validation set after each epoch
    metric_dict.update({'accuracy': f'{100*evaluate(dataloader_validation, model):6.2f}%'})
    pbar.set_postfix(metric_dict)

## Randomly predict sample from test set

In [None]:
def map_indices_to_labels(indices: list):
    return map_list_using_dict(itoc, indices)

In [None]:
# Randomly select a sample from the validation dataset
sample = random.choice(dataset_validation_tokenized)
print(sample)
# build input vector and add batch dimension
sample_tensor = torch.tensor(sample["token_ids"]).unsqueeze(dim=0).to(DEVICE)

# forward / predict
with torch.no_grad():
    # get rid of batch dimension (is set to 1)
    outputs = model(sample_tensor).squeeze(dim=0)

predictions = [itoc[output.argmax(dim=0).item()] for output in outputs]
print("Input:", ' '.join(sample["words"]))
print(f"Prediction:   {predictions}")
print(f"Ground truth: {sample['labels']}")
accuracy = sum([1 for pred, gt in zip(predictions, sample["labels"]) if pred == gt]) / len(sample["labels"])
print(f"Accuracy: {accuracy*100:.2f}%")

## Interactive prediction

Note that we did not have to tokenize our data so far since tokens were given.\
For tokenizing text, you can again use the tokenization from the sentiment analysis task, but it has some trouble:

In [None]:
def tokenize_simple(text: str):
    return text.lower().split()

print(tokenize_simple("This is a simple text."))

Punctuation is not properly split, but for POS tagging to work correctly, we need punctuation is separate tokens too.\
We can extract words and punctuation using a regular expression (regex):

In [None]:
import re

def tokenize_regex(text: str):
    return re.findall(r"[\w']+|[.,!?;]", text.lower())

print(tokenize_regex("This is a simple text."))

There are also packages like spacy that help you with tokenization.\
We have to install it first and then download some files for the tokenizer:

In [None]:
# install spacy using pip
%pip install spacy

In [None]:
# download resources for english
# `run` has to be replaced by `python` if run in a shell
%run -m spacy download en_core_web_sm

In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")

def tokenize(text: str):
    return [token.text for token in nlp(text.lower())]

print(tokenize("This is a simple text."))

We render a nice text box:

In [None]:
from ipywidgets import widgets
from IPython.display import display

sentence_widget = widgets.Text(
    value="This movie is terrible",
    placeholder="Type something",
    description="Sentence:",
    disabled=False,
)
display(sentence_widget)

### Task
Prepare the input, and feed it through the model.

In [None]:
text = sentence_widget.value

# convert text to token ids

# build input vector and add batch dimension


# forward / predict
with torch.no_grad():
    

# print prediction