# PyTorch POS Tagging

## Requirements
- PyTorch
- tqdm
- spacy

In [None]:
%pip install spacy
# %conda install spacy # or install using conda

In [None]:
# download resources for english
# `run` has to be replaced by `python` if run in a shell
%run -m spacy download en_core_web_sm

In [7]:
import os
import random

from tqdm.auto import tqdm
import torch
import torchtext
from torchtext.vocab import GloVe

print("Torch Version: ", torch.__version__)
print("Torchtext Version: ", torchtext.__version__)

Torch Version:  2.1.2
Torchtext Version:  0.16.2


### Some global settings

In [8]:
EMB_CACHE = os.path.expanduser("./glove/")
DATASET_ROOT = os.path.expanduser("./")
BATCH_SIZE = 16 # make sure that batches fit into your device's memory but note that the batch size influences your training (it is a hyperparameter)
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # 'cuda' for GPU (optional specify device id) and 'cpu' for CPU

## The dataset is adapted from the UDPOS where the format has been slightly changed

In [9]:
class POSTaggingDataset(torchtext.legacy.data.TabularDataset):

    # Universal Dependencies English Web Treebank by Universal Dependencies contributors
    # Modified by Maximilian Schmidt for use at the IMS, University of Stuttgart
    # License: http://creativecommons.org/licenses/by-sa/4.0/
    urls = ['file:./udpos/en-ud-v2']
    dirname = 'en-ud-v2'
    name = 'udpos'

    @classmethod
    def splits(cls, text_field, label1_field, label2_field, id_field, root=".data", train="train.jsonl",
               validation="dev.jsonl",
               test="test.jsonl", **kwargs):
        """Downloads and loads the Universal Dependencies Version 2 POS Tagged
        data.
        """

        fields = {'text': text_field}
        if label1_field is not None:
            fields.update(label1=label1_field)
        if label2_field is not None:
            fields.update(label2=label2_field)
        if id_field is not None:
            fields.update(id=id_field)

        return super(POSTaggingDataset, cls).splits(
            fields=fields, root=root, train=train, validation=validation,
            format='json', test=test, **kwargs)
            

ModuleNotFoundError: Package `portalocker` is required to be installed to use this datapipe.Please use `pip install 'portalocker>=2.0.0'` or`conda install -c conda-forge 'portalocker>=2/0.0'`to install the package

## Our neural network consists of one fully connected linear layer

The softmax is part of the loss function in PyTorch, so you can omit this in the forward function.

The embedding layer
- maps from indices to vectors
- is not trained (freezed)

In [49]:
class Net(torch.nn.Module):
    # this resembles a really simple neural network: an embedding layer followed by a fully
    # connected linear layer such that predictions are computed for each token in the sequence
    # and batch independently
    def __init__(self, embedding_vectors, num_classes):
        super().__init__()
        # PyTorch's embedding layer maps from indices to embeddings, freeze will tell PyTorch to
        # not train this layer, i.e. not modifying any weight
        self.embedding = torch.nn.Embedding.from_pretrained(embedding_vectors, freeze=True)
        # a fully connected linear layer mapping the embedded vector to a vector of fixed size
        # (num_classes in this case)
        self.fc = torch.nn.Linear(embedding_vectors.size(1), num_classes)

    def forward(self, inputs):
        # simple forwarding through our model
        # PyTorch takes care of keeping track of the operations for the backward pass
        emmedded_inputs = self.embedding(inputs)
        outputs = self.fc(emmedded_inputs)
        return outputs

### Set up our fields as placeholder for the actual data

- text (input)
- label (gold label / ground truth)

### Split into training, validation & test dataset and build vocabulary for *training* dataset (only)

In [50]:
# set up fields
TEXT = torchtext.legacy.data.Field(sequential=True, lower=True, include_lengths=True, batch_first=True, tokenize='spacy')
LABEL = torchtext.legacy.data.Field(sequential=True, use_vocab=True, batch_first=True, unk_token=None)

# make splits for data
train, val, test = POSTaggingDataset.splits(root=DATASET_ROOT, text_field=('Text',TEXT), label1_field=None, label2_field=('Label',LABEL), id_field=None)

# build the vocabulary
TEXT.build_vocab(train, vectors=GloVe(name='6B', dim=300, cache=EMB_CACHE))
LABEL.build_vocab(train)

### GloVe
GloVe embeddings were trained with a special objective.
Word pairs share the same underlying concept: Vector differences should be roughly equal.

<img src="https://nlp.stanford.edu/projects/glove/images/man_woman.jpg" width=500/>\
source: https://nlp.stanford.edu/projects/glove/



### Create iterator such that each iteration returns a batch from shuffled data

In [51]:
# make iterator for splits
train_iter, val_iter, test_iter = torchtext.legacy.data.Iterator.splits((train, val, test), batch_size=BATCH_SIZE, device=DEVICE, sort=False)

vocab = TEXT.vocab
classes = LABEL.vocab.itos
print(f"Available classes: {len(classes)}\n{classes}")

Available classes: 51
['<pad>', 'NN', 'IN', 'DT', 'NNP', 'PRP', 'JJ', 'RB', '.', 'VB', 'NNS', ',', 'CC', 'VBP', 'VBD', 'VBZ', 'CD', 'VBN', 'VBG', 'MD', 'TO', 'PRP$', '-RRB-', '-LRB-', 'WDT', 'WRB', ':', 'WP', 'UH', '``', "''", 'RP', 'HYPH', 'POS', 'NNPS', 'JJR', 'JJS', 'NFP', 'EX', 'ADD', 'GW', 'RBR', '$', 'PDT', 'RBS', 'SYM', 'FW', 'LS', 'AFX', 'WP$', 'XX']


## Set up model, loss and optimizer
- Cross Entropy is Softmax + Negative Log Likelihood
- As optimizer we use Adam (adapts the learning rate per weight)

(run this only once as Jupyter keeps the model (including the weights) and the optimizer in memory)

In [52]:
# set up model and optimizer
model = Net(vocab.vectors, len(classes)).to(DEVICE)
criterion = torch.nn.CrossEntropyLoss(reduction='mean')
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
metric_dict = {'loss': '------', 'accuracy': '------'}

## Evaluation function comparing prediction with gold label

In [53]:
def evaluate(data_iter, net):
    correct_count = 0
    total_count = 0
    for i, batch in enumerate(data_iter):
        # extract input and labels
        (inputs, inputs_lengths), labels = batch.Text, batch.Label

        # predict only
        with torch.no_grad():
            outputs = net(inputs)
        outputs_classes = outputs.argmax(dim=2)

        # compute amount of correct predictions
        # sequence lengths within the batch might be different, so we need to take care of that

        total_count += inputs_lengths.sum()
        # iterate over each sample of the batch
        batch_size = outputs_classes.size(0)
        for i in range(batch_size):
            for j in range(inputs_lengths[i]):
                correct_count += int(outputs_classes[i][j] == labels[i][j])
    return correct_count/total_count.float().item()

## The actual training loop

- runs several epochs
- in each epoch
 - forward the batch
 - computes the loss for the output of the whole batch
 - reduces (e.g. average, sum) the loss
 - computes derivatives of weights by backpropagation
 - optimizer updates weights
 - evaluate on validation/development dataset

In [54]:
NUM_EPOCHS = 5

# a nice progress bar to make the waiting time much better
pbar = tqdm(total=NUM_EPOCHS*len(train), postfix=metric_dict)

# run for NUM_EPOCHS epochs
for epoch in range(NUM_EPOCHS):
    # run for every data (in batches) of our iterator
    
    pbar.set_description(f"Epoch {epoch + 1}/{NUM_EPOCHS}")
    for i, batch in enumerate(train_iter):
        # extract input and labels
        (inputs, inputs_lengths), labels = batch.Text, batch.Label

        # forward + backward + optimize
        outputs = model(inputs)
        
        # 2D loss function expects input as (batch, prediction, sequence) and target as (batch, sequence) containing the class index
        loss = criterion(outputs.permute(0,2,1), labels)
        # otherwise use view function to get rid of sequence dimension by effectively concatenating all sequence items
        # loss = criterion(outputs.view(-1, len(classes)), labels.view(-1))

        # zero the parameter gradients
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # print statistics
        pbar.update(labels.size(0))
        metric_dict.update({'loss': f'{loss.item():6.3f}'})
        pbar.set_postfix(metric_dict)
        
    # evaluate on validation set after each epoch
    metric_dict.update({'accuracy': f'{100*evaluate(val_iter, model):6.2f}%'})
    pbar.set_postfix(metric_dict)

## Randomly predict sample from test set

In [55]:
def map_list(list_: list, mapping: dict):
    return [mapping[item] for item in list_]

def tokens_to_index(tokens: list, vocabulary: dict):
    return map_list(tokens, vocabulary)

def indices_to_class(indices: list, classes: dict):
    return map_list(indices, classes)

In [56]:
# get any tokenizer
tokenizer = torchtext.data.get_tokenizer('spacy', language='en')

In [57]:
sample_idx = random.randint(1, len(test))
sample = test[sample_idx]
# map tokens to index using vocabulary
sample_tokens_indexed = tokens_to_index(sample.Text, vocab)
# build input vector and add batch dimension
sample_tensor = torch.tensor(sample_tokens_indexed).unsqueeze(dim=0).to(DEVICE)

# forward / predict
with torch.no_grad():
    # get rid of batch dimension (is set to 1)
    outputs = model(sample_tensor).squeeze(dim=0)

print("Input:\t\t    ", ' '.join(sample.Text))
print("Prediction:\t    ", indices_to_class(outputs.argmax(dim=1), classes))
print("Expected prediction:", sample.Label)

Input:		     one little boy stands up and offers that , " if my best friend who lives next door was playing in the street when a car came along and killed him , that would be a tragedy " .
Prediction:	     ['CD', 'JJ', 'NN', 'VBZ', 'RP', 'CC', 'NN', 'DT', ',', "''", 'IN', 'PRP$', 'JJS', 'NN', 'WP', 'NNS', 'NN', 'NN', 'VBD', 'VBG', 'IN', 'DT', 'NN', 'WRB', 'DT', 'NN', 'VBD', 'RB', 'CC', 'NNP', 'PRP', ',', 'DT', 'MD', 'VB', 'DT', 'NN', "''", '.']
Expected prediction: ['CD', 'JJ', 'NN', 'VBZ', 'RB', 'CC', 'VBZ', 'IN', ',', '``', 'IN', 'PRP$', 'JJS', 'NN', 'WP', 'VBZ', 'JJ', 'NN', 'VBD', 'VBG', 'IN', 'DT', 'NN', 'WRB', 'DT', 'NN', 'VBD', 'RB', 'CC', 'VBD', 'PRP', ',', 'DT', 'MD', 'VB', 'DT', 'NN', "''", '.']


### Interactive prediction

In [58]:
text = input("Please enter your text: ")

# map tokens to index using vocabulary
tokens = tokenizer(text)
tokens_indexed = tokens_to_index(tokens, vocab)
# build input vector and add batch dimension
tensor = torch.tensor(tokens_indexed).unsqueeze(dim=0).to(DEVICE)

# forward / predict
with torch.no_grad():
    # get rid of batch dimension (is set to 1)
    outputs = model(tensor).squeeze(dim=0)

print("Input:", tokens)
print("Prediction:", indices_to_class(outputs.argmax(dim=1), classes))

Please enter your text: Our class today takes place online since there is a strike in public transport.
Input: ['Our', 'class', 'today', 'takes', 'place', 'online', 'since', 'there', 'is', 'a', 'strike', 'in', 'public', 'transport', '.']
Prediction: ['<pad>', 'NN', 'NN', 'VBZ', 'NN', 'NN', 'IN', 'EX', 'VBZ', 'DT', 'NN', 'IN', 'JJ', 'NN', '.']


# Logging Tools: Tensorboard

* [Tensorboard](https://www.tensorflow.org/tensorboard) is an extremely useful logging tool originally developed for the tensorflow framework
* It provides a webserver to log and display (live!)
    * scalar plots
    * image data
    * distributions 
    * histograms
    * model graphs
* Logging e.g. gradients / network weights can help you find bugs in your training process
* Pytorch now contains bindings to this tool as well

![Tensorboard Example](images/tensorboard_example.png)

* To install the package, execute

In [None]:
%pip install tensorboard
# %conda install tensorboard

PyTorch 1.9 needs setuptools<=59.5.0 for tensorboard to work.\
In this case run the following (note that `<` is escaped here as it refers to input redirection for the magic command otherwise; you don't need `\` if it is run as a command in your shell (i.e. not as a magic command)):

In [None]:
%pip install "setuptools<=59.5.0"
# %conda install "setuptools\<=59.5.0"

* To add tensorboard logging to your training routines, you only need to add a `torch.utils.tensorboard.SummaryWriter` instance
* You can add data by calling `add_*`-functions on the `SummaryWriter` instance
    * e.g. in the picture above (scalar plots): `add_scalar`
* You can specify a log directory via the `log_dir` parameter in the `SummaryWriter`'s constructor
    * if none specified, a default folder called `runs` will be created in your execution directory
* Each time you create an instance of `SummaryWriter`, a new file will be added to the log directory
    * This prevents overwriting your old logs by accident
    * Each file will be displayed in the column to the left (underneath the `Runs`-section)
    * Individual log files can be toggled as not to clutter your view too much
        * Having multiple ones active at the same time gives you an easy way to compare diffrent runs

In [None]:
# dummy training routine
from torch.utils.tensorboard import SummaryWriter
import math

writer = SummaryWriter()
for epoch in range(10):
    # ... your training code
    epoch_accuracy = epoch/10 # dummy accuracy
    epoch_loss = math.log10(10)-math.log10(epoch+1) # dummy loss

    # Append a new data point to log "accuracy" to category "train"
    # - the '/' token creates the grouping as you can see in the picture above: 
    #     e.g. the top group is 'epoch', the 2 graphs are called 'acc' and 'loss'
    #          the bottom group is 'eval', the 2 graphs are called 'acc' and 'loss'
    # Here: first value after the tag is the y axis, second value is the x axis
    writer.add_scalar("train/accuracy", epoch_accuracy, epoch)
    writer.add_scalar("train/loss", epoch_loss, epoch)
    writer.add_scalar("othergroup/epoch", epoch, epoch)

* To visualize your logs (live) in your notebook, you can load a jupyter extension

In [None]:
%load_ext tensorboard

* And then include the visualization in your notebook by using (replace `runs` with the folder your logging to)

In [None]:
%tensorboard --logdir runs

NOTE: If you want to run tensorboard as a standalone-server
* Then start the server from the command line with the command `tensorboard --logdir runs`
    * replace `runs` with the folder you chose to log to
* In a webbrowser, navigate to [localhost:6006](http://localhost:6006)
* If you're training on a remote server, you can e.g. use ssh port forwarding to run tensorboard on the server and access the webpage from your machine
* If the default port `6006` should not be free, you can specify a different one when starting tensorboard, e.g. `tensorboard --logdir=runs  --port=6007`
    * Change the port in the webbrowser URL accordingly

* Some alternatives
    * [CometML](https://www.comet.ml/docs/python-sdk/pytorch/)
    * [WandB](https://wandb.ai/site/experiment-tracking)

# Training Frameworks

* Until now: we wrote the training and evaluation loops ourselves
* However, a lot of the code is repetitive
* Frameworks help you avoid repetitive code and some solve common problems like
    * Training and evaluation loops
    * Multi-GPU / Multi-Node training
    * Early stopping
    * Creating model checkpoints
    * Hyper parameter search
    * ...
    

* Some examples:
    * [Pytorch Ignite](https://pytorch.org/ignite/index.html) ("high-level library to help with training and evaluating neural networks ")
    * [Pytorch Lightning](https://www.pytorchlightning.ai) ("The ultimate PyTorch research framework")
    * [Skorch](https://github.com/skorch-dev/skorch) ("scikit-learn compatible neural network library that wraps PyTorch")
    * Huggingface libraries ("State-of-the-art Natural Language Processing"): [Transformers](https://huggingface.co/transformers/index.html), [Datasets](https://huggingface.co/docs/datasets/)

# Example: Pytorch Ignite (taken from [website](https://pytorch.org/ignite/index.html))

```python
from ignite.engine import Events, create_supervised_trainer, create_supervised_evaluator
from ignite.metrics import Accuracy, Loss

model = Net()
train_loader, val_loader = get_data_loaders(train_batch_size, val_batch_size)
optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.8)
criterion = nn.NLLLoss()

trainer = create_supervised_trainer(model, optimizer, criterion)

val_metrics = {
    "accuracy": Accuracy(),
    "nll": Loss(criterion)
}
evaluator = create_supervised_evaluator(model, metrics=val_metrics)

@trainer.on(Events.ITERATION_COMPLETED(every=log_interval))
def log_training_loss(trainer):
    print(f"Epoch[{trainer.state.epoch}] Loss: {trainer.state.output:.2f}")

@trainer.on(Events.EPOCH_COMPLETED)
def log_training_results(trainer):
    evaluator.run(train_loader)
    metrics = evaluator.state.metrics
    print(f"Training Results - Epoch: {trainer.state.epoch}  Avg accuracy: {metrics['accuracy']:.2f} Avg loss: {metrics['nll']:.2f}")

@trainer.on(Events.EPOCH_COMPLETED)
def log_validation_results(trainer):
    evaluator.run(val_loader)
    metrics = evaluator.state.metrics
    print(f"Validation Results - Epoch: {trainer.state.epoch}  Avg accuracy: {metrics['accuracy']:.2f} Avg loss: {metrics['nll']:.2f}")

trainer.run(train_loader, max_epochs=100)

```

# Tips  & Tricks

* Log the magnitude of your gradients. If you encounter problems with exploding gradients, you can try to clip the gradient values to a specific range using [torch.nn.utils.clip_grad_value_](https://pytorch.org/docs/stable/generated/torch.nn.utils.clip_grad_value_.html) or the norm over all parameters using [torch.nn.utils.clip_grad_norm_](https://pytorch.org/docs/stable/generated/torch.nn.utils.clip_grad_norm_.html#torch.nn.utils.clip_grad_norm_) 
* [Dropout](https://pytorch.org/docs/stable/generated/torch.nn.Dropout.html)
* Ensembling
* Try [more sophisticated initialisations](https://pytorch.org/docs/stable/nn.init.html), e.g. Xavier 
* [Batch normalization](https://pytorch.org/docs/stable/nn.html#normalization-layers) (can reduce dependence on initialization, higher learning rates)
* Dataset augmentation (e.g. preprocess pictures with rotations, color shifts, mirroring, ...)
* Look for existing datasets, architectures and pre-trained models (github, pytorch model zoo, huggingface model hub, fastai, ...)
* Transfer-learning / fine-tuning pretrained models for your data