# Practical machine learning and deep learning. Lab 6

# Many-to-many NLP task. Part-of-speech tagging

# [Competition](https://www.kaggle.com/t/91cc39068ce7426aae9721f25d4d8fec)

## Goal

Your goal is to implement Neural Network for tagging the part-of-speech entities.

## Submission

Submission format is described at competition page.

> Remember, you can use any structure of the solution. The template classes/function in this file is just the tip for you. 

In [1]:
!pip uninstall torchtext
!pip install torchtext==0.13.1

[0mCollecting torchtext==0.13.1
  Downloading torchtext-0.13.1-cp310-cp310-manylinux1_x86_64.whl.metadata (6.9 kB)
Collecting torch==1.12.1 (from torchtext==0.13.1)
  Downloading torch-1.12.1-cp310-cp310-manylinux1_x86_64.whl.metadata (22 kB)
Downloading torchtext-0.13.1-cp310-cp310-manylinux1_x86_64.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading torch-1.12.1-cp310-cp310-manylinux1_x86_64.whl (776.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m776.3/776.3 MB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torch, torchtext
  Attempting uninstall: torch
    Found existing installation: torch 2.4.0
    Uninstalling torch-2.4.0:
      Successfully uninstalled torch-2.4.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following

In [2]:
import pandas as pd
import torch
import warnings

warnings.filterwarnings('ignore')

## Data reading and preprocessing

In [3]:
train = pd.read_csv('/kaggle/input/pmldl-week-6-part-of-speech-tagging/train.csv')
test = pd.read_csv('/kaggle/input/pmldl-week-6-part-of-speech-tagging/test.csv')

In [4]:
train.head()

Unnamed: 0,sentence_id,entity_id,entity,tag
0,0,0,It,PRON
1,0,1,is,VERB
2,0,2,true,ADJ
3,0,3,that,ADP
4,0,4,his,DET


In [5]:
test.head()

Unnamed: 0,id,sentence_id,entity_id,entity
0,0,0,0,In
1,1,0,1,another
2,2,0,2,setback
3,3,0,3,yesterday
4,4,0,4,","


First, let's divide dataset on train and validation. And split the dataframe according to random split.

In [6]:
from sklearn.model_selection import train_test_split
VALIDATION_RATIO = 0.2
train_split, val_split = train_test_split(range(train['sentence_id'].max()), test_size=VALIDATION_RATIO, random_state=420)

And then split the original dataframe by ids that we splitted.

In [7]:
train_dataframe = train[train['sentence_id'].isin(train_split)]
val_dataframe = train[train['sentence_id'].isin(val_split)]

In [8]:
pos_tags = ['ADJ', 'ADP', 'ADV', 'CONJ', 'DET', 'NOUN', 'NUM', 'PRT', 'PRON', 'VERB', '.', 'X']
cat2idx = {tag: i for i, tag in enumerate(pos_tags)}
idx2cat = {v: k for k, v in cat2idx.items()}

UNK_IDX, PAD_IDX, BOS_IDX, EOS_IDX = 0, 1, 2, 3
special_symbols = ['<unk>', '<pad>', '<bos>', '<eos>']

For working with datasets more efficiently, let's create separate classes for datasets. 



In [9]:
import torch
torch.manual_seed(420)
from torchtext.vocab import build_vocab_from_iterator


class PosTaggingDataset(torch.utils.data.Dataset):
    def __init__(self, dataframe: pd.DataFrame, vocab = None, max_size=100):
        self.dataframe = dataframe
        self._preprocess()
        self.vocab = vocab or self._create_vocab()

    def _preprocess(self):
        # fill missing values in entities
        self.dataframe['entity'].fillna('', inplace=True)

        # Fill missing tag to `other` - `X`
        self.dataframe['tag'].fillna('X', inplace=True)

        # Clean entities column
        self.dataframe['entity'] = self.dataframe['entity'].apply(self._clean_entity)
        
        # Split the dataset, so that we will have 
        # full sentences and full tags by the same index
        

        self.sentences = self.dataframe.groupby('sentence_id')['entity'].apply(list).tolist()
        self.tags = self.dataframe.groupby('sentence_id')['tag'].apply(list).tolist()
        
    def _clean_entity(self, token: str) -> str:
        token = token.lower()
        token = token.strip()
        return token
    
    def _create_vocab(self):
        # creates vocabulary that is used for encoding 
        # the sequence of tokens (splitted sentence)
        def token_generator():
            for sentence in self.sentences:
                yield sentence
                
        vocab = build_vocab_from_iterator(token_generator(), specials=special_symbols)
        vocab.set_default_index(vocab["<unk>"])
        return vocab

    def _get_sentence(self, index: int) -> list:
        # retrieves sentence from dataset by index
        sentence = self.sentences[index]
        result = [self.vocab[token] for token in sentence]
        return result

    def _get_labels(self, index: int) -> list:
        # retrieves tags from dataset by index
        tags = self.tags[index]
        return [cat2idx.get(tag, cat2idx['X']) for tag in tags]

    def __getitem__(self, index) -> tuple[list, list]:
        return self._get_sentence(index), self._get_labels(index)
    
    def __len__(self) -> int:
        return len(self.sentences)

In [10]:
# Create train dataset
train_dataset = PosTaggingDataset(train_dataframe)
val_dataset = PosTaggingDataset(val_dataframe, vocab= train_dataset.vocab)

And now we are able to create dataloader faster, because we created torch datasets

In [11]:
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

batch_size = 128
max_size = 50

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

def collate_batch(batch: list):
    # Collate list of samples into tensor batch
    # As an input we have list of pair from dataset:
    # [([ent1, ent2, ...], [tag1, tag2, ...]), ([ent1, ent2, ...], [tag1, tag2, ...]), ...]
    # as an output, we want to have tensor of entities and tensor of tags 
    sentences_batch, postags_batch = [], []
    for _sent, _postags in batch:
        _sent = _sent[:max_size]
        _postags = _postags[:max_size]
        if len(_sent) < max_size:
            _sent.extend([PAD_IDX] *  (max_size - len(_sent)))
        if len(_postags) < max_size:
            _postags.extend([0] *  (max_size - len(_postags)))
        sentences_batch.append(torch.tensor(_sent))
        postags_batch.append(torch.tensor(_postags))

    # Remember, that if we want to perform many to many mapping with our network with recurrent units, 
    # we want pass first item from all sequences as first input, thus
    # we want to have tensor with shape (max_size, ...., batch_size)
    return torch.stack(sentences_batch, dim=0).int().T.to(device), torch.stack(postags_batch, dim=0).T.long().to(device)

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_batch)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_batch)

cuda


In [12]:
# just to check that all shapes are correct

for batch in train_dataloader:
    inp, out = batch
    print(inp.shape)
    print(out.shape)
    break

torch.Size([50, 128])
torch.Size([50, 128])


## Creating the network

For the many-to-many or seq2seq netoworks, we want to have recurrent units in the network. This gives the ability for network to learn the hidden features and pass the knowledge from one token to other. 

### Embeddings

For embeddings you can use `nn.Embedding` for creating your own features or use pretrained embedding (like GloVe or FastText or Bert).

### Recurrent

For processing sequences you can use recurrent units like `LSTM`.

### Linear

Add simple nn.Linear. ~~This is basic stuff what do you want~~

### Regularization

Remeber to set up Dropout and Batch Normalization for regularization purposes.

In [13]:
import torch.nn as nn

class POSTagger(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, pad_idx, dropout_rate, use_pretrained_embeddings=False, embeddings=None):
        
        super().__init__()
        
        if use_pretrained_embeddings and embeddings is not None:
            self.embedding = nn.Embedding.from_pretrained(embedings, freeze=False, padding_idx=pad_idx)
        else:
            self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True, batch_first=False)
        
#         self.batch_norm = nn.BatchNorm1d(hidden_dim * 2)
        
        self.dropout = nn.Dropout(dropout_rate)
        
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        
    def forward(self, text):

        # text shape= [sent len, batch size]
        
        embedded = self.dropout(self.embedding(text))
        lstm_out, (hidden, cell) = self.lstm(embedded)
        lstm_out = self.dropout(lstm_out)
#         lstm_out = lstm_out.permute(1, 2, 0)  # shape: [batch size, hidden_dim * 2, sent len]
#         lstm_out = self.batch_norm(lstm_out)  # apply batch norm across the feature dimension (hidden_dim * 2)
#         lstm_out = lstm_out.permute(2, 0, 1) 
        predictions = self.fc(lstm_out) 
        
        # predictions shape = [sent len, batch size, output dim]
        return predictions

## Training

As for training you should take into account that the shape of your output and shape of the labels. Perform required transformations and use loss function that fits your task.

> Do not forget about tqdm and logging, you want normal training not some unreadable ~~sht~~ logs. 

In [14]:
from tqdm.autonotebook import tqdm

def train_one_epoch(
    model,
    loader,
    optimizer,
    loss_fn
):
    loop = tqdm(
        enumerate(loader, 1),
        total=len(loader),
        desc=f"Epoch {epoch}: train",
        leave=True,
    )
    model.train()
    train_loss = 0.0
    total = 0
    for i, batch in loop:
        texts, labels = batch
        
        outputs = model(texts)
        outputs = outputs.view(-1, outputs.shape[-1]) 
        labels = labels.reshape(-1)
        loss = loss_fn(outputs, labels)
        
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item() * labels.size(0)
        total += labels.size(0)

        loop.set_postfix({"loss": train_loss/total})

def val_one_epoch(
    model,
    loader,
    loss_fn,
    best_so_far=0.0,
    ckpt_path='best.pt'
):
    
    loop = tqdm(
        enumerate(loader, 1),
        total=len(loader),
        desc=f"Epoch {epoch}: val",
        leave=True,
    )
    val_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        model.eval()  # evaluation mode
        for i, batch in loop:
            texts, labels = batch
            outputs = model(texts)
            
            outputs = outputs.view(-1, outputs.shape[-1])  # Shape: [sent len * batch size, num classes]
            labels = labels.reshape(-1) 
            optimizer.zero_grad()
            loss = loss_fn(outputs, labels)
            val_loss += loss.item() * labels.size(0)
            total += labels.size(0)
            
            preds = outputs.argmax(dim=1)
            correct += (preds == labels).sum().item()
            loop.set_postfix({"loss": val_loss/total, "acc": correct / total})
            
        current_acc = correct / total
        if current_acc > best_so_far:
            print(f"Validation accuracy improved from {best_so_far:.4f} to {current_acc:.4f}. Saving model...")
            torch.save(model.state_dict(), ckpt_path)
            best_so_far = current_acc

    return best_so_far

In [15]:
INPUT_DIM = len(train_dataset.vocab)
OUTPUT_DIM = len(pos_tags)

embedding_dim = 100  # Size of word embeddings
hidden_dim = 128  # Number of hidden units in the LSTM

pad_idx = PAD_IDX  # Padding index, defined earlier
dropout_rate = 0.1 

model = POSTagger(INPUT_DIM, embedding_dim, hidden_dim, OUTPUT_DIM, pad_idx, dropout_rate)

model = model.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
loss_fn = nn.CrossEntropyLoss()

In [16]:
best_so_far = -float('inf')
num_epochs = 10
for epoch in range(num_epochs):
    train_one_epoch(model, train_dataloader, optimizer, loss_fn)
    best_so_far = val_one_epoch(model, val_dataloader, loss_fn, best_so_far=best_so_far)

Epoch 0: train:   0%|          | 0/361 [00:00<?, ?it/s]

Epoch 0: val:   0%|          | 0/91 [00:00<?, ?it/s]

Validation accuracy improved from -inf to 0.9678. Saving model...


Epoch 1: train:   0%|          | 0/361 [00:00<?, ?it/s]

Epoch 1: val:   0%|          | 0/91 [00:00<?, ?it/s]

Validation accuracy improved from 0.9678 to 0.9736. Saving model...


Epoch 2: train:   0%|          | 0/361 [00:00<?, ?it/s]

Epoch 2: val:   0%|          | 0/91 [00:00<?, ?it/s]

Validation accuracy improved from 0.9736 to 0.9776. Saving model...


Epoch 3: train:   0%|          | 0/361 [00:00<?, ?it/s]

Epoch 3: val:   0%|          | 0/91 [00:00<?, ?it/s]

Validation accuracy improved from 0.9776 to 0.9797. Saving model...


Epoch 4: train:   0%|          | 0/361 [00:00<?, ?it/s]

Epoch 4: val:   0%|          | 0/91 [00:00<?, ?it/s]

Validation accuracy improved from 0.9797 to 0.9813. Saving model...


Epoch 5: train:   0%|          | 0/361 [00:00<?, ?it/s]

Epoch 5: val:   0%|          | 0/91 [00:00<?, ?it/s]

Epoch 6: train:   0%|          | 0/361 [00:00<?, ?it/s]

Epoch 6: val:   0%|          | 0/91 [00:00<?, ?it/s]

Validation accuracy improved from 0.9813 to 0.9814. Saving model...


Epoch 7: train:   0%|          | 0/361 [00:00<?, ?it/s]

Epoch 7: val:   0%|          | 0/91 [00:00<?, ?it/s]

Epoch 8: train:   0%|          | 0/361 [00:00<?, ?it/s]

Epoch 8: val:   0%|          | 0/91 [00:00<?, ?it/s]

Validation accuracy improved from 0.9814 to 0.9824. Saving model...


Epoch 9: train:   0%|          | 0/361 [00:00<?, ?it/s]

Epoch 9: val:   0%|          | 0/91 [00:00<?, ?it/s]

Validation accuracy improved from 0.9824 to 0.9828. Saving model...


# Predictions

Write prediction. That's it. No more instructions, you already made it 3 times.

In [17]:
# you can use the same dataset class
test['tag'] = 'X'
test_dataset = PosTaggingDataset(test, vocab=train_dataset.vocab)

In [18]:
batch_size = 32

# remebder that for training we can use pads but for testing we need to write 
# exact length of the sentence into the seubmission
def collate_batch(batch: list):
    sentences_batch, sentences_lengths = [], []
    
    # Get the max sentence length in the current batch
    max_size = max([len(sent) for sent, _ in batch])
    
    for _sent, _ in batch:
        sentence_length = len(_sent)
        sentences_lengths.append(sentence_length)  # Store the original length
        
        # Pad sentence to the max length in the batch
        if sentence_length < max_size:
            _sent = _sent + [PAD_IDX] * (max_size - sentence_length)  # Add padding
        
        # Append the padded sentence as a tensor
        sentences_batch.append(torch.tensor(_sent))
    
    # Stack the padded sentences and transpose to shape [max_size, batch_size]
    # Sentences will have shape [batch_size, max_size], and we transpose for [max_size, batch_size]
    sentences_batch = torch.stack(sentences_batch, dim=0).int().T.to(device)
    
    return sentences_batch, sentences_lengths

test_dataloader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_batch)

In [19]:
def predict(model, loader):
    loop = tqdm(
        enumerate(loader, 1),
        total=len(loader),
        desc="Predictions",
        leave=True,
    )
    
    predictions = []
    
    # Disable gradient calculations, since we're only doing inference
    with torch.no_grad():
        model.eval()  # Set the model to evaluation mode
        for i, batch in loop:
            texts, lengths = batch  # Get input data (texts) and original sentence lengths
            texts = texts.to(device)  # Move the input to the correct device (GPU/CPU)
            
            # Forward pass through the model
            outputs = model(texts)  # Model outputs are raw logits
            
            # Get the predicted class (POS tags) by taking argmax across the class dimension
            # Outputs have shape [sequence length, batch size, num_classes]
            preds = torch.argmax(outputs, dim=-1)  # Get predictions: shape [sequence length, batch size]
            
            # Transpose to match original input shape [batch size, sequence length]
            preds = preds.T  # Shape [batch size, sequence length]
            
            # Collect predictions, flattening them into a single list, removing padding based on original sentence lengths
            for j, length in enumerate(lengths):
                sentence_preds = preds[j][:length].cpu().numpy().tolist()  # Keep only valid tokens (no padding)
                predictions.extend(sentence_preds)  # Add each sentence's predictions to the flat list

    return predictions

In [20]:
ckpt = torch.load("best.pt")
model.load_state_dict(ckpt)

predictions = predict(model, test_dataloader)
predictions[:10]

Predictions:   0%|          | 0/452 [00:00<?, ?it/s]

[1, 4, 5, 5, 10, 5, 7, 5, 5, 9]

In [21]:
results = pd.Series(predictions).apply(lambda x: idx2cat[x])
results.to_csv('submission.csv', index_label='id')

In [22]:
results

0          ADP
1          DET
2         NOUN
3         NOUN
4            .
          ... 
303020    NOUN
303021     PRT
303022    VERB
303023    NOUN
303024       .
Length: 303025, dtype: object