<a href="https://colab.research.google.com/github/Dimildizio/DS_course/blob/main/Neural_networks/NLP/Language_modeling/LSTM_language_modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# LSTM language modeling

## Imports

In [None]:
!pip install datasets

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import numpy as np
import matplotlib.pyplot as plt

from tqdm.auto import tqdm
from datasets import load_dataset
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.model_selection import train_test_split
import nltk

from collections import Counter
from typing import List

import seaborn
seaborn.set(palette='summer')

In [None]:
nltk.download('punkt')

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

## Load dataset

In [None]:
dataset = load_dataset('imdb')

## Preprocessing and creating vocab

> Preprocess

> Get vocab or `set` of strings:

  1. split train samples into separate sentences using `sent_tokenize` from `nltk`. Each separate sentence would be a single instance of training samples.

  2. Drop sentences with words num **above** `word_threshold`

  3. Count each word in resulting sentences (Document Frequency). (use `word_tokenize` to split into separate words)

  4. Create `vocab` object of `set`, put `<unk>, <bos>, <eos>, <pad>` and `vocab_size` of the most frequent words.

Get separate sentences and put them in list

In [None]:
sentences = []
word_threshold = 32

Cound frequency

In [None]:
words = Counter()

Add vocab_size of the most frequent words into vocab

In [None]:
vocab = set()
vocab_size = 40000

Bathe in tests

In [None]:
assert '<unk>' in vocab
assert '<bos>' in vocab
assert '<eos>' in vocab
assert '<pad>' in vocab
assert len(vocab) == vocab_size + 4

In [None]:
print("Total words in vocab:", len(vocab))

### Prepare dataset

Create `__getitem__` (return data sample by input idx) in `WordDataset`.

add service tokens for the beginning and the end of sequence and tokenize the sentence using `word_tokenize` and match it with indices from `word2idx`

In [None]:
word2idx = {char: i for i, char in enumerate(vocab)}
idx2word = {i: char for char, i in word2idx.items()}

In [None]:
class WordDataset:
  def __init__(self, sent):
    self.data = sent
    self.unk_id, self.bos_id, self.eos_id, self.pad_id  = [word2idx[tag] for tag in tags]


  def __getitem__(self, idx: int) -> dict:
    processed_txt = self.data[idx]['text'].lower().translate(str.maketrans('', '', string.punctuation))
    tok_sent = [self.bos_id]
    tok_sent += [word2idx.get(word, self.unk_id) for word in word_tokenize(processed_txt)]
    tok_sent += [self.eos_id]

    batch = {'text': tok_sent, 'label': self.data[idx]['label']}
    return batch

  def __len__(self) -> int:
    return len(self.data)


In [None]:
def collate_fn_with_padding(input_batch: List[List[int]],
                            pad_id=word2idx['<pad>']) -> torch.Tensor:

    seq_lens = [len(x) for x in input_batch]
    max_seq_len = max(seq_lens)

    new_batch = []
    for seq in input_batch:
        for _ in range(max_seq_len - len(seq)):
            seq.append(pad_id)
        new_batch.append(seq)

    sequences = torch.LongTensor(new_batch).to(device)

    new_batch = {'input_ids': sequences[:,:-1], 'target_ids': sequences[:,1:]}
    return new_batch


In [None]:
train_sentences, eval_sentences = train_test_split(sentences, test_size=0.2)
eval_sentences, test_sentences = train_test_split(sentences, test_size=0.5)

train_dataset = WordDataset(train_sentences)
eval_dataset = WordDataset(eval_sentences)
test_dataset = WordDataset(test_sentences)

batch_size = 64

train_dataloader = DataLoader(
    train_dataset, collate_fn=collate_fn_with_padding, batch_size=batch_size)

eval_dataloader = DataLoader(
    eval_dataset, collate_fn=collate_fn_with_padding, batch_size=batch_size)

test_dataloader = DataLoader(
    test_dataset, collate_fn=collate_fn_with_padding, batch_size=batch_size)

## Model architecture and training

#### Baseline

In [None]:
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, vocab_dim):
        super().__init__()
        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=vocab_dim, rnn_layers=2, bi=True)
        self.rnn = nn.LSTM(input_size=vocab_dim, hidden_size=vocab_dim, num_layers=rnn_layers, batch_first=True, bidirectional=bi)
        self.fc_1 = nn.Linear(vocab_dim*(bi+1), vocab_dim)
        self.fc_2 = nn.Linear(vocab_dim, vocab_size)

    def forward(self, x):
        embedding = self.embedding(x)
        x, _ = self.rnn(embedding)
        x = torch.tanh(x)
        x = self.fc_1(x)
        x = F.relu(x)
        x = self.fc_2(x)
        return x

### Evaluation func (1 point)

perplexity is an exponent applied to cross-entropy loss

In [None]:
def evaluate(model, criterion, dataloader) -> float:
    model.eval()
    perplexity = []
    with torch.no_grad():
        for batch in dataloader:
            X = batch["input_ids"]
            logits =model(X)
            loss = criterion(logits, batch['target_ids'].flatten())
            perplexity.append(torch.exp(loss).item())

    perplexity = sum(perplexity) / len(perplexity)

    return perplexity

### Train func (1 point)

In [None]:

def get_epoch(model, optimizer, criterion, epoch, loader, name='Train', scheduler=False):
  if name=='Train':
    model.train()
  else:
    model.eval()
  avg_loss = 0
  avg_perplexity = 0
  for batch in tqdm(loader, desc=f'{name} Epoch: {epoch}}'):
    xs, ys = batch['input_ids'], batch['target_ids']
    logits = model(xs)
    loss = criterion(logits.flatten(start_dim=0, end_dim=1), ys.flatten())
    if name=='Train':
      optimizer.zero_grad()
      loss.backward()
      optimizer.step()
    avg.loss += loss.item()
    avg.perplexity += torch.exp(loss).item()
    if scheduler:
     scheduler.step()
  return avg_loss / len(loader), avg_perplexity / len(loader)


def fit(model, optimizer, criterion, train_loader, eval_loader, scheduler):
  hist_loss_train, hist_loss_val, hist_perp_train, hist_perp_val = [] * 4
  for epoch in tqdm(CFG.epochs):
    train_loss, train_perp = get_epoch(model, optimizer, criterion, epoch, train_loader, scheduler=scheduler)
    hist_loss_train.append(train_loss)
    hist_perp_train.append(train_perp)

    val_loss, val_perp = get_epoch(model, optimizer, criterion, epoch, eval_loader, name='Val' scheduler=scheduler)
    hist_loss_val.append(val_loss)
    hist_perp_val.append(val_perp)


    t_loss = round(hist_train_loss[-1], 5)
    v_loss = round(hist_test_loss[-1], 5)
    t_perplexity = round(hist_train_perplexity[-1], 5)
    v_perplexity = round(hist_test_perplexity[-1], 5)

    print(f"Epoch: {epoch+1}.\n\tTrain loss: {t_loss}\n\ttest_loss: {v_loss}"
          f"\n\ttrain_perplexity: {t_perplexity}\n\ttest_perplexity: {v_perplexity}")

  return hist_loss_train, hist_loss_val, hist_perp_train, hist_perp_val