<a href="https://colab.research.google.com/github/Dimildizio/DS_course/blob/main/Neural_networks/NLP/Language_modeling/LSTM_language_modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# LSTM language modeling

## Imports

In [None]:
!pip install datasets

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

import numpy as np
import matplotlib.pyplot as plt

from tqdm.auto import tqdm
from datasets import load_dataset
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.model_selection import train_test_split
import nltk

from collections import Counter
from typing import List

import seaborn
seaborn.set(palette='summer')

In [None]:
nltk.download('punkt')

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

## Load dataset

In [None]:
dataset = load_dataset('imdb')

## Preprocessing and creating vocab

> Preprocess

> Get vocab or `set` of strings:

  1. split train samples into separate sentences using `sent_tokenize` from `nltk`. Each separate sentence would be a single instance of training samples.

  2. Drop sentences with words num **above** `word_threshold`

  3. Count each word in resulting sentences (Document Frequency). (use `word_tokenize` to split into separate words)

  4. Create `vocab` object of `set`, put `<unk>, <bos>, <eos>, <pad>` and `vocab_size` of the most frequent words.

Get separate sentences and put them in list

In [None]:
sentences = []
word_threshold = 32

Cound frequency

In [None]:
words = Counter()

Add vocab_size of the most frequent words into vocab

In [None]:
vocab = set()
vocab_size = 40000

Bathe in tests

In [None]:
assert '<unk>' in vocab
assert '<bos>' in vocab
assert '<eos>' in vocab
assert '<pad>' in vocab
assert len(vocab) == vocab_size + 4

In [None]:
print("Total words in vocab:", len(vocab))

### Prepare dataset

Create `__getitem__` (return data sample by input idx) in `WordDataset`.

add service tokens for the beginning and the end of sequence and tokenize the sentence using `word_tokenize` and match it with indices from `word2idx`

In [None]:
word2ind = {char: i for i, char in enumerate(vocab)}
ind2word = {i: char for char, i in word2ind.items()}

In [None]:
class WordDataset:
    def __init__(self, sentences):
        self.data = sentences
        self.unk_id = word2ind['<unk>']
        self.bos_id = word2ind['<bos>']
        self.eos_id = word2ind['<eos>']
        self.pad_id = word2ind['<pad>']

    def __getitem__(self, idx: int) -> List[int]:
        tokenized_sentence = []
        # MOAR CODE HERE

        return tokenized_sentence

    def __len__(self) -> int:
        return len(self.data)

In [None]:
def collate_fn_with_padding(
    input_batch: List[List[int]], pad_id=word2ind['<pad>']) -> torch.Tensor:
    seq_lens = [len(x) for x in input_batch]
    max_seq_len = max(seq_lens)

    new_batch = []
    for sequence in input_batch:
        for _ in range(max_seq_len - len(sequence)):
            sequence.append(pad_id)
        new_batch.append(sequence)

    sequences = torch.LongTensor(new_batch).to(device)

    new_batch = {
        'input_ids': sequences[:,:-1],
        'target_ids': sequences[:,1:]
    }

    return new_batch

In [None]:
train_sentences, eval_sentences = train_test_split(sentences, test_size=0.2)
eval_sentences, test_sentences = train_test_split(sentences, test_size=0.5)

train_dataset = WordDataset(train_sentences)
eval_dataset = WordDataset(eval_sentences)
test_dataset = WordDataset(test_sentences)

batch_size = 128

train_dataloader = DataLoader(
    train_dataset, collate_fn=collate_fn_with_padding, batch_size=batch_size)

eval_dataloader = DataLoader(
    eval_dataset, collate_fn=collate_fn_with_padding, batch_size=batch_size)

test_dataloader = DataLoader(
    test_dataset, collate_fn=collate_fn_with_padding, batch_size=batch_size)

## Model architecture and training