<a href="https://colab.research.google.com/github/AlexeyTri/PyTorchTutorials_2025/blob/main/14_NLP_RNN_ATTENTION_ipynb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%bash
pip install -q torchmetrics

   ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ 983.2/983.2 kB 11.0 MB/s eta 0:00:00


In [2]:
import torch
import torchmetrics

In [3]:
if torch.cuda.is_available():
    device = 'cuda'
if torch.backends.mps.is_available():
    device = 'mps'
else:
    device = 'cpu'

In [4]:
def evaluate_tm(model, dataloader, metric):
    model.eval()
    metric.reset()
    for X_batch, y_batch in dataloader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        pred = model(X_batch)
        metric.update(pred, y_batch)
    return metric.compute()

In [5]:
def train(model, optimizer, loss_fn, metric, train_loader, valid_loader, n_epochs, patience=2, factor=0.5, epoch_callback=None):
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer, mode='max',factor=factor, patience=patience)
    history = {"train_losses": [], "train_metrics": [], "valid_metrics": []}
    for epoch in range(n_epochs):
        model.train()
        metric.reset()
        total_loss = 0.
        if epoch_callback is not None:
            epoch_callback(model, epoch)
        for index, (X_batch, y_batch) in enumerate(train_loader):
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            y_pred = model(X_batch)
            loss = loss_fn(y_pred, y_batch)
            total_loss += loss.item()
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            metric.update(y_pred, y_batch)
            train_metric = metric.compute().item()
            print(f"\rBatch {index + 1}/ {len(train_loader)}", end="")
            print(f", loss={total_loss/(index+1):.4f}", end="")
            print(f", {train_metric=:.2%}", end="")
        history["train_losses"].append(total_loss/len(train_loader))
        history["train_metrics"].append(train_metric)
        val_metric = evaluate_tm(model, valid_loader, metric=metric)
        history["valid_metrics"].append(val_metric.item())
        scheduler.step(val_metric)
        print(f"\rEpoch {epoch+1}/{n_epochs},"
              f"train loss: {history["train_losses"][-1]:.4f},"
              f"train metrics: {history["train_metrics"][-1]:.2%}, "
              f"valid metrics: {history["valid_metrics"][-1]:.2%} ")
    return history

In [6]:
import gc

def del_vars(variable_names=[]):
    for name in variable_names:
        try:
            del globals()[name]
        except KeyError:
            pass  # ignore variables that have already been deleted
    gc.collect()
    if device == "cuda":
        torch.cuda.empty_cache()

# Generating Shakespearean Text Using a Character RNN

**Creating the Training Dataset**

In [7]:
from pathlib import Path
import urllib.request

def dowload_shkspeare_text():
    path_file = Path("datasets/shakespeare/shakespeare.txt")
    if not path_file.is_file():
        path_file.parent.mkdir(parents=True, exist_ok=True)
        url = "https://homl.info/shakespeare"
        urllib.request.urlretrieve(url, path_file)
    return path_file.read_text()

In [8]:
shakespear_text = dowload_shkspeare_text()

In [9]:
print(shakespear_text[:80])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.


In [10]:
vocab = sorted(set(shakespear_text.lower()))

In [11]:
"".join(vocab)

"\n !$&',-.3:;?abcdefghijklmnopqrstuvwxyz"

In [12]:
char_to_idx = {char: index for index, char in enumerate(vocab)}
index_to_char = {index: char for index, char in enumerate(vocab)}

In [13]:
char_to_idx["a"], index_to_char[13]

(13, 'a')

In [14]:
def encode_text(text):
    return torch.tensor([char_to_idx[char] for char in text.lower()])
def decode_text(char_ids):
    return "".join([index_to_char[char_id.item()] for char_id in char_ids])

In [15]:
encoded = encode_text("Hello world")
encoded

tensor([20, 17, 24, 24, 27,  1, 35, 27, 30, 24, 16])

In [16]:
decode_text(encoded)

'hello world'

In [17]:
from torch.utils.data import DataLoader, Dataset

class CharDataset(Dataset):
    def __init__(self, text, window_length):
        self.encode_text = encode_text(text)
        self.window_length = window_length

    def __len__(self):
        return len(self.encode_text) - self.window_length


    def __getitem__(self, idx):
        if idx >= len(self.encode_text):
            raise IndexError("length over len array")
        end = idx + self.window_length
        window = self.encode_text[idx : end]
        target = self.encode_text[idx+1 : end+1]
        return window, target


In [18]:
to_be_dataset = CharDataset("To be or not to be", window_length=7)
for x, y in to_be_dataset:
    print(f"x: {x}, y: {y}")
    print(f"decode x: {decode_text(x)}, decode y: {decode_text(y)}")


x: tensor([32, 27,  1, 14, 17,  1, 27]), y: tensor([27,  1, 14, 17,  1, 27, 30])
decode x: to be o, decode y: o be or
x: tensor([27,  1, 14, 17,  1, 27, 30]), y: tensor([ 1, 14, 17,  1, 27, 30,  1])
decode x: o be or, decode y:  be or 
x: tensor([ 1, 14, 17,  1, 27, 30,  1]), y: tensor([14, 17,  1, 27, 30,  1, 26])
decode x:  be or , decode y: be or n
x: tensor([14, 17,  1, 27, 30,  1, 26]), y: tensor([17,  1, 27, 30,  1, 26, 27])
decode x: be or n, decode y: e or no
x: tensor([17,  1, 27, 30,  1, 26, 27]), y: tensor([ 1, 27, 30,  1, 26, 27, 32])
decode x: e or no, decode y:  or not
x: tensor([ 1, 27, 30,  1, 26, 27, 32]), y: tensor([27, 30,  1, 26, 27, 32,  1])
decode x:  or not, decode y: or not 
x: tensor([27, 30,  1, 26, 27, 32,  1]), y: tensor([30,  1, 26, 27, 32,  1, 32])
decode x: or not , decode y: r not t
x: tensor([30,  1, 26, 27, 32,  1, 32]), y: tensor([ 1, 26, 27, 32,  1, 32, 27])
decode x: r not t, decode y:  not to
x: tensor([ 1, 26, 27, 32,  1, 32, 27]), y: tensor([26, 

In [19]:
window_length = 50
batch_size = 512

train_size = CharDataset(shakespear_text[: 1_000_000], window_length=window_length)
valid_size = CharDataset(shakespear_text[1_000_000 : 1_060_000], window_length=window_length)
test_size = CharDataset(shakespear_text[1_060_000 : ], window_length=window_length)

train_loader = DataLoader(train_size, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(valid_size, batch_size=batch_size)
test_loader = DataLoader(test_size, batch_size=batch_size)




**Embeddings**

$emb = sqrt(n)$

In [20]:
import torch.nn as nn

torch.manual_seed(42)

embed = nn.Embedding(5, 3)
embed(torch.tensor([[0, 1], [2, 1], [3, 4]]))

tensor([[[ 0.3367,  0.1288,  0.2345],
         [ 0.2303, -1.1229, -0.1863]],

        [[ 2.2082, -0.6380,  0.4617],
         [ 0.2303, -1.1229, -0.1863]],

        [[ 0.2674,  0.5349,  0.8094],
         [ 1.1103, -1.6898, -0.9890]]], grad_fn=<EmbeddingBackward0>)

In [21]:
line = nn.Linear(5, 3, bias=False)
line.forward(torch.tensor([0., 0., 0., 1., 0.]))

tensor([ 0.2764,  0.1202, -0.1955], grad_fn=<SqueezeBackward4>)

In [22]:
line.weight.T.std(dim=1)

tensor([0.0956, 0.1953, 0.2319, 0.2404, 0.1897], grad_fn=<StdBackward0>)

In [23]:
embed.weight.std(dim=1)

tensor([0.1039, 0.6930, 1.4353, 0.2710, 1.4571], grad_fn=<StdBackward0>)

**Building and Training the Char-RNN Model**

In [24]:
class ShakespearModel(nn.Module):
    def __init__(self, vocab_size, n_layers=2, embed_dim=10, hidden_dim=128, dropout=0.1):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, embed_dim)
        self.gru = nn.GRU(embed_dim, hidden_dim, num_layers=n_layers, batch_first=True, dropout=dropout)
        self.linear = nn.Linear(hidden_dim, vocab_size)

    def forward(self, X):
        embeddings = self.embed(X)
        outputs, _states = self.gru(embeddings)
        return self.linear(outputs).permute(0, 2, 1)

In [25]:
torch.manual_seed(42)
model = ShakespearModel(len(vocab)).to(device)

In [27]:
n_epochs = 5
xentropy = nn.CrossEntropyLoss()
optimizer = torch.optim.NAdam(model.parameters())
accuracy = torchmetrics.Accuracy(task="multiclass", num_classes=len(vocab)).to(device)

history = train(model=model, optimizer=optimizer, loss_fn=xentropy, metric=accuracy, train_loader=train_loader, valid_loader=valid_loader, n_epochs=n_epochs)

Epoch 1/5,train loss: 1.5975,train metrics: 51.42%, valid metrics: 51.81% 
Epoch 2/5,train loss: 1.3933,train metrics: 56.47%, valid metrics: 52.01% 
Epoch 3/5,train loss: 1.3648,train metrics: 57.19%, valid metrics: 53.26% 
Epoch 4/5,train loss: 1.3511,train metrics: 57.53%, valid metrics: 52.94% 
Epoch 5/5,train loss: 1.3428,train metrics: 57.74%, valid metrics: 53.94% 


In [28]:
torch.save(model.state_dict(), "my_shakespeare_model.pt")

In [26]:
model.load_state_dict(torch.load("/content/my_shakespeare_model.pt"))


<All keys matched successfully>

In [27]:
model.eval()
text = "To be or not to b"
encoded_text = encode_text(text).unsqueeze(dim=0).to(device)
with torch.no_grad():
    Y_logits = model(encoded_text)
    predicted_char_id = Y_logits[0, :, -1].argmax().item()
    predict_char = index_to_char[predicted_char_id]

predict_char

'e'

**Generating Shakespearean Text**

In [28]:
import torch.nn.functional as F

def next_char(model, text, temperature=1):
    encoded_text = encode_text(text).unsqueeze(dim=0).to(device)
    with torch.no_grad():
        Y_logits = model(encoded_text)
        Y_probas = F.softmax(Y_logits[0, : , -1]/temperature, dim=-1)
        predict_char_id = torch.multinomial(Y_probas, num_samples=1).item()
        return index_to_char[predict_char_id]

In [29]:
def extend_char(model, text, n_chars=80, temperature=1):
    for _ in range(n_chars):
        text += next_char(model, text, temperature)
    return text

In [30]:
print(extend_char(model, "To be or not to b", temperature=0.01))

To be or not to be so so should be so so should be so so should be so so should be so so should b


In [31]:
print(extend_char(model, "To be or not to b", temperature=0.4))

To be or not to be the son of the noble lords, and will not will i may not shall be i would not k


In [32]:
print(extend_char(model, "To be or not to b", temperature=100))

To be or not to bepvcvri,zgp&?:fkp:s,ixm.o;vtz ;rx'.&v'h'feehj3nne$!pdh?qjasq'&.y3t,lwb;hlozcqt. 


In [33]:
Out.clear()  # clear Jupyter's `Out` variable which saves all the cell outputs
del_vars(["accuracy", "embed", "encoded", "encoded_text", "optimizer", "probs",
          "samples", "x", "y", "shakespeare_text", "stateful_test_loader",
          "stateful_train_loader", "Y_logits", "stateful_valid_loader",
          "test_loader", "train_loader", "valid_loader", "xentropy"])

**Sentiment Analysis**



```
Loading the IMDB Dataset
```



In [34]:
!pip install datasets



In [138]:
from datasets import load_dataset

imdb_dataset = load_dataset("imdb")
split = imdb_dataset["train"].train_test_split(train_size=0.8, seed=42)
imdb_train_set, imdb_valid_set = split["train"], split["test"]
imdb_test_set = imdb_dataset["test"]

In [40]:
imdb_train_set[1]["text"], imdb_train_set[1]["label"]

("'The Rookie' was a wonderful movie about the second chances life holds for us and also puts an emotional thought over the audience, making them realize that your dreams can come true. If you loved 'Remember the Titans', 'The Rookie' is the movie for you!! It's the feel good movie of the year and it is the perfect movie for all ages. 'The Rookie' hits a major home run!",
 1)

In [41]:
imdb_train_set[16]["text"], imdb_train_set[16]["label"]

("Lillian Hellman's play, adapted by Dashiell Hammett with help from Hellman, becomes a curious project to come out of gritty Warner Bros. Paul Lukas, reprising his Broadway role and winning the Best Actor Oscar, plays an anti-Nazi German underground leader fighting the Fascists, dragging his American wife and three children all over Europe before finding refuge in the States (via the Mexico border). They settle in Washington with the wife's wealthy mother and brother, though a boarder residing in the manor is immediately suspicious of the newcomers and spends an awful lot of time down at the German Embassy playing poker. It seems to take forever for this drama to find its focus, and when we realize what the heart of the material is (the wise, honest, direct refugees teaching the clueless, head-in-the-sand Americans how the world has suddenly changed), it seems a little patronizing--the viewer is quite literally put in the relatives' place, being lectured to. Lukas has several speeches

In [42]:
!pip install tokenizers



In [43]:
import tokenizers

In [46]:
bpe_model = tokenizers.models.BPE(unk_none="<unk>")
bpe_tokenizer = tokenizers.Tokenizer(bpe_model)
bpe_tokenizer.pre_tokenizer = tokenizers.pre_tokenizers.Whitespace()

special_token = ["<pad>", "<unk>"]
bpe_trainer = tokenizers.trainers.BpeTrainer(vocab_size=1000, show_progress=True, special_tokens=special_token)
train_reviews = [review["text"].lower() for review in imdb_train_set]

bpe_tokenizer.train_from_iterator(train_reviews, bpe_trainer)

In [51]:
some_review = "what an awesome movie! üòä"

bpe_encod = bpe_tokenizer.encode(some_review)
bpe_encod

Encoding(num_tokens=7, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [52]:
bpe_encod.tokens, bpe_encod.ids

(['what', 'an', 'aw', 'es', 'ome', 'movie', '!'],
 [303, 139, 373, 149, 240, 211, 4])

In [54]:
bpe_tokenizer.get_vocab()['what']

303

In [56]:
bpe_tokenizer.id_to_token(303)

'what'

In [59]:
bpe_tokenizer.decode([303, 139])

'what an'

In [61]:
bpe_encod.offsets # —Å–º–µ—â–µ–Ω–∏–µ –∫–∞–∂–¥–æ–≥–æ —Ç–æ–∫–µ–Ω–∞ –≤ —Å—Ç—Ä–æ–∫–µ

[(0, 4), (5, 7), (8, 10), (10, 12), (12, 15), (16, 21), (21, 22)]

In [62]:
bpe_tokenizer.encode_batch(train_reviews[:3])

[Encoding(num_tokens=281, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),
 Encoding(num_tokens=114, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),
 Encoding(num_tokens=285, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])]

In [63]:
bpe_tokenizer.enable_padding(pad_id=0, pad_token="<pad>")
bpe_tokenizer.enable_truncation(max_length=500)

In [69]:
bpe_encodings = bpe_tokenizer.encode_batch(train_reviews[:3])
bpe_batch_ids = torch.tensor([encoding.ids for encoding in bpe_encodings])
bpe_batch_ids

tensor([[159, 402, 176, 246,  61, 782, 156, 737, 252,  42, 239,  51, 154, 460,
         917,  17, 272, 156, 737, 576, 215, 976, 275,  42, 199,  44, 554,  42,
         192, 585,  57, 160, 259, 170, 157, 143, 138, 159, 402,  11, 589, 152,
           5, 819, 168, 230,   5, 521, 924, 981, 962, 250,  61,  10,  60, 426,
         526, 959,  60, 138, 199, 150, 319,  15, 363, 141, 957, 694,  47, 696,
          61, 875, 138, 960, 337, 414, 140, 157, 385, 174, 433, 161, 221, 145,
         213,  17, 549,  15, 151,  10,  60,  55, 416, 146, 407, 144, 182, 303,
         151, 141,  17, 138, 547, 538, 528, 768,  54, 335,  42, 203,  44, 270,
          46, 153, 876, 141, 919, 233, 522, 172, 141, 719, 162, 807, 279,  17,
         138,  45,  66,  55, 188, 989, 156, 378, 698, 301, 296, 689, 212, 558,
         926, 148,  17,  44, 270,  46, 141,  47, 279, 302, 171, 152, 787,  15,
         153, 522, 172, 766, 205, 156, 234, 677, 161, 139, 513, 146, 370, 251,
         219, 162, 197, 162, 166,  50, 265,  47, 266

In [70]:
attention_mask = torch.tensor([encoding.attention_mask for encoding in bpe_encodings])
attention_mask

tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [88]:
attention_mask.sum(dim=1), attention_mask.shape

(tensor([281, 114, 285]), torch.Size([3, 285]))

–í –ø—Ä–∏–º–µ—Ä–µ –¥–ª—è –î–ó –∑–∞–º–µ–Ω–∏ BpeTrainer –Ω–∞ WordPiceTrainer

In [89]:
import transformers

In [92]:
gpt2_tokenizer = transformers.AutoTokenizer.from_pretrained("gpt2")
gpt2_encoding = gpt2_tokenizer(train_reviews[:3], truncation=True, max_length=500)

In [93]:
gpt2_token_ids = gpt2_encoding["input_ids"][0][:10]
gpt2_token_ids

[14247, 35030, 1690, 423, 257, 1688, 8046, 13, 484, 1690]

In [95]:
gpt2_tokenizer.decode(gpt2_token_ids)

'stage adaptations often have a major fault. they often'

In [139]:
bert_tokenizer = transformers.AutoTokenizer.from_pretrained("bert-base-uncased")
bert_encoding = bert_tokenizer(train_reviews[:3], padding=True,
                               truncation=True, max_length=500,
                               return_tensors="pt")

In [99]:
bert_encoding["input_ids"][:3]

tensor([[  101,  2754, 17241,  2411,  2031,  1037,  2350,  6346,  1012,  2027,
          2411,  2272,  2041,  2559,  2066,  1037,  2143,  4950,  2001,  3432,
          2872,  2006,  1996,  2754,  1006,  2107,  2004,  1000,  2305,  2388,
          1000,  1007,  1012, 11430, 11320, 11368,  1005,  1055,  3257,  7906,
          1996,  2143,  4142,  1010,  2029,  2003,  2926,  3697,  2144,  1996,
          3861,  3253,  2032,  2053,  2613,  4119,  1012,  2145,  1010,  2009,
          1005,  1055,  3835,  2000,  2298,  2012,  2005,  2054,  2009,  2003,
          1012,  1996,  6370,  2090,  2745, 19881,  1998,  5696, 20726,  2003,
          3243,  8235,  1012,  1996, 10949,  1997,  2037,  3276,  2024, 11341,
          1012, 19881,  2003, 10392,  2004,  2467,  1010,  1998, 20726,  4152,
          2028,  1997,  2010,  2261,  9592,  2000,  2428,  2552,  1012,  1026,
          7987,  1013,  1028,  1026,  7987,  1013,  1028,  1045, 18766,  2008,
          1045,  1005,  2310,  2196,  2464, 11209, 2

In [100]:
bert_encoding["attention_mask"]

tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [104]:
hf_tokenizer = transformers.PreTrainedTokenizerFast(tokenizer_object=bpe_tokenizer)
hf_encodings = hf_tokenizer(train_reviews[:3], padding=True, truncation=True, max_length=500, return_tensors="pt")
hf_encodings["input_ids"]

tensor([[159, 402, 176, 246,  61, 782, 156, 737, 252,  42, 239,  51, 154, 460,
         917,  17, 272, 156, 737, 576, 215, 976, 275,  42, 199,  44, 554,  42,
         192, 585,  57, 160, 259, 170, 157, 143, 138, 159, 402,  11, 589, 152,
           5, 819, 168, 230,   5, 521, 924, 981, 962, 250,  61,  10,  60, 426,
         526, 959,  60, 138, 199, 150, 319,  15, 363, 141, 957, 694,  47, 696,
          61, 875, 138, 960, 337, 414, 140, 157, 385, 174, 433, 161, 221, 145,
         213,  17, 549,  15, 151,  10,  60,  55, 416, 146, 407, 144, 182, 303,
         151, 141,  17, 138, 547, 538, 528, 768,  54, 335,  42, 203,  44, 270,
          46, 153, 876, 141, 919, 233, 522, 172, 141, 719, 162, 807, 279,  17,
         138,  45,  66,  55, 188, 989, 156, 378, 698, 301, 296, 689, 212, 558,
         926, 148,  17,  44, 270,  46, 141,  47, 279, 302, 171, 152, 787,  15,
         153, 522, 172, 766, 205, 156, 234, 677, 161, 139, 513, 146, 370, 251,
         219, 162, 197, 162, 166,  50, 265,  47, 266

**Building and Training a Sentiment Analysis Model**

In [123]:
def collate_fn(batch, tokenizer=bert_tokenizator):
    reviews = [review["text"] for review in batch]
    labels = [review["label"] for review in batch]
    encodings = tokenizer(reviews, padding=True, truncation=True, max_length=200, return_tensors='pt')
    labels = torch.tensor(labels, dtype=torch.float32)
    return encodings, labels


In [140]:
def collate_fn(batch, tokenizer=bert_tokenizator):
    reviews = [review["text"] for review in batch]
    labels = [[review["label"]] for review in batch]
    encodings = tokenizer(reviews, padding=True, truncation=True,
                          max_length=200, return_tensors="pt")
    labels = torch.tensor(labels, dtype=torch.float32)
    return encodings, labels

batch_size = 256
imdb_train_loader = DataLoader(imdb_train_set, batch_size=batch_size,
                               collate_fn=collate_fn, shuffle=True)
imdb_valid_loader = DataLoader(imdb_valid_set, batch_size=batch_size,
                               collate_fn=collate_fn)
imdb_test_loader = DataLoader(imdb_test_set, batch_size=batch_size,
                              collate_fn=collate_fn)

In [152]:
class SentimentAnalysisModel(nn.Module):
    def __init__(self, vocab_size, n_layers=2, embed_dim=128, hidden_dim=64,
                 pad_id=0, dropout=0.2):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, embed_dim,
                                  padding_idx=pad_id)
        self.gru = nn.GRU(embed_dim, hidden_dim, num_layers=n_layers,
                          batch_first=True, dropout=dropout)
        self.output = nn.Linear(hidden_dim, 1)

    def forward(self, encodings):
        embeddings = self.embed(encodings["input_ids"])
        _outputs, hidden_states = self.gru(embeddings)
        return self.output(hidden_states[-1])

In [113]:
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

sequence = torch.tensor([[1,2,0,0], [5,6,7,8]])
packed = pack_padded_sequence(sequence, lengths=(2,4), enforce_sorted=False, batch_first=True)
packed

PackedSequence(data=tensor([5, 1, 6, 2, 7, 8]), batch_sizes=tensor([2, 2, 1, 1]), sorted_indices=tensor([1, 0]), unsorted_indices=tensor([1, 0]))

In [114]:
padded, lengths = pad_packed_sequence(packed, batch_first=True)
padded, lengths

(tensor([[1, 2, 0, 0],
         [5, 6, 7, 8]]),
 tensor([2, 4]))

In [145]:
class SentimentAnalysisModelPackedSeq(nn.Module):
    def __init__(self, vocab_size, n_layers=2, embed_dim=128,
                 hidden_dim=64, pad_id=0, dropout=0.2):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, embed_dim,
                                  padding_idx=pad_id)
        self.gru = nn.GRU(embed_dim, hidden_dim, num_layers=n_layers,
                          batch_first=True, dropout=dropout)
        self.output = nn.Linear(hidden_dim, 1)

    def forward(self, encodings):
        embeddings = self.embed(encodings["input_ids"])
        lengths = encodings["attention_mask"].sum(dim=1)                      # <= line added
        packed = pack_padded_sequence(embeddings, lengths=lengths.cpu(),      # <= line added
                                      batch_first=True, enforce_sorted=False) # <= line added
        _outputs, hidden_states = self.gru(packed)                            # <= line changed
        return self.output(hidden_states[-1])

In [157]:
torch.manual_seed(42)

vocab_size = bert_tokenizer.vocab_size
imdb_model_ps = SentimentAnalysisModelPackedSeq(vocab_size).to(device)

n_epochs = 1
xentropy = nn.BCEWithLogitsLoss()
optimizer = torch.optim.NAdam(imdb_model_ps.parameters())
accuracy = torchmetrics.Accuracy(task="binary").to(device)

history = train(imdb_model_ps, optimizer, xentropy, accuracy,
                imdb_train_loader, imdb_valid_loader, n_epochs)

Epoch 1/1,train loss: 0.6726,train metrics: 58.27%, valid metrics: 59.66% 


**Bidirectional RNNs**