#  <font color='#FFE15B'><b> Language Modeling </b></font>

# 🔴 **Import Libs**

In [None]:
!pip install torchtext==0.15.2

In [None]:
!pip install -q torchdata==0.4.1

In [3]:
!pip install -q torchmetrics

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/841.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m225.3/841.5 kB[0m [31m6.4 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━[0m [32m655.4/841.5 kB[0m [31m9.3 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m841.5/841.5 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
!pip install -q portalocker>=2.0.0

In [1]:
import numpy as np
import matplotlib.pyplot as plt

import torchtext

import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset

from torch import optim
from torch.nn import functional as F

import tqdm
import torchmetrics as tm

In [5]:
!python --version
print(torch.__version__)
print(torchtext.__version__)

Python 3.10.12
2.0.1+cu117
0.15.2+cpu


In [6]:
for lib in [np, torch, torchtext, tqdm]:
  print(lib.__name__, '-->', lib.__version__)

numpy --> 1.25.2
torch --> 2.0.1+cu117
torchtext --> 0.15.2+cpu
tqdm --> 4.66.2


# 🔴 **Utils**

In [2]:
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

In [3]:
def num_trainable_params(model):
  nums = sum(p.numel() for p in model.parameters() if p.requires_grad)/1e6
  return nums

# 🔴 **Dataset**

## 🟠 **Load the Dataset**

🔰 In this session you should load WikiText2 dataset.

In [None]:
# n * b * l >> no need to dataloader
# n * l     >> need dataloader

In [88]:
from torchtext.datasets import WikiText2
train_iter, valid_iter, test_iter = WikiText2(root='/content/datasets/WikiText2',
                                              split=('train', 'valid', 'test'))

In [89]:
train_iter

ShardingFilterIterDataPipe

In [117]:
class CustomDataset(torch.utils.data.IterableDataset):

  def __init__(self, file_path):
    self.file_path = file_path

  def parse_file(self):
    with open(self.file_path, 'r') as file:
      for line in file:
        yield line.strip()

  def __iter__(self):
    return self.parse_file()

In [118]:
train_dataset = CustomDataset('/content/datasets/WikiText2/wiki.train.tokens')
valid_dataset = CustomDataset('/content/datasets/WikiText2/wiki.valid.tokens')
test_dataset  = CustomDataset('/content/datasets/WikiText2/wiki.test.tokens')

In [119]:
next(train_dataset)

TypeError: 'CustomDataset' object is not an iterator

In [90]:
train_iter = iter(valid_iter)
train_iter

<generator object ShardingFilterIterDataPipe.__iter__ at 0x7b566d9dd0e0>

In [None]:
next(train_iter)

## 🟠 **Build vocabulary and save it**

🔰 In this section we need to:

*   Define a tokenizer using `basic_english`
*   Tokenize the dataset and collect tokens
*   Build the vocabulary using `build_vocab_from_iterator`
*   Manually insert special tokens and set the default index


In [92]:
from torchtext.data.utils import get_tokenizer
tokenizer = get_tokenizer("basic_english")

In [1]:
txt = ['@sajad hi sajad! 1 n2 3 #45', 'how are are you?']

tokenizer = get_tokenizer('basic_english')

[tokenizer(line) for line in txt]
# list(map(tokenizer, txt))

NameError: name 'get_tokenizer' is not defined

In [94]:
from torchtext.vocab import build_vocab_from_iterator

vocab = build_vocab_from_iterator(map(tokenizer, txt), specials=['<unk>'], min_freq=1)
vocab.set_default_index(vocab['<unk>'])
vocab.get_stoi()

{'sajad': 11,
 'n2': 10,
 'how': 9,
 '@sajad': 7,
 '?': 6,
 '3': 5,
 '1': 4,
 '#45': 3,
 '!': 2,
 'are': 1,
 'you': 12,
 'hi': 8,
 '<unk>': 0}

In [95]:
vocab(['ebi', 'hi', 'qwerty'])

[0, 8, 0]

In [96]:
tokenizer = get_tokenizer('basic_english')
vocab = build_vocab_from_iterator(map(tokenizer, train_iter), specials=['<unk>'])
vocab.set_default_index(vocab['<unk>'])

In [97]:
len(vocab)

1

## 🟠 EDA

### 🟡 Let's explore the WikiText2 dataset!

### 🟡 Calculate basic statistics such as the number of documents, total words, average document length, etc.

### 🟡 Analyze the most common and least common words in the dataset.

### 🟡  Please proceed with further exploration of the dataset. what do you suggest?

## 🟠 Transform the data

🛑 Make sure to perform the transformations on train, validation and test datasets.

🔰 Reshape the dataset into an `N x B x L` or `M x L` format, where `N` represents the number of batches, `B` is the batch size, `L` is the length of a sample within each batch, and `M` is equal to `N x B`.

In [None]:
def data_process(raw_text_iter, batch_size, seq_len):

    return inputs, targets

## 🟠 Custom dataset

🔰 Write a custom dataset class for LanguageModelDataset.

In [None]:
class LanguageModelDataset(Dataset):

  def __init__(self, inputs, targets):
    pass

  def __len__(self):
    pass

  def __getitem__(self, idx):
    pass


## 🟠 Define a dataloader if needed

🔰 Write dataloaders for the training, validation, and test sets.

# 🔴 **Model**

🔰 Use the following template to create a custom model.

Your model should consist of three parts:

*   an embedding layer
*   an LSTM layer
*   a fully connected layer

In [None]:
class LanguageModel(nn.Module):

  def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers, dropout_rate):
    pass

  def forward(self, src):
    pass


# 🔴 **Config**

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

🔰 Define the optimizer, loss function, metrics and other necessary parameters in this section, and ensure the model is sent to the appropriate device.

# 🔴 **Train ➰**

🔰 This is the template for train function, change it if needed.

In [None]:
def train_one_epoch(model, train_loader, loss_fn, optimizer, metric, epoch=None):
  model.train()
  loss_train = AverageMeter()
  metric.reset()

  with tqdm.tqdm(train_loader, unit='batch') as tepoch:
    for inputs, targets in tepoch:
      if epoch:
        tepoch.set_description(f'Epoch {epoch}')

      inputs = inputs.to(device)
      targets = targets.to(device)

      outputs = model(inputs)

      loss = loss_fn(outputs, targets)

      loss.backward()

      optimizer.step()
      optimizer.zero_grad()

      loss_train.update(loss.item(), n=len(targets))
      metric.update(outputs, targets)

      tepoch.set_postfix(loss=loss_train.avg, metric=metric.compute().item())

  return model, loss_train.avg, metric.compute().item()

# 🔴 **Evaluation**

🔰 This is the template for evaluation function, change it if needed.

In [None]:
def evaluate(model, test_loader, loss_fn, metric):
  model.eval()
  loss_eval = AverageMeter()
  metric.reset()

  with torch.inference_mode():
    for inputs, targets in test_loader:
      inputs = inputs.to(device)
      targets = targets.to(device)

      outputs = model(inputs)

      loss = loss_fn(outputs, targets)
      loss_eval.update(loss.item(), n=len(targets))

      metric(outputs, targets)

  return loss_eval.avg, metric.compute().item()

# 🔴 **Training Process 〽️**

## 🟠 Finding Hyper-parameters

### 🟡 **Step 1:** Calculate the loss for an untrained model using a few batches.


In [None]:
model =

inputs, targets = next(iter(train_set))
inputs = inputs.to(device)
targets = targets.to(device)

with torch.no_grad():
  outputs = model(inputs)
  loss = loss_fn(outputs, targets)

print(loss)

### 🟡 **Step 2:** Try to train and overfit the model on a small subset of the dataset.

In [None]:
model =
optimizer = torch.optim.SGD(model.parameters(), lr=, momentum=0.9)

In [None]:
num_epochs = ...
for epoch in range(num_epochs):
  model, _, _ = train_one_epoch(model, ..., loss_fn, optimizer, metric, epoch)

### 🟡 **Step 3:** Train the model for a limited number of epochs, experimenting with various learning rates.

In [None]:
num_epochs =

for lr in [...]:
  print(f'LR={lr}')

  model =
  optimizer = optim.SGD(model.parameters(), lr=lr, weight_decay=1e-4, momentum=0.9)

  for epoch in range(num_epochs):
    model, _, _ = train_one_epoch(model, train_set, loss_fn, optimizer, metric, epoch)

  print()

### 🟡 Step 4: Create a small grid using the weight decay and the best learning rate.





In [None]:
num_epochs =

for lr in [...]:
  for wd in [...]:
    print(f'LR={lr}, WD={wd}')

    model =
    optimizer = optim.SGD(model.parameters(), lr=lr, weight_decay=wd)

    for epoch in range(num_epochs):
      model, loss, _ = train_one_epoch(model, train_loader, loss_fn, optimizer, epoch)

    print()

### 🟡 Step 5: Train model for longer epochs using the best model from step 4.





In [None]:
model =

In [None]:
lr =
wd =
optimizer = optim.SGD(model.parameters(), lr=lr, weight_decay=wd, momentum=0.9, nesterov=True)

In [None]:
loss_train_hist = []
loss_valid_hist = []

metric_train_hist = []
metric_valid_hist = []

best_loss_valid = torch.inf
epoch_counter = 0

In [None]:
num_epochs =

for epoch in range(num_epochs):
  # Train
  model, loss_train, metric_train = train_one_epoch(model,
                                                 train_set,
                                                 loss_fn,
                                                 optimizer,
                                                 metric,
                                                 epoch)
  # Validation
  loss_valid, metric_valid = evaluate(model,
                                     valid_set,
                                     loss_fn,
                                     metric)

  loss_train_hist.append(loss_train)
  loss_valid_hist.append(loss_valid)

  metric_train_hist.append(metric_train)
  metric_valid_hist.append(metric_valid)

  if loss_valid < best_loss_valid:
    torch.save(model, f'model.pt')
    best_loss_valid = loss_valid
    print('Model Saved!')

  print(f'Valid: Loss = {loss_valid:.4}, Metric = {metric_valid:.4}')
  print()

  epoch_counter += 1

## 🟠 Main Loop

🔰 Define model.

In [None]:
model =

🔰 Define optimizer and Set learning rate and weight decay.

In [None]:
lr =
wd =
optimizer = optim.SGD(model.parameters(), lr=lr, weight_decay=wd, momentum=0.9, nesterov=True)

🔰 Write code to train the model for `num_epochs` epoches.

In [None]:
loss_train_hist = []
loss_valid_hist = []

metric_train_hist = []
metric_valid_hist = []

best_loss_valid = torch.inf
epoch_counter = 0

In [None]:
num_epochs =

for epoch in range(num_epochs):
  # Train
  model, loss_train, metric_train = train_one_epoch(model,
                                                 train_set,
                                                 loss_fn,
                                                 optimizer,
                                                 metric,
                                                 epoch)
  # Validation
  loss_valid, metric_valid = evaluate(model,
                                     valid_set,
                                     loss_fn,
                                     metric)

  loss_train_hist.append(loss_train)
  loss_valid_hist.append(loss_valid)

  metric_train_hist.append(metric_train)
  metric_valid_hist.append(metric_valid)

  if loss_valid < best_loss_valid:
    torch.save(model, f'model.pt')
    best_loss_valid = loss_valid
    print('Model Saved!')

  print(f'Valid: Loss = {loss_valid:.4}, Metric = {metric_valid:.4}')
  print()

  epoch_counter += 1

## 🟠 Plot

🔰 Plot learning curves

In [None]:
plt.figure(figsize=(8, 6))

plt.plot(range(epoch_counter), loss_train_hist, 'r-', label='Train')
plt.plot(range(epoch_counter), loss_valid_hist, 'b-', label='Validation')

plt.xlabel('Epoch')
plt.ylabel('loss')
plt.grid(True)
plt.legend()

# 🔴 **Test**

🔰 Test your model using data from the test set

# 🔴 **Generate**

🔰 Your mission is to write a `generate` function and use a desired sentence to evaluate the model

In [None]:
model_path = 'model.pt'
model = torch.load(model_path)
model.eval()

In [None]:
def generate(prompt, max_seq_len, temperature, model, tokenizer, vocab, seed=None):
    pass