In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import torch.utils.data as data
from functools import partial
from torch.utils.data import DataLoader
from tqdm import tqdm
import copy
from torch.utils.tensorboard import SummaryWriter
import torch
import torch.nn as nn
from typing import *
from torch.optim import Optimizer

In [None]:
#from google.colab import drive
#drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
tensorboard_folder = 'tensorboard'
models_folder = 'models'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
batch_size = 256

In [None]:
run_exp = [0,0,0,0,0,0,1]

# Data loading

In [None]:
# Loading the corpus

def read_file(path, eos_token="<eos>"):
    output = []
    with open(path, "r") as f:
        for line in f.readlines():
            output.append(line.strip() + " " + eos_token)
    return output

# Vocab with tokens to ids
def get_vocab(corpus, special_tokens=[]):
    output = {}
    i = 0
    for st in special_tokens:
        output[st] = i
        i += 1
    for sentence in corpus:
        for w in sentence.split():
            if w not in output:
                output[w] = i
                i += 1
    return output

In [None]:
# If you are using Colab, run these lines
#!wget -P dataset/PennTreeBank https://raw.githubusercontent.com/BrownFortress/NLU-2024-Labs/main/labs/dataset/PennTreeBank/ptb.test.txt
#!wget -P dataset/PennTreeBank https://raw.githubusercontent.com/BrownFortress/NLU-2024-Labs/main/labs/dataset/PennTreeBank/ptb.valid.txt
#!wget -P dataset/PennTreeBank https://raw.githubusercontent.com/BrownFortress/NLU-2024-Labs/main/labs/dataset/PennTreeBank/ptb.train.txt

--2024-04-08 15:41:31--  https://raw.githubusercontent.com/BrownFortress/NLU-2024-Labs/main/labs/dataset/PennTreeBank/ptb.test.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 449945 (439K) [text/plain]
Saving to: ‘dataset/PennTreeBank/ptb.test.txt’


2024-04-08 15:41:32 (18.2 MB/s) - ‘dataset/PennTreeBank/ptb.test.txt’ saved [449945/449945]

--2024-04-08 15:41:32--  https://raw.githubusercontent.com/BrownFortress/NLU-2024-Labs/main/labs/dataset/PennTreeBank/ptb.valid.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 399782 (

In [None]:

train_raw = read_file("dataset/PennTreeBank/ptb.train.txt")
dev_raw = read_file("dataset/PennTreeBank/ptb.valid.txt")
test_raw = read_file("dataset/PennTreeBank/ptb.test.txt")


In [None]:
# Vocab is computed only on training set
# We add two special tokens end of sentence and padding
vocab = get_vocab(train_raw, ["<pad>", "<eos>"])

In [None]:
# This class computes and stores our vocab
# Word to ids and ids to word
class Lang():
    def __init__(self, corpus, special_tokens=[]):
        self.word2id = self.get_vocab(corpus, special_tokens)
        self.id2word = {v:k for k, v in self.word2id.items()}
    def get_vocab(self, corpus, special_tokens=[]):
        output = {}
        i = 0
        for st in special_tokens:
            output[st] = i
            i += 1
        for sentence in corpus:
            for w in sentence.split():
                if w not in output:
                    output[w] = i
                    i += 1
        return output

lang = Lang(train_raw, ["<pad>", "<eos>"])

In [None]:
class PennTreeBank (data.Dataset):
    # Mandatory methods are __init__, __len__ and __getitem__
    def __init__(self, corpus, lang):
        self.source = []
        self.target = []

        for sentence in corpus:
            self.source.append(sentence.split()[0:-1]) # We get from the first token till the second-last token
            self.target.append(sentence.split()[1:]) # We get from the second token till the last token
            # See example in section 6.2

        self.source_ids = self.mapping_seq(self.source, lang)
        self.target_ids = self.mapping_seq(self.target, lang)

    def __len__(self):
        return len(self.source)

    def __getitem__(self, idx):
        src= torch.LongTensor(self.source_ids[idx])
        trg = torch.LongTensor(self.target_ids[idx])
        sample = {'source': src, 'target': trg}
        return sample

    # Auxiliary methods

    def mapping_seq(self, data, lang): # Map sequences of tokens to corresponding computed in Lang class
        res = []
        for seq in data:
            tmp_seq = []
            for x in seq:
                if x in lang.word2id:
                    tmp_seq.append(lang.word2id[x])
                else:
                    print('OOV found!')
                    print('You have to deal with that') # PennTreeBank doesn't have OOV but "Trust is good, control is better!"
                    break
            res.append(tmp_seq)
        return res

In [None]:
train_dataset = PennTreeBank(train_raw, lang)
dev_dataset = PennTreeBank(dev_raw, lang)
test_dataset = PennTreeBank(test_raw, lang)

In [None]:
def collate_fn(data, pad_token):
    def merge(sequences):
        '''
        merge from batch * sent_len to batch * max_len
        '''
        lengths = [len(seq) for seq in sequences]
        max_len = 1 if max(lengths)==0 else max(lengths)
        # Pad token is zero in our case
        # So we create a matrix full of PAD_TOKEN (i.e. 0) with the shape
        # batch_size X maximum length of a sequence
        padded_seqs = torch.LongTensor(len(sequences),max_len).fill_(pad_token)
        for i, seq in enumerate(sequences):
            end = lengths[i]
            padded_seqs[i, :end] = seq # We copy each sequence into the matrix
        padded_seqs = padded_seqs.detach()  # We remove these tensors from the computational graph
        return padded_seqs, lengths

    # Sort data by seq lengths

    data.sort(key=lambda x: len(x["source"]), reverse=True)
    new_item = {}
    for key in data[0].keys():
        new_item[key] = [d[key] for d in data]

    source, _ = merge(new_item["source"])
    target, lengths = merge(new_item["target"])

    new_item["source"] = source.to(device)
    new_item["target"] = target.to(device)
    new_item["number_tokens"] = sum(lengths)
    return new_item


# Train Loop

In [None]:
criterion_train = nn.CrossEntropyLoss(ignore_index=lang.word2id["<pad>"])
criterion_eval = nn.CrossEntropyLoss(ignore_index=lang.word2id["<pad>"], reduction='sum')

In [None]:
import math
def train_loop(data, optimizer, criterion, model, clip=5):
    model.train()
    loss_array = []
    number_of_tokens = []

    for sample in data:
        optimizer.zero_grad() # Zeroing the gradient
        output = model(sample['source'])
        loss = criterion(output, sample['target'])
        loss_array.append(loss.item() * sample["number_tokens"])
        number_of_tokens.append(sample["number_tokens"])
        loss.backward() # Compute the gradient, deleting the computational graph
        # clip the gradient to avoid explosioning gradients
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step() # Update the weights

    return sum(loss_array)/sum(number_of_tokens)

def eval_loop(data, eval_criterion, model):
    model.eval()
    loss_to_return = []
    loss_array = []
    number_of_tokens = []
    # softmax = nn.Softmax(dim=1) # Use Softmax if you need the actual probability
    with torch.no_grad(): # It used to avoid the creation of computational graph
        for sample in data:
            output = model(sample['source'])
            loss = eval_criterion(output, sample['target'])
            loss_array.append(loss.item())
            number_of_tokens.append(sample["number_tokens"])

    ppl = math.exp(sum(loss_array) / sum(number_of_tokens))
    loss_to_return = sum(loss_array) / sum(number_of_tokens)
    return ppl, loss_to_return

def init_weights(mat):
    for m in mat.modules():
        if type(m) in [nn.GRU, nn.LSTM, nn.RNN]:
            for name, param in m.named_parameters():
                if 'weight_ih' in name:
                    for idx in range(4):
                        mul = param.shape[0]//4
                        torch.nn.init.xavier_uniform_(param[idx*mul:(idx+1)*mul])
                elif 'weight_hh' in name:
                    for idx in range(4):
                        mul = param.shape[0]//4
                        torch.nn.init.orthogonal_(param[idx*mul:(idx+1)*mul])
                elif 'bias' in name:
                    param.data.fill_(0)
        else:
            if type(m) in [nn.Linear]:
                torch.nn.init.uniform_(m.weight, -0.01, 0.01)
                if m.bias != None:
                    m.bias.data.fill_(0.01)

In [None]:
def train(model, optimizer, exp_name, clip=5, epochs=100, patience=3, batch_size=64):
    pat = patience

    train_loader = DataLoader(train_dataset, batch_size=batch_size, collate_fn=partial(collate_fn, pad_token=lang.word2id["<pad>"]),  shuffle=True)
    dev_loader = DataLoader(dev_dataset, batch_size=batch_size, collate_fn=partial(collate_fn, pad_token=lang.word2id["<pad>"]))
    test_loader = DataLoader(test_dataset, batch_size=batch_size, collate_fn=partial(collate_fn, pad_token=lang.word2id["<pad>"]))

    writer = SummaryWriter(tensorboard_folder+'/'+exp_name)

    best_ppl = math.inf

    pbar = tqdm(range(1,epochs))
    #If the PPL is too high try to change the learning rate
    for epoch in pbar:
        loss = train_loop(train_loader, optimizer, criterion_train, model, clip)

        if epoch % 1 == 0:
            ppl_dev, loss_dev = eval_loop(dev_loader, criterion_eval, model)
            pbar.set_description("PPL: %f" % ppl_dev)

            writer.add_scalar('Loss/train', np.asarray(loss).mean(), epoch)
            writer.add_scalar('Loss/dev', np.asarray(loss_dev).mean(), epoch)
            writer.add_scalar('PPL/dev', ppl_dev, epoch)

            if  ppl_dev < best_ppl: # the lower, the better
                best_ppl = ppl_dev
                best_model = copy.deepcopy(model)
                pat = patience
            else:
                pat -= 1

            if pat <= 0: # Early stopping with patience
                break # Not nice but it keeps the code clean

    best_model.to(device)
    final_ppl,  _ = eval_loop(test_loader, criterion_eval, best_model)
    print('Test ppl: ', final_ppl)

    writer.add_scalar('PPL/test', final_ppl, 0)

    # Save the best model
    torch.save(best_model.state_dict(), models_folder+'/'+exp_name+'.pt')

    writer.close()

# Baseline

In [None]:
name = "baseline_higherLR"

## Model

In [None]:
class LM_RNN(nn.Module):
    def __init__(self, emb_size, hidden_size, output_size, pad_index=0, out_dropout=0.1,
                 emb_dropout=0.1, n_layers=1):
        super(LM_RNN, self).__init__()
        # Token ids to vectors, we will better see this in the next lab
        self.embedding = nn.Embedding(output_size, emb_size, padding_idx=pad_index)
        # Pytorch's RNN layer: https://pytorch.org/docs/stable/generated/torch.nn.RNN.html
        self.rnn = nn.RNN(emb_size, hidden_size, n_layers, bidirectional=False, batch_first=True)
        self.pad_token = pad_index
        # Linear layer to project the hidden layer to our output space
        self.output = nn.Linear(hidden_size, output_size)

    def forward(self, input_sequence):
        emb = self.embedding(input_sequence)
        rnn_out, _  = self.rnn(emb)
        output = self.output(rnn_out).permute(0,2,1)
        return output

## Params

In [None]:
hid_size = 300
emb_size = 300

lr = 2
clip = 5
patience = 3
n_epochs = 100

vocab_len = len(lang.word2id)

model = LM_RNN(emb_size, hid_size, vocab_len, pad_index=lang.word2id["<pad>"]).to(device)
model.apply(init_weights)

optimizer = optim.SGD(model.parameters(), lr=lr)
criterion_train = nn.CrossEntropyLoss(ignore_index=lang.word2id["<pad>"])
criterion_eval = nn.CrossEntropyLoss(ignore_index=lang.word2id["<pad>"], reduction='sum')

## Train

In [None]:
if (run_exp[0]):
  train(model=model, optimizer=optimizer, exp_name=name, clip=clip, epochs=n_epochs, patience=patience, batch_size=batch_size)

  #save all parameters (lr, emb_size, hid_size, optimizer, clip, patience, n_epochs)
  with open(tensorboard_folder+'/'+name+'/params.txt', 'w') as f:
      f.write('lr: {}\n'.format(optimizer.param_groups[0]['lr']))
      f.write('weight_decay: {}\n'.format(optimizer.param_groups[0]['weight_decay']))
      f.write('optimizer: {}\n'.format(optimizer))
      f.write('emb_size: {}\n'.format(emb_size))
      f.write('hid_size: {}\n'.format(hid_size))
      f.write('optimizer: {}\n'.format(optimizer))
      f.write('clip: {}\n'.format(clip))
      f.write('patience: {}\n'.format(patience))
      f.write('n_epochs: {}\n'.format(n_epochs))
      f.write('batch_size: {}\n'.format(batch_size))

# Part 1

In this, you have to modify the baseline LM_RNN by adding a set of techniques that might improve the performance. In this, you have to add one modification at a time incrementally. If adding a modification decreases the performance, you can remove it and move forward with the others. However, in the report, you have to provide and comment on this unsuccessful experiment.  For each of your experiments, you have to print the performance expressed with Perplexity (PPL).
<br>
One of the important tasks of training a neural network is  hyperparameter optimization. Thus, you have to play with the hyperparameters to minimise the PPL and thus print the results achieved with the best configuration (in particular <b>the learning rate</b>).
These are two links to the state-of-the-art papers which use vanilla RNN [paper1](https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=5947611), [paper2](https://www.fit.vutbr.cz/research/groups/speech/publi/2010/mikolov_interspeech2010_IS100722.pdf).

**Mandatory requirements**: For the following experiments the perplexity must be below 250 (***PPL < 250***).

1. Replace RNN with a Long-Short Term Memory (LSTM) network --> [link](https://pytorch.org/docs/stable/generated/torch.nn.LSTM.html)
2. Add two dropout layers: --> [link](https://pytorch.org/docs/stable/generated/torch.nn.Dropout.html)
    - one after the embedding layer,
    - one before the last linear layer
3. Replace SGD with AdamW --> [link](https://pytorch.org/docs/stable/generated/torch.optim.AdamW.html)

## LSTM base

In [None]:
name = "LSTM_base_higherLR"

### Model

In [None]:
class LM_LSTM(nn.Module):
    def __init__(self, emb_size, hidden_size, output_size, pad_index=0, out_dropout=0.1,
                 emb_dropout=0.1, n_layers=1):
        super(LM_LSTM, self).__init__()
        # Token ids to vectors, we will better see this in the next lab
        self.embedding = nn.Embedding(output_size, emb_size, padding_idx=pad_index)
        # Pytorch's RNN layer: https://pytorch.org/docs/stable/generated/torch.nn.RNN.html
        self.rnn = nn.LSTM(emb_size, hidden_size, n_layers, bidirectional=False, batch_first=True)
        self.pad_token = pad_index
        # Linear layer to project the hidden layer to our output space
        self.output = nn.Linear(hidden_size, output_size)

    def forward(self, input_sequence):
        emb = self.embedding(input_sequence)
        rnn_out, _  = self.rnn(emb)
        output = self.output(rnn_out).permute(0,2,1)
        return output

### Params

In [None]:
hid_size = 300
emb_size = 300

lr = 2
clip = 5
patience = 3
n_epochs = 100

vocab_len = len(lang.word2id)

model = LM_RNN(emb_size, hid_size, vocab_len, pad_index=lang.word2id["<pad>"]).to(device)
model.apply(init_weights)

optimizer = optim.SGD(model.parameters(), lr=lr)
criterion_train = nn.CrossEntropyLoss(ignore_index=lang.word2id["<pad>"])
criterion_eval = nn.CrossEntropyLoss(ignore_index=lang.word2id["<pad>"], reduction='sum')


### Train

In [None]:
if (run_exp[1]):
  train(model=model, optimizer=optimizer, exp_name=name, clip=clip, epochs=n_epochs, patience=patience, batch_size=batch_size)

  #save all parameters (lr, emb_size, hid_size, optimizer, clip, patience, n_epochs)
  with open(tensorboard_folder+'/'+name+'/params.txt', 'w') as f:
      f.write('lr: {}\n'.format(optimizer.param_groups[0]['lr']))
      f.write('weight_decay: {}\n'.format(optimizer.param_groups[0]['weight_decay']))
      f.write('optimizer: {}\n'.format(optimizer))
      f.write('emb_size: {}\n'.format(emb_size))
      f.write('hid_size: {}\n'.format(hid_size))
      f.write('optimizer: {}\n'.format(optimizer))
      f.write('clip: {}\n'.format(clip))
      f.write('patience: {}\n'.format(patience))
      f.write('n_epochs: {}\n'.format(n_epochs))
      f.write('batch_size: {}\n'.format(batch_size))

## LSTM dropout

In [None]:
name = "LSTM_dropout_higherLR"

### Model

In [None]:
class LM_LSTM_drop(nn.Module):
    def __init__(self, emb_size, hidden_size, output_size, pad_index=0, out_dropout=0.1,
                 emb_dropout=0.1, n_layers=1):
        super(LM_LSTM_drop, self).__init__()
        # Token ids to vectors, we will better see this in the next lab
        self.embedding = nn.Embedding(output_size, emb_size, padding_idx=pad_index)
        # Pytorch's RNN layer: https://pytorch.org/docs/stable/generated/torch.nn.RNN.html
        self.rnn = nn.LSTM(emb_size, hidden_size, n_layers, bidirectional=False, batch_first=True)
        self.pad_token = pad_index
        self.emb_dropout = nn.Dropout(emb_dropout)
        self.out_dropout = nn.Dropout(out_dropout)
        # Linear layer to project the hidden layer to our output space
        self.output = nn.Linear(hidden_size, output_size)

    def forward(self, input_sequence):
        emb = self.embedding(input_sequence)
        emb_drop = self.emb_dropout(emb)
        rnn_out, _  = self.rnn(emb_drop)
        rnn_out_drop = self.out_dropout(rnn_out)
        output = self.output(rnn_out_drop).permute(0,2,1)
        return output

### Params

In [None]:
hid_size = 300
emb_size = 300

lr = 2
clip = 5
patience = 3
n_epochs = 100
emb_dropout = 0.25
out_dropout = 0.25

vocab_len = len(lang.word2id)

model = LM_LSTM_drop(emb_size, hid_size, vocab_len, pad_index=lang.word2id["<pad>"], emb_dropout=emb_dropout, out_dropout=out_dropout).to(device)
model.apply(init_weights)

optimizer = optim.SGD(model.parameters(), lr=lr)
criterion_train = nn.CrossEntropyLoss(ignore_index=lang.word2id["<pad>"])
criterion_eval = nn.CrossEntropyLoss(ignore_index=lang.word2id["<pad>"], reduction='sum')


### Train

In [None]:
if (run_exp[2]):
  train(model=model, optimizer=optimizer, exp_name=name, clip=clip, epochs=n_epochs, patience=patience, batch_size=batch_size)

  #save all parameters (lr, emb_size, hid_size, optimizer, clip, patience, n_epochs)
  with open(tensorboard_folder+'/'+name+'/params.txt', 'w') as f:
      f.write('lr: {}\n'.format(optimizer.param_groups[0]['lr']))
      f.write('weight_decay: {}\n'.format(optimizer.param_groups[0]['weight_decay']))
      f.write('optimizer: {}\n'.format(optimizer))
      f.write('emb_dropout: {}\n'.format(emb_dropout))
      f.write('out_dropout: {}\n'.format(out_dropout))
      f.write('emb_size: {}\n'.format(emb_size))
      f.write('hid_size: {}\n'.format(hid_size))
      f.write('optimizer: {}\n'.format(optimizer))
      f.write('clip: {}\n'.format(clip))
      f.write('patience: {}\n'.format(patience))
      f.write('n_epochs: {}\n'.format(n_epochs))
      f.write('batch_size: {}\n'.format(batch_size))

## LSTM AdamW

In [None]:
name = "LSTM_AdamW"

### Params

In [None]:
hid_size = 300
emb_size = 300

clip = 5
patience = 3
n_epochs = 100
emb_dropout = 0.25
out_dropout = 0.25

vocab_len = len(lang.word2id)

model = LM_LSTM_drop(emb_size, hid_size, vocab_len, pad_index=lang.word2id["<pad>"], emb_dropout=emb_dropout, out_dropout=out_dropout).to(device)
model.apply(init_weights)

optimizer = optim.AdamW(model.parameters())
criterion_train = nn.CrossEntropyLoss(ignore_index=lang.word2id["<pad>"])
criterion_eval = nn.CrossEntropyLoss(ignore_index=lang.word2id["<pad>"], reduction='sum')

### Train

In [None]:
if (run_exp[3]):
  train(model=model, optimizer=optimizer, exp_name=name, clip=clip, epochs=n_epochs, patience=patience, batch_size=batch_size)

  #save all parameters (lr, emb_size, hid_size, optimizer, clip, patience, n_epochs)
  with open(tensorboard_folder+'/'+name+'/params.txt', 'w') as f:
      f.write('lr: {}\n'.format(optimizer.param_groups[0]['lr']))
      f.write('weight_decay: {}\n'.format(optimizer.param_groups[0]['weight_decay']))
      f.write('optimizer: {}\n'.format(optimizer))
      f.write('emb_dropout: {}\n'.format(emb_dropout))
      f.write('out_dropout: {}\n'.format(out_dropout))
      f.write('emb_size: {}\n'.format(emb_size))
      f.write('hid_size: {}\n'.format(hid_size))
      f.write('optimizer: {}\n'.format(optimizer))
      f.write('clip: {}\n'.format(clip))
      f.write('patience: {}\n'.format(patience))
      f.write('n_epochs: {}\n'.format(n_epochs))
      f.write('batch_size: {}\n'.format(batch_size))

# Part 2

**Mandatory requirements**: For the following experiments the perplexity must be below 250 (***PPL < 250***) and it should be lower than the one achieved in Part 1.1 (i.e. base LSTM).

Starting from the `LM_RNN` in which you replaced the RNN with a LSTM model, apply the following regularisation techniques:
- Weight Tying
- Variational Dropout (no DropConnect)
- Non-monotonically Triggered AvSGD

These techniques are described in [this paper](https://openreview.net/pdf?id=SyyGPP0TZ).

## LSTM Weight tying

In [None]:
name = "LSTM_WT_higherLR"

### Model

In [None]:
class LM_LSTM_WT(nn.Module):
    def __init__(self, emb_size, hidden_size, output_size, pad_index=0, out_dropout=0.1,
                 emb_dropout=0.1, n_layers=1):
        super(LM_LSTM_WT, self).__init__()
        # Token ids to vectors, we will better see this in the next lab
        self.embedding = nn.Embedding(output_size, emb_size, padding_idx=pad_index)
        # Pytorch's RNN layer: https://pytorch.org/docs/stable/generated/torch.nn.RNN.html
        self.rnn = nn.LSTM(emb_size, hidden_size, n_layers, bidirectional=False, batch_first=True)
        self.pad_token = pad_index
        # Linear layer to project the hidden layer to our output space
        self.output = nn.Linear(hidden_size, output_size)
        # Weight tying
        self.output.weight = self.embedding.weight

    def forward(self, input_sequence):
        emb = self.embedding(input_sequence)
        rnn_out, _  = self.rnn(emb)
        output = self.output(rnn_out).permute(0,2,1)
        return output

### Params

In [None]:
hid_size = 300
emb_size = 300

lr = 2
clip = 5
patience = 3
n_epochs = 100

vocab_len = len(lang.word2id)

model = LM_LSTM_WT(emb_size, hid_size, vocab_len, pad_index=lang.word2id["<pad>"]).to(device)
model.apply(init_weights)

optimizer = optim.SGD(model.parameters(), lr=lr)
criterion_train = nn.CrossEntropyLoss(ignore_index=lang.word2id["<pad>"])
criterion_eval = nn.CrossEntropyLoss(ignore_index=lang.word2id["<pad>"], reduction='sum')

### Train

In [None]:
if (run_exp[4]):
  train(model=model, optimizer=optimizer, exp_name=name, clip=clip, epochs=n_epochs, patience=patience, batch_size=batch_size)

  #save all parameters (lr, emb_size, hid_size, optimizer, clip, patience, n_epochs)
  with open(tensorboard_folder+'/'+name+'/params.txt', 'w') as f:
      f.write('lr: {}\n'.format(optimizer.param_groups[0]['lr']))
      f.write('weight_decay: {}\n'.format(optimizer.param_groups[0]['weight_decay']))
      f.write('optimizer: {}\n'.format(optimizer))
      f.write('emb_size: {}\n'.format(emb_size))
      f.write('hid_size: {}\n'.format(hid_size))
      f.write('optimizer: {}\n'.format(optimizer))
      f.write('clip: {}\n'.format(clip))
      f.write('patience: {}\n'.format(patience))
      f.write('n_epochs: {}\n'.format(n_epochs))
      f.write('batch_size: {}\n'.format(batch_size))

## LSTM WT Variational Dropout

In [None]:
name = "LSTM_WT_VD_higherLR"

### Model

In [None]:
class VariationalDropout(nn.Module):
    def __init__(self, dropout_probability: float,):
        super().__init__()
        self.p = dropout_probability

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        if not self.training or self.p <= 0.:
            return x

        batch_size = x.size(0)

        mask = x.new_empty(batch_size, 1, x.size(2), requires_grad=False).bernoulli_(1 - self.p)

        mask = mask.expand_as(x)
        x = x.mul(mask).div(1.0 - self.p)

        return x

class LM_LSTM_WT_VD(nn.Module):
    def __init__(self, emb_size, hidden_size, output_size, pad_index=0, out_dropout=0.1,
                 emb_dropout=0.1, n_layers=1):
        super(LM_LSTM_WT_VD, self).__init__()
        # Token ids to vectors, we will better see this in the next lab
        self.embedding = nn.Embedding(output_size, emb_size, padding_idx=pad_index)
        # Pytorch's RNN layer: https://pytorch.org/docs/stable/generated/torch.nn.RNN.html
        self.rnn = nn.LSTM(emb_size, hidden_size, n_layers, bidirectional=False, batch_first=True)
        self.pad_token = pad_index
        # Linear layer to project the hidden layer to our output space
        self.output = nn.Linear(hidden_size, output_size)
        # Weight tying
        self.output.weight = self.embedding.weight
        # variational dropout
        self.emb_dropout = VariationalDropout(emb_dropout)
        # variational dropout
        self.out_dropout = VariationalDropout(out_dropout)


    def forward(self, input_sequence):
        emb = self.embedding(input_sequence)
        emb_drop = self.emb_dropout(emb)
        rnn_out, _  = self.rnn(emb_drop)
        out_drop = self.out_dropout(rnn_out)
        output = self.output(out_drop).permute(0,2,1)
        return output

### Params

In [None]:
hid_size = 300
emb_size = 300

lr = 2
clip = 5
patience = 3
n_epochs = 100
emb_dropout = 0.2
out_dropout = 0.2

vocab_len = len(lang.word2id)

model = LM_LSTM_WT_VD(emb_size, hid_size, vocab_len, pad_index=lang.word2id["<pad>"], emb_dropout=emb_dropout, out_dropout=out_dropout).to(device)
model.apply(init_weights)

optimizer = optim.SGD(model.parameters(), lr=lr)
criterion_train = nn.CrossEntropyLoss(ignore_index=lang.word2id["<pad>"])
criterion_eval = nn.CrossEntropyLoss(ignore_index=lang.word2id["<pad>"], reduction='sum')

### Train

In [None]:
if (run_exp[5]):
  train(model=model, optimizer=optimizer, exp_name=name, clip=clip, epochs=n_epochs, patience=patience, batch_size=batch_size)

  #save all parameters (lr, emb_size, hid_size, optimizer, clip, patience, n_epochs)
  with open(tensorboard_folder+'/'+name+'/params.txt', 'w') as f:
      f.write('lr: {}\n'.format(optimizer.param_groups[0]['lr']))
      f.write('weight_decay: {}\n'.format(optimizer.param_groups[0]['weight_decay']))
      f.write('optimizer: {}\n'.format(optimizer))
      f.write('emb_dropout: {}\n'.format(emb_dropout))
      f.write('out_dropout: {}\n'.format(out_dropout))
      f.write('emb_size: {}\n'.format(emb_size))
      f.write('hid_size: {}\n'.format(hid_size))
      f.write('optimizer: {}\n'.format(optimizer))
      f.write('clip: {}\n'.format(clip))
      f.write('patience: {}\n'.format(patience))
      f.write('n_epochs: {}\n'.format(n_epochs))
      f.write('batch_size: {}\n'.format(batch_size))

PPL: 146.082096:  73%|███████▎  | 72/99 [33:30<12:34, 27.96s/it]

## LSTM Non-monotonically Triggered AvSGD

In [None]:
name = "LSTM_NT_AvSGD_higherLR"

### Optimizer

In [None]:
# put into functions
class NT_AvSGD(optim.ASGD):
    def __init__(self, model, lr=1, L=100, n=5):
        super(NT_AvSGD, self).__init__(model.parameters(), lr=lr, t0=float('inf'))
        self.temp = {}
        self.logs = []
        self.dev_loader = DataLoader(dev_dataset, batch_size=batch_size, collate_fn=partial(collate_fn, pad_token=lang.word2id["<pad>"]))
        self.T = 0
        self.t = 0
        self.k = 0
        self.L = L
        self.n = n
        self.mu = 1
        self.model = model

    def step(self, closure=None):
        super(NT_AvSGD, self).step(closure)
        with torch.no_grad():
          #calculate validation PPL
          if self.k % self.L == 0 and self.T==0:
              ppl_dev, _ = eval_loop(self.dev_loader, criterion_eval, self.model)
              self.model.train()
              if self.t>self.n and ppl_dev > min(self.logs[:self.t-self.n]):
                  self.T = self.k
                  # set t0 of ASGD
                  self.param_groups[0]['t0'] = 0
                  print("averaging at k "+self.k)
              self.logs.append(ppl_dev)
              self.t += 1
          self.k += 1

    def average(self):
        if self.T == 0:
            print("No need to average")
            return
        with torch.no_grad():
            # use ax computed in ASGD
            for prm in self.model.parameters():
                self.temp[prm] = prm.data.clone()
                prm.data = self.state[prm]['ax'].clone()

    def restore(self):
        if self.T == 0:
            print("No need to restore")
            return
        with torch.no_grad():
            for prm in self.model.parameters():
                prm.data = self.temp[prm].clone()




### Params

In [None]:
hid_size = 300
emb_size = 300

lr = 1
clip = 5
patience = 10
n_epochs = 100
L = 165
n=5
emb_dropout = 0.2
out_dropout = 0.2

vocab_len = len(lang.word2id)

model = LM_LSTM_WT_VD(emb_size, hid_size, vocab_len, pad_index=lang.word2id["<pad>"], emb_dropout=emb_dropout, out_dropout=out_dropout).to(device)
model.apply(init_weights)

optimizer = NT_AvSGD(lr=lr, L=L, n=n, model = model)
criterion_train = nn.CrossEntropyLoss(ignore_index=lang.word2id["<pad>"])
criterion_eval = nn.CrossEntropyLoss(ignore_index=lang.word2id["<pad>"], reduction='sum')

### Train

In [None]:
if (run_exp[6]):
  train(model=model, optimizer=optimizer, exp_name=name, clip=clip, epochs=n_epochs, patience=patience, batch_size=batch_size)
  optimizer.average()

  #calculate test PPL
  test_loader = DataLoader(test_dataset, batch_size=batch_size, collate_fn=partial(collate_fn, pad_token=lang.word2id["<pad>"]))
  ppl_test, _ = eval_loop(test_loader, criterion_eval, model)
  print('Test ppl: ', ppl_test)
  writer = SummaryWriter(tensorboard_folder+'/'+name)
  writer.add_scalar('PPL/test', ppl_test, 1)
  writer.close()

  #save model
  torch.save(model.state_dict(), models_folder+'/'+name+'.pt')

  #save all parameters (lr, emb_size, hid_size, optimizer, clip, patience, n_epochs)
  with open(tensorboard_folder+'/'+name+'/params.txt', 'w') as f:
      f.write('lr: {}\n'.format(lr))
      f.write('emb_size: {}\n'.format(emb_size))
      f.write('hid_size: {}\n'.format(hid_size))
      f.write('emb_dropout: {}\n'.format(emb_dropout))
      f.write('out_dropout: {}\n'.format(out_dropout))
      f.write('optimizer: {}\n'.format(optimizer))
      f.write('clip: {}\n'.format(clip))
      f.write('patience: {}\n'.format(patience))
      f.write('L: {}\n'.format(L))
      f.write('n: {}\n'.format(n))
      f.write('n_epochs: {}\n'.format(n_epochs))
      f.write('batch_size: {}\n'.format(batch_size))

# Results

In [None]:
# Load the TensorBoard notebook extension
%load_ext tensorboard
%tensorboard --logdir 'drive/MyDrive/MASTER_AIS/labs/NLU-2024-Labs/assignment_1/tensorboard'