[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/brownfortress/NLU-2024-labs/blob/main/labs/04_neural_LM.ipynb)

In [1]:
# Run this if you are on Colab
# !python -m spacy download en_core_web_lg


In [2]:
import spacy
import itertools
import numpy as np
DEVICE = 'cuda:0'  # it can be changed with 'cpu' if you do not have a gpu
nlp = spacy.load('en_core_web_lg')
EPOCHS = 5

In [3]:
import sys
sys.path.append('../src/')

from model import MultiModel
from model import NTAvSGD
from model import V_Dropout
from model import AverageOfGradientsSGD
import utils as ut

In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import math
import numpy as np

# RNN Elman version
# We are not going to use this since for efficiency purposes it's better to use the RNN layer provided by pytorch  

class RNN_cell(nn.Module):
    def __init__(self,  hidden_size, input_size, output_size, vocab_size, dropout=0.1):
        super(RNN_cell, self).__init__()
        
        self.W = nn.Linear(input_size, hidden_size, bias=False)
        self.U = nn.Linear(hidden_size, hidden_size)
        self.V = nn.Linear(hidden_size, hidden_size)
        self.vocab_size = vocab_size
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, prev_hidden, word):
        input_emb = self.W(word)
        prev_hidden_rep = self.U(prev_hidden)
        # ht = σ(Wx + Uht-1 + b)
        hidden_state = self.sigmoid(input_emb + prev_hidden_rep)
        # yt = σ(Vht + b)
        output = self.output(hidden_state)
        return hidden_state, output

In [5]:
class LM_RNN(nn.Module):
    def __init__(self, emb_size, hidden_size, output_size, pad_index=0, out_dropout=0.1,
                 emb_dropout=0.1, n_layers=1):
        super(LM_RNN, self).__init__()
        # Token ids to vectors, we will better see this in the next lab 
        self.embedding = nn.Embedding(output_size, emb_size, padding_idx=pad_index)#matrix vocabulary*dimension of the input
        # Pytorch's RNN layer: https://pytorch.org/docs/stable/generated/torch.nn.RNN.html
        self.rnn = nn.RNN(emb_size, hidden_size, n_layers, bidirectional=False)    
        self.pad_token = pad_index
        # Linear layer to project the hidden layer to our output space 
        self.output = nn.Linear(hidden_size, output_size)
        
    def forward(self, input_sequence):
        emb = self.embedding(input_sequence)#sentence
        rnn_out, _  = self.rnn(emb)
        output = self.output(rnn_out).permute(0,2,1)
        return output 

In [6]:
# Loading the corpus 

def read_file(path, eos_token="<eos>"):
    output = []
    with open(path, "r") as f:
        for line in f.readlines():
            output.append(line.strip() + " " + eos_token)
    return output

# Vocab with tokens to ids
def get_vocab(corpus, special_tokens=[]):
    output = {}
    i = 0 
    for st in special_tokens:
        output[st] = i
        i += 1
    for sentence in corpus:
        for w in sentence.split():
            if w not in output:
                output[w] = i
                i += 1
    return output

In [7]:

train_raw = read_file("dataset/PennTreeBank/ptb.train.txt")
dev_raw = read_file("dataset/PennTreeBank/ptb.valid.txt")
test_raw = read_file("dataset/PennTreeBank/ptb.test.txt")


In [8]:
# Vocab is computed only on training set 
# We add two special tokens end of sentence and padding
#The dataset was already cutoffed
vocab = get_vocab(train_raw, ["<pad>", "<eos>"])

In [9]:
len(vocab)

10001

In [10]:
# This class computes and stores our vocab 
# Word to ids and ids to word
class Lang():
    def __init__(self, corpus, special_tokens=[]):
        self.word2id = self.get_vocab(corpus, special_tokens)
        self.id2word = {v:k for k, v in self.word2id.items()}
    def get_vocab(self, corpus, special_tokens=[]):
        output = {}
        i = 0 
        for st in special_tokens:
            output[st] = i
            i += 1
        for sentence in corpus:
            for w in sentence.split():
                if w not in output:
                    output[w] = i
                    i += 1
        return output
    

In [11]:
lang = Lang(train_raw, ["<pad>", "<eos>"])

In [12]:
import torch
import torch.utils.data as data


class PennTreeBank (data.Dataset):
    # Mandatory methods are __init__, __len__ and __getitem__
    def __init__(self, corpus, lang):
        self.source = []
        self.target = []

        for sentence in corpus:
            # We get from the first token till the second-last token
            self.source.append(sentence.split()[0:-1])
            # We get from the second token till the last token
            self.target.append(sentence.split()[1:])
            # See example in section 6.2

        self.source_ids = self.mapping_seq(self.source, lang)
        self.target_ids = self.mapping_seq(self.target, lang)

    def __len__(self):
        return len(self.source)

    def __getitem__(self, idx):
        src = torch.LongTensor(self.source_ids[idx])
        trg = torch.LongTensor(self.target_ids[idx])
        sample = {'source': src, 'target': trg}
        return sample

    # Auxiliary methods

    # Map sequences of tokens to corresponding computed in Lang class
    def mapping_seq(self, data, lang):
        res = []
        for seq in data:
            tmp_seq = []
            for x in seq:
                if x in lang.word2id:
                    tmp_seq.append(lang.word2id[x])
                else:
                    print('OOV found!')
                    # PennTreeBank doesn't have OOV but "Trust is good, control is better!"
                    print('You have to deal with that')
                    break
            res.append(tmp_seq)
        return res
    

def init_weights(mat):
    for m in mat.modules():
        if type(m) in [nn.GRU, nn.LSTM, nn.RNN]:
            for name, param in m.named_parameters():
                if 'weight_ih' in name:
                    for idx in range(4):
                        mul = param.shape[0]//4
                        torch.nn.init.xavier_uniform_(
                            param[idx*mul:(idx+1)*mul])
                elif 'weight_hh' in name:
                    for idx in range(4):
                        mul = param.shape[0]//4
                        torch.nn.init.orthogonal_(param[idx*mul:(idx+1)*mul])
                elif 'bias' in name:
                    param.data.fill_(0)
        else:
            if type(m) in [nn.Linear]:
                torch.nn.init.uniform_(m.weight, -0.01, 0.01)
                if m.bias != None:
                    m.bias.data.fill_(0.01)


In [13]:
train_dataset = PennTreeBank(train_raw, lang)
dev_dataset = PennTreeBank(dev_raw, lang)
test_dataset = PennTreeBank(test_raw, lang)

In [14]:
from functools import partial
from torch.utils.data import DataLoader

def collate_fn(data, pad_token):
    def merge(sequences):
        '''
        merge from batch * sent_len to batch * max_len 
        '''
        lengths = [len(seq) for seq in sequences]
        max_len = 1 if max(lengths)==0 else max(lengths)
        # Pad token is zero in our case
        # So we create a matrix full of PAD_TOKEN (i.e. 0) with the shape 
        # batch_size X maximum length of a sequence
        padded_seqs = torch.LongTensor(len(sequences),max_len).fill_(pad_token)
        for i, seq in enumerate(sequences):
            end = lengths[i]
            padded_seqs[i, :end] = seq # We copy each sequence into the matrix
        padded_seqs = padded_seqs.detach()  # We remove these tensors from the computational graph
        return padded_seqs, lengths
    
    # Sort data by seq lengths

    data.sort(key=lambda x: len(x["source"]), reverse=True) 
    new_item = {}
    for key in data[0].keys():
        new_item[key] = [d[key] for d in data]

    source, _ = merge(new_item["source"])
    target, lengths = merge(new_item["target"])
    
    new_item["source"] = source.to(DEVICE)
    new_item["target"] = target.to(DEVICE)
    new_item["number_tokens"] = sum(lengths)
    return new_item

# Dataloader instantiation
# You can reduce the batch_size if the GPU memory is not enough
train_loader = DataLoader(train_dataset, batch_size=128, collate_fn=partial(collate_fn, pad_token=lang.word2id["<pad>"]),  shuffle=True)
dev_loader = DataLoader(dev_dataset, batch_size=128, collate_fn=partial(
    collate_fn, pad_token=lang.word2id["<pad>"]))
test_loader = DataLoader(test_dataset, batch_size=128, collate_fn=partial(
    collate_fn, pad_token=lang.word2id["<pad>"]))

# Mandatory Exam Exercise
## Part 1 (4 points)
In this, you have to modify the baseline LM_RNN by adding a set of techniques that might improve the performance. In this, you have to add one modification at a time incrementally. If adding a modification decreases the performance, you can remove it and move forward with the others. However, in the report, you have to provide and comment on this unsuccessful experiment.  For each of your experiments, you have to print the performance expressed with Perplexity (PPL).
<br>
One of the important tasks of training a neural network is  hyperparameter optimization. Thus, you have to play with the hyperparameters to minimise the PPL and thus print the results achieved with the best configuration (in particular <b>the learning rate</b>). 
These are two links to the state-of-the-art papers which use vanilla RNN [paper1](https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=5947611), [paper2](https://www.fit.vutbr.cz/research/groups/speech/publi/2010/mikolov_interspeech2010_IS100722.pdf). 

**Mandatory requirements**: For the following experiments the perplexity must be below 250 (***PPL < 250***).

1. Replace RNN with a Long-Short Term Memory (LSTM) network --> [link](https://pytorch.org/docs/stable/generated/torch.nn.LSTM.html)#while using SGD the learning rate could be above 0 to perform
2. Add two dropout layers: --> [link](https://pytorch.org/docs/stable/generated/torch.nn.Dropout.html)
    - one after the embedding layer, 
    - one before the last linear layer
3. Replace SGD with AdamW --> [link](https://pytorch.org/docs/stable/generated/torch.optim.AdamW.html)

## Part 2 (11 points)
**Mandatory requirements**: For the following experiments the perplexity must be below 250 (***PPL < 250***) and it should be lower than the one achieved in Part 1.1 (i.e. base LSTM).

Starting from the `LM_RNN` in which you replaced the RNN with a LSTM model, apply the following regularisation techniques:
- Weight Tying 
- Variational Dropout (no DropConnect)
- Non-monotonically Triggered AvSGD 

These techniques are described in [this paper](https://openreview.net/pdf?id=SyyGPP0TZ).


In [15]:
#Tensor board logging system
from torch.utils.tensorboard import SummaryWriter
def log_values(writer, step, ppl, prefix):
  writer.add_scalar(f"{prefix}/ppl", ppl, step)


In [16]:
import math
def avg_train_loop(data, optimizer, criterion, model, VDROP, clip=5):
    torch.cuda.empty_cache()
    model.train()
    loss_array = []
    number_of_tokens = []
    
    for sample in data:#data is the list of batches
        optimizer.zero_grad() # Zeroing the gradient

        if VDROP:
            model.reset_dropout(sample["source"])
        output = model(sample['source'])
        loss = criterion(output, sample['target'])
        loss_array.append(loss.item() * sample["number_tokens"])
        number_of_tokens.append(sample["number_tokens"])
        loss.backward() # Compute the gradient, zeroing the computational graph calculated by pytorch
        # clip the gradient to avoid explosive gradients
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)  
        optimizer.step()  # Update the weights    weight_decay_range, patience_range, emb_drop_range,

        
    return sum(loss_array)/sum(number_of_tokens)

In [17]:
def eval_loop(data, eval_criterion, model):
    model.eval()
    loss_to_return = []
    loss_array = []
    number_of_tokens = []
    # softmax = nn.Softmax(dim=1) # Use Softmax if you need the actual probability
    with torch.no_grad(): # It used to avoid the creation of computational graph
        for sample in data:
            #torch.cuda.empty_cache()
            output = model(sample['source'])
            loss = eval_criterion(output, sample['target'])
            loss_array.append(loss.item())
            number_of_tokens.append(sample["number_tokens"])
            
    ppl = math.exp(sum(loss_array) / sum(number_of_tokens))
    loss_to_return = sum(loss_array) / sum(number_of_tokens)
    return ppl, loss_to_return



In [18]:
import matplotlib.pyplot as plt
from tqdm import tqdm
import copy


def train_model(
        param_string,
    n_epochs=100,
    patience_set=3,
    losses_train=[],
    losses_dev=[],
    sampled_epochs=[],
    best_ppl=math.inf,
    best_model=None,
    modelM=None,
    optimizer=None,
    NM_ASGD=False,
    ASGD_optim = NTAvSGD,
    Logging_interval=1,
    non_monotone_interval=5,
    lr=.5,
    VDROP=False,
    clip=5,
    criterion_train=None,
    weight_decay_Av =.05,
    criterion_eval = None,
    exp_name=None,
):


    patience = patience_set
    writer = SummaryWriter(log_dir=f"runs/{param_string}")
    pbar = tqdm(range(1, n_epochs))
    # If the PPL is too high try to change the learning rate
    loss_log = []
    T = 0
    t = 0
    for epoch in pbar:
        loss = avg_train_loop(train_loader, optimizer,
                              criterion_train, modelM, VDROP, clip)
        # The epoch is treated as the k in NT-AvSGD
        if epoch % 1 == 0:
            sampled_epochs.append(epoch)
            losses_train.append(np.asarray(loss).mean())
            ppl_dev, loss_dev = eval_loop(dev_loader, criterion_eval, modelM)
            losses_dev.append(np.asarray(loss_dev).mean())
            # 4-9: of NT-AvSGD
            if NM_ASGD and epoch % Logging_interval == 0 and T == 0:
                if (len(loss_log) > 0):
                    if t > non_monotone_interval and ppl_dev > min(loss_log):
                        T = epoch
                        optimizer = ASGD_optim(
                            modelM.parameters(), lr=lr, weight_decay=weight_decay_Av)
                        print(f"swiching to ASGD at epoch{epoch}")
                        patience = patience_set
                loss_log.append(ppl_dev)
                t = t+1

            # tensorboard logger
            log_values(writer, epoch, ppl_dev, exp_name+"PPL")

            pbar.set_description("PPL: %f Patience:%d" % (ppl_dev, patience))

            if ppl_dev < best_ppl:  # the lower, the better
                best_ppl = ppl_dev
                best_model = copy.deepcopy(modelM).to('cpu')
                patience = patience_set
            else:
                patience -= 1

            if patience <= 0:  # Early stopping with patience
                break  # Not nice but it keeps the code clean

    best_model.to(DEVICE)
    final_ppl,  _ = eval_loop(test_loader, criterion_eval, best_model)
    print('Test ppl: ', final_ppl)
    return final_ppl,modelM

In [19]:
""" class LSTM_DROP(nn.Module):
    def __init__(self, emb_size, hidden_size, output_size, pad_index=0, out_dropout=0.1,
                 emb_dropout=0.1, n_layers=1):
        super(LSTM_DROP, self).__init__()
        # Token ids to vectors, we will better see this in the next lab
        # matrix vocabulary*dimension of the input
        self.embedding = nn.Embedding(
            output_size, emb_size, padding_idx=pad_index)
        self.drop1 = nn.Dropout(p=emb_dropout)

        # Pytorch's RNN layer: https://pytorch.org/docs/stable/generated/torch.nn.RNN.html
        self.rnn = nn.LSTM(emb_size, hidden_size,
                           n_layers, bidirectional=False)
        self.pad_token = pad_index
        # Linear layer to project the hidden layer to our output space
        self.drop2 = nn.Dropout(p=emb_dropout)
        self.output = nn.Linear(hidden_size, output_size)

    def forward(self, input_sequence):
        emb = self.embedding(input_sequence)  # sentence
        drop1_out = self.drop1(emb)
        rnn_out, _ = self.rnn(drop1_out)
        drop2_out = self.drop2(rnn_out)
        output = self.output(drop2_out).permute(0, 2, 1)
        return output """

" class LSTM_DROP(nn.Module):\n    def __init__(self, emb_size, hidden_size, output_size, pad_index=0, out_dropout=0.1,\n                 emb_dropout=0.1, n_layers=1):\n        super(LSTM_DROP, self).__init__()\n        # Token ids to vectors, we will better see this in the next lab\n        # matrix vocabulary*dimension of the input\n        self.embedding = nn.Embedding(\n            output_size, emb_size, padding_idx=pad_index)\n        self.drop1 = nn.Dropout(p=emb_dropout)\n\n        # Pytorch's RNN layer: https://pytorch.org/docs/stable/generated/torch.nn.RNN.html\n        self.rnn = nn.LSTM(emb_size, hidden_size,\n                           n_layers, bidirectional=False)\n        self.pad_token = pad_index\n        # Linear layer to project the hidden layer to our output space\n        self.drop2 = nn.Dropout(p=emb_dropout)\n        self.output = nn.Linear(hidden_size, output_size)\n\n    def forward(self, input_sequence):\n        emb = self.embedding(input_sequence)  # sentenc

In [20]:
""" class LSTM_SIMPLE(nn.Module):
    def __init__(self, emb_size, hidden_size, output_size, pad_index=0, out_dropout=0.1,
                 emb_dropout=0.1, n_layers=1):
        super(LSTM_SIMPLE, self).__init__()
        # Token ids to vectors, we will better see this in the next lab
        # matrix vocabulary*dimension of the input
        self.embedding = nn.Embedding(
            output_size, emb_size, padding_idx=pad_index)
        # Pytorch's RNN layer: https://pytorch.org/docs/stable/generated/torch.nn.RNN.html
        self.rnn = nn.LSTM(emb_size, hidden_size,
                           n_layers, bidirectional=False)
        self.pad_token = pad_index
        # Linear layer to project the hidden layer to our output space
        self.output = nn.Linear(hidden_size, output_size)

    def forward(self, input_sequence):
        emb = self.embedding(input_sequence)  # sentence
        rnn_out, _ = self.rnn(emb)
        output = self.output(rnn_out).permute(0, 2, 1)
        return output """

" class LSTM_SIMPLE(nn.Module):\n    def __init__(self, emb_size, hidden_size, output_size, pad_index=0, out_dropout=0.1,\n                 emb_dropout=0.1, n_layers=1):\n        super(LSTM_SIMPLE, self).__init__()\n        # Token ids to vectors, we will better see this in the next lab\n        # matrix vocabulary*dimension of the input\n        self.embedding = nn.Embedding(\n            output_size, emb_size, padding_idx=pad_index)\n        # Pytorch's RNN layer: https://pytorch.org/docs/stable/generated/torch.nn.RNN.html\n        self.rnn = nn.LSTM(emb_size, hidden_size,\n                           n_layers, bidirectional=False)\n        self.pad_token = pad_index\n        # Linear layer to project the hidden layer to our output space\n        self.output = nn.Linear(hidden_size, output_size)\n\n    def forward(self, input_sequence):\n        emb = self.embedding(input_sequence)  # sentence\n        rnn_out, _ = self.rnn(emb)\n        output = self.output(rnn_out).permute(0, 2, 1)\

In [21]:
import torch.optim as optim
# Don't forget to experiment with a lower training batch size
# Increasing the back propagation steps can be seen as a regularization step

# With SGD try with an higher learning rate (> 1 for instance)
vocab_len = len(lang.word2id)

criterion_train = nn.CrossEntropyLoss(ignore_index=lang.word2id["<pad>"])
criterion_eval = nn.CrossEntropyLoss(ignore_index=lang.word2id["<pad>"], reduction='sum')

In [22]:
# Let's check the results on tensorboard
# NOTE: remember to set the smoothing to zero
%load_ext tensorboard
%tensorboard --logdir=runs --host localhost --port 8088

In [23]:
def train_grid_search(lr_range, clip_range, hid_size_range, emb_size_range, weight_decay_range, patience_range, emb_drop_range, out_drop_range, vocab_len, model_type="None", dropout_type="None", NM_ASGD=False, optimizer=None, n_epochs=100, exp_name="Pollo", ASGD_optim=NTAvSGD, Weight_tying=False):
    best_model = None
    best_accuracy = 0
    best_params = {}
    optimizer_class = optimizer
    model = None
    # Iterate over all combinations of the parameter ranges
    for lr, clip, hid_size, emb_size, weight_decay, patience, emb_drop, out_drop in itertools.product(lr_range, clip_range, hid_size_range, emb_size_range, weight_decay_range, patience_range, emb_drop_range, out_drop_range):
        if Weight_tying and emb_drop !=out_drop:
            continue

        print(
            f"Training with parameters: lr={lr}, clip={clip}, hid_size={hid_size}, emb_size={emb_size}, weight_decay={weight_decay}, patience={patience}, emb_drop={emb_drop}, out_drop={out_drop}")

        # Initialize the model with current parameters
        model = MultiModel(emb_size, hid_size, vocab_len, model_type=model_type,
                           pad_index=lang.word2id["<pad>"], dropout_type=dropout_type,
                           emb_dropout=emb_drop, out_dropout=out_drop,
                           Weight_tying=Weight_tying).to(DEVICE)

        model.apply(init_weights)

        # Set up the optimizer with current parameters
        optimizer_obj = optimizer_class(model.parameters(), lr=lr,
                                        weight_decay=weight_decay)

        if dropout_type == "Variational":
            VDROP= True;
        else: VDROP = False
        # Train the model
        accuracy, model = train_model(exp_name+"/"+f"LR={lr}, C={clip}, HS={hid_size}, ES={emb_size}, WD={weight_decay}, P={patience}, E_D={emb_drop}, O_D={out_drop},M_T={model_type},D_T={dropout_type},ASGD={NM_ASGD}, VDROP={VDROP},WT={Weight_tying}", clip=clip, modelM=model,
                                      optimizer=optimizer_obj, NM_ASGD=NM_ASGD, VDROP=VDROP,
                                      criterion_eval=criterion_eval,
                                      criterion_train=criterion_train, patience_set=patience, weight_decay_Av=weight_decay,
                                      n_epochs=n_epochs, exp_name=exp_name,
                                      ASGD_optim=ASGD_optim)

        # Check if the current model is the best so far
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_model = model
            best_params = {
                'lr': lr,
                'clip': clip,
                'hid_size': hid_size,
                'emb_size': emb_size,
                'weight_decay': weight_decay,
                'patience': patience,
                'emb_drop': emb_drop,
                'out_drop': out_drop
            }

            torch.save({
                'model_state_dict': model.state_dict(),
            }, "models/"+exp_name+".model")

    print(f"Best model parameters: {best_params}")
    print(f"Best model accuracy: {best_accuracy}")
    return best_model, best_params

In [24]:
""" lr = 1
clip = 5
hid_size = 400
emb_size = 250
patience = 3
weight_decay = .0001

model = MultiModel(emb_size, hid_size, vocab_len, model_type="RNN",
                   pad_index=lang.word2id["<pad>"], dropout_type="None").to(DEVICE)

model.apply(init_weights)
optimizer = optim.SGD(model.parameters(), lr=lr, weight_decay=weight_decay)
train_model("LM_RNN_SGD", modelM=model,
            optimizer=optimizer, patience_set=patience,clip=clip,
            criterion_train=criterion_train,criterion_eval = criterion_eval) """

' lr = 1\nclip = 5\nhid_size = 400\nemb_size = 250\npatience = 3\nweight_decay = .0001\n\nmodel = MultiModel(emb_size, hid_size, vocab_len, model_type="RNN",\n                   pad_index=lang.word2id["<pad>"], dropout_type="None").to(DEVICE)\n\nmodel.apply(init_weights)\noptimizer = optim.SGD(model.parameters(), lr=lr, weight_decay=weight_decay)\ntrain_model("LM_RNN_SGD", modelM=model,\n            optimizer=optimizer, patience_set=patience,clip=clip,\n            criterion_train=criterion_train,criterion_eval = criterion_eval) '

In [28]:
lr_range = [1.0,2.0,3.0]
clip_range = [1, 5, 10]
hid_size_range = [100, 250, 500]
emb_size_range = [100, 250, 500]
weight_decay_range = [0.0001, 0.001]
patience_range = [3]
emb_drop_range = [0]
out_drop_range = [0]

best_model, best_params = train_grid_search(
    lr_range, clip_range, hid_size_range, emb_size_range,
    weight_decay_range, patience_range, emb_drop_range,
    out_drop_range, vocab_len, exp_name="LM_RNN_SGD", model_type="RNN", optimizer=optim.SGD, dropout_type="None",n_epochs=EPOCHS)

Training with parameters: lr=1.0, clip=1, hid_size=100, emb_size=100, weight_decay=0.0001, patience=3, emb_drop=0.1, out_drop=0.1


PPL: 386.034632 Patience:3: 100%|██████████| 4/4 [00:42<00:00, 10.56s/it]


Test ppl:  368.63091810206885
Training with parameters: lr=1.0, clip=1, hid_size=100, emb_size=100, weight_decay=0.0001, patience=3, emb_drop=0.1, out_drop=0.2


PPL: 387.536501 Patience:3: 100%|██████████| 4/4 [00:42<00:00, 10.56s/it]


Test ppl:  375.8707549809068
Training with parameters: lr=1.0, clip=1, hid_size=100, emb_size=100, weight_decay=0.0001, patience=3, emb_drop=0.1, out_drop=0.3


PPL: 364.914401 Patience:3: 100%|██████████| 4/4 [00:42<00:00, 10.54s/it]


Test ppl:  352.994516929359
Training with parameters: lr=1.0, clip=1, hid_size=100, emb_size=100, weight_decay=0.0001, patience=3, emb_drop=0.2, out_drop=0.1


PPL: 371.213559 Patience:3: 100%|██████████| 4/4 [00:42<00:00, 10.57s/it]


Test ppl:  357.7792336850881
Training with parameters: lr=1.0, clip=1, hid_size=100, emb_size=100, weight_decay=0.0001, patience=3, emb_drop=0.2, out_drop=0.2


PPL: 364.813845 Patience:3: 100%|██████████| 4/4 [00:42<00:00, 10.62s/it]


Test ppl:  353.62477385568144
Training with parameters: lr=1.0, clip=1, hid_size=100, emb_size=100, weight_decay=0.0001, patience=3, emb_drop=0.2, out_drop=0.3


PPL: 372.399378 Patience:3: 100%|██████████| 4/4 [00:42<00:00, 10.57s/it]


Test ppl:  355.96619282554155
Training with parameters: lr=1.0, clip=1, hid_size=100, emb_size=100, weight_decay=0.0001, patience=3, emb_drop=0.3, out_drop=0.1


PPL: 378.572292 Patience:3: 100%|██████████| 4/4 [00:42<00:00, 10.66s/it]


Test ppl:  364.1622613049077
Training with parameters: lr=1.0, clip=1, hid_size=100, emb_size=100, weight_decay=0.0001, patience=3, emb_drop=0.3, out_drop=0.2


PPL: 368.830257 Patience:3: 100%|██████████| 4/4 [00:42<00:00, 10.64s/it]


Test ppl:  355.0932445496852
Training with parameters: lr=1.0, clip=1, hid_size=100, emb_size=100, weight_decay=0.0001, patience=3, emb_drop=0.3, out_drop=0.3


PPL: 372.050661 Patience:3: 100%|██████████| 4/4 [00:42<00:00, 10.63s/it]


Test ppl:  358.41027857362803
Training with parameters: lr=1.0, clip=1, hid_size=100, emb_size=100, weight_decay=0.001, patience=3, emb_drop=0.1, out_drop=0.1


PPL: 462.363150 Patience:3: 100%|██████████| 4/4 [00:43<00:00, 10.97s/it]


Test ppl:  445.24979270006804
Training with parameters: lr=1.0, clip=1, hid_size=100, emb_size=100, weight_decay=0.001, patience=3, emb_drop=0.1, out_drop=0.2


PPL: 464.278840 Patience:3: 100%|██████████| 4/4 [01:06<00:00, 16.73s/it]


Test ppl:  450.0608611160987
Training with parameters: lr=1.0, clip=1, hid_size=100, emb_size=100, weight_decay=0.001, patience=3, emb_drop=0.1, out_drop=0.3


PPL: 454.214314 Patience:2: 100%|██████████| 4/4 [01:04<00:00, 16.21s/it]


Test ppl:  441.2352995326771
Training with parameters: lr=1.0, clip=1, hid_size=100, emb_size=100, weight_decay=0.001, patience=3, emb_drop=0.2, out_drop=0.1


PPL: 452.679389 Patience:3: 100%|██████████| 4/4 [00:36<00:00,  9.24s/it]


Test ppl:  435.0991051347404
Training with parameters: lr=1.0, clip=1, hid_size=100, emb_size=100, weight_decay=0.001, patience=3, emb_drop=0.2, out_drop=0.2


PPL: 469.079423 Patience:2: 100%|██████████| 4/4 [00:33<00:00,  8.28s/it]


Test ppl:  453.33604376453087
Training with parameters: lr=1.0, clip=1, hid_size=100, emb_size=100, weight_decay=0.001, patience=3, emb_drop=0.2, out_drop=0.3


  0%|          | 0/4 [00:00<?, ?it/s]


KeyboardInterrupt: 

In [31]:
""" lr = .002
clip = 5
hid_size = 400
emb_size = 250
weight_decay=.0001 """
""" model = LSTM_DROP(emb_size, hid_size, vocab_len,
                  pad_index=lang.word2id["<pad>"]).to(DEVICE) """
""" model = MultiModel(emb_size, hid_size, vocab_len, model_type="LSTM",
                   pad_index=lang.word2id["<pad>"],dropout_type="Normal").to(DEVICE)

model.apply(init_weights)
optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
train_model("LSTM_DROP_AdamW", modelM=model,
            optimizer=optimizer, patience_set=patience, clip=clip,
            criterion_train=criterion_train, criterion_eval = criterion_eval) """


lr_range = [.02,.002,.001]
clip_range = [1, 5, 10]
hid_size_range = [100, 250, 500]
emb_size_range = [100, 250, 500]
weight_decay_range = [0.0001, 0.001]
patience_range = [3]
emb_drop_range = [0.1, 0.2, 0.3]
out_drop_range = [0.1, 0.2, 0.3]

best_model, best_params = train_grid_search(
    lr_range, clip_range, hid_size_range, emb_size_range,
    weight_decay_range, patience_range, emb_drop_range,
    out_drop_range, vocab_len, exp_name="LSTM_DROP_AdamW", model_type="LSTM", optimizer=optim.AdamW, dropout_type="Normal", n_epochs=EPOCHS)

Training with parameters: lr=0.02, clip=1, hid_size=100, emb_size=100, weight_decay=0.0001, patience=3, emb_drop=0.1, out_drop=0.1


PPL: 227.514157 Patience:3: 100%|██████████| 4/4 [00:35<00:00,  8.76s/it]


Test ppl:  206.0682208819467
Training with parameters: lr=0.02, clip=1, hid_size=100, emb_size=100, weight_decay=0.0001, patience=3, emb_drop=0.1, out_drop=0.2


PPL: 224.122977 Patience:3: 100%|██████████| 4/4 [00:35<00:00,  8.78s/it]


Test ppl:  205.54534527315607
Training with parameters: lr=0.02, clip=1, hid_size=100, emb_size=100, weight_decay=0.0001, patience=3, emb_drop=0.1, out_drop=0.3


PPL: 219.958884 Patience:3: 100%|██████████| 4/4 [00:35<00:00,  8.78s/it]


Test ppl:  202.57594334611056
Training with parameters: lr=0.02, clip=1, hid_size=100, emb_size=100, weight_decay=0.0001, patience=3, emb_drop=0.2, out_drop=0.1


PPL: 225.354042 Patience:3: 100%|██████████| 4/4 [00:35<00:00,  8.77s/it]


Test ppl:  206.3225177153071
Training with parameters: lr=0.02, clip=1, hid_size=100, emb_size=100, weight_decay=0.0001, patience=3, emb_drop=0.2, out_drop=0.2


PPL: 221.953195 Patience:3: 100%|██████████| 4/4 [00:35<00:00,  8.84s/it]


Test ppl:  204.3492405949697
Training with parameters: lr=0.02, clip=1, hid_size=100, emb_size=100, weight_decay=0.0001, patience=3, emb_drop=0.2, out_drop=0.3


PPL: 221.446146 Patience:3: 100%|██████████| 4/4 [00:34<00:00,  8.71s/it]


Test ppl:  204.10735333089437
Training with parameters: lr=0.02, clip=1, hid_size=100, emb_size=100, weight_decay=0.0001, patience=3, emb_drop=0.3, out_drop=0.1


PPL: 225.825093 Patience:3: 100%|██████████| 4/4 [00:34<00:00,  8.72s/it]


Test ppl:  206.14434560066405
Training with parameters: lr=0.02, clip=1, hid_size=100, emb_size=100, weight_decay=0.0001, patience=3, emb_drop=0.3, out_drop=0.2


PPL: 225.898250 Patience:3: 100%|██████████| 4/4 [00:34<00:00,  8.68s/it]


Test ppl:  206.17104266692974
Training with parameters: lr=0.02, clip=1, hid_size=100, emb_size=100, weight_decay=0.0001, patience=3, emb_drop=0.3, out_drop=0.3


PPL: 225.136839 Patience:3: 100%|██████████| 4/4 [00:35<00:00,  8.78s/it]


Test ppl:  206.99345027395344
Training with parameters: lr=0.02, clip=1, hid_size=100, emb_size=100, weight_decay=0.001, patience=3, emb_drop=0.1, out_drop=0.1


PPL: 226.833923 Patience:3: 100%|██████████| 4/4 [00:35<00:00,  8.78s/it]


Test ppl:  207.4086389997443
Training with parameters: lr=0.02, clip=1, hid_size=100, emb_size=100, weight_decay=0.001, patience=3, emb_drop=0.1, out_drop=0.2


PPL: 220.557057 Patience:3: 100%|██████████| 4/4 [00:35<00:00,  8.85s/it]


Test ppl:  203.27759464523834
Training with parameters: lr=0.02, clip=1, hid_size=100, emb_size=100, weight_decay=0.001, patience=3, emb_drop=0.1, out_drop=0.3


PPL: 220.669347 Patience:3: 100%|██████████| 4/4 [00:35<00:00,  8.83s/it]


Test ppl:  201.32729244678742
Training with parameters: lr=0.02, clip=1, hid_size=100, emb_size=100, weight_decay=0.001, patience=3, emb_drop=0.2, out_drop=0.1


PPL: 223.517595 Patience:3: 100%|██████████| 4/4 [00:34<00:00,  8.74s/it]


Test ppl:  204.6592486693378
Training with parameters: lr=0.02, clip=1, hid_size=100, emb_size=100, weight_decay=0.001, patience=3, emb_drop=0.2, out_drop=0.2


PPL: 222.700134 Patience:3: 100%|██████████| 4/4 [00:35<00:00,  8.83s/it]


Test ppl:  204.6898874579707
Training with parameters: lr=0.02, clip=1, hid_size=100, emb_size=100, weight_decay=0.001, patience=3, emb_drop=0.2, out_drop=0.3


PPL: 221.737523 Patience:3: 100%|██████████| 4/4 [00:35<00:00,  8.82s/it]


Test ppl:  203.09780505741787
Training with parameters: lr=0.02, clip=1, hid_size=100, emb_size=100, weight_decay=0.001, patience=3, emb_drop=0.3, out_drop=0.1


PPL: 227.222422 Patience:3: 100%|██████████| 4/4 [00:35<00:00,  8.76s/it]


Test ppl:  207.80592136961803
Training with parameters: lr=0.02, clip=1, hid_size=100, emb_size=100, weight_decay=0.001, patience=3, emb_drop=0.3, out_drop=0.2


PPL: 224.558973 Patience:3: 100%|██████████| 4/4 [00:34<00:00,  8.63s/it]


Test ppl:  205.7196415066298
Training with parameters: lr=0.02, clip=1, hid_size=100, emb_size=100, weight_decay=0.001, patience=3, emb_drop=0.3, out_drop=0.3


PPL: 226.128415 Patience:3: 100%|██████████| 4/4 [00:34<00:00,  8.66s/it]


Test ppl:  207.27285750950608
Training with parameters: lr=0.02, clip=1, hid_size=100, emb_size=250, weight_decay=0.0001, patience=3, emb_drop=0.1, out_drop=0.1


PPL: 239.416910 Patience:3: 100%|██████████| 4/4 [00:35<00:00,  8.83s/it]


Test ppl:  216.19805408658317
Training with parameters: lr=0.02, clip=1, hid_size=100, emb_size=250, weight_decay=0.0001, patience=3, emb_drop=0.1, out_drop=0.2


PPL: 236.898563 Patience:3: 100%|██████████| 4/4 [00:35<00:00,  8.86s/it]


Test ppl:  215.71614352867527
Training with parameters: lr=0.02, clip=1, hid_size=100, emb_size=250, weight_decay=0.0001, patience=3, emb_drop=0.1, out_drop=0.3


PPL: 235.793552 Patience:3: 100%|██████████| 4/4 [00:35<00:00,  8.82s/it]


Test ppl:  215.4145885601564
Training with parameters: lr=0.02, clip=1, hid_size=100, emb_size=250, weight_decay=0.0001, patience=3, emb_drop=0.2, out_drop=0.1


PPL: 237.708991 Patience:3: 100%|██████████| 4/4 [00:35<00:00,  8.87s/it]


Test ppl:  214.7828709425306
Training with parameters: lr=0.02, clip=1, hid_size=100, emb_size=250, weight_decay=0.0001, patience=3, emb_drop=0.2, out_drop=0.2


PPL: 235.878708 Patience:3: 100%|██████████| 4/4 [00:35<00:00,  8.82s/it]


Test ppl:  216.60711304473605
Training with parameters: lr=0.02, clip=1, hid_size=100, emb_size=250, weight_decay=0.0001, patience=3, emb_drop=0.2, out_drop=0.3


PPL: 235.955418 Patience:3: 100%|██████████| 4/4 [00:35<00:00,  8.85s/it]


Test ppl:  215.5352934388295
Training with parameters: lr=0.02, clip=1, hid_size=100, emb_size=250, weight_decay=0.0001, patience=3, emb_drop=0.3, out_drop=0.1


PPL: 237.159264 Patience:3: 100%|██████████| 4/4 [00:35<00:00,  8.86s/it]


Test ppl:  215.27406430675967
Training with parameters: lr=0.02, clip=1, hid_size=100, emb_size=250, weight_decay=0.0001, patience=3, emb_drop=0.3, out_drop=0.2


PPL: 235.243364 Patience:3: 100%|██████████| 4/4 [00:35<00:00,  8.83s/it]


Test ppl:  214.21227649692173
Training with parameters: lr=0.02, clip=1, hid_size=100, emb_size=250, weight_decay=0.0001, patience=3, emb_drop=0.3, out_drop=0.3


PPL: 236.269031 Patience:3: 100%|██████████| 4/4 [00:35<00:00,  8.90s/it]


Test ppl:  214.43729379849245
Training with parameters: lr=0.02, clip=1, hid_size=100, emb_size=250, weight_decay=0.001, patience=3, emb_drop=0.1, out_drop=0.1


PPL: 248.709709 Patience:3:  25%|██▌       | 1/4 [00:17<00:52, 17.59s/it]


KeyboardInterrupt: 

In [32]:
""" lr = .002
clip = 5
hid_size = 400
emb_size = 250
weight_decay = .0001
model = MultiModel(emb_size, hid_size, vocab_len, model_type="LSTM",
                   pad_index=lang.word2id["<pad>"]).to(DEVICE)
model.apply(init_weights)
optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
train_model("LSTM_SIMPLE_AdamW", modelM=model, optimizer=optimizer,clip=clip,
            criterion_train=criterion_train, criterion_eval=criterion_eval) """


lr_range = [.02, .002, .001]
clip_range = [1, 5, 10]
hid_size_range = [100, 250, 500]
emb_size_range = [100, 250, 500]
weight_decay_range = [0.0001, 0.001]
patience_range = [3]
emb_drop_range = [0.1, 0.2, 0.3]
out_drop_range = [0.1, 0.2, 0.3]

best_model, best_params = train_grid_search(
    lr_range, clip_range, hid_size_range, emb_size_range,
    weight_decay_range, patience_range, emb_drop_range,
    out_drop_range, vocab_len, exp_name="LSTM_SIMPLE_AdamW", model_type="LSTM", optimizer=optim.AdamW, dropout_type="Normal", n_epochs=EPOCHS)

Training with parameters: lr=0.02, clip=1, hid_size=100, emb_size=100, weight_decay=0.0001, patience=3, emb_drop=0.1, out_drop=0.1


PPL: 229.124991 Patience:3: 100%|██████████| 4/4 [00:34<00:00,  8.63s/it]


Test ppl:  208.36720312424922
Training with parameters: lr=0.02, clip=1, hid_size=100, emb_size=100, weight_decay=0.0001, patience=3, emb_drop=0.1, out_drop=0.2


PPL: 223.902258 Patience:3: 100%|██████████| 4/4 [00:34<00:00,  8.62s/it]


Test ppl:  206.68925456044397
Training with parameters: lr=0.02, clip=1, hid_size=100, emb_size=100, weight_decay=0.0001, patience=3, emb_drop=0.1, out_drop=0.3


PPL: 221.426062 Patience:3: 100%|██████████| 4/4 [00:34<00:00,  8.65s/it]


Test ppl:  203.04070360725035
Training with parameters: lr=0.02, clip=1, hid_size=100, emb_size=100, weight_decay=0.0001, patience=3, emb_drop=0.2, out_drop=0.1


PPL: 226.733405 Patience:3: 100%|██████████| 4/4 [00:34<00:00,  8.53s/it]


Test ppl:  206.18938866737108
Training with parameters: lr=0.02, clip=1, hid_size=100, emb_size=100, weight_decay=0.0001, patience=3, emb_drop=0.2, out_drop=0.2


PPL: 224.033702 Patience:3: 100%|██████████| 4/4 [00:34<00:00,  8.62s/it]


Test ppl:  205.22984994042005
Training with parameters: lr=0.02, clip=1, hid_size=100, emb_size=100, weight_decay=0.0001, patience=3, emb_drop=0.2, out_drop=0.3


PPL: 224.452930 Patience:3: 100%|██████████| 4/4 [00:34<00:00,  8.63s/it]


Test ppl:  205.02874281819385
Training with parameters: lr=0.02, clip=1, hid_size=100, emb_size=100, weight_decay=0.0001, patience=3, emb_drop=0.3, out_drop=0.1


PPL: 227.712145 Patience:3: 100%|██████████| 4/4 [00:34<00:00,  8.73s/it]


Test ppl:  209.51014514850854
Training with parameters: lr=0.02, clip=1, hid_size=100, emb_size=100, weight_decay=0.0001, patience=3, emb_drop=0.3, out_drop=0.2


PPL: 233.561971 Patience:3:  50%|█████     | 2/4 [00:20<00:20, 10.41s/it]


KeyboardInterrupt: 

### Part 2 LSTM_SIMPLE with NT-AvSGD

In [28]:
""" lr = 1
clip = 5
hid_size = 400
emb_size = 250
weight_decay = .0001
non_monotone_interval = 5
emb_drop=.1
out_drop=.1
model = MultiModel(emb_size, hid_size, vocab_len, model_type="LSTM",
                         pad_index=lang.word2id["<pad>"], dropout_type="None",
                   emb_dropout=emb_drop, out_dropout=out_drop).to(DEVICE)

optimizer = optim.SGD(model.parameters(), lr=lr, weight_decay=weight_decay)
train_model("LSTM_SIMPLE_NT-AvSGD", modelM=model, optimizer=optimizer,
            NM_ASGD=True, clip=clip, criterion_train=criterion_train,
              non_monotone_interval=non_monotone_interval,
              weight_decay_Av = weight_decay,criterion_eval=criterion_eval
              ) """

lr_range = [.02, .002, .001]
clip_range = [1, 5, 10]
hid_size_range = [100, 250, 500]
emb_size_range = [100, 250, 500]
weight_decay_range = [0.0001, 0.001]
patience_range = [3]
emb_drop_range = [0]
out_drop_range = [0]

best_model, best_params = train_grid_search(
    lr_range, clip_range, hid_size_range, emb_size_range,
    weight_decay_range, patience_range, emb_drop_range,
    out_drop_range, vocab_len, exp_name="LSTM_SIMPLE_NT-AvSGD",
      optimizer=optim.SGD, dropout_type="None", model_type= "LSTM",n_epochs=12, NM_ASGD=True)

Training with parameters: lr=0.02, clip=1, hid_size=100, emb_size=100, weight_decay=0.0001, patience=3, emb_drop=0.1, out_drop=0.1


PPL: 1036.176119 Patience:3: 100%|██████████| 11/11 [01:33<00:00,  8.50s/it]


Test ppl:  1000.8666328442101
Training with parameters: lr=0.02, clip=1, hid_size=100, emb_size=100, weight_decay=0.0001, patience=3, emb_drop=0.1, out_drop=0.2


PPL: 997.690517 Patience:3: 100%|██████████| 11/11 [01:33<00:00,  8.53s/it] 


Test ppl:  963.5641263094466
Training with parameters: lr=0.02, clip=1, hid_size=100, emb_size=100, weight_decay=0.0001, patience=3, emb_drop=0.1, out_drop=0.3


PPL: 983.997665 Patience:3: 100%|██████████| 11/11 [01:33<00:00,  8.47s/it] 


Test ppl:  949.4518474248052
Training with parameters: lr=0.02, clip=1, hid_size=100, emb_size=100, weight_decay=0.0001, patience=3, emb_drop=0.2, out_drop=0.1


PPL: 1006.860452 Patience:3: 100%|██████████| 11/11 [01:33<00:00,  8.47s/it]


Test ppl:  972.7105745155079
Training with parameters: lr=0.02, clip=1, hid_size=100, emb_size=100, weight_decay=0.0001, patience=3, emb_drop=0.2, out_drop=0.2


PPL: 990.162177 Patience:3: 100%|██████████| 11/11 [01:33<00:00,  8.47s/it] 


Test ppl:  955.7165720133253
Training with parameters: lr=0.02, clip=1, hid_size=100, emb_size=100, weight_decay=0.0001, patience=3, emb_drop=0.2, out_drop=0.3


PPL: 982.830063 Patience:3: 100%|██████████| 11/11 [01:32<00:00,  8.44s/it] 


Test ppl:  951.1954525624336
Training with parameters: lr=0.02, clip=1, hid_size=100, emb_size=100, weight_decay=0.0001, patience=3, emb_drop=0.3, out_drop=0.1


PPL: 997.770345 Patience:3: 100%|██████████| 11/11 [01:33<00:00,  8.49s/it] 


Test ppl:  963.0751663333928
Training with parameters: lr=0.02, clip=1, hid_size=100, emb_size=100, weight_decay=0.0001, patience=3, emb_drop=0.3, out_drop=0.2


PPL: 978.785296 Patience:3: 100%|██████████| 11/11 [01:34<00:00,  8.55s/it] 


Test ppl:  945.3003111931621
Training with parameters: lr=0.02, clip=1, hid_size=100, emb_size=100, weight_decay=0.0001, patience=3, emb_drop=0.3, out_drop=0.3


PPL: 998.685300 Patience:3: 100%|██████████| 11/11 [01:33<00:00,  8.50s/it] 


Test ppl:  962.9482344576794
Training with parameters: lr=0.02, clip=1, hid_size=100, emb_size=100, weight_decay=0.001, patience=3, emb_drop=0.1, out_drop=0.1


PPL: 981.050839 Patience:3: 100%|██████████| 11/11 [01:33<00:00,  8.51s/it] 


Test ppl:  946.9742691918989
Training with parameters: lr=0.02, clip=1, hid_size=100, emb_size=100, weight_decay=0.001, patience=3, emb_drop=0.1, out_drop=0.2


PPL: 1474.334753 Patience:3:  64%|██████▎   | 7/11 [01:04<00:36,  9.24s/it]


KeyboardInterrupt: 

In [None]:
""" lr = 1
clip = 5
hid_size = 400
emb_size = 250
weight_decay = .0001
non_monotone_interval = 5
emb_drop = .1
out_drop = .1

model = MultiModel(emb_size, hid_size, vocab_len, model_type="LSTM",
                   pad_index=lang.word2id["<pad>"], dropout_type="None",
                   emb_dropout=emb_drop, out_dropout=out_drop).to(DEVICE)

optimizer = optim.SGD(model.parameters(), lr=lr, weight_decay=weight_decay)
train_model("LSTM_SIMPLE_AverageOfGradients", modelM=model, optimizer=optimizer,
            NM_ASGD=True, clip=clip, criterion_train=criterion_train,
            non_monotone_interval=non_monotone_interval,
            weight_decay_Av=weight_decay, criterion_eval=criterion_eval,
            ASGD_optim=AverageOfGradientsSGD
            ) """

Lr = [.02, .002, .001]
Clip = [1, 5, 10]
Hid_size = [100, 250, 500]
Emb_size = [100, 250, 500]
Weighy_decay = [0.0001, 0.001]
Patience_range = [3]
Emb_drop = [0.1, 0.2, 0.3]
Out_drop = [0.1, 0.2, 0.3]

best_model, best_params = train_grid_search(
    Lr, Clip, Hid_size, Emb_size,
    Weighy_decay, Patience_range, Emb_drop,
    Out_drop, vocab_len, exp_name="LSTM_SIMPLE_NT-AverageOfGradientsSGD",
    optimizer=optim.SGD, dropout_type="None", n_epochs=12, NM_ASGD=True,
    ASGD_optim=AverageOfGradientsSGD)

PPL: 254.694746 Patience:3:  13%|█▎        | 13/99 [03:23<22:25, 15.64s/it]


TypeError: '<' not supported between instances of 'dict' and 'float'

### Part 2 V_DROP

In [None]:
"""lr = 5
clip = 5
hid_size = 400
emb_size = 250
weight_decay = .0001
patience = 8
non_monotone_interval=5

 model_class = MultiModel(emb_size, hid_size, vocab_len,model_type="LSTM",
                   pad_index=lang.word2id["<pad>"],dropout_type="Variational",
                         emb_dropout=emb_drop, out_dropout=out_drop).to(DEVICE)
model_class.apply(init_weights)
optimizer = optim.SGD(model_class.parameters(), lr=lr,
                      weight_decay=weight_decay)

train_model("LSTM_SIMPLE_NTASGD_VDROP", clip=clip, modelM=model_class,emb_size, hid_size, vocab_len,model_type="LSTM",
                   pad_index=lang.word2id["<pad>"],dropout_type="Variational",
                         emb_dropout=emb_drop, out_dropout=out_drop
            criterion_eval=criterion_eval,
            criterion_train=criterion_train,
            patience_set=patience,weight_decay_Av=weight_decay) """


Lr = [.02, .002, .001]
Clip = [1, 5, 10]
Hid_size = [100, 250, 500]
Emb_size = [100, 250, 500]
Weighy_decay = [0.0001, 0.001]
Patience_range = [3]
Emb_drop = [0.1, 0.2, 0.3]
Out_drop = [0.1, 0.2, 0.3]

best_model, best_params = train_grid_search(
    Lr, Clip, Hid_size, Emb_size,
    Weighy_decay, Patience_range, Emb_drop,
    Out_drop, vocab_len,  exp_name="LSTM_SIMPLE_NTASGD_VDROP",
    optimizer=optim.SGD, dropout_type="Variational", model_type="LSTM", n_epochs=EPOCHS, NM_ASGD=True)

PPL: 109336.353031 Patience:7:   4%|▍         | 4/99 [01:10<27:55, 17.64s/it]


KeyboardInterrupt: 

### Part 2 Weight_tying

In [None]:
# https://discuss.pytorch.org/t/how-to-use-shared-weights-in-different-layers-of-a-model/71263


class LSTM_VDROP_Weight(nn.Module):

    def __init__(self, emb_size, hidden_size, output_size, pad_index=0, out_dropout=0.1,
                 emb_dropout=0.1, n_layers=1):
        super(LSTM_VDROP_Weight, self).__init__()
        assert (emb_size == hidden_size),"For Weight Tying emb_size and hidden size must be the same"
        # Token ids to vectors, we will better see this in the next lab
        # matrix vocabulary*dimension of the input
        self.embedding = nn.Embedding(
            output_size, emb_size, padding_idx=pad_index)
        self.drop1 = V_Dropout(p=emb_dropout)
        self.lstm = nn.LSTM(emb_size, hidden_size,
                            n_layers, bidirectional=False)
        self.pad_token = pad_index
        self.drop2 = V_Dropout(p=out_dropout)

        # Linear layer to project the hidden layer to our output space
        self.output = nn.Linear(hidden_size, output_size)
        self.output.weight.data = self.embedding.weight.data

    def forward(self, input_sequence):
        emb = self.embedding(input_sequence)  # sentence
        drop1_out = self.drop1(emb)
        lstm_out, _ = self.lstm(drop1_out)
        drop2_out = self.drop2(lstm_out)
        output = self.output(drop2_out).permute(0, 2, 1)

        return output

    def reset_dropout(self, input_sample):
        emb = self.embedding(input_sample)
        self.drop1.generate_mask(emb.to(DEVICE))
        drop1_out = self.drop1(emb)
        lstm_out, _ = self.lstm(drop1_out)
        self.drop2.generate_mask(lstm_out)

In [24]:
""" #to iterate on
lr = 1.0
clip = 5
hid_size = 250
emb_size = 250
weight_decay = .0001
patience = 8
emb_drop = 0.2
out_drop = 0.2
#end to iterate on

model = LSTM_VDROP_Weight(emb_size, hid_size, vocab_len,
                          pad_index=lang.word2id["<pad>"],
                          emb_dropout=emb_drop, out_dropout=out_drop).to(DEVICE)
model.apply(init_weights)
optimizer = optim.SGD(model.parameters(), lr=lr, weight_decay=weight_decay)
train_model("LSTM_NTASGD_VDROP_WEIGHT", clip=clip, modelM=model,
            optimizer=optimizer, NM_ASGD=True, VDROP=True,
            criterion_eval=criterion_eval,
            criterion_train=criterion_train, patience_set=patience, weight_decay_Av=weight_decay) """

Lr = [.02, .002, .001]
Clip = [1, 5, 10]
Hid_size = [100, 250, 500]
Emb_size = [100, 250, 500]
Weighy_decay = [0.0001, 0.001]
Patience_range = [3]
Emb_drop = [0.1, 0.2, 0.3]
Out_drop = [0.1, 0.2, 0.3]

best_model, best_params = train_grid_search(
    Lr, Clip, Hid_size, Emb_size,
    Weighy_decay, Patience_range, Emb_drop,
    Out_drop, vocab_len,  exp_name="LSTM_NTASGD_VDROP_WEIGHT",
    optimizer=optim.SGD, dropout_type="Variational", model_type="LSTM", n_epochs=EPOCHS, NM_ASGD=True, Weight_tying=True)

Training with parameters: lr=0.02, clip=1, hid_size=100, emb_size=100, weight_decay=0.0001, patience=3, emb_drop=0.1, out_drop=0.1


PPL: 7271.472979 Patience:3: 100%|██████████| 4/4 [00:44<00:00, 11.19s/it]


Test ppl:  7203.020650022584
Training with parameters: lr=0.02, clip=1, hid_size=100, emb_size=100, weight_decay=0.0001, patience=3, emb_drop=0.1, out_drop=0.2


PPL: 7266.561004 Patience:3: 100%|██████████| 4/4 [00:44<00:00, 11.19s/it]


Test ppl:  7197.654075608684
Training with parameters: lr=0.02, clip=1, hid_size=100, emb_size=100, weight_decay=0.0001, patience=3, emb_drop=0.1, out_drop=0.3


PPL: 7258.524982 Patience:3: 100%|██████████| 4/4 [00:44<00:00, 11.19s/it]


Test ppl:  7190.80653680236
Training with parameters: lr=0.02, clip=1, hid_size=100, emb_size=100, weight_decay=0.0001, patience=3, emb_drop=0.2, out_drop=0.1


PPL: 8524.265331 Patience:3:  50%|█████     | 2/4 [00:26<00:26, 13.33s/it]


KeyboardInterrupt: 

In [27]:
#%rm -r runs