#Project for Natural Language Understanding Course

Andrea Bonora 
mat. 232222

# Dataset

In [1]:
#@title Paths
#@markdown

dataset_path = '/content/drive/MyDrive/NLU_project_Andrea_Bonora_232222/ptbdataset.zip'  #@param {type: "string"}
models_path = '/content/drive/MyDrive/NLU_project_Andrea_Bonora_232222/NLU_models'  #@param {type: "string"}
#@markdown ---


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!cp -r $dataset_path ./ptbdataset.zip

In [4]:
!unzip "/content/ptbdataset.zip"

Archive:  /content/ptbdataset.zip
  inflating: ptbdataset/ptb.char.test.txt  
  inflating: ptbdataset/ptb.char.train.txt  
  inflating: ptbdataset/ptb.char.valid.txt  
  inflating: ptbdataset/ptb.test.txt  
  inflating: ptbdataset/ptb.train.txt  
  inflating: ptbdataset/ptb.valid.txt  
  inflating: ptbdataset/README       


# Imports and Parameters

In [5]:
import argparse
import time
import math
import torch
import torch.nn as nn
import os
from torch.nn import TransformerEncoder, TransformerEncoderLayer
import torch.nn.functional as F
from tqdm.notebook import tqdm
from collections import Iterable
from torch.autograd import Variable

  from collections import Iterable


In [6]:
#@title Parameters
#@markdown Insert value for the following parameters

emsize = 1500  #@param {type: "number"}
nhid = 1500  #@param {type: "number"}
nlayers = 2  #@param {type: "slider", min: 1, max: 5}
bptt = 35  #@param {type: "number"}
dropout = 0.65 #@param {type: "number"}
clip = 0.25 #@param {type: "number"}
tied = True #@param{type: "boolean"}
alpha = 2 #@param {type: "number"}
beta = 1 #@param {type: "number"}
#@markdown ---

if torch.cuda.is_available():
  device = torch.device("cuda")
else:
  device = torch.device("cpu")

torch.manual_seed(1111)


<torch._C.Generator at 0x7f3c1c96c390>

# Model, Corpus, Dictionary and Positional encoding

In [7]:
class LockedDropout(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x, dropout=0.5):
        if not self.training or not dropout:
            return x
        m = x.data.new(1, x.size(1), x.size(2)).bernoulli_(1 - dropout)
        mask = Variable(m, requires_grad=False) / (1 - dropout)
        mask = mask.expand_as(x)
        return mask * x

In [25]:
class RNNModel(nn.Module):
    """Container module with an encoder, a recurrent module, and a decoder."""

    def __init__(self, ntoken, ninp, nhid, nlayers, dropout=0.5, tie_weights=False):
        super(RNNModel, self).__init__()
        self.ntoken = ntoken
        self.lockdrop = LockedDropout()
        self.dropout = dropout
        self.drop = nn.Dropout(dropout)
        self.encoder = nn.Embedding(ntoken, ninp)
        self.rnn = getattr(nn, "LSTM")(ninp, nhid, nlayers, dropout=dropout)
        self.decoder = nn.Linear(nhid, ntoken)

        if tie_weights:
            if nhid != ninp:
                raise ValueError('When using the tied flag, nhid must be equal to emsize')
            self.decoder.weight = self.encoder.weight

        self.init_weights()

        self.nhid = nhid
        self.nlayers = nlayers

    def init_weights(self):
        initrange = 0.1
        nn.init.uniform_(self.encoder.weight, -initrange, initrange)
        nn.init.zeros_(self.decoder.bias)
        nn.init.uniform_(self.decoder.weight, -initrange, initrange)

    def forward(self, input, hidden, return_h = False):
        emb = self.lockdrop(self.encoder(input), self.dropout)
        raw_output, hidden = self.rnn(emb, hidden)
        output = self.lockdrop(raw_output, dropout)
        decoded = self.decoder(output)
        decoded = decoded.view(-1, self.ntoken)
        if return_h:
          return F.log_softmax(decoded, dim=1), hidden, raw_output, output
        return F.log_softmax(decoded, dim=1), hidden

    def init_hidden(self, bsz):
        weight = next(self.parameters())
        return (weight.new_zeros(self.nlayers, bsz, self.nhid),
                    weight.new_zeros(self.nlayers, bsz, self.nhid))

In [9]:
class Dictionary(object):
    def __init__(self):
        self.word2idx = {}
        self.idx2word = []

    def add_word(self, word):
        if word not in self.word2idx:
            self.idx2word.append(word)
            self.word2idx[word] = len(self.idx2word) - 1
        return self.word2idx[word]

    def __len__(self):
        return len(self.idx2word)

In [10]:
class Corpus(object):
    def __init__(self, path):
        self.dictionary = Dictionary()
        self.train = self.tokenize(os.path.join(path, 'ptb.train.txt'))
        self.valid = self.tokenize(os.path.join(path, 'ptb.valid.txt'))
        self.test = self.tokenize(os.path.join(path, 'ptb.test.txt'))

    def tokenize(self, path):
        """Tokenizes a text file."""
        assert os.path.exists(path)
        # Add words to the dictionary
        with open(path, 'r', encoding="utf8") as f:
            for line in f:
                words = line.split() + ['<eos>']
                for word in words:
                    self.dictionary.add_word(word)

        # Tokenize file content
        with open(path, 'r', encoding="utf8") as f:
            idss = []
            for line in f:
                words = line.split() + ['<eos>']
                ids = []
                for word in words:
                    ids.append(self.dictionary.word2idx[word])
                idss.append(torch.tensor(ids).type(torch.int64))
            ids = torch.cat(idss)

        return ids

# Utils

In [11]:
def batchify(data: Iterable, bsz:int):
    # Work out how cleanly we can divide the dataset into bsz parts.
    nbatch = data.size(0) // bsz
    # Trim off any extra elements that wouldn't cleanly fit (remainders).
    data = data.narrow(0, 0, nbatch * bsz)
    # Evenly divide the data across the bsz batches.
    data = data.view(bsz, -1).t().contiguous()
    return data.to(device)

In [12]:
def repackage_hidden(h: torch.Tensor):
    """Wraps hidden states in new Tensors, to detach them from their history."""
    if isinstance(h, torch.Tensor):
        return h.detach()
    else:
        return tuple(repackage_hidden(v) for v in h)

In [13]:
def get_batch(source: Iterable, i: int):
    seq_len = min(bptt, len(source) - 1 - i)
    data = source[i:i+seq_len]
    target = source[i+1:i+1+seq_len].view(-1)
    return data, target

In [14]:
def get_optimizer(type: str, model: nn.Module, lr: float):
  if type == "SGD":
    return torch.optim.SGD(model.parameters(), lr=lr, weight_decay=1.2e-6)
  elif type == "ASGD":
    return torch.optim.ASGD(model.parameters(), lr=lr, weight_decay = 1.2e-6)
  elif type == "Adam":
    return torch.optim.Adam(model.parameters(), lr=lr, betas=(0.9, 0.99))

# Training and Testing

In [29]:
def evaluate(model: nn.Module, data_source: Iterable):
    # Turn on evaluation mode which disables dropout.
    criterion = nn.CrossEntropyLoss()
    model.eval()
    total_loss = 0.
    ntokens = len(corpus.dictionary)
    hidden = model.init_hidden(eval_batch_size)
    with torch.no_grad():
        for i in range(0, data_source.size(0) - 1, bptt):
            data, targets = get_batch(data_source, i)
            output, hidden = model(data, hidden)
            hidden = repackage_hidden(hidden)
            total_loss += len(data) * criterion(output, targets).item()

    return total_loss / (len(data_source) - 1)

In [16]:
def train(model: nn.Module,
          opt: any,
          train_data: Iterable,
          epoch: int):
  
    # Turn on training mode which enables dropout.
    model.train()
    total_loss = 0.
    start_time = time.time()

    criterion = nn.NLLLoss()

    hidden = model.init_hidden(batch_size)

    pbar = tqdm(range(0, train_data.size(0) - 1, bptt),position=0, leave=False)
    for p, (batch, i) in zip(pbar, enumerate(range(0, train_data.size(0) - 1, bptt))):
        data, targets = get_batch(train_data, i)
        
        model.zero_grad()
        opt.zero_grad()
        
        # Starting each batch, we detach the hidden state from how it was previously produced.
        # If we didn't, the model would try backpropagating all the way to start of the dataset.
        hidden = repackage_hidden(hidden)
        output, hidden, rnn_hs, dropped_rnn_hs = model(data, hidden, return_h = True)

        loss = criterion(output, targets)
        
        # Activiation Regularization
        if alpha: loss = loss + alpha * dropped_rnn_hs[-1:].pow(2).mean() 
        # Temporal Activation Regularization (slowness)
        if beta: loss = loss + beta * (rnn_hs[-1][1:] - rnn_hs[-1][:-1]).pow(2).mean()

        loss.backward()

        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        opt.step()
        for p in model.parameters():
          p.data.add_(p.grad, alpha=-lr)

        total_loss += loss.item()

    print('| epoch {:3d} | lr {:02.2f} | loss {:5.2f} | ppl {:8.2f}'.format(
                epoch, opt.param_groups[0]['lr'], total_loss/batch, math.exp(total_loss/batch)))

In [17]:
def test(model, test_data):
  test_loss = evaluate(model, test_data)
  print('=' * 89)
  print('| Test | test loss {:5.2f} | test ppl {:8.2f}'.format(
    test_loss, math.exp(test_loss)))
  print('=' * 89)

# Run experiments

In [26]:
def main(lr:float,
         batch_size:int,
         eval_batch_size:int,
         epochs:int,
         pretrained:bool = True,
         training:bool = True,
         n:int = 3):
  
  '''
   Args:
      lr: learning rate
      batch_size: batch size for the training data
      eval_batch_size: batch size for the evaluation and testing data
      epochs: number of epochs to train the model
      pretrained: true to use an already trained model in the models folder
      training: true to train the model, false to test only. Use training = True with pretrained = True. 
      n: number of successive iterations without improvement to trigger the ASGD optimizer and learning rate annealing
  '''
  
  corpus = Corpus('/content/ptbdataset')
  train_data = batchify(corpus.train, batch_size)
  val_data = batchify(corpus.valid, eval_batch_size)
  ntokens = len(corpus.dictionary)
  
  model = RNNModel(ntokens, emsize, nhid, nlayers, dropout, tied).to(device)
  opt = get_optimizer("SGD", model, lr) 
  no_imp = n
  best_val_loss = None

  if training:
    if pretrained:
      model.load_state_dict(torch.load(models_path + '/best_model.pt'))
      best_val_loss = evaluate(model, val_data)
      print("Best loss reached:", best_val_loss)
      model.load_state_dict(torch.load(models_path + '/last_model.pt'))
      opt = get_optimizer("SGD", model, lr)
    
    pbar = tqdm(range(1, epochs+1),position=0, leave=False)
    for p, epoch in zip(pbar,range(1, epochs+1)):
        epoch_start_time = time.time()
        train(model, opt, train_data, epoch)
        val_loss = evaluate(model, val_data)
        print('-' * 89)
        print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
                'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
                                           val_loss, math.exp(val_loss)))
        print('-' * 89)

        # Save the model if the validation loss is the best we've seen so far.
        if not best_val_loss or val_loss < best_val_loss:
            torch.save(model.state_dict(), models_path + '/best_model.pt')
            best_val_loss = val_loss
            no_imp = n
        else:
            # Anneal the learning rate if no improvement has been seen in the validation dataset.
            no_imp -= 1
            if no_imp == 0:
              no_imp = n
              lr /= 4
              opt.param_groups[0]['lr'] = lr
        
        torch.save(model.state_dict(), models_path + '/last_model.pt')

  model.load_state_dict(torch.load(models_path + '/best_model.pt'))
  model.rnn.flatten_parameters()
  test(model, test_data)

In [20]:
#@title Running parameters
#@markdown Running parameters

lr = 20  #@param {type: "number"}
epochs = 40 #@param {type: "number"}
batch_size = 20 #@param {type: "number"}
eval_batch_size = 10 #@param {type: "number"}
training = True #@param {type: "boolean"}
pretrained = False #@param {type: "boolean"}
n = 3 #@param {type: "number"}
#@markdown ---


In [None]:
if __name__ == '__main__':
  main(lr, batch_size, eval_batch_size, epochs, pretrained = pretrained, training = training, n = n)

In [32]:
corpus = Corpus('/content/ptbdataset')
ntokens = len(corpus.dictionary)
model = RNNModel(ntokens, emsize, nhid, nlayers, dropout, tied).to(device)
test_data = batchify(corpus.test, eval_batch_size)

print("Model1")
model.load_state_dict(torch.load(models_path+'/model1.pt'))
model.rnn.flatten_parameters()
test(model, test_data)

print("Model2")
model.load_state_dict(torch.load(models_path+'/model2.pt'))
model.rnn.flatten_parameters()
test(model, test_data)

print("Model3")
model.load_state_dict(torch.load(models_path+'/model3.pt'))
model.rnn.flatten_parameters()
test(model, test_data)

print("Model4")
model.load_state_dict(torch.load(models_path+'/model4.pt'))
model.rnn.flatten_parameters()
test(model, test_data)

print("Model5")
model.load_state_dict(torch.load(models_path+'/model5.pt'))
model.rnn.flatten_parameters()
test(model, test_data)

print("Model6")
model.load_state_dict(torch.load(models_path+'/model6.pt'))
model.rnn.flatten_parameters()
test(model, test_data)



Model1
| Test | test loss  4.35 | test ppl    77.33
Model2
| Test | test loss  4.35 | test ppl    77.47
Model3
| Test | test loss  4.34 | test ppl    76.68
Model4
| Test | test loss  4.32 | test ppl    75.47
Model5
| Test | test loss  4.31 | test ppl    74.52
Model6
| Test | test loss  4.34 | test ppl    77.04
