In [28]:
from google.colab import drive 
drive.mount('/content/gdrive')

In [29]:
cd /content/gdrive/Shareddrives/CS726/Vaidehi/Lambada

In [32]:
!pip install nengolib
import torch
from torch import nn
import time
import math
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import pickle
import os

class Arguments:
  batch_size = 20
  cuda = False
  seed = 1111
  clip = 0.4
  epochs = 1
  data = "/content/gdrive/Shareddrives/CS726/Vaidehi/Lambada"
  emsize = 500
  log_interval = 100
  lr = 0.4
  nhid = 500
  optim = "SGD"
  validseqlen = 50
  seq_len = 100
  corpus = False #set this as true while running for the first time
  tied = True

args=Arguments()
torch.manual_seed(args.seed)
if torch.cuda.is_available():
    if not args.cuda:
        print("WARNING: You have a CUDA device, restart GPU runtime")


In [34]:
def data_generator(args):
    if os.path.exists(args.data + "/corpus") and not args.corpus:
        corpus = pickle.load(open(args.data + '/corpus', 'rb'))
    else:
        print("Creating Corpus...")
        corpus = Corpus(args.data + "/lambada-vocab-2.txt", args.data)
        pickle.dump(corpus, open(args.data + '/corpus', 'wb'))

    eval_batch_size = 1
    train_data = batchify(corpus.train, args.batch_size, args)
    # val_data = batchify(corpus.valid, args.batch_size, args)
    # test_data = batchify(corpus.test, args.batch_size, args)
    val_data = [[0] * (args.seq_len-len(line)) + line for line in corpus.valid]
    test_data = [[0] * (args.seq_len-len(line)) + line for line in corpus.test]
    return train_data, val_data, test_data, corpus


class Dictionary(object):
    def __init__(self):
        self.word2idx = {}
        self.idx2word = []

    def add_word(self, word):
        if word not in self.word2idx:
            # print(word)
            self.idx2word.append(word)
            self.word2idx[word] = len(self.idx2word) - 1
            # print(word)
            # print(word not in self.word2idx)
        return self.word2idx[word]

    def __len__(self):
        return len(self.idx2word)


class Corpus(object):
    def __init__(self, dict_path, path):
        self.dictionary = Dictionary()
        self.prep_dict(dict_path)
        self.train = torch.LongTensor(self.tokenize(os.path.join(path, 'train-novels')))
        self.valid = self.tokenize(os.path.join(path, 'lambada_development_plain_text.txt'), eval=True)
        self.test = self.tokenize(os.path.join(path, 'lambada_test_plain_text.txt'), eval=True)

    def prep_dict(self, dict_path):
        assert os.path.exists(dict_path)

        # Add words to the dictionary
        with open(dict_path, 'r') as f:
            tokens = 0
            for line in f:
                word = line.strip().split()[0]
                # print(word)
                # print("word parinted")
                tokens += 1
                # print(word)
                self.dictionary.add_word(word)

        if "<eos>" not in self.dictionary.word2idx:
            self.dictionary.add_word("<eos>")
            tokens += 1

        # print("to" in self.dictionary.word2idx)

        print("The dictionary captured a vocabulary of size {0}.".format(tokens))

    def tokenize(self, path, eval=False):
        assert os.path.exists(path)

        ids = []
        token = 0
        misses = 0
        if not path.endswith(".txt"):   # it's a folder
            for subdir in os.listdir(path):
                for filename in os.listdir(path + "/" + subdir):
                    if filename.endswith(".txt"):
                        full_path = "{0}/{1}/{2}".format(path, subdir, filename)
                        # Tokenize file content
                        delta_ids, delta_token, delta_miss = self._tokenize_file(full_path, eval=eval)
                        ids += delta_ids
                        token += delta_token
                        misses += delta_miss
        else:
            ids, token, misses = self._tokenize_file(path, eval=eval)

        print(token, misses)
        return ids

    def _tokenize_file(self, path, eval=False):
        with open(path, 'r') as f:
            token = 0
            ids = []
            misses = 0
            for line in f:
                line_ids = []
                words = line.strip().split() + ['<eos>']
                # print(words)
                # print("Words being printed")
                if eval:
                    words = words[:-1]
                for word in words:
                    # These words are in the text but not vocabulary
                    if word == "n't":
                        word = "not"
                    elif word == "'s":
                        word = "is"
                    elif word == "'re":
                        word = "are"
                    elif word == "'ve":
                        word = "have"
                    elif word == "wo":
                        word = "will"
                    if word not in self.dictionary.word2idx:
                        word = re.sub(r'[^\w\s]', '', word)
                    if word not in self.dictionary.word2idx:
                        misses += 1
                        continue
                    line_ids.append(self.dictionary.word2idx[word])
                    token += 1
                if eval:
                    ids.append(line_ids)
                else:
                    ids += line_ids
        return ids, token, misses


def batchify(data, batch_size, args):
    """The output should have size [L x batch_size], where L could be a long sequence length"""
    # Work out how cleanly we can divide the dataset into batch_size parts (i.e. continuous seqs).
    nbatch = data.size(0) // batch_size
    # Trim off any extra elements that wouldn't cleanly fit (remainders).
    data = data.narrow(0, 0, nbatch * batch_size)
    # Evenly divide the data across the batch_size batches.
    data = data.view(batch_size, -1)
    print(data.size())
    if args.cuda:
        data = data.cuda()
    return data


def get_batch(source, i, args, seq_len=None, evaluation=False):
    seq_len = min(seq_len if seq_len else args.seq_len, source.size(1) - 1 - i)
    data = Variable(source[:, i:i+seq_len], volatile=evaluation)
    target = Variable(source[:, i+1:i+2])  # CAUTION: This is un-flattened!
    return data, target


In [35]:
train_data, val_data, test_data, corpus = data_generator(args)

n_words = len(corpus.dictionary)
print("Total # of words: {0}".format(n_words))



In [61]:
class LMUTagger(nn.Module):
    def __init__(self, units, order, theta, vocab_size, embedding_dim, tagset_size):
        super(LMUTagger, self).__init__()

        self.theta = theta
        self.units = units
        self.order = order


        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        
        self.lmu_cell = LMUCell(input_size=embedding_dim, hidden_size=units, memory_size=order, theta=theta, nonlinearity='sigmoid', A_learnable = True, B_learnable = True)

        # uncomment the following line to use ASSVMU
        # self.lmu_cell = ASSVMU(input_size = embedding_dim, hidden_size = units, memory_size = order, theta = theta, discretizer = 'zoh',nonlinearity='sigmoid', A_learnable = False, B_learnable = False, activate=False)

        # uncomment the following line to use BMU
        # self.lmu_cell = BMU(input_size = embedding_dim, hidden_size = units, memory_size = order, theta = theta, matrix_type='pb', discretizer = 'zoh',nonlinearity='sigmoid', A_learnable = False, B_learnable = False)


        self.dense = nn.Linear(
            in_features=units,
            out_features=tagset_size,
        )

    def forward(self, inputs):
        # inputs is of shape (batch_size, n_steps, 1)

        embeds = self.word_embeddings(inputs)

        h = torch.zeros(1, self.units)
        c = torch.zeros(1, self.order)

        if args.cuda:
          
          h = h.cuda()
          c = c.cuda()
          embeds = embeds.cuda()

        
        for i in range(embeds.shape[1]):            
            h, c = self.lmu_cell(embeds[:, i, :], (h, c))
           
        if inputs.is_cuda:
          h = h.cuda()
        ###

        # make a prediction based on the final hidden state of the LMU
        return self.dense(h)


class LSTMTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        # print(embeds.shape)
        lstm_out, _ = self.lstm(embeds.view(embeds.shape[1], -1, 500))
        # print(lstm_out.shape)
        tag_space = self.hidden2tag(lstm_out[99,:,:].view(-1,1,150))
        # print("Tag space")
        # print(tag_space)
        # print(tag_space.contiguous())
        return tag_space


In [57]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from nengolib.signal import Identity, cont2discrete
from nengolib.synapses import LegendreDelay
import numpy as np


class LMUCell(nn.Module):
    def __init__(self, input_size, hidden_size, memory_size, theta, nonlinearity='sigmoid', A_learnable = True, B_learnable = True):
        super(LMUCell, self).__init__()
        
        ### SIZE
        self.k = input_size
        self.n = hidden_size
        self.d = memory_size

        ### PARAMETERS
        self.Wx = nn.Parameter(torch.Tensor(self.n,self.k))
        self.Wh = nn.Parameter(torch.Tensor(self.n,self.n))
        self.Wm = nn.Parameter(torch.Tensor(self.n,self.d))
        self.ex = nn.Parameter(torch.Tensor(1,self.k))
        self.eh = nn.Parameter(torch.Tensor(1,self.n))
        self.em = nn.Parameter(torch.Tensor(1,self.d))

        ### A,B MATRIX ----- FIX??
        realizer = Identity()
        self._realizer_result = realizer(LegendreDelay(theta=theta, order=self.d))
        self._ss = cont2discrete(self._realizer_result.realization, dt=1., method='zoh')
        self._A = self._ss.A
        self._B = self._ss.B
        '''
        Q = np.arange(order, dtype=np.float64)
        R = (2 * Q + 1)[:, None] / theta
        j, i = np.meshgrid(Q, Q)
        A = np.where(i < j, -1, (-1.0) ** (i - j + 1)) * R
        B = (-1.0) ** Q[:, None] * R
        C = np.ones((1, order))
        D = np.zeros((1,))
        self._A, self._B, _, _, _ = cont2discrete((A, B, C, D), dt=1.0, method="zoh")
        '''
        self.AT = torch.Tensor(self._A)
        self.BT = torch.Tensor(self._B)
        if A_learnable:
            self.AT = nn.Parameter(self.AT)
        if B_learnable:
            self.BT = nn.Parameter(self.BT)
        ### Changes
        # if args.cuda:
        #   self.AT = self.AT.cuda()
        #   self.BT = self.BT.cuda()
        ###

        ### NON-LINEARITY
        self.nl = nonlinearity
        if self.nl == 'sigmoid':
            self.act = nn.Sigmoid()
        elif self.nl == 'tanh':
            self.act = nn.Tanh()
        else:
            self.act = nn.ReLU()

        ### INITIALIZATION
        torch.nn.init.xavier_normal_(self.Wm)    ##### FIGURE THIS OUT!!
        torch.nn.init.xavier_normal_(self.Wx)
        torch.nn.init.xavier_normal_(self.Wh)
        torch.nn.init.zeros_(self.em)
        torch.nn.init.uniform_(self.ex, -np.sqrt(3/self.d), np.sqrt(3/self.d))
        torch.nn.init.uniform_(self.eh, -np.sqrt(3/self.d), np.sqrt(3/self.d))
        
    def forward(self,x,hm):
        '''
        x shape: (batch_size, input_size) 
        h shape: (batch_size, hidden_size)
        m shape: (batch_size, memory_size) 
        '''

        h,m = hm
        # if args.cuda:
        #   h.cuda()
        #   m.cuda()
        u = F.linear(x,self.ex)+F.linear(h,self.eh)+F.linear(m,self.em)
        # if args.cuda:
        #   u.cuda()
        ### Changes:
        # print(self.AT.is_cuda, self.BT.is_cuda) prints False, False
        # becaue torch.tensor is not part of model and hence model.cuda does not transform it to cuda
        ####
        new_m = F.linear(m,self.AT) + F.linear(u,self.BT)
        # if args.cuda:
        #   new_m.cuda()
        new_h = self.act(F.linear(x,self.Wx)+F.linear(h,self.Wh)+F.linear(new_m,self.Wm))
        # if args.cuda:
        #   new_h.cuda()

        return new_h,new_m


import torch
import torch.nn as nn
import torch.nn.functional as F
from nengolib.signal import Identity, cont2discrete
from nengolib.synapses import LegendreDelay
import numpy as np

class ASSVMU(nn.Module):

    # ASSVMU(input_size, hidden_size, memory_size, theta, discretizer = 'zoh',nonlinearity='sigmoid', 
    #                     A_learnable = False, B_learnable = False, activate=False)
    # # '''
    # Co-invented by Abhishek, Syomantak, Siddharth, Vaidehi, Mithilesh
    # ASSVM + MU = ASSVMU
    # '''
    def __init__(self, input_size, hidden_size, memory_size, theta, discretizer = 'zoh',nonlinearity='sigmoid', 
                        A_learnable = False, B_learnable = False, activate=False):
        super(ASSVMU, self).__init__()
        
        ### SIZE
        self.k = input_size
        self.n = hidden_size
        self.d = memory_size

        ###
        # self.include_both = include_both
        

        ### PARAMETERS
        self.Wx = nn.Parameter(torch.Tensor(self.n,self.k))
        self.Wh = nn.Parameter(torch.Tensor(self.n,self.n))
        self.Wm = nn.Parameter(torch.Tensor(self.n,self.d))
        self.ex = nn.Parameter(torch.Tensor(1,self.k))
        self.eh = nn.Parameter(torch.Tensor(1,self.n))
        self.em = nn.Parameter(torch.Tensor(1,self.d))

        ### A,B MATRIX ----- FIX??
        order=self.d
        Q = np.arange(order, dtype=np.float64)
        R = (2 * Q + 1)[:, None] / theta
        j, i = np.meshgrid(Q, Q)
        A = np.where(i < j, -1, (-1.0) ** (i - j + 1)) * R
        B = (-1.0) ** Q[:, None] * R
        C = np.ones((1, order))
        D = np.zeros((1,))
        self._ss = cont2discrete((A, B, C, D), dt=0.01, method=discretizer)
        self._A = self._ss.A
        self._B = self._ss.B

        ### NON-LINEARITY
        self.nl = nonlinearity
        if self.nl == 'sigmoid':
            self.act = nn.Sigmoid()
        elif self.nl == 'tanh':
            self.act = nn.Tanh()
        else:
            self.act = nn.ReLU()

        ### NN
        self.fc = nn.Linear(self.n,self.n)

        if activate:
            self.nn_act = self.act
        else:
            self.nn_act = nn.LeakyReLU(1.0) #Identity Function

        ### INITIALIZATION
        torch.nn.init.xavier_normal_(self.Wm)    ##### FIGURE THIS OUT!!
        torch.nn.init.xavier_normal_(self.Wx)
        torch.nn.init.xavier_normal_(self.Wh)
        torch.nn.init.zeros_(self.em)
        torch.nn.init.uniform_(self.ex, -np.sqrt(3/self.d), np.sqrt(3/self.d))
        torch.nn.init.uniform_(self.eh, -np.sqrt(3/self.d), np.sqrt(3/self.d))


        #### TRIAL
        self.register_buffer('AT', torch.Tensor(self._A))
        self.register_buffer('BT', torch.Tensor(self._B))
        if A_learnable:
            self.AT = nn.Parameter(self.AT)
        if B_learnable:
            self.BT = nn.Parameter(self.BT)

    def forward(self,x,hm):
        '''
        x shape: (batch_size, input_size) 
        h shape: (batch_size, hidden_size)
        m shape: (batch_size, memory_size) 
        '''

        h,m = hm 
        u = F.linear(x,self.ex)+F.linear(h,self.eh)+F.linear(m,self.em)
        new_m = F.linear(m,self.AT) + F.linear(u,self.BT)
        new_h = self.act(F.linear(x,self.Wx)+F.linear(h,self.Wh)+F.linear(new_m,self.Wm))
        new_h = self.nn_act(self.fc(new_h))
        return new_h,new_m


import torch
import torch.nn as nn
import torch.nn.functional as F
from nengolib.signal import Identity,cont2discrete
from nengolib.synapses import LegendreDelay
import numpy as np
from scipy.special import comb


class BMU(nn.Module):
    def __init__(self, input_size, hidden_size, memory_size, theta, matrix_type='pb',discretizer = 'zoh',nonlinearity='sigmoid', A_learnable = False, B_learnable = False):
        super(BMU, self).__init__()

        ### SIZE
        self.k = input_size
        self.n = hidden_size
        self.d = memory_size

        ### PARAMETERS
        self.Wx = nn.Parameter(torch.Tensor(self.n,self.k))
        self.Wh = nn.Parameter(torch.Tensor(self.n,self.n))
        self.Wm = nn.Parameter(torch.Tensor(self.n,self.d))
        self.ex = nn.Parameter(torch.Tensor(1,self.k))
        self.eh = nn.Parameter(torch.Tensor(1,self.n))
        self.em = nn.Parameter(torch.Tensor(1,self.d))

        ### A,B MATRIX ----- FIX??
        '''
        realizer = Identity()
        self._realizer_result = realizer(LegendreDelay(theta=theta, order=self.d))
        self._ss = cont2discrete(self._realizer_result.realization, dt=1., method=discretizer)
        self._A = self._ss.A
        self._B = self._ss.B
        '''

        if matrix_type=='pl':   #For Legendre Memory Unit
            order=self.d
            Q = np.arange(order, dtype=np.float64)
            R = (2 * Q + 1)[:, None] / theta
            j, i = np.meshgrid(Q, Q)
            A = np.where(i < j, -1, (-1.0) ** (i - j + 1)) * R
            B = (-1.0) ** Q[:, None] * R
            C = np.ones((1, order))
            D = np.zeros((1,))
            self._ss = cont2discrete((A, B, C, D), dt=0.01, method=discretizer)
            self._A = self._ss.A
            self._B = self._ss.B
        elif matrix_type=='p':  #For Pade Memory Unit
            order=self.d
            Q=np.arange(order,dtype=np.float64)
            V=(order+Q+1)*(order-Q)/(Q+1)/theta
            A=np.zeros([order,order],dtype=np.float64)
            B=np.zeros([order,1],dtype=np.float64)
            A[0,:]=-V[0]
            A[1:order,0:order-1]=np.diag(V[1:order])
            B[0]=V[0]
            C = np.ones((1, order))
            D = np.zeros((1,))
            self._ss = cont2discrete((A, B, C, D), dt=0.01, method=discretizer)
            self._A = self._ss.A
            self._B = self._ss.B
        elif matrix_type=='pb':  #For Bernstein Memory Unit
            order=self.d
            Q = np.arange(order, dtype=np.float64)
            R = (2 * Q + 1)[:, None] / theta
            j, i = np.meshgrid(Q, Q)
            A_leg = np.where(i < j, -1, (-1.0) ** (i - j + 1)) * R
            B_leg = (-1.0) ** Q[:, None] * R
            C = np.ones((1, order))
            D = np.zeros((1,))
            M=np.zeros([order,order],dtype=np.float64)
            M_inv=np.zeros([order,order],dtype=np.float64)
            n=order-1 #degree of polynomial
            for j in range(0,n+1):
              for k in range(0,n+1):
                ll=max(0,j+k-n)
                ul=min(j,k)+1
                sum=0.0
                for i in range(ll,ul):
                  sum=sum+((-1.0)**(k+i))*(comb(k,i)**2)*comb(n-k,j-i)
                M[j,k]=sum/comb(n,j)

                sum=0.0
                for i in range(0,j+1):
                  sum=sum+(-1.0)**(j+i)*comb(j,i)**2/comb(n+j,k+i)
                M_inv[j,k]=(2*j+1)/(n+j+1)*comb(n,k)*sum

            M=10*np.tanh(M/10)
            M_inv=10*np.tanh(M_inv/10)

            A_1=np.matmul(M,A_leg)
            A=np.matmul(A_1,M_inv)
            B=np.matmul(M,B_leg)

            self._ss = cont2discrete((A, B, C, D), dt=0.01, method=discretizer)
            self._A = self._ss.A
            self._B = self._ss.B
        '''
        self.AT = torch.Tensor(self._A)
        self.BT = torch.Tensor(self._B)
        if A_learnable:
            self.AT = nn.Parameter(self.AT)
        if B_learnable:
            self.BT = nn.Parameter(self.BT)
        '''
        ### NON-LINEARITY
        self.nl = nonlinearity
        if self.nl == 'sigmoid':
            self.act = nn.Sigmoid()
        elif self.nl == 'tanh':
            self.act = nn.Tanh()
        else:
            self.act = nn.ReLU()

        ### INITIALIZATION
        torch.nn.init.xavier_normal_(self.Wm)    ##### FIGURE THIS OUT!!
        torch.nn.init.xavier_normal_(self.Wx)
        torch.nn.init.xavier_normal_(self.Wh)
        torch.nn.init.zeros_(self.em)
        torch.nn.init.uniform_(self.ex, -np.sqrt(3/self.d), np.sqrt(3/self.d))
        torch.nn.init.uniform_(self.eh, -np.sqrt(3/self.d), np.sqrt(3/self.d))


        #### TRIAL
        self.register_buffer('AT', torch.Tensor(self._A))
        self.register_buffer('BT', torch.Tensor(self._B))
        if A_learnable:
            self.AT = nn.Parameter(self.AT)
        if B_learnable:
            self.BT = nn.Parameter(self.BT)


    def forward(self,x,hm):
        '''
        x shape: (batch_size, input_size)
        h shape: (batch_size, hidden_size)
        m shape: (batch_size, memory_size)
        '''

        h,m = hm
        u = F.linear(x,self.ex)+F.linear(h,self.eh)+F.linear(m,self.em)
        new_m = F.linear(m,self.AT) + F.linear(u,self.BT)
        new_h = self.act(F.linear(x,self.Wx)+F.linear(h,self.Wh)+F.linear(new_m,self.Wm))

        return new_h,new_m



In [58]:
model=LMUTagger(order=40, theta=35**2,embedding_dim=500,units=150,vocab_size=112742,tagset_size=112742)
# model=LSTMTagger(embedding_dim=500,hidden_dim=150,vocab_size=112742,tagset_size=112742)

if args.cuda:
    model.cuda()

In [59]:
criterion = nn.CrossEntropyLoss()
lr = 4
optimizer = getattr(optim, args.optim)(model.parameters(), lr=lr)


def evaluate(data_source):
    model.eval()
    total_loss = 0
    processed_data_size = 0
    correct = 0
    with torch.no_grad():
        for i in range(len(data_source)):
            data, targets = torch.LongTensor(data_source[i]).view(1, -1), torch.LongTensor([data_source[i][-1]]).view(1, -1)
            data, targets = Variable(data), Variable(targets)
            
            if args.cuda:
                data, targets = data.cuda(), targets.cuda()
            output = model(data)
            final_output = output[:, :].contiguous().view(-1, n_words)
            final_target = targets[:, -1].contiguous().view(-1)
            loss = criterion(final_output, final_target)
            total_loss += loss.data
            processed_data_size += 1
        return total_loss.item() / processed_data_size


def train():
    global train_data
    model.train()
    total_loss = 0
    start_time = time.time()
    for batch_idx, i in enumerate(range(147000, (train_data.size(1) - 1), args.validseqlen)):
      # if batch_idx > 128900:
        if i + args.seq_len - args.validseqlen >= train_data.size(1) - 1:
            continue
        data, targets = get_batch(train_data, i, args)
        if args.cuda:
            data, targets = data.cuda(), targets.cuda()
       
        optimizer.zero_grad()
        output = model(data)
        eff_history = args.seq_len - args.validseqlen
        if eff_history < 0:
            raise ValueError("Valid sequence length must be smaller than sequence length!")
        final_target = targets[:, :].contiguous().view(-1)

        #when using LSTMTagger uncomment the following line
        # output = output[:, :].contiguous().view(-1, n_words)
        
        loss = criterion(output, final_target)
        loss.backward()
        if args.clip > 0:
            torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip)
        optimizer.step()
        total_loss += loss.item()

        if batch_idx % args.log_interval == 0 and batch_idx > 0:
            cur_loss = total_loss / args.log_interval
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.5f} | ms/batch {:5.5f} | '
                  'loss {:5.2f} | ppl {:8.2f}'.format(
                epoch, batch_idx, train_data.size(1) // args.validseqlen, lr,
                elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss)))
            
            print('Save model!\n')
            torch.save(model.state_dict(), "weights/model_weights_LMU_learnable")
            total_loss = 0
            reg_loss = 0
            start_time = time.time()


In [62]:
if(True):    
    best_vloss = 1e8
    try:
        all_vloss = []
        for epoch in range(1, args.epochs+1):
            epoch_start_time = time.time()
            train()
            val_loss = evaluate(val_data)
            test_loss = evaluate(test_data)
            print('-' * 89)
            print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
                    'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
                                               val_loss, math.exp(val_loss)))
            print('| end of epoch {:3d} | time: {:5.2f}s | test loss {:5.2f} | '
                  'test ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
                                            test_loss, math.exp(test_loss)))
            print('-' * 89)
            # Save the model if the validation loss is the best we've seen so far.

            if val_loss < best_vloss:
                
                torch.save(model.state_dict(), "model_LMU_learnable")
                print('Save model!\n')

                best_vloss = val_loss
            if epoch > 5 and val_loss >= max(all_vloss[-5:]):
                lr = lr / 10.
                for param_group in optimizer.param_groups:
                    param_group['lr'] = lr
            all_vloss.append(val_loss)

    except KeyboardInterrupt:
        print('-' * 89)
        print('Exiting from training early')

   
    model.load_state_dict(torch.load("weights/model_weights_LMU_learnable"), strict=False)

    # Run on test data.
    test_loss = evaluate(test_data)
    print('=' * 89)
    print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(
        test_loss, math.exp(test_loss)))
    print('=' * 89)