#Disclaimer:

Please try to understand the code skeleton before directly using this for the HW4P2. If you encounter anything that you are not able to understand or you feel is wrong do post on Piazza. 

#Updates:

1. In attention module check for the updated documentation. We should pass all the time-steps of Key and Value projection.
2. In method `train` look for the updated mask logic, current logic makes all the instances 1.
3. Check for the expected shape of the variable `text_input` in `train`. Though this is not a hard bound, but if you are getting a transpose shape than you need to change some other logic too. Hence we recommend you make sure the size of `text_input` is the same as mentioned.

In [None]:
### Please Install the python-levenshtein package! ###

In [1]:
import numpy as np
import torch 
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.nn.utils as utils
from torch.nn.utils.rnn import *
import pickle as pk
from torch.utils.data import DataLoader, Dataset 
import time
import math
import random
from Levenshtein import distance
from torchnlp.nn import LockedDropout
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# **Load data**

Loading all the numpy files containing the utterance information and text information

In [2]:
speech_train = np.load('train_new.npy', allow_pickle=True, encoding='bytes')
speech_valid = np.load('dev_new.npy', allow_pickle=True, encoding='bytes')
speech_test = np.load('test_new.npy', allow_pickle=True, encoding='bytes')

transcript_train = np.load('./train_transcripts.npy', allow_pickle=True,encoding='bytes')
transcript_valid = np.load('./dev_transcripts.npy', allow_pickle=True,encoding='bytes')
print("Data Loading Sucessful.....")

Data Loading Sucessful.....


# **Transform Text Data**

`transform_letter_to_index` function transforms alphabetical input to numerical input. Each letter is replaced by its corresponding index from `letter_list` .

In [3]:
letter_list = ['<sos>','<eos>', ' ', '-', "'", '.', '_', '+', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q',\
             'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z']

In [4]:
# Reserve 0 for padding
letter_dict = {}
for idx, letter in enumerate(letter_list):
    letter_dict[letter] = idx + 1

In [5]:
print(letter_dict)

{'<sos>': 1, '<eos>': 2, ' ': 3, '-': 4, "'": 5, '.': 6, '_': 7, '+': 8, 'A': 9, 'B': 10, 'C': 11, 'D': 12, 'E': 13, 'F': 14, 'G': 15, 'H': 16, 'I': 17, 'J': 18, 'K': 19, 'L': 20, 'M': 21, 'N': 22, 'O': 23, 'P': 24, 'Q': 25, 'R': 26, 'S': 27, 'T': 28, 'U': 29, 'V': 30, 'W': 31, 'X': 32, 'Y': 33, 'Z': 34}


In [6]:
def transform_letter_to_index(transcript):
    '''
    :param transcript :(N, ) Transcripts are the text input
    :param letter_list: Letter list defined above
    :return letter_to_index_list: Returns a list for all the transcript sentence to index
    
    '''
    transform_letter_to_index_list = []
    for sentence in transcript:
        trans = []
        trans.append(1)
        for token in sentence:
            decoded_token = token.decode()
            for char in decoded_token:
                trans.append(letter_dict[char])
            trans.append(3)
        trans.append(2)
        transform_letter_to_index_list.append(np.asarray(trans))
    return np.asarray(transform_letter_to_index_list)

In [7]:
character_text_train = transform_letter_to_index(transcript_train)
character_text_valid = transform_letter_to_index(transcript_valid)
print("Transformed data sucessfully.....")

Transformed data sucessfully.....



# **Pyramidal BiLSTM**
 

*   The length of utterance (speech input) can be hundereds to thousands of frames long.
*   Paper reports that that a direct LSTM implementation as Encoder resulted in slow convergence and inferior results even after extensive training.
*   The major reason is inability of `AttendAndSpell` operation to extract relevant information from a large number of input steps.

In [103]:
class pBLSTM(nn.Module):

    def __init__(self, input_dim, hidden_dim):
        super(pBLSTM, self).__init__()
        self.blstm = nn.LSTM(input_size=input_dim*4,hidden_size=hidden_dim,num_layers=1,bidirectional=True)
        self.ld = LockedDropout()
    def forward(self,x,lens):
        '''
        :param x :(N,T, H1) input to the pBLSTM h
        :return output: (N,T,H) encoded sequence from pyramidal Bi-LSTM 
        '''
        N, T, H1 = x.size()
        if T % 2 == 0:
            new_T = int(T / 2)
            H = int(H1 * 2)
            x = x.contiguous().view(N, new_T, H)
        else:
            x = x[:,:-1,:]
            new_T = int((T-1) / 2)
            H = int(H1 * 2)
            x = x.contiguous().view(N, new_T, H)
        lens = lens // 2
        x = x.permute(1, 0, 2)
        x = self.ld(x)
        packed_in = utils.rnn.pack_padded_sequence(x, lengths=lens, batch_first=False, enforce_sorted=False)
        outputs, _ = self.blstm(packed_in)
        padded_out, out_lens = utils.rnn.pad_packed_sequence(outputs)
        return padded_out, out_lens

# **Encoder**

*    Encoder takes the utterances as inputs and returns the key and value.
*    Key and value are nothing but simple projections of the output from pBLSTM network.

In [104]:
class Encoder(nn.Module):
    def __init__(self, input_dim, hidden_dim, value_size=128,key_size=128):
        super(Encoder, self).__init__()
        self.lstm = nn.LSTM(input_size=input_dim,hidden_size=hidden_dim,num_layers=1,bidirectional=True)
        self.pblstm1 = pBLSTM(hidden_dim, hidden_dim)
        self.pblstm2 = pBLSTM(hidden_dim, hidden_dim)
        self.pblstm3 = pBLSTM(hidden_dim, hidden_dim)

        self.key_network = nn.Linear(hidden_dim*2, key_size)
        self.value_network = nn.Linear(hidden_dim*2, value_size)
  
    def forward(self,x,lens):
        x = x.permute(1, 0, 2)
        rnn_inp = utils.rnn.pack_padded_sequence(x, lengths=lens, batch_first=False, enforce_sorted=False)
        outputs, _ = self.lstm(rnn_inp)

        #Use the outputs and pass it through the pBLSTM blocks

        linear_input, lens = utils.rnn.pad_packed_sequence(outputs)

        #pBLSTM Blocks

        in1 = linear_input.permute(1, 0, 2)
        o1, l1 = self.pblstm1(in1, lens)

        in2 = o1.permute(1, 0, 2)
        o2, l2 = self.pblstm2(in2, l1)

        in3 = o2.permute(1, 0, 2)
        o3, l3 = self.pblstm3(in3, l2)

        keys = self.key_network(o3)
        value = self.value_network(o3)

        return keys, value, l3


# **Attention**

*    Attention is calculated using key, value and query from Encoder and decoder.

Below are the set of operations you need to perform for computing attention.

```
energy = bmm(key, query)
attention = softmax(energy)
context = bmm(attention, value)
```



In [105]:
class Attention(nn.Module):
    def __init__(self):
        super(Attention, self).__init__()
    def forward(self, query, key, value, lens):
        '''
        :param query :(N,context_size) Query is the output of LSTMCell from Decoder
        :param key: (T,N,key_size) Key Projection from Encoder per time step
        :param value: (T,N,value_size) Value Projection from Encoder per time step
        :return output: Attended Context
        :return attention_mask: Attention mask that can be plotted  
        '''
        energy = torch.bmm(key, query.unsqueeze(2)).squeeze(2)
        mask = torch.arange(key.size(1)).unsqueeze(0).to(device) >= lens.unsqueeze(1)
        energy.masked_fill_(mask, -1e9)
        attention = nn.functional.softmax(energy, dim=1)
        out = torch.bmm(attention.unsqueeze(1), value).squeeze(1)
        
        return out, attention

# **Decoder**

*    As mentioned in Recitation-9 each forward call of decoder deals with just one time step. Thus we use LSTMCell instead of LSLTM here.
*    Output from the second LSTMCell can be used as query here for attention module.
*    In place of `value` that we get from the attention, this can be replace by context we get from the attention.
*    Methods like Gumble noise and teacher forcing can also be incorporated for improving the performance.

In [106]:
class Decoder(nn.Module):
    def __init__(self, vocab_size, hidden_dim, value_size=128, key_size=128, isAttended=True):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, hidden_dim)

        self.lstm1 = nn.LSTMCell(input_size=hidden_dim+value_size, hidden_size=hidden_dim*2)
        self.lstm2 = nn.LSTMCell(input_size=hidden_dim*2, hidden_size=key_size)
        self.hidden_dim = hidden_dim
        self.key_size = key_size
        self.isAttended = isAttended
        if(isAttended):
            self.attention = Attention()
        self.character_prob = nn.Linear(hidden_dim,vocab_size)
        self.linear = nn.Linear(key_size+value_size, hidden_dim)
        self.character_prob.weight = self.embedding.weight
        self.teacher = 0.1
        self.act = nn.Hardtanh(inplace = True)
        self.query = nn.Linear(hidden_dim, key_size)

    def forward(self, key, values, text=None, lens=None, train=True):
        '''
        :param key :(T,N,key_size) Output of the Encoder Key projection layer
        :param values: (T,N,value_size) Output of the Encoder Value projection layer
        :param text: (N,text_len) Batch input of text with text_length
        :param train: Train or eval mode
        :return predictions: Returns the character perdiction probability 
        '''
        key = key.permute(1, 0, 2)
        values = values.permute(1, 0, 2)
        batch_size = key.shape[0]
#         print('batch_size:', batch_size)
        if(train):
            max_len =  text.shape[1]
            embeddings = self.embedding(text)
        else:
            max_len = 250
        predictions = []
        h1 = torch.zeros(batch_size,self.hidden_dim*2).to(device)
        hidden1 = (h1, h1)
        hidden_states = [hidden1, None]
        prediction = torch.ones(batch_size,1).to(device)
        for i in range(max_len-1):
            '''
            Here you should implement Gumble noise and teacher forcing techniques
            '''
            if(train):
                if random.random() < self.teacher:
                    char_embed = self.embedding(prediction.argmax(dim=-1))
                else:
                    char_embed = embeddings[:,i,:]
            else:
                char_embed = self.embedding(prediction.argmax(dim=-1))
            
            #When attention is True you should replace the values[i,:,:] with the context you get from attention
            query = self.query(char_embed)
            lens = lens.to(device)
            context, _ = self.attention(query, key, values, lens)
            inp = torch.cat([char_embed,context], dim=1)
            hidden_states[0] = self.lstm1(inp,hidden_states[0])

            inp_2 = hidden_states[0][0]
            hidden_states[1] = self.lstm2(inp_2,hidden_states[1])

            output = hidden_states[1][0]
            prediction = self.linear(torch.cat([output, context], dim=1))
            prediction = self.act(prediction)
            prediction = self.character_prob(prediction)
            predictions.append(prediction.unsqueeze(1))

        return torch.cat(predictions, dim=1)
    def decode(self, key, value, lens):
        max_len = 250
        batch_size = key.shape[0]
        prediction = torch.ones(batch_size,1).to(device)
        predictions = []
        
        h1 = torch.zeros(batch_size,self.hidden_dim*2).to(device)
        hidden1 = (h1, h1)
        hidden_states = [hidden1, None]
        
        for i in range(max_len):
            char_embed = self.embedding(prediction.argmax(dim = 1))
            query = self.query(char_embed)
            context, _ = self.attention(query, key, value, lens)
            inp = torch.cat([char_embed,context], dim=1)
            hidden_states[0] = self.lstm1(inp,hidden_states[0])

            inp_2 = hidden_states[0][0]
            hidden_states[1] = self.lstm2(inp_2,hidden_states[1])
            
            output = hidden_states[1][0]
            prediction = self.character_prob(torch.cat([output, context], dim=1))
            predictions.append(prediction.unsqueeze(1))
            
        return torch.cat(predictions, dim=1)

# **Sequence to Sequence Model**

*    We train an end-to-end sequence to sequence model comprising of Encoder and Decoder.

In [107]:
class Seq2Seq(nn.Module):
    def __init__(self,input_dim,vocab_size,hidden_dim,value_size=128, key_size=128,isAttended=True):
        super(Seq2Seq,self).__init__()

        self.encoder = Encoder(input_dim, hidden_dim)
        self.decoder = Decoder(vocab_size, hidden_dim)
    def forward(self,speech_input, speech_len, text_input=None,train=True):
        key, value, lens = self.encoder(speech_input, speech_len)
        if(train):
            predictions = self.decoder(key, value, text_input, lens)
        else:
            predictions = self.decoder(key, value, text=None, lens=lens, train=False)
        return predictions


# **DataLoader**

Below is the dataloader for the homework.

*    You are expected to fill in the collate function if you use this code skeleton.

In [108]:
class Speech2Text_Dataset(Dataset):
    def __init__(self, speech, text=None, train=True):
        self.speech = speech
        self.train = train
        if(text is not None):
            self.text = text
    def __len__(self):
        return self.speech.shape[0]
    def __getitem__(self, index):
        if(self.train):
            return torch.tensor(self.speech[index].astype(np.float32)), torch.tensor(self.text[index])
        else:
            return torch.tensor(self.speech[index].astype(np.float32))

In [109]:
def collate_train(batch_data):
    '''
    Complete this function.
    I usually return padded speech and text data, and length of 
    utterance and transcript from this function 
    '''
    inputs, targets = zip(*batch_data)
    lens = [len(seq) for seq in inputs]
    text_lens = [len(seq) for seq in targets]

    inputs = pad_sequence(inputs, batch_first=True)
    targets = pad_sequence(targets, batch_first=True)
    
    inp_len = torch.LongTensor(lens)
    tgt_len = torch.LongTensor(text_lens)
    return inputs, targets, inp_len, tgt_len 

def collate_test(batch_data):
    '''
    Complete this function.
    I usually return padded speech and length of 
    utterance from this function 
    '''
    inputs = batch_data
    lens = [len(seq) for seq in inputs]
    inputs = pad_sequence(inputs, batch_first=True)
    inp_len = torch.LongTensor(lens)
    return inputs, inp_len

In [110]:
Speech2Text_train_Dataset = Speech2Text_Dataset(speech_train, character_text_train)
Speech2Text_val_Dataset = Speech2Text_Dataset(speech_valid, character_text_valid)
Speech2Text_test_Dataset = Speech2Text_Dataset(speech_test, None, False)

In [111]:
train_loader = DataLoader(Speech2Text_train_Dataset, batch_size=64, shuffle=True, collate_fn=collate_train)
val_loader = DataLoader(Speech2Text_val_Dataset, batch_size=64, shuffle=True, collate_fn=collate_train)
test_loader = DataLoader(Speech2Text_test_Dataset, batch_size=64, shuffle=False, drop_last=False, collate_fn=collate_test)

# **Learning**

Defining the Sequence to Sequence model, optimizer and criterion for learning.

Train routine is also provided here which can be referenced while writing validation and test routine.

In [120]:
model = Seq2Seq(input_dim=40,vocab_size=len(letter_list)+1,hidden_dim=256)
model.load_state_dict(torch.load('LAS-30-epoch.pth'))
model = model.to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss(reduction='none').to(device)

In [113]:
def train(model,train_loader, num_epochs, criterion, optimizer):
    for epochs in range(num_epochs):
        model.train()
        loss_sum = 0
        since = time.time()
        for (batch_num, collate_output) in enumerate(train_loader):
#             with torch.autograd.set_detect_anomaly(True):
                optimizer.zero_grad()
                speech_input, text_input, speech_len, text_len = collate_output
                speech_input = speech_input.to(device)
                '''
                Please check if the text_input is of the (Batch_size, Sequence_length) i.e. (B,L)
                '''
                text_input = text_input.to(device)
                speech_len = speech_len.to(device)
                text_len = text_len.to(device)
                predictions = model(speech_input, speech_len ,text_input)
                
                text_input = text_input[:, 1:]
                
                mask = torch.zeros(text_input.size()).to(device)

                for i in range(len(text_len)):
                    mask[i,:text_len[i]-1] = 1
#                 print(mask)
                mask = mask.view(-1)


                predictions = predictions.contiguous().view(-1, predictions.size(-1))
                '''
                If you do not have text_input as (B,L) but have (L,B) instead then make
                sure that you uncomment the next line of code
                '''
                #text_input = text_input.T
                text_input = text_input.contiguous().view(-1)

                loss = criterion(predictions, text_input)
                masked_loss = torch.sum(loss*mask)

                masked_loss.backward()

                torch.nn.utils.clip_grad_norm_(model.parameters(), 2)
                optimizer.step()

                current_loss = float(masked_loss.item())/int(torch.sum(mask).item())

                loss_sum += current_loss
                if batch_num > 0 and batch_num % 10 == 0:
                    loss_sum /= 10
                    print('batch ', batch_num,'/',len(train_loader), 'train_loss_perplexity', math.e**loss_sum, 'time: ', time.time() - since)
                    loss_sum = 0
                torch.cuda.empty_cache()
        
        model.eval()
        lev_dis = 0
        batch = 0
        for (batch_num, collate_output) in enumerate(val_loader):
            with torch.no_grad():
                speech_input, text_input, speech_len, text_len = collate_output
                speech_input = speech_input.to(device)
                '''
                Please check if the text_input is of the (Batch_size, Sequence_length) i.e. (B,L)
                '''
                text_input = text_input.to(device)
                speech_len = speech_len.to(device)
                text_len = text_len.to(device)

                predictions = model(speech_input, speech_len ,text_input)
                text_input = text_input[:, 1:]
                true_list = []
                pred_list = []
                p = predictions.argmax(dim=2)
                
                for i, sentence in enumerate(p):
                    pred_list.append(transform_index_to_letter(sentence)[:text_len[i]-1])

                text_input = text_input.to('cpu')
                for i, s in enumerate(text_input):
                    true_list.append(transform_index_to_letter(s)[:text_len[i]-1])
                dis = 0
                for i in range(len(pred_list)):
                    true = true_list[i]
                    pred = pred_list[i]
                    dis += distance(true, pred)
                dis /= len(pred_list)
                lev_dis += dis
                batch += 1
                n = random.randint(0,len(pred_list)-1)
                print('True: ', true_list[n])
                print('Pred: ', pred_list[n])
        lev_dis /= batch
        print('Levenshtein Distance:', dis)

In [86]:
def transform_index_to_letter(prediction, letter_list=letter_list):
    letter_dict = {}
    letter_dict[0] = ' '
    for idx, letter in enumerate(letter_list):
        letter_dict[idx+1] = letter
    out_str = ''
    for char in prediction:
        out_str += letter_dict[char.item()]
    return out_str

In [87]:
model.decoder.teacher = 0

In [88]:
train(model, train_loader, 5, criterion, optimizer)

batch  10 / 387 train_loss_perplexity 29.11802437805604 time:  8.671548843383789
batch  20 / 387 train_loss_perplexity 10.10487535358183 time:  16.66004776954651
batch  30 / 387 train_loss_perplexity 8.15636179960623 time:  24.486927032470703
batch  40 / 387 train_loss_perplexity 7.049046903302497 time:  32.33364772796631
batch  50 / 387 train_loss_perplexity 6.2250877627705075 time:  40.244149923324585
batch  60 / 387 train_loss_perplexity 5.713295311259314 time:  48.233057260513306
batch  70 / 387 train_loss_perplexity 5.285324955333307 time:  56.53564143180847
batch  80 / 387 train_loss_perplexity 5.081817128564116 time:  64.49775838851929
batch  90 / 387 train_loss_perplexity 4.721187666704778 time:  72.25717306137085
batch  100 / 387 train_loss_perplexity 4.520794176206726 time:  80.12900710105896
batch  110 / 387 train_loss_perplexity 4.416464887590484 time:  88.2008605003357
batch  120 / 387 train_loss_perplexity 4.27381105714761 time:  96.00108337402344
batch  130 / 387 train_l

In [89]:
torch.save(model.state_dict(), 'LAS-5-epoch.pth')

In [90]:
model.decoder.teacher = 0.1

In [91]:
train(model, train_loader, 5, criterion, optimizer)

batch  10 / 387 train_loss_perplexity 2.3581700585252277 time:  8.314899921417236
batch  20 / 387 train_loss_perplexity 2.0775786481949163 time:  16.348105430603027
batch  30 / 387 train_loss_perplexity 2.027573615972097 time:  24.38331961631775
batch  40 / 387 train_loss_perplexity 1.9230409560061048 time:  32.565017223358154
batch  50 / 387 train_loss_perplexity 1.9492814177094742 time:  40.48361802101135
batch  60 / 387 train_loss_perplexity 1.905889980799231 time:  48.553017139434814
batch  70 / 387 train_loss_perplexity 1.8561671369281039 time:  56.67060995101929
batch  80 / 387 train_loss_perplexity 1.9193070157712666 time:  64.6816954612732
batch  90 / 387 train_loss_perplexity 1.9365772758631583 time:  72.78937339782715
batch  100 / 387 train_loss_perplexity 1.9198928754854805 time:  80.60376524925232
batch  110 / 387 train_loss_perplexity 1.902931689865976 time:  88.47177982330322
batch  120 / 387 train_loss_perplexity 1.8011857494645418 time:  96.5673987865448
batch  130 / 38

In [92]:
torch.save(model.state_dict(), 'LAS-10-epoch.pth')

In [93]:
model.decoder.teacher = 0.2

In [94]:
train(model, train_loader, 5, criterion, optimizer)

batch  10 / 387 train_loss_perplexity 1.621909294881963 time:  8.527288436889648
batch  20 / 387 train_loss_perplexity 1.4962674499872137 time:  16.376285791397095
batch  30 / 387 train_loss_perplexity 1.5651894490781864 time:  24.267054080963135
batch  40 / 387 train_loss_perplexity 1.624692412972851 time:  32.29254603385925
batch  50 / 387 train_loss_perplexity 1.5453588380980627 time:  40.254080057144165
batch  60 / 387 train_loss_perplexity 1.5031560954942496 time:  47.89552617073059
batch  70 / 387 train_loss_perplexity 1.4999955020472298 time:  56.01031851768494
batch  80 / 387 train_loss_perplexity 1.5282920253236403 time:  63.83223628997803
batch  90 / 387 train_loss_perplexity 1.523282177654867 time:  71.64073896408081
batch  100 / 387 train_loss_perplexity 1.5535640436477438 time:  79.6159029006958
batch  110 / 387 train_loss_perplexity 1.5833870568552917 time:  87.4384171962738
batch  120 / 387 train_loss_perplexity 1.5033641062282435 time:  95.53398203849792
batch  130 / 38

In [95]:
torch.save(model.state_dict(), 'LAS-15-epoch.pth')

In [96]:
model.decoder.teacher = 0.3

In [97]:
train(model, train_loader, 5, criterion, optimizer)

batch  10 / 387 train_loss_perplexity 1.532315504028755 time:  8.830554485321045
batch  20 / 387 train_loss_perplexity 1.4329500701185356 time:  16.903461456298828
batch  30 / 387 train_loss_perplexity 1.4045435095092076 time:  25.008704900741577
batch  40 / 387 train_loss_perplexity 1.4933929684344331 time:  32.9380784034729
batch  50 / 387 train_loss_perplexity 1.5394514076263361 time:  40.98968029022217
batch  60 / 387 train_loss_perplexity 1.4752354570834392 time:  49.06547284126282
batch  70 / 387 train_loss_perplexity 1.5046152677279023 time:  56.934473276138306
batch  80 / 387 train_loss_perplexity 1.4692845462667852 time:  65.23782706260681
batch  90 / 387 train_loss_perplexity 1.4368036807428575 time:  73.05849289894104
batch  100 / 387 train_loss_perplexity 1.473867625642001 time:  80.89753460884094
batch  110 / 387 train_loss_perplexity 1.4898636858011955 time:  88.92157125473022
batch  120 / 387 train_loss_perplexity 1.4759772168445224 time:  96.89956736564636
batch  130 / 

In [98]:
torch.save(model.state_dict(), 'LAS-20-epoch.pth')

In [99]:
model.decoder.teacher = 0.4

In [100]:
train(model, train_loader, 5, criterion, optimizer)

batch  10 / 387 train_loss_perplexity 1.5324195435214614 time:  8.863358974456787
batch  20 / 387 train_loss_perplexity 1.470640014189124 time:  16.755496501922607
batch  30 / 387 train_loss_perplexity 1.4958315896761405 time:  24.83281898498535
batch  40 / 387 train_loss_perplexity 1.4835670622918475 time:  32.55494427680969
batch  50 / 387 train_loss_perplexity 1.475747728440349 time:  40.47198462486267
batch  60 / 387 train_loss_perplexity 1.4593516452239472 time:  48.7489595413208
batch  70 / 387 train_loss_perplexity 1.4908319474696765 time:  56.84985542297363
batch  80 / 387 train_loss_perplexity 1.5266335756935105 time:  64.81219029426575
batch  90 / 387 train_loss_perplexity 1.5876961622185353 time:  72.38242888450623
batch  100 / 387 train_loss_perplexity 1.4821727188064817 time:  80.43511009216309
batch  110 / 387 train_loss_perplexity 1.4645724468874763 time:  88.27211046218872
batch  120 / 387 train_loss_perplexity 1.4716423471154396 time:  96.51493406295776
batch  130 / 38

In [101]:
torch.save(model.state_dict(), 'LAS-25-epoch.pth')

In [114]:
train(model, train_loader, 5, criterion, optimizer)

batch  10 / 387 train_loss_perplexity 1.884056721918299 time:  9.114539384841919
batch  20 / 387 train_loss_perplexity 1.7669485318169862 time:  17.442318439483643
batch  30 / 387 train_loss_perplexity 1.6625823720698127 time:  25.357105016708374
batch  40 / 387 train_loss_perplexity 1.6288852563389806 time:  33.20398736000061
batch  50 / 387 train_loss_perplexity 1.6095192206142965 time:  41.245516300201416
batch  60 / 387 train_loss_perplexity 1.591926023957701 time:  49.214025020599365
batch  70 / 387 train_loss_perplexity 1.6003114704112085 time:  57.544026136398315
batch  80 / 387 train_loss_perplexity 1.5738548650380417 time:  65.49876737594604
batch  90 / 387 train_loss_perplexity 1.5877648683600274 time:  73.39217758178711
batch  100 / 387 train_loss_perplexity 1.5024083196000122 time:  81.45594072341919
batch  110 / 387 train_loss_perplexity 1.557779297739585 time:  89.77702784538269
batch  120 / 387 train_loss_perplexity 1.5399389465143911 time:  97.73204803466797
batch  130 

In [115]:
torch.save(model.state_dict(), 'LAS-30-epoch.pth')

In [121]:
train(model, train_loader, 5, criterion, optimizer)

batch  10 / 387 train_loss_perplexity 1.319925677688084 time:  9.14450216293335
batch  20 / 387 train_loss_perplexity 1.317949192340466 time:  17.553000926971436
batch  30 / 387 train_loss_perplexity 1.3041235236382636 time:  25.886341094970703
batch  40 / 387 train_loss_perplexity 1.2808773715254478 time:  34.262948513031006
batch  50 / 387 train_loss_perplexity 1.3123712992273076 time:  42.44439625740051
batch  60 / 387 train_loss_perplexity 1.3138519103213828 time:  50.55785799026489
batch  70 / 387 train_loss_perplexity 1.300087474489127 time:  58.67414617538452
batch  80 / 387 train_loss_perplexity 1.273887010033168 time:  66.70822262763977
batch  90 / 387 train_loss_perplexity 1.2805902662536777 time:  74.80969762802124
batch  100 / 387 train_loss_perplexity 1.2952806680024846 time:  82.97666001319885
batch  110 / 387 train_loss_perplexity 1.2875896113435017 time:  91.066002368927
batch  120 / 387 train_loss_perplexity 1.301233346361249 time:  99.19137978553772
batch  130 / 387 t

In [122]:
torch.save(model.state_dict(), 'LAS-35-epoch.pth')

In [137]:
model.eval()
test_list = []
for speech, speech_len in test_loader:
    speech = speech.to(device)
    speech_len = speech_len.to(device)
#     key, value, lens = model.encoder(speech, speech_len,)
#     lens = lens.to(device)
    predictions = model(speech, speech_len, text_input=None,train=False)
    p = predictions.argmax(dim=2)
    for s in p:
        test_list.append(transform_index_to_letter(s).split('<eos>')[0])

In [140]:
import pandas as pd
output_df = pd.DataFrame()
output_df['Id'] = np.asarray(range(len(test_list)))
output_df['Predicted'] = np.asarray(test_list)

In [141]:
output_df.to_csv('submission.csv', index=None)