In [1]:
!pip install python-Levenshtein

Processing ./.cache/pip/wheels/de/c2/93/660fd5f7559049268ad2dc6d81c4e39e9e36518766eaf7e342/python_Levenshtein-0.12.0-cp36-cp36m-linux_x86_64.whl
Installing collected packages: python-Levenshtein
Successfully installed python-Levenshtein-0.12.0


In [22]:
import numpy as np
import torch 
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.nn.utils.rnn import *
import pickle as pk
from torch.utils.data import DataLoader, Dataset
from Levenshtein import distance 
import time
import random
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


# **Load data**

Loading all the numpy files containing the utterance information and text information

In [2]:
speech_train = np.load('train_new.npy', allow_pickle=True, encoding='bytes')
speech_valid = np.load('dev_new.npy', allow_pickle=True, encoding='bytes')
speech_test = np.load('test_new.npy', allow_pickle=True, encoding='bytes')

transcript_train = np.load('./train_transcripts.npy', allow_pickle=True, encoding='bytes')
transcript_valid = np.load('./dev_transcripts.npy', allow_pickle=True, encoding='bytes')
print("Data Loading Sucessful.....")

Data Loading Sucessful.....


In [86]:
print(speech_test.shape)

(523,)


# **Transform Text Data**

`transform_letter_to_index` function transforms alphabetical input to numerical input. Each letter is replaced by its corresponding index from `letter_list` .

In [4]:
letter_list = ['<eos>', ' ', "'", '+', '-', '.', '_','A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q',\
             'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z']

char_to_index = {
    '<eos>': 0, 
    ' ': 1, 
    "'": 2, 
    '+': 3, 
    '-': 4, 
    '.': 5, 
    '_': 6, 
    'A': 7, 
    'B': 8, 
    'C': 9, 
    'D': 10, 
    'E': 11, 
    'F': 12, 
    'G': 13, 
    'H': 14, 
    'I': 15, 
    'J': 16, 
    'K': 17, 
    'L': 18, 
    'M': 19, 
    'N': 20, 
    'O': 21, 
    'P': 22, 
    'Q': 23, 
    'R': 24, 
    'S': 25, 
    'T': 26, 
    'U': 27, 
    'V': 28, 
    'W': 29, 
    'X': 30, 
    'Y': 31, 
    'Z': 32
    }

In [5]:
def transform_letter_to_index(transcript, char_to_index):
    '''
    :param transcript :(N, ) Transcripts are the text input
    :param letter_list: Letter list defined above
    :return letter_to_index_list: Returns a list for all the transcript sentence to index
    '''
    letter_to_index_list = []
    # loop through each transcript
    for trans in transcript:
        index_list = []
        #append <eos> at start of each sentence
        index_list.append(0)
        #loop through each word
        for i, word in enumerate(trans):
            word = word.decode()
            for char in word:
                index_list.append(char_to_index[char])
            if i != len(trans)-1:
                index_list.append(1)
            else:
                index_list.append(0)
        letter_to_index_list.append(index_list)

    assert(len(transcript) == len(letter_to_index_list))

    return letter_to_index_list

In [6]:
character_text_train = transform_letter_to_index(transcript_train, char_to_index)
character_text_valid = transform_letter_to_index(transcript_valid, char_to_index)
print("Transformed data sucessfully.....")

Transformed data sucessfully.....



# **Pyramidal BiLSTM**
 

*   The length of utterance (speech input) can be hundereds to thousands of frames long.
*   Paper reports that that a direct LSTM implementation as Encoder resulted in slow convergence and inferior results even after extensive training.
*   The major reason is inability of `AttendAndSpell` operation to extract relevant information from a large number of input steps.

In [7]:
class pBLSTM(nn.Module):

    def __init__(self, input_dim, hidden_dim):
        super(pBLSTM, self).__init__()
      
        self.blstm = nn.LSTM(input_size=input_dim*2, hidden_size=hidden_dim, num_layers=1, bidirectional=True)
  
    def forward(self, x, x_lens):
        '''
        :param x :(N,T) input to the pBLSTM
        :return output: (N,T,H) encoded sequence from pyramidal Bi-LSTM 
        '''

        inputs = x.transpose(0, 1) #shape: B x L x _
        batch_size = inputs.shape[0]

        #concatenate input in consecutive timestamp
        inputs_downsample = []
        for i in range(batch_size):
            length = x_lens[i]
            input_downsample = []
            if((length % 2) == 1):
                inputs_downsample.append(inputs[i][:length-1])
            else:
                inputs_downsample.append(inputs[i][:length])

            inputs_downsample[i] = inputs_downsample[i].reshape(length // 2, -1)
    
        out_lens = torch.LongTensor([i//2 for i in x_lens])

        inputs_downsample = pad_sequence(inputs_downsample).to(device) 
        inputs = inputs_downsample #.transpose(0, 1)
       
        packed_input = pack_padded_sequence(inputs, lengths=out_lens, batch_first=False, enforce_sorted=False)
        output_packed, _ = self.blstm(packed_input)
        output_padded, lens = pad_packed_sequence(output_packed)
        
        return output_padded, out_lens

# **Encoder**

*    Encoder takes the utterances as inputs and returns the key and value.
*    Key and value are nothing but simple projections of the output from pBLSTM network.

In [8]:
class Encoder(nn.Module):
    def __init__(self, input_dim, hidden_dim, value_size=128,key_size=128):
        super(Encoder, self).__init__()
        self.key_size = key_size
        self.value_size = value_size
        self.lstm = nn.LSTM(input_size=input_dim, hidden_size=hidden_dim, num_layers=1, bidirectional=True)
        #Here you need to define the blocks of pBLSTMs
        self.pblstm1 = pBLSTM(hidden_dim*2, hidden_dim)
        self.pblstm2 = pBLSTM(hidden_dim*2, hidden_dim)
        self.pblstm3 = pBLSTM(hidden_dim*2, hidden_dim)

        self.key_network = nn.Linear(hidden_dim*2, value_size)
        self.value_network = nn.Linear(hidden_dim*2, key_size)
  
    def forward(self, x, lens):
        batch_size = x.shape[1]
        packed_input = pack_padded_sequence(x, lengths=lens, batch_first=False, enforce_sorted=False)
        out, _ = self.lstm(packed_input)
        out_padded, _ = pad_packed_sequence(out) #shape: L x B x _

        #Use the outputs and pass it through the pBLSTM blocks
        outputs, out_lens = self.pblstm1(out_padded, lens)
        outputs, out_lens = self.pblstm2(outputs, out_lens)
        outputs, out_lens = self.pblstm3(outputs, out_lens)

        output_flatten = outputs.reshape(outputs.size(0) * outputs.size(1), outputs.size(2)) # (L * N) x H
        linear_input = output_flatten

        keys = self.key_network(linear_input)
        keys = keys.view(-1, batch_size, self.key_size) # output shape L x N x D

        value = self.value_network(linear_input)
        value = value.view(-1, batch_size, self.value_size) # output shape L x N x D

        return keys, value, out_lens

# **Attention**

*    Attention is calculated using key, value and query from Encoder and decoder.

Below are the set of operations you need to perform for computing attention.

```
energy = bmm(key, query)
attention = softmax(energy)
context = bmm(attention, value)
```



In [9]:
class Attention(nn.Module):
    def __init__(self):
        super(Attention, self).__init__()

    def forward(self, query, key, value, lens):
        '''
        :param query :(N,context_size) Query is the output of LSTMCell from Decoder
        :param key: (N,key_size) Key Projection from Encoder per time step
        :param value: (N,value_size) Value Projection from Encoder per time step
        :return output: Attended Context
        :return attention_mask: Attention mask that can be plotted  
        '''

        key = key.transpose(0,1)  #(N, T, H)
        # Input/output shape of bmm: (N, T, H), (N, H, 1) -> (N, T, 1)
        energy = torch.bmm(key, query.unsqueeze(2)).squeeze(2) # (N, T)
        attention = nn.functional.softmax(energy, dim=1) # (N, T)

        # Create an (N, T) boolean mask for all padding positions
        # Make use of broadcasting: (1, T), (N, 1) -> (N, T)
        mask = torch.arange(key.size(1)).unsqueeze(0) >= lens.unsqueeze(1) # (N, T)
        mask = mask.to(device)

        attention_masked = attention.clone().masked_fill_(mask.to(device), 0)

        sum_attention = attention_masked.sum(dim=1) # (N)
        normalized_attention = attention_masked / sum_attention.unsqueeze(1)

        value = value.transpose(0,1)  #(N, T, H)  
        # Compute attention-weighted sum of context vectors
        # Input/output shape of bmm: (N, 1, T), (N, T, H) -> (N, 1, H)
        context = torch.bmm(attention.unsqueeze(1), value).squeeze(1)

        return context, attention

# **Decoder**

*    As mentioned in Recitation-9 each forward call of decoder deals with just one time step. Thus we use LSTMCell instead of LSLTM here.
*    Output from the second LSTMCell can be used as query here for attention module.
*    In place of `value` that we get from the attention, this can be replace by context we get from the attention.
*    Methods like Gumble noise and teacher forcing can also be incorporated for improving the performance.

In [66]:
class Decoder(nn.Module):
    def __init__(self, vocab_size, hidden_dim, value_size=128, key_size=128, isAttended=True):
        super(Decoder, self).__init__()
        self.key_size = key_size
        self.hidden_dim = hidden_dim
        self.embedding = nn.Embedding(vocab_size, hidden_dim)    
        self.lstm1 = nn.LSTMCell(input_size=hidden_dim+value_size, hidden_size=hidden_dim)
        self.lstm2 = nn.LSTMCell(input_size=hidden_dim, hidden_size=key_size)
        self.isAttended = isAttended
        if(isAttended):
            self.attention = Attention()
            self.character_prob = nn.Linear(key_size+value_size,vocab_size)

    def forward(self, key, values, lens, teacher_forcing=None, text=None, train=True):
        '''
        :param key :(T,N,key_size) Output of the Encoder Key projection layer
        :param values: (T,N,value_size) Output of the Encoder Value projection layer
        :param text: (N,text_len) Batch input of text with text_length
        :param train: Train or eval mode
        :return predictions: Returns the character perdiction probability 
        '''
        batch_size = key.shape[1]
        if(train):
            text = text.transpose(0, 1) # N x max_len
            max_len =  text.shape[1]
            embeddings = self.embedding(text) # N x T x D
        else:
            max_len = 250
            
        predictions = []
        
        #initialize hidden states
        h1 = torch.zeros(batch_size, self.hidden_dim).to(device)
        h2 = torch.zeros(batch_size, self.key_size).to(device)
        hid1 = (h1, h1)
        hid2 = (h2, h2)
        hidden_states = [hid1, hid2]
        
        prediction = torch.zeros(batch_size, 1).to(device)
        
        for i in range(max_len - 1):
            '''
            Here you should implement Gumble noise and teacher forcing techniques
            '''
            #train: use the embedding of current word as input
            if(train):
                #print('****** teacher forcing: {}******',teacher_forcing)
                if(random.random() < teacher_forcing):
                    char_embed = self.embedding(prediction.argmax(dim=-1))
                else:
                    char_embed = embeddings[:,i,:] 
                
            else:
                #eval: use the embedding of last prediction as input
                char_embed = self.embedding(prediction.argmax(dim=-1))
                assert(char_embed.shape[0] == batch_size)
          
            #attention(query, key, value, lens)
            if hidden_states[0] == None:
                last_hiddent_state = torch.zeros(batch_size, 128)
            else:
                last_hiddent_state = hidden_states[0][-1]
            context, attention = self.attention(last_hiddent_state.to(device), key, values, lens)

            inp = torch.cat([char_embed, context], dim=1)

            hidden_states[0] = self.lstm1(inp, hidden_states[0])
          
            inp_2 = hidden_states[0][0]
            hidden_states[1] = self.lstm2(inp_2, hidden_states[1])

            output = hidden_states[1][0]
            prediction = self.character_prob(torch.cat([output, context], dim=1))  # 64 x 33
            predictions.append(prediction.unsqueeze(1)) # 64 x max_len x 33

        return torch.cat(predictions, dim=1)
    

# **Sequence to Sequence Model**

*    We train an end-to-end sequence to sequence model comprising of Encoder and Decoder.

In [67]:
class Seq2Seq(nn.Module):
    def __init__(self,input_dim,vocab_size,hidden_dim,value_size=128, key_size=128,isAttended=True):
        super(Seq2Seq,self).__init__()
        self.encoder = Encoder(input_dim, hidden_dim)
        self.decoder = Decoder(vocab_size, hidden_dim)
    def forward(self, teacher_forcing, speech_input, speech_len, text_input=None, train=True):
        key, value, lens = self.encoder(speech_input, speech_len)
        if(train):
            predictions = self.decoder(key, value, lens, teacher_forcing, text_input, train=True)
        else:
            predictions = self.decoder(key, value, lens, teacher_forcing=None, text=None, train=False)
        return predictions

# **DataLoader & Helper functions**

Below is the dataloader for the homework.

*    You are expected to fill in the collate function if you use this code skeleton.

In [68]:
class Speech2Text_Dataset(Dataset):
    def __init__(self, speech, text=None, train=True):
        self.speech = speech
        self.train = train
        if(text is not None):
            self.text = text
    def __len__(self):
        return self.speech.shape[0]
    def __getitem__(self, index):
        if(self.train):
            return torch.tensor(self.speech[index].astype(np.float32)), torch.tensor(self.text[index])
        else:
            return torch.tensor(self.speech[index].astype(np.float32))

In [69]:
def collate_train(batch_data):
    inputs, targets = zip(*batch_data)
    X_lens = torch.LongTensor([len(seq) for seq in inputs])
    Y_lens = torch.LongTensor([len(seq) for seq in targets])

    X = pad_sequence(inputs)
    Y = pad_sequence(targets)

    return X, Y, X_lens, Y_lens

def collate_test(batch_data):
    inputs = batch_data
    X_lens = torch.LongTensor([len(seq) for seq in inputs])
    X = pad_sequence(inputs)

    return X, X_lens

In [70]:
Speech2Text_train_Dataset = Speech2Text_Dataset(speech_train, character_text_train)
Speech2Text_eval_Dataset = Speech2Text_Dataset(speech_valid, character_text_valid)
Speech2Text_test_Dataset = Speech2Text_Dataset(speech_test, None, False)

In [71]:
train_loader = DataLoader(Speech2Text_train_Dataset, batch_size=64, shuffle=True, collate_fn=collate_train)
eval_loader = DataLoader(Speech2Text_eval_Dataset, batch_size=64, shuffle=True, collate_fn=collate_train)

In [83]:
test_loader = DataLoader(Speech2Text_test_Dataset, batch_size=64, shuffle=False, drop_last=False, collate_fn=collate_test)

In [72]:
def greedy(outputs):
    for output in outputs:
        chars = ''
        char_list = torch.argmax(output, dim=1).cpu()
        #print(char_list.shape)
        for i in range(len(char_list)):
            if i == 0:
                break
            else:
                chars += letter_list[char_list[i]]
    return chars

def index_to_char(char_list):
    chars =''
    for i in char_list:
        if i == 0:
            break
        else:
            chars += letter_list[i]
    return chars

# **Learning**

Defining the Sequence to Sequence model, optimizer and criterion for learning.

Train routine is also provided here which can be referenced while writing validation and test routine.

In [73]:
model = Seq2Seq(input_dim=40, vocab_size=len(letter_list), hidden_dim=128)
model = model.to(device)
optimizer = optim.Adam(model.parameters(), lr=3e-4)
criterion = nn.CrossEntropyLoss(reduction='none').to(device)

In [74]:
print(model)

Seq2Seq(
  (encoder): Encoder(
    (lstm): LSTM(40, 128, bidirectional=True)
    (pblstm1): pBLSTM(
      (blstm): LSTM(512, 128, bidirectional=True)
    )
    (pblstm2): pBLSTM(
      (blstm): LSTM(512, 128, bidirectional=True)
    )
    (pblstm3): pBLSTM(
      (blstm): LSTM(512, 128, bidirectional=True)
    )
    (key_network): Linear(in_features=256, out_features=128, bias=True)
    (value_network): Linear(in_features=256, out_features=128, bias=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(33, 128)
    (lstm1): LSTMCell(256, 128)
    (lstm2): LSTMCell(128, 128)
    (attention): Attention()
    (character_prob): Linear(in_features=256, out_features=33, bias=True)
  )
)


In [42]:
#model = torch.load('hw4_epoch1.pth')
#model = model.to(device)

In [103]:
def train(model,train_loader, num_epochs, criterion, optimizer, one_epoch):
    model.train()
    for epochs in range(num_epochs):
        if (one_epoch):
            teacher_forcing_para = 0.6
        else:
            if epochs < 4:
                teacher_forcing_para = 0.1
            elif epochs < 9:
                teacher_forcing_para = 0.2
            elif epochs < 14:
                teacher_forcing_para = 0.3
            elif epochs < 19:
                teacher_forcing_para = 0.4
            elif epochs < 24:
                teacher_forcing_para = 0.5
            
        print('*** Training Epoch {} with teacher forcing {} :***'.format(epochs+1, teacher_forcing_para))
        loss_sum = 0
        since = time.time()
        for (batch_num, collate_output) in enumerate(train_loader):
            #with torch.autograd.set_detect_anomaly(True):
            optimizer.zero_grad()
            speech_input, text_input, speech_len, text_len = collate_output
            speech_input = speech_input.to(device)
            text_input = text_input.to(device)

            predictions = model(teacher_forcing_para, speech_input, speech_len ,text_input, train=True) # 64 x max_len x 33

            text_input = text_input.transpose(0 ,1)

            # label[1:]
            text_array = text_input.cpu().detach().numpy()
            labels = []
            for i in range(text_array.shape[0]):
                labels.append(text_array[i][1:]) 
            labels = np.array(labels)
            labels = torch.from_numpy(labels).to(device)
              
            mask = torch.zeros(labels.size()).to(device) # 64 x max_len

            for length in text_len:
                mask[:,:length-1] = 1
            
            mask = mask.view(-1).to(device)

            predictions = predictions.contiguous().view(-1, predictions.size(-1))
            labels = labels.contiguous().view(-1)

            loss = criterion(predictions, labels)
            masked_loss = torch.sum(loss * mask)

            masked_loss.backward()

            torch.nn.utils.clip_grad_norm(model.parameters(), 2)
            optimizer.step()

            current_loss = float(masked_loss.item())/int(torch.sum(mask).item())

            if batch_num % 50 == 49:
                print('Batch: {}\tTraining loss: {:.4f}'.format(batch_num+1, current_loss))
                print('Batch: {}\tTraining perplexity: {:.4f}'.format(batch_num+1, np.exp(current_loss)))
              
        #eval        
        model.eval()
        print('****** Start validation ******')
        total_distance = []

        for (batch_num, collate_output) in enumerate(eval_loader):
            batch_distance = 0

            speech_input, text_input, speech_len, text_len = collate_output
            speech_input = speech_input.to(device)
            text_input = text_input.to(device)
            predictions = model(teacher_forcing_para, speech_input, speech_len ,text_input, train = False) #torch.Size([64, 249, 33])
          
            #ground truth label
            text_input = text_input.transpose(0 ,1) #torch.Size([64, 217])
            text_array = text_input.cpu().detach().numpy()
            labels = []
            for i in range(text_array.shape[0]):
                labels.append(text_array[i][1:]) 
            labels_array = np.array(labels) #torch.Size([64, 216])
            #labels = torch.from_numpy(labels).to(device)
          
            for i, length in enumerate(text_len):
                label = labels[i][:length-1]
                pred = torch.argmax(predictions[i], dim=1).cpu().numpy()
                pred = pred[:length-1]
                assert(len(pred) == len(label))

                ground_truth = index_to_char(label)
                #print('****** groudth truth ******')
                #print(ground_truth)
                pred_word = index_to_char(pred)
                #print('****** pred word ******')
                #print(pred_word)
              
                #distance per sequence
                dist = distance(pred_word, ground_truth)
                batch_distance += dist
                total_distance.append(dist)

        print("\n Epoch average distance :", sum(total_distance) / len(total_distance) )
        model.train()

In [76]:
train(model, train_loader, 66, criterion, optimizer, one_epoch=False)

*** Training Epoch 1 with teacher forcing 0.1 :***




Batch: 1	Training loss: 3.4716
Batch: 1	Training perplexity: 32.1885
Batch: 26	Training loss: 1.8376
Batch: 26	Training perplexity: 6.2817
Batch: 51	Training loss: 1.5600
Batch: 51	Training perplexity: 4.7589
Batch: 76	Training loss: 1.3712
Batch: 76	Training perplexity: 3.9400
Batch: 101	Training loss: 1.5034
Batch: 101	Training perplexity: 4.4971
Batch: 126	Training loss: 1.4460
Batch: 126	Training perplexity: 4.2462
Batch: 151	Training loss: 1.3958
Batch: 151	Training perplexity: 4.0381
Batch: 176	Training loss: 1.3993
Batch: 176	Training perplexity: 4.0524
Batch: 201	Training loss: 1.3903
Batch: 201	Training perplexity: 4.0159
Batch: 226	Training loss: 1.2900
Batch: 226	Training perplexity: 3.6328
Batch: 251	Training loss: 1.2832
Batch: 251	Training perplexity: 3.6081
Batch: 276	Training loss: 1.2119
Batch: 276	Training perplexity: 3.3598
Batch: 301	Training loss: 1.2285
Batch: 301	Training perplexity: 3.4161
Batch: 326	Training loss: 1.2263
Batch: 326	Training perplexity: 3.4087
B

Batch: 176	Training loss: 1.0145
Batch: 176	Training perplexity: 2.7581
Batch: 201	Training loss: 1.0243
Batch: 201	Training perplexity: 2.7852
Batch: 226	Training loss: 0.9984
Batch: 226	Training perplexity: 2.7139
Batch: 251	Training loss: 1.0902
Batch: 251	Training perplexity: 2.9750
Batch: 276	Training loss: 0.9941
Batch: 276	Training perplexity: 2.7024
Batch: 301	Training loss: 0.9822
Batch: 301	Training perplexity: 2.6703
Batch: 326	Training loss: 1.1381
Batch: 326	Training perplexity: 3.1209
Batch: 351	Training loss: 0.9558
Batch: 351	Training perplexity: 2.6007
Batch: 376	Training loss: 0.9817
Batch: 376	Training perplexity: 2.6689
****** Start validation ******

 Batch average distance : 74.8125

 Batch average distance : 69.59375

 Batch average distance : 77.578125

 Batch average distance : 76.375

 Batch average distance : 72.25

 Batch average distance : 67.859375

 Batch average distance : 79.046875

 Batch average distance : 75.34375

 Batch average distance : 73.828125

Batch: 351	Training loss: 0.8178
Batch: 351	Training perplexity: 2.2655
Batch: 376	Training loss: 0.9446
Batch: 376	Training perplexity: 2.5717
****** Start validation ******

 Batch average distance : 77.25

 Batch average distance : 65.4375

 Batch average distance : 68.53125

 Batch average distance : 72.25

 Batch average distance : 70.859375

 Batch average distance : 67.125

 Batch average distance : 71.34375

 Batch average distance : 68.3125

 Batch average distance : 73.828125

 Batch average distance : 67.515625

 Batch average distance : 66.390625

 Batch average distance : 70.59375

 Batch average distance : 65.375

 Batch average distance : 69.0

 Batch average distance : 70.140625

 Batch average distance : 72.296875

 Batch average distance : 62.46875

 Batch average distance : 71.11111111111111

 Epoch average distance : 69.36528028933093
*** Training Epoch 10 with teacher forcing 0.3 :***
Batch: 1	Training loss: 1.0743
Batch: 1	Training perplexity: 2.9279
Batch: 26	Tra


 Batch average distance : 56.75

 Batch average distance : 61.515625

 Batch average distance : 55.796875

 Batch average distance : 67.03125

 Batch average distance : 61.0625

 Batch average distance : 56.28125

 Batch average distance : 55.40625

 Batch average distance : 59.46875

 Batch average distance : 60.55555555555556

 Epoch average distance : 58.91048824593128
*** Training Epoch 14 with teacher forcing 0.3 :***
Batch: 1	Training loss: 0.8742
Batch: 1	Training perplexity: 2.3969
Batch: 26	Training loss: 0.8549
Batch: 26	Training perplexity: 2.3512
Batch: 51	Training loss: 0.8284
Batch: 51	Training perplexity: 2.2897
Batch: 76	Training loss: 0.6976
Batch: 76	Training perplexity: 2.0089
Batch: 101	Training loss: 0.8740
Batch: 101	Training perplexity: 2.3965
Batch: 126	Training loss: 0.7294
Batch: 126	Training perplexity: 2.0739
Batch: 151	Training loss: 0.6069
Batch: 151	Training perplexity: 1.8347
Batch: 176	Training loss: 0.6978
Batch: 176	Training perplexity: 2.0093
Batch:

Batch: 51	Training loss: 0.5513
Batch: 51	Training perplexity: 1.7355
Batch: 76	Training loss: 0.5269
Batch: 76	Training perplexity: 1.6937
Batch: 101	Training loss: 0.4243
Batch: 101	Training perplexity: 1.5285
Batch: 126	Training loss: 0.5311
Batch: 126	Training perplexity: 1.7009
Batch: 151	Training loss: 0.4519
Batch: 151	Training perplexity: 1.5713
Batch: 176	Training loss: 0.4324
Batch: 176	Training perplexity: 1.5409
Batch: 201	Training loss: 0.5739
Batch: 201	Training perplexity: 1.7752
Batch: 226	Training loss: 0.4738
Batch: 226	Training perplexity: 1.6060
Batch: 251	Training loss: 0.5079
Batch: 251	Training perplexity: 1.6618
Batch: 276	Training loss: 0.4104
Batch: 276	Training perplexity: 1.5074
Batch: 301	Training loss: 0.4564
Batch: 301	Training perplexity: 1.5783
Batch: 326	Training loss: 0.5161
Batch: 326	Training perplexity: 1.6755
Batch: 351	Training loss: 0.4272
Batch: 351	Training perplexity: 1.5330
Batch: 376	Training loss: 0.5888
Batch: 376	Training perplexity: 1.8

Batch: 226	Training loss: 0.3945
Batch: 226	Training perplexity: 1.4836
Batch: 251	Training loss: 0.4524
Batch: 251	Training perplexity: 1.5721
Batch: 276	Training loss: 0.3727
Batch: 276	Training perplexity: 1.4517
Batch: 301	Training loss: 0.4107
Batch: 301	Training perplexity: 1.5078
Batch: 326	Training loss: 0.4042
Batch: 326	Training perplexity: 1.4981
Batch: 351	Training loss: 0.4985
Batch: 351	Training perplexity: 1.6462
Batch: 376	Training loss: 0.4475
Batch: 376	Training perplexity: 1.5644
****** Start validation ******

 Batch average distance : 18.609375

 Batch average distance : 20.171875

 Batch average distance : 19.875

 Batch average distance : 18.109375

 Batch average distance : 23.515625

 Batch average distance : 20.15625

 Batch average distance : 17.390625

 Batch average distance : 21.859375

 Batch average distance : 22.65625

 Batch average distance : 22.109375

 Batch average distance : 20.046875

 Batch average distance : 25.875

 Batch average distance : 20

****** Start validation ******

 Batch average distance : 18.390625

 Batch average distance : 19.015625

 Batch average distance : 18.546875

 Batch average distance : 17.046875

 Batch average distance : 20.375

 Batch average distance : 17.265625

 Batch average distance : 14.671875

 Batch average distance : 19.890625

 Batch average distance : 19.03125

 Batch average distance : 17.265625

 Batch average distance : 16.8125

 Batch average distance : 15.8125

 Batch average distance : 15.4375

 Batch average distance : 17.375

 Batch average distance : 16.859375

 Batch average distance : 17.609375

 Batch average distance : 13.53125

 Batch average distance : 14.277777777777779

 Epoch average distance : 17.299276672694393
*** Training Epoch 27 with teacher forcing 0.5 :***
Batch: 1	Training loss: 0.3120
Batch: 1	Training perplexity: 1.3662
Batch: 26	Training loss: 0.2819
Batch: 26	Training perplexity: 1.3257
Batch: 51	Training loss: 0.4038
Batch: 51	Training perplexity: 1.4976
Ba


 Batch average distance : 14.859375

 Batch average distance : 15.15625

 Batch average distance : 18.484375

 Batch average distance : 13.8125

 Batch average distance : 11.555555555555555

 Epoch average distance : 15.742314647377938
*** Training Epoch 31 with teacher forcing 0.5 :***
Batch: 1	Training loss: 0.2080
Batch: 1	Training perplexity: 1.2312
Batch: 26	Training loss: 0.2018
Batch: 26	Training perplexity: 1.2236
Batch: 51	Training loss: 0.2088
Batch: 51	Training perplexity: 1.2322
Batch: 76	Training loss: 0.2185
Batch: 76	Training perplexity: 1.2442
Batch: 101	Training loss: 0.1883
Batch: 101	Training perplexity: 1.2072
Batch: 126	Training loss: 0.2183
Batch: 126	Training perplexity: 1.2439
Batch: 151	Training loss: 0.1951
Batch: 151	Training perplexity: 1.2154
Batch: 176	Training loss: 0.2463
Batch: 176	Training perplexity: 1.2793
Batch: 201	Training loss: 0.2224
Batch: 201	Training perplexity: 1.2491
Batch: 226	Training loss: 0.2598
Batch: 226	Training perplexity: 1.2967
B

Batch: 76	Training loss: 0.1691
Batch: 76	Training perplexity: 1.1842
Batch: 101	Training loss: 0.2214
Batch: 101	Training perplexity: 1.2479
Batch: 126	Training loss: 0.1868
Batch: 126	Training perplexity: 1.2054
Batch: 151	Training loss: 0.2710
Batch: 151	Training perplexity: 1.3113
Batch: 176	Training loss: 0.2076
Batch: 176	Training perplexity: 1.2307
Batch: 201	Training loss: 0.1725
Batch: 201	Training perplexity: 1.1882
Batch: 226	Training loss: 0.2745
Batch: 226	Training perplexity: 1.3159
Batch: 251	Training loss: 0.2144
Batch: 251	Training perplexity: 1.2391
Batch: 276	Training loss: 0.3005
Batch: 276	Training perplexity: 1.3505
Batch: 301	Training loss: 0.2443
Batch: 301	Training perplexity: 1.2768
Batch: 326	Training loss: 0.2388
Batch: 326	Training perplexity: 1.2697
Batch: 351	Training loss: 0.2491
Batch: 351	Training perplexity: 1.2828
Batch: 376	Training loss: 0.1661
Batch: 376	Training perplexity: 1.1807
****** Start validation ******

 Batch average distance : 14.3125


Batch: 251	Training loss: 0.2185
Batch: 251	Training perplexity: 1.2443
Batch: 276	Training loss: 0.1975
Batch: 276	Training perplexity: 1.2183
Batch: 301	Training loss: 0.1856
Batch: 301	Training perplexity: 1.2039
Batch: 326	Training loss: 0.1806
Batch: 326	Training perplexity: 1.1979
Batch: 351	Training loss: 0.1586
Batch: 351	Training perplexity: 1.1719
Batch: 376	Training loss: 0.1643
Batch: 376	Training perplexity: 1.1786
****** Start validation ******

 Batch average distance : 17.65625

 Batch average distance : 13.578125

 Batch average distance : 14.125

 Batch average distance : 13.46875

 Batch average distance : 15.96875

 Batch average distance : 14.484375

 Batch average distance : 13.75

 Batch average distance : 15.1875

 Batch average distance : 13.203125

 Batch average distance : 11.953125

 Batch average distance : 15.515625

 Batch average distance : 13.421875

 Batch average distance : 12.953125

 Batch average distance : 12.65625

 Batch average distance : 14.26


 Batch average distance : 11.640625

 Batch average distance : 13.4375

 Batch average distance : 12.1875

 Batch average distance : 15.03125

 Batch average distance : 12.5

 Batch average distance : 13.6875

 Batch average distance : 11.796875

 Batch average distance : 12.25

 Batch average distance : 14.25

 Batch average distance : 12.375

 Batch average distance : 13.6875

 Batch average distance : 13.765625

 Batch average distance : 14.390625

 Batch average distance : 14.765625

 Batch average distance : 11.109375

 Batch average distance : 14.84375

 Batch average distance : 14.055555555555555

 Epoch average distance : 13.217902350813743
*** Training Epoch 44 with teacher forcing 0.5 :***
Batch: 1	Training loss: 0.1273
Batch: 1	Training perplexity: 1.1357
Batch: 26	Training loss: 0.1599
Batch: 26	Training perplexity: 1.1733
Batch: 51	Training loss: 0.1332
Batch: 51	Training perplexity: 1.1425
Batch: 76	Training loss: 0.1204
Batch: 76	Training perplexity: 1.1280
Batch: 101	T


 Batch average distance : 16.6875

 Batch average distance : 10.53125

 Batch average distance : 12.333333333333334

 Epoch average distance : 13.181735985533454
*** Training Epoch 48 with teacher forcing 0.5 :***
Batch: 1	Training loss: 0.1184
Batch: 1	Training perplexity: 1.1257
Batch: 26	Training loss: 0.0958
Batch: 26	Training perplexity: 1.1005
Batch: 51	Training loss: 0.1036
Batch: 51	Training perplexity: 1.1092
Batch: 76	Training loss: 0.1009
Batch: 76	Training perplexity: 1.1061
Batch: 101	Training loss: 0.1055
Batch: 101	Training perplexity: 1.1113
Batch: 126	Training loss: 0.1181
Batch: 126	Training perplexity: 1.1253
Batch: 151	Training loss: 0.1486
Batch: 151	Training perplexity: 1.1602
Batch: 176	Training loss: 0.1666
Batch: 176	Training perplexity: 1.1813
Batch: 201	Training loss: 0.1535
Batch: 201	Training perplexity: 1.1659
Batch: 226	Training loss: 0.1236
Batch: 226	Training perplexity: 1.1316
Batch: 251	Training loss: 0.1265
Batch: 251	Training perplexity: 1.1348
Bat

Batch: 101	Training loss: 0.1012
Batch: 101	Training perplexity: 1.1065
Batch: 126	Training loss: 0.1134
Batch: 126	Training perplexity: 1.1200
Batch: 151	Training loss: 0.0834
Batch: 151	Training perplexity: 1.0870
Batch: 176	Training loss: 0.1087
Batch: 176	Training perplexity: 1.1148
Batch: 201	Training loss: 0.1531
Batch: 201	Training perplexity: 1.1654
Batch: 226	Training loss: 0.0876
Batch: 226	Training perplexity: 1.0915
Batch: 251	Training loss: 0.0945
Batch: 251	Training perplexity: 1.0991
Batch: 276	Training loss: 0.0865
Batch: 276	Training perplexity: 1.0903
Batch: 301	Training loss: 0.1351
Batch: 301	Training perplexity: 1.1446
Batch: 326	Training loss: 0.1294
Batch: 326	Training perplexity: 1.1381
Batch: 351	Training loss: 0.1062
Batch: 351	Training perplexity: 1.1121
Batch: 376	Training loss: 0.1394
Batch: 376	Training perplexity: 1.1496
****** Start validation ******

 Batch average distance : 12.5625

 Batch average distance : 13.328125

 Batch average distance : 14.140

KeyboardInterrupt: 

In [94]:
numEpochs = 1
learningRate = 3e-5
optimizer = torch.optim.Adam(model.parameters(), lr=learningRate)
train(model, train_loader, numEpochs, criterion, optimizer, one_epoch=True)

*** Training Epoch 1 with teacher forcing 0.5 :***




Batch: 1	Training loss: 0.0481
Batch: 1	Training perplexity: 1.0493
Batch: 50	Training loss: 0.0773
Batch: 50	Training perplexity: 1.0804
Batch: 99	Training loss: 0.0794
Batch: 99	Training perplexity: 1.0827
Batch: 148	Training loss: 0.0834
Batch: 148	Training perplexity: 1.0869
Batch: 197	Training loss: 0.0718
Batch: 197	Training perplexity: 1.0745
Batch: 246	Training loss: 0.0990
Batch: 246	Training perplexity: 1.1041
Batch: 295	Training loss: 0.0674
Batch: 295	Training perplexity: 1.0697
Batch: 344	Training loss: 0.0664
Batch: 344	Training perplexity: 1.0687
****** Start validation ******

 Epoch average distance : 12.021699819168173


In [97]:
numEpochs = 1
learningRate = 3e-5
optimizer = torch.optim.Adam(model.parameters(), lr=learningRate)
train(model, train_loader, numEpochs, criterion, optimizer, one_epoch=True)

*** Training Epoch 1 with teacher forcing 0.5 :***




Batch: 1	Training loss: 0.0433
Batch: 1	Training perplexity: 1.0442
Batch: 50	Training loss: 0.0532
Batch: 50	Training perplexity: 1.0547
Batch: 99	Training loss: 0.0604
Batch: 99	Training perplexity: 1.0622
Batch: 148	Training loss: 0.0683
Batch: 148	Training perplexity: 1.0707
Batch: 197	Training loss: 0.0525
Batch: 197	Training perplexity: 1.0539
Batch: 246	Training loss: 0.0688
Batch: 246	Training perplexity: 1.0712
Batch: 295	Training loss: 0.0651
Batch: 295	Training perplexity: 1.0672
Batch: 344	Training loss: 0.0719
Batch: 344	Training perplexity: 1.0746
****** Start validation ******

 Epoch average distance : 12.105786618444846


In [100]:
numEpochs = 1
learningRate = 3e-5
optimizer = torch.optim.Adam(model.parameters(), lr=learningRate)
train(model, train_loader, numEpochs, criterion, optimizer, one_epoch=True)

*** Training Epoch 1 with teacher forcing 0.6 :***




Batch: 49	Training loss: 0.1011
Batch: 49	Training perplexity: 1.1064
Batch: 99	Training loss: 0.0613
Batch: 99	Training perplexity: 1.0632
Batch: 149	Training loss: 0.1271
Batch: 149	Training perplexity: 1.1355
Batch: 199	Training loss: 0.0808
Batch: 199	Training perplexity: 1.0841
Batch: 249	Training loss: 0.0652
Batch: 249	Training perplexity: 1.0674
Batch: 299	Training loss: 0.0687
Batch: 299	Training perplexity: 1.0711
Batch: 349	Training loss: 0.0610
Batch: 349	Training perplexity: 1.0629
****** Start validation ******

 Epoch average distance : 12.12748643761302


In [104]:
numEpochs = 10
learningRate = 3e-5
optimizer = torch.optim.Adam(model.parameters(), lr=learningRate)
train(model, train_loader, numEpochs, criterion, optimizer, one_epoch=True)

*** Training Epoch 1 with teacher forcing 0.6 :***




Batch: 50	Training loss: 0.0643
Batch: 50	Training perplexity: 1.0664
Batch: 100	Training loss: 0.0581
Batch: 100	Training perplexity: 1.0598
Batch: 150	Training loss: 0.0754
Batch: 150	Training perplexity: 1.0783
Batch: 200	Training loss: 0.0752
Batch: 200	Training perplexity: 1.0781
Batch: 250	Training loss: 0.0633
Batch: 250	Training perplexity: 1.0654
Batch: 300	Training loss: 0.0776
Batch: 300	Training perplexity: 1.0806
Batch: 350	Training loss: 0.1298
Batch: 350	Training perplexity: 1.1386
****** Start validation ******

 Epoch average distance : 12.269439421338156
*** Training Epoch 2 with teacher forcing 0.6 :***
Batch: 50	Training loss: 0.0575
Batch: 50	Training perplexity: 1.0592
Batch: 100	Training loss: 0.0774
Batch: 100	Training perplexity: 1.0805
Batch: 150	Training loss: 0.0596
Batch: 150	Training perplexity: 1.0615
Batch: 200	Training loss: 0.0490
Batch: 200	Training perplexity: 1.0502
Batch: 250	Training loss: 0.0626
Batch: 250	Training perplexity: 1.0646
Batch: 300	T

# ***Moving Forward....***

We have provided a skeleton to begin with, so that you have a clear picture when writing the code. Apart from the methods given here following methods should be implemented.
  
*    Validation and test methods.
*    Methods to convert indexes to characters
*    Methods for calculating the perplexity/Levenstine distance for gauging the training routine.
*    For visualizing the gradient flow (refer to FAQs) and attention graph (refer to Recitation-9) methods are already given.

In [77]:
torch.save(model, 'hw4_epoch53.pth')

  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


In [105]:
# test

generated_sentence = []

def predict(model, test_loader):
    
    teacher_forcing_para=None
  
    with torch.no_grad():
        model.eval()
        model.to(device)
        
        print('****** Testing ******')

        for (batch_num, collate_output) in enumerate(test_loader):

            speech_input, speech_len = collate_output
            speech_input = speech_input.to(device)
            predictions = model(teacher_forcing_para, speech_input, speech_len ,text_input=None, train=False)
          
            for i in range(len(speech_len)):
                pred = torch.argmax(predictions[i], dim=1).cpu().numpy()
                pred_word = index_to_char(pred)
                generated_sentence.append(pred_word)
        assert(len(generated_sentence) == 523)
                
predict(model, test_loader)

****** Testing ******


In [106]:
import pandas as pd
result = pd.DataFrame()
result['Id'] = list(range(523))
result['Predicted'] = generated_sentence
result.to_csv('zichenli_hw4p2_4.csv', encoding='utf-8', index=False)