In [1]:
import torch
import torch.nn as nn
import torch.nn.utils.rnn as rnn
from torch.utils.data import Dataset, DataLoader, TensorDataset
from torch.nn.utils.rnn import *

import numpy as np
import time

import shakespeare_data as sh

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
DEVICE

'cuda'

In [2]:
PHONEME_MAP = [
    ' ',  # "+BLANK+"
    '_',  # "+BREATH+"
    '+',  # "+COUGH+"
    '~',  # "+NOISE+"
    '!',  # "+SMACK+"
    '-',  # "+UH+"
    '@',  # "+UM+"
    'a',  # "AA"
    'A',  # "AE"
    'h',  # "AH"
    'o',  # "AO"
    'w',  # "AW"
    'y',  # "AY"
    'b',  # "B"
    'c',  # "CH"
    'd',  # "D"
    'D',  # "DH"
    'e',  # "EH"
    'r',  # "ER"
    'E',  # "EY"
    'f',  # "F"
    'g',  # "G"
    'H',  # "HH"
    'i',  # "IH"
    'I',  # "IY"
    'j',  # "JH"
    'k',  # "K"
    'l',  # "L"
    'm',  # "M"
    'n',  # "N"
    'G',  # "NG"
    'O',  # "OW"
    'Y',  # "OY"
    'p',  # "P"
    'R',  # "R"
    's',  # "S"
    'S',  # "SH"
    '.',  # "SIL"
    't',  # "T"
    'T',  # "TH"
    'u',  # "UH"
    'U',  # "UW"
    'v',  # "V"
    'W',  # "W"
    '?',  # "Y"
    'z',  # "Z"
    'Z',  # "ZH"
]

In [6]:
path_train = 'wsj0_train.npy'
path_train_lables = 'wsj0_train_merged_labels.npy'
path_valid = 'wsj0_dev.npy'
path_valid_lables = 'wsj0_dev_merged_labels.npy'
path_test = 'wsj0_test.npy'

train_data = np.load(path_train, allow_pickle = True, encoding='bytes')
train_labels = np.load(path_train_lables, allow_pickle = True, encoding= 'bytes')
valid_data = np.load(path_valid, allow_pickle = True, encoding='bytes')
valid_labels = np.load(path_valid_lables, allow_pickle = True, encoding= 'bytes')
test = np.load(path_test, allow_pickle = True, encoding='bytes')

class UtterancesDataset(Dataset):
    def __init__(self,utterances, phonemes):
        self.utterances = utterances
        self.phonemes = phonemes
        
    def __getitem__(self,index):
        utterance = self.utterances[index]
        phoneme = self.phonemes[index] + 1
        return utterance, phoneme
    
    def __len__(self):
        return len(self.utterances)

def collate_lines(seq_list):
    inputs,targets = zip(*seq_list) 
    input_lens = [len(seq) for seq in inputs]
    target_lens = [len(seq) for seq in targets]
    inputs = [torch.tensor(l) for l in inputs]   
    inputs = pad_sequence(inputs)
    targets = [torch.tensor(l) for l in targets]
    targets = pad_sequence(targets,batch_first=True)
    return inputs.to(DEVICE), targets.to(DEVICE), torch.LongTensor(input_lens).to(DEVICE), torch.LongTensor(target_lens).to(DEVICE)
    


In [5]:
valid_labels[0]

array([36, 15,  8, 19, 23, 27, 18, 26, 32, 33,  8, 14, 40, 34, 22, 44,  8,
       26, 22, 37, 17,  8, 41, 37, 40, 37, 22, 19,  9, 33, 43,  8, 29, 22,
       28, 28, 30, 41, 16, 27, 12, 17,  7, 28, 14, 14, 22, 34, 16, 27, 12,
       17,  0, 36])

In [7]:
train_dataset = UtterancesDataset(train_data, train_labels)
train_loader = DataLoader(train_dataset, shuffle=True, batch_size=50, collate_fn = collate_lines)

valid_dataset = UtterancesDataset(valid_data, valid_labels)
valid_loader = DataLoader(valid_dataset, shuffle=True, batch_size=10, collate_fn = collate_lines)


In [9]:
i = 0
for inputs,targets,input_lengths,target_lengths in train_loader:
    print(inputs.size())
    print(targets.size())
    
    i = i + 1
    
    if i > 0:
        break
        

torch.Size([1519, 50, 40])
torch.Size([50, 148])


In [22]:
class Model(nn.Module):
    def __init__(self,out_phonems, input_vocab_length, hidden_size, nlayers):
        super(Model, self).__init__()
        '''
        self.conv1 = nn.Conv1d(in_channels = input_vocab_length, out_channels = 200, kernel_size = 9, stride=1, padding=4)
        self.bn1 = nn.BatchNorm1d(200)
        self.tanh1 = nn.Tanh()

        self.conv2 = nn.Conv1d(in_channels = 200, out_channels = 400, kernel_size = 5, stride=1, padding=2)
        self.bn2 = nn.BatchNorm1d(400)
        self.tanh2 = nn.Tanh()
        
        ## if cnn used then chnage input size to 200
        
        '''
        self.lstm = nn.LSTM(input_size = input_vocab_length,hidden_size=hidden_size,num_layers=nlayers, bidirectional=True)
        self.output1 = nn.Linear(hidden_size * 2, 150)
        self.output2 = nn.Linear(150, out_phonems)
    
    def forward(self, inputs, input_lens):
        '''
        inputs = inputs.transpose(0,1)
        inputs = inputs.transpose(1,2)
        
        conv1_out = self.conv1(inputs)
        conv1_bn = self.bn1(conv1_out)
        conv1_tanh = self.tanh1(conv1_bn)
        
        conv2_out = self.conv2(conv1_out)
        conv2_bn = self.bn2(conv2_out)
        conv2_tanh = self.tanh2(conv2_bn)
        
        output_cnn = conv1_tanh.transpose(0,2).transpose(1,2)
        '''
        
        packed_input = pack_padded_sequence(inputs, input_lens, enforce_sorted = False)
        output_packed = self.lstm(packed_input)[0]
        output_padded, out_lens = rnn.pad_packed_sequence(output_packed) # unpacked output (padded). Also gives you the lengths
        out1 = self.output1(output_padded)
        out = self.output2(out1).log_softmax(2)
        return out, out_lens

In [23]:
import phoneme_list as pl

phoneme_length = len(PHONEME_MAP)
input_vocab_length = train_data[0].shape[1]


In [24]:
model = Model(phoneme_length, input_vocab_length, 500, 4)
model.to(DEVICE)

learningRate = 1e-3

optimizer = torch.optim.Adam(model.parameters(), lr=learningRate, weight_decay=1e-6)

In [25]:
def train_epoch_packed(model, optimizer, train_loader, val_loader, learning_Rate):
    criterion = nn.CTCLoss() # sum instead of averaging, to take into account the different lengths
    criterion = criterion.to(DEVICE)
    model.train()
    batch_id=0
    before = time.time()
    print("Training", len(train_loader), "number of batches")
    
    optimizer.learning_rate = learning_Rate
    
    training_loss = 0.0
    ntimeframes = 0
    for inputs,targets,input_lengths,target_lengths in train_loader: # lists, presorted, preloaded on GPU
        batch_id+=1
        optimizer.zero_grad()
        outputs, output_lengths = model(inputs, input_lengths)
        loss = criterion(outputs,targets,output_lengths,target_lengths) # criterion of the concatenated output
        training_loss = training_loss + loss.item()
        
        loss.backward()
        optimizer.step()
        
        ntimeframes += np.sum(np.array([len(l) for l in inputs]))
        
    Avg_training_loss = training_loss / len(train_loader)
    
    print("Avg Training loss",Avg_training_loss)
    
    
    val_loss = 0
    batch_id=0
    ntimeframes = 0
    batch_size = len(val_loader)
    
    optimizer.zero_grad()
    model.zero_grad()
    for inputs,targets,input_lengths,target_lengths in val_loader:
        ntimeframes += np.sum(np.array([len(l) for l in inputs]))
        batch_id+=1
        outputs, output_lengths = model(inputs, input_lengths)
        loss = criterion(outputs,targets,output_lengths,target_lengths)
        val_loss+=loss.item()
        
    val_lptf = val_loss / batch_size
    print("\nValidation Loss", val_lptf)
        
    
    return Avg_training_loss

In [26]:
learrate = learningRate
for epoch in range(15):
    model.zero_grad()
    print("\nEpoch:",epoch)
    avg_loss = train_epoch_packed(model, optimizer, train_loader, train_loader, learrate)
    learrate = learrate*0.95
    
    torch.save({
                'epoch': epoch,
                'model_state_dict':model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'loss': avg_loss,
                }, "model_params_new" + str(epoch) + ".tar")


Epoch: 0
Training 495 number of batches


KeyboardInterrupt: 

In [27]:
for e in range(5):
    
    epoch = e + 15
    
    model.zero_grad()
    print("\nEpoch:",epoch)
    avg_loss = train_epoch_packed(model, optimizer, train_loader, train_loader, learrate)
    learrate = learrate*0.90
    
    torch.save({
                'epoch': epoch,
                'model_state_dict':model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'loss': avg_loss,
                }, "model_params_new" + str(epoch) + ".tar")  
                
    


Epoch: 15
Training 495 number of batches


KeyboardInterrupt: 

In [None]:
for e in range(5):
    
    epoch = e + 20
    
    model.zero_grad()
    print("\nEpoch:",epoch)
    avg_loss = train_epoch_packed(model, optimizer, train_loader, train_loader, learrate)
    learrate = learrate*0.85
    torch.save({
                'epoch': epoch,
                'model_state_dict':model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'loss': avg_loss,
                }, "model_params_new" + str(epoch) + ".tar")  
    

In [None]:
for e in range(3):
    
    epoch = e + 25
    
    model.zero_grad()
    print("\nEpoch:",epoch)
    #avg_loss = train_epoch_packed(model, optimizer, train_loader, train_loader, learrate)
    learrate = learrate*0.05
    
    torch.save({
                'epoch': epoch,
                'model_state_dict':model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'loss': avg_loss,
                }, "model_params_new" + str(epoch) + ".tar")
    

In [None]:
checkpoint = torch.load("model_params_new27.tar")
model.cuda()
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

!git clone --recursive https://github.com/parlance/ctcdecode.git
%cd ctcdecode
!pip3 install wget
!pip3 install .
%cd ..

In [None]:
test_data = np.load(path_test, allow_pickle = True, encoding='bytes')

In [None]:
class Test_UtterancesDataset(Dataset):
    def __init__(self,utterances):
        self.utterances = utterances
        
    def __getitem__(self,index):
        utterance = self.utterances[index]
        return utterance
    
    def __len__(self):
        return len(self.utterances)

def collate_lines_test(seq_list):
    inputs = seq_list
    input_lens = [len(seq) for seq in inputs]
    inputs = [torch.tensor(l) for l in inputs]
    inputs = pad_sequence(inputs)
    return inputs.to(DEVICE), torch.LongTensor(input_lens).to(DEVICE)
    
test_dataset = Test_UtterancesDataset(test_data)

test_loader = DataLoader(test_dataset, shuffle=False, batch_size=20, collate_fn = collate_lines_test)

In [None]:
import torch
from ctcdecode import CTCBeamDecoder

decoder = CTCBeamDecoder(['$'] * phoneme_length, beam_width=1000, log_probs_input=True)

In [None]:
out_phonemes = []

batch_size = len(test_loader)

print("No of batches", batch_size)

b = 0
for inputs,input_lengths in test_loader:
    print("Batch: ", b)
    b = b + 1
    
    with torch.no_grad():
        out, out_lens = model(inputs, input_lengths)
        
    test_Y, _, _, test_Y_lens = decoder.decode(out.transpose(0,1), out_lens)
    
    batch_len = test_Y.shape[0]
    
    for i in range(batch_len):
        best_seq = test_Y[i, 0, :test_Y_lens[i, 0]]
        best_pron = ''.join(PHONEME_MAP[i] for i in best_seq)
        out_phonemes.append(best_pron)


In [None]:
out_np = np.array(out_phonemes)

In [None]:
np.savetxt('output.csv', out_np, delimiter=',', fmt='%s') 