In [1]:
import torch
import torch.nn as nn
import torch.nn.utils.rnn as rnn
from torch.utils.data import Dataset, DataLoader, TensorDataset
from torch.nn.utils.rnn import *

import numpy as np
import time

import shakespeare_data as sh

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
DEVICE

'cuda'

In [2]:
PHONEME_MAP = [
    ' ',  # "+BLANK+"
    '_',  # "+BREATH+"
    '+',  # "+COUGH+"
    '~',  # "+NOISE+"
    '!',  # "+SMACK+"
    '-',  # "+UH+"
    '@',  # "+UM+"
    'a',  # "AA"
    'A',  # "AE"
    'h',  # "AH"
    'o',  # "AO"
    'w',  # "AW"
    'y',  # "AY"
    'b',  # "B"
    'c',  # "CH"
    'd',  # "D"
    'D',  # "DH"
    'e',  # "EH"
    'r',  # "ER"
    'E',  # "EY"
    'f',  # "F"
    'g',  # "G"
    'H',  # "HH"
    'i',  # "IH"
    'I',  # "IY"
    'j',  # "JH"
    'k',  # "K"
    'l',  # "L"
    'm',  # "M"
    'n',  # "N"
    'G',  # "NG"
    'O',  # "OW"
    'Y',  # "OY"
    'p',  # "P"
    'R',  # "R"
    's',  # "S"
    'S',  # "SH"
    '.',  # "SIL"
    't',  # "T"
    'T',  # "TH"
    'u',  # "UH"
    'U',  # "UW"
    'v',  # "V"
    'W',  # "W"
    '?',  # "Y"
    'z',  # "Z"
    'Z',  # "ZH"
]

In [3]:
path_train = 'wsj0_train.npy'
path_train_lables = 'wsj0_train_merged_labels.npy'
path_valid = 'wsj0_dev.npy'
path_valid_lables = 'wsj0_dev_merged_labels.npy'
path_test = 'wsj0_test.npy'

train_data = np.load(path_train, allow_pickle = True, encoding='bytes')
train_labels = np.load(path_train_lables, allow_pickle = True, encoding= 'bytes')
valid_data = np.load(path_valid, allow_pickle = True, encoding='bytes')
valid_labels = np.load(path_valid_lables, allow_pickle = True, encoding= 'bytes')
test = np.load(path_test, allow_pickle = True, encoding='bytes')

class UtterancesDataset(Dataset):
    def __init__(self,utterances, phonemes):
        self.utterances = utterances
        self.phonemes = phonemes
        
    def __getitem__(self,index):
        utterance = self.utterances[index]
        phoneme = self.phonemes[index] + 1
        return utterance, phoneme
    
    def __len__(self):
        return len(self.utterances)

def collate_lines(seq_list):
    inputs,targets = zip(*seq_list) 
    input_lens = [len(seq) for seq in inputs]
    target_lens = [len(seq) for seq in targets]
    inputs = [torch.tensor(l) for l in inputs]   
    inputs = pad_sequence(inputs)
    targets = [torch.tensor(l) for l in targets]
    targets = pad_sequence(targets,batch_first=True)
    return inputs.to(DEVICE), targets.to(DEVICE), torch.LongTensor(input_lens).to(DEVICE), torch.LongTensor(target_lens).to(DEVICE)
    


In [4]:
train_dataset = UtterancesDataset(train_data, train_labels)
train_loader = DataLoader(train_dataset, shuffle=True, batch_size=50, collate_fn = collate_lines)

valid_dataset = UtterancesDataset(valid_data, valid_labels)
valid_loader = DataLoader(valid_dataset, shuffle=True, batch_size=10, collate_fn = collate_lines)


In [5]:
class Model(nn.Module):
    def __init__(self,out_phonems, embed_size, hidden_size, nlayers):
        super(Model, self).__init__()
        self.lstm = nn.LSTM(input_size = embed_size,hidden_size=hidden_size,num_layers=nlayers, bidirectional=True)
        self.output = nn.Linear(hidden_size * 2, out_phonems)
    
    def forward(self, inputs, input_lens):
        packed_input = pack_padded_sequence(inputs, input_lens, enforce_sorted = False)
        output_packed = self.lstm(packed_input)[0]
        output_padded, out_lens = rnn.pad_packed_sequence(output_packed) # unpacked output (padded). Also gives you the lengths
        out = self.output(output_padded).log_softmax(2)
        return out, out_lens

In [6]:
import phoneme_list as pl

phoneme_length = len(PHONEME_MAP)
input_vocab_length = train_data[0].shape[1]

print("phonemen length ", phoneme_length)
print("input_vocab_length ", input_vocab_length)

phonemen length  47
input_vocab_length  40


In [7]:
model = Model(phoneme_length, input_vocab_length, 500, 4)
model.to(DEVICE)

learningRate = 1e-3

optimizer = torch.optim.Adam(model.parameters(), lr=learningRate, weight_decay=1e-6)

In [8]:
def train_epoch_packed(model, optimizer, train_loader, val_loader, learning_Rate):
    criterion = nn.CTCLoss() # sum instead of averaging, to take into account the different lengths
    criterion = criterion.to(DEVICE)
    model.train()
    batch_id=0
    before = time.time()
    print("Training", len(train_loader), "number of batches")
    
    optimizer.learning_rate = learning_Rate
    
    training_loss = 0.0
    ntimeframes = 0
    for inputs,targets,input_lengths,target_lengths in train_loader: # lists, presorted, preloaded on GPU
        batch_id+=1
        optimizer.zero_grad()
        outputs, output_lengths = model(inputs, input_lengths)
        loss = criterion(outputs,targets,output_lengths,target_lengths) # criterion of the concatenated output
        training_loss = training_loss + loss.item()
        
        loss.backward()
        optimizer.step()
        
        ntimeframes += np.sum(np.array([len(l) for l in inputs]))
        
        '''
        if batch_id % 100 == 0:
            print("Avg Training loss",training_loss/100)
            training_loss = 0.0
        '''
        
    Avg_training_loss = training_loss / len(train_loader)
    
    print("Avg Training loss",Avg_training_loss)
    
    '''
    val_loss = 0
    batch_id=0
    ntimeframes = 0
    batch_size = len(val_loader)
    
    optimizer.zero_grad()
    model.zero_grad()
    for inputs,targets,input_lengths,target_lengths in val_loader:
        ntimeframes += np.sum(np.array([len(l) for l in inputs]))
        batch_id+=1
        outputs, output_lengths = model(inputs, input_lengths)
        loss = criterion(outputs,targets,output_lengths,target_lengths)
        val_loss+=loss.item()
        
    val_lptf = val_loss / batch_size
    print("\nValidation Loss", val_lptf)
    '''
    
    
    return Avg_training_loss

In [9]:
learrate = learningRate
for epoch in range(15):
    model.zero_grad()
    print("\nEpoch:",epoch)
    avg_loss = train_epoch_packed(model, optimizer, train_loader, train_loader, learrate)
    learrate = learrate*0.95
    
    torch.save({
                'epoch': epoch,
                'model_state_dict':model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'loss': avg_loss,
                }, "model_params_new" + str(epoch) + ".tar")    
    



Epoch: 0
Training 495 number of batches
Avg Training loss 3.120216463069723

Epoch: 1
Training 495 number of batches
Avg Training loss 1.0763789976486051

Epoch: 2
Training 495 number of batches
Avg Training loss 0.6918405371482926

Epoch: 3
Training 495 number of batches
Avg Training loss 0.5604746231527039

Epoch: 4
Training 495 number of batches
Avg Training loss 0.4747085527338163

Epoch: 5
Training 495 number of batches
Avg Training loss 0.42564266721407573

Epoch: 6
Training 495 number of batches
Avg Training loss 0.38394945649185563

Epoch: 7
Training 495 number of batches
Avg Training loss 0.35110473596688474

Epoch: 8
Training 495 number of batches
Avg Training loss 0.3215599137123185

Epoch: 9
Training 495 number of batches
Avg Training loss 0.2880862435909233

Epoch: 10
Training 495 number of batches
Avg Training loss 0.2635759257005923

Epoch: 11
Training 495 number of batches
Avg Training loss 0.24890248977174662

Epoch: 12
Training 495 number of batches
Avg Training loss

In [10]:
print(learrate)

0.00046329123015975297


In [12]:
for e in range(5):
    
    epoch = e + 15
    
    model.zero_grad()
    print("\nEpoch:",epoch)
    avg_loss = train_epoch_packed(model, optimizer, train_loader, train_loader, learrate)
    learrate = learrate*0.6
    
    torch.save({
                'epoch': epoch,
                'model_state_dict':model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'loss': avg_loss,
                }, "model_params_new" + str(epoch) + ".tar")  


Epoch: 15
Training 495 number of batches
Avg Training loss 0.19142433463624028

Epoch: 16
Training 495 number of batches
Avg Training loss 0.17291257328758336

Epoch: 17
Training 495 number of batches
Avg Training loss 0.16373422216586392

Epoch: 18
Training 495 number of batches
Avg Training loss 0.15360904227603567

Epoch: 19
Training 495 number of batches
Avg Training loss 0.15097569159486077


!git clone --recursive https://github.com/parlance/ctcdecode.git
%cd ctcdecode
!pip3 install wget
!pip3 install .
%cd ..

In [13]:
test_data = np.load(path_test, allow_pickle = True, encoding='bytes')

In [14]:
class Test_UtterancesDataset(Dataset):
    def __init__(self,utterances):
        self.utterances = utterances
        
    def __getitem__(self,index):
        utterance = self.utterances[index]
        return utterance
    
    def __len__(self):
        return len(self.utterances)

def collate_lines_test(seq_list):
    inputs = seq_list
    input_lens = [len(seq) for seq in inputs]
    inputs = [torch.tensor(l) for l in inputs]
    inputs = pad_sequence(inputs)
    return inputs.to(DEVICE), torch.LongTensor(input_lens).to(DEVICE)
    
test_dataset = Test_UtterancesDataset(test_data)

test_loader = DataLoader(test_dataset, shuffle=False, batch_size=1, collate_fn = collate_lines_test)

In [15]:
import torch
from ctcdecode import CTCBeamDecoder

decoder = CTCBeamDecoder(['$'] * phoneme_length, beam_width=1500, log_probs_input=True)

In [16]:
out_phonemes = []

for inputs,input_lengths in test_loader:
    with torch.no_grad():
        out, out_lens = model(inputs, input_lengths)
        
    test_Y, _, _, test_Y_lens = decoder.decode(out.transpose(0,1), out_lens)
    
    best_seq = test_Y[0, 0, :test_Y_lens[0, 0]]
    best_pron = ''.join(PHONEME_MAP[i] for i in best_seq)
    out_phonemes.append(best_pron)


In [17]:
phon_npp = np.array(out_phonemes)

print(phon_npp)

['.tREvOlAstmhnThgRIdinpRinshphltUfelitsinmhndWotrtRAnsprtEShnstIvhdoRiGhndpyptAbRikEShnbiznhshzfrhnhndisklOzhd.'
 '.DhkhmbyndbiznhshzhkwnhdfrtU@HhndrdTrtIfyvmil?hndalrzhvdREvrz+EtHhnrdnytITRImil?hndalrzinnyntInEtIfyvbRevhn?U!.'
 '.inseptembr.DhkhmphnIRisIvdnyntIsikspYntEt!mil?hndalrzezitseRhvdAmijrzfrmhbRIchvkantRAkthWoRd+RidlEtidtUhkOldaRthSip.'
 '.AzmizkAndOlhnmistrlOsIet.swTAfrkhzsentRhlghvrnmhnt.izlykhbiglembriGtAGkk_.'
 '.hWyt.?uRhkRAtsrinDhdRybrzsIt!hndblAksRydlIfiRWhtDhtAGksWephnzkhndUtUgwm!.'
 '.menIWonttistoRmDhtAGk~hndtEkidOHWRO.'
 '.bhtifDhistRIOvhDrAfrkhnEShnzizendEgyd_Wyts.indenzhndsmolblAktRybSudfiRbhk!.'
 '.!gREnzhndsYbIiGz!mOstkoRnhndWItf?UcrzpRysizWrstRoGgr@.'
 '.WIf?UcrzWrshpoRthdbykhnsrnnAtWItmEbIvolnrhmhl.ifkOldsnApcfalrDispAstWIkenzWoRm.AnhWhssed+.'
 '..DhWoRmiGtRAndmEHAvmelthdDisnOkhvranshmkRos.'
 '.oRnf?UcrcrsIvdshmstRiGk.onikspektEShnhvDIAgRhkhlcrdipaRtmhntWudREdIzsOvIetimpoRtfoRkAs_Anhlhs.'
 '.AftrDhklOzhvtREdiG!DhfoRkAstWrinkRIst.'
 '.sYbInf?UcrzpRyshzWrmikst

In [18]:
out_np = np.array(out_phonemes)

In [19]:
print(out_np)

['.tREvOlAstmhnThgRIdinpRinshphltUfelitsinmhndWotrtRAnsprtEShnstIvhdoRiGhndpyptAbRikEShnbiznhshzfrhnhndisklOzhd.'
 '.DhkhmbyndbiznhshzhkwnhdfrtU@HhndrdTrtIfyvmil?hndalrzhvdREvrz+EtHhnrdnytITRImil?hndalrzinnyntInEtIfyvbRevhn?U!.'
 '.inseptembr.DhkhmphnIRisIvdnyntIsikspYntEt!mil?hndalrzezitseRhvdAmijrzfrmhbRIchvkantRAkthWoRd+RidlEtidtUhkOldaRthSip.'
 '.AzmizkAndOlhnmistrlOsIet.swTAfrkhzsentRhlghvrnmhnt.izlykhbiglembriGtAGkk_.'
 '.hWyt.?uRhkRAtsrinDhdRybrzsIt!hndblAksRydlIfiRWhtDhtAGksWephnzkhndUtUgwm!.'
 '.menIWonttistoRmDhtAGk~hndtEkidOHWRO.'
 '.bhtifDhistRIOvhDrAfrkhnEShnzizendEgyd_Wyts.indenzhndsmolblAktRybSudfiRbhk!.'
 '.!gREnzhndsYbIiGz!mOstkoRnhndWItf?UcrzpRysizWrstRoGgr@.'
 '.WIf?UcrzWrshpoRthdbykhnsrnnAtWItmEbIvolnrhmhl.ifkOldsnApcfalrDispAstWIkenzWoRm.AnhWhssed+.'
 '..DhWoRmiGtRAndmEHAvmelthdDisnOkhvranshmkRos.'
 '.oRnf?UcrcrsIvdshmstRiGk.onikspektEShnhvDIAgRhkhlcrdipaRtmhntWudREdIzsOvIetimpoRtfoRkAs_Anhlhs.'
 '.AftrDhklOzhvtREdiG!DhfoRkAstWrinkRIst.'
 '.sYbInf?UcrzpRyshzWrmikst

In [20]:
np.savetxt('test_116.csv', out_np, delimiter=',', fmt='%s') 