# 08. CTC-for-End2End-Speech-Recognition

* http://web.stanford.edu/class/cs224n/lectures/cs224n-2017-lecture12.pdf
* https://github.com/SeanNaren/warp-ctc/tree/pytorch_bindings/pytorch_binding

In [2]:
import nltk
import librosa
import os
import random
import numpy as np
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as F
from warpctc_pytorch import CTCLoss

In [21]:
USE_CUDA = False#torch.cuda.is_available()

FloatTensor = torch.cuda.FloatTensor if USE_CUDA else torch.FloatTensor
IntTensor = torch.cuda.Intensor if USE_CUDA else torch.IntTensor
ByteTensor = torch.cuda.ByteTensor if USE_CUDA else torch.ByteTensor

In [22]:
def prepare_sequence(seq, to_index):
    filtered_seq = list(filter(lambda x: x in to_index.keys(),seq))
    idxs = list(map(lambda w: to_index[w], filtered_seq))
    return Variable(IntTensor(idxs))

# Data load and Preprocessing

In [7]:
paths=[]

In [8]:
for path in nltk.corpus.timit.abspaths():
    if path.path.endswith('.wav'):
        base = '/'.join(path.path.split('/')[:-1])
        sid = path.path.split('/')[-1][:-4]
        paths.append([path.path,base+'/'+sid+'.txt'])

In [9]:
data=[]

In [11]:
for path in paths:
    y, sr = librosa.load(path[0],sr=16000)
    feature = librosa.feature.mfcc(y,sr,n_mfcc=26)
    label = list('_'.join(open(path[1],'r').readline()[:-1].lower().split()[2:]))[:-1]
    data.append([feature,label])

In [12]:
char_vocab = ['<blank>','a','b','c','d','e','f','g','h','i','j','k',
           'l','m','n','o','p','q','r','s','t','u','v','w','x','y','z','_']

char2index={}
for ch in char_vocab:
    char2index[ch]=len(char2index)
index2char={v:k for k,v in char2index.items()}

In [13]:
index2char[0] # the index 0 is reserved for the blank symbol

'<blank>'

In [23]:
random.shuffle(data)
train = data[:int(len(data)*0.9)]
test = data[int(len(data)*0.9):]

In [24]:
train_X,train_y = list(zip(*train))
test_X,test_y = list(zip(*test))

In [25]:
train_X = [Variable(FloatTensor(x.T)) for x in train_X]
train_y = [prepare_sequence(y,char2index).unsqueeze(0) for y in train_y]
train_data = list(zip(train_X,train_y))

In [26]:
test_X = [Variable(FloatTensor(x.T)) for x in test_X]
test_data = list(zip(test_X,test_y))

In [27]:
class SimpleCTC(nn.Module): 
    def __init__(self,input_size,hidden_size,output_size):

        super(SimpleCTC, self).__init__()
        self.hidden_size = hidden_size
        self.rnn = nn.GRU(input_size,hidden_size,1,batch_first=True)
        self.linear = nn.Linear(hidden_size,output_size)
        
    def init_hidden(self,inputs):
        hidden = Variable(torch.zeros(1,inputs.size(0),self.hidden_size))
        return hidden.cuda() if USE_CUDA else hidden
        
    def forward(self, inputs): 
        hidden = self.init_hidden(inputs)
        out,hidden = self.rnn(inputs,hidden)
        
        # expected shape of seqLength x batchSize x alphabet_size
        return F.log_softmax(self.linear(out.contiguous().view(out.size(0)*out.size(1),-1))).unsqueeze(inputs.size(0)) 

In [58]:
HIDDEN_SIZE=100
STEP=100
LR=0.01

In [59]:
model = SimpleCTC(26,HIDDEN_SIZE,len(char2index))
if USE_CUDA:
    model = model.cuda()
ctc_loss = CTCLoss()
optimizer = optim.Adam(model.parameters(),lr=LR)

In [60]:
for step in range(STEP):
    losses=[]
    for pair in train_data:
        model.zero_grad()
        probs = model(pair[0].unsqueeze(0))
        labels = pair[1].squeeze(0)
        label_sizes = Variable(IntTensor([labels.size(0)]))
        probs_sizes = Variable(IntTensor([probs.size(0)]))
        loss = ctc_loss(probs, labels, probs_sizes, label_sizes)
        losses.append(loss.data.tolist()[0])
        loss.backward()
        optimizer.step()
        
    print("[%d/%d] loss : %0.2f" %(step,STEP,np.mean(losses)))
    losses=[]

[0/100] loss : 143.75
[1/100] loss : 139.72
[2/100] loss : 138.25
[3/100] loss : 137.55
[4/100] loss : 136.45
[5/100] loss : 135.93
[6/100] loss : 135.86
[7/100] loss : 134.72
[8/100] loss : 134.87
[9/100] loss : 134.79
[10/100] loss : 134.40
[11/100] loss : 133.43
[12/100] loss : 133.35
[13/100] loss : 133.55
[14/100] loss : 133.18
[15/100] loss : 132.26
[16/100] loss : 133.09
[17/100] loss : 131.91
[18/100] loss : 131.81
[19/100] loss : 132.07
[20/100] loss : 132.04
[21/100] loss : 132.29
[22/100] loss : 131.60
[23/100] loss : 132.52
[24/100] loss : 131.50
[25/100] loss : 131.23
[26/100] loss : 130.80
[27/100] loss : 132.71
[28/100] loss : 132.01
[29/100] loss : 133.07
[30/100] loss : 131.83
[31/100] loss : 131.94
[32/100] loss : 131.82
[33/100] loss : 131.07
[34/100] loss : 130.75


KeyboardInterrupt: 

# TODO 

* batch, gpu
* decode

It needs more test...