We attempt in thisto create an end to end ASR model with CTC loss using keras

In [2]:
import sys 
import os
import numpy as np
sys.path.insert(1, os.path.abspath('../models'))
import data_preparation 
import pandas as pd
from itertools import groupby
from helpers import *

In [3]:
path2data = os.path.abspath('../../Datasets/tedlium/TEDLIUM_release1')
path2features = os.path.abspath('../../Datasets/train_mfcc')
filename = 'ids_labels.p'
n_mfcc = 26
n_mels = 40
hop_length = 160
frame_length = 320
batch_size = 32
epoch_length = 32
shuffle = True

In [11]:
df = pd.read_pickle(os.path.join(path2features, filename))

In [12]:
df

Unnamed: 0,filename,nb_frames,labels,offset,duration


In [5]:
data_prep_params = {'path2data': path2data,
                    'path2features': path2features,
                    'pickle_filename': filename,
                    'n_mfcc': n_mfcc,
                    'n_mels': n_mels,
                    'hop_length': hop_length,
                    'frame_length': frame_length
                    }

data_prep = data_preparation.DataPrep(**data_prep_params)

In [None]:
data_prep.process_tedelium(category='train')

reading STM file list: 100%|██████████| 774/774 [00:03<00:00, 230.41it/s]


Number of tasks to be performed :  56803


[Parallel(n_jobs=11)]: Using backend LokyBackend with 11 concurrent workers.
[Parallel(n_jobs=11)]: Done  28 tasks      | elapsed:    4.1s
[Parallel(n_jobs=11)]: Done 178 tasks      | elapsed:    9.9s
[Parallel(n_jobs=11)]: Done 428 tasks      | elapsed:   19.0s
[Parallel(n_jobs=11)]: Done 778 tasks      | elapsed:   31.5s
[Parallel(n_jobs=11)]: Done 1228 tasks      | elapsed:   49.1s
[Parallel(n_jobs=11)]: Done 1778 tasks      | elapsed:  1.1min
[Parallel(n_jobs=11)]: Done 2428 tasks      | elapsed:  1.5min
[Parallel(n_jobs=11)]: Done 3178 tasks      | elapsed:  2.0min
[Parallel(n_jobs=11)]: Done 4028 tasks      | elapsed:  2.4min
[Parallel(n_jobs=11)]: Done 4978 tasks      | elapsed:  3.0min
[Parallel(n_jobs=11)]: Done 6028 tasks      | elapsed:  3.6min
[Parallel(n_jobs=11)]: Done 7178 tasks      | elapsed:  4.3min
[Parallel(n_jobs=11)]: Done 8661 tasks      | elapsed:  5.2min
[Parallel(n_jobs=11)]: Done 10423 tasks      | elapsed:  6.2min
[Parallel(n_jobs=11)]: Done 11873 tasks     

In [7]:
data_generator_params = {'path2features': path2features,
                         'pickle_filename': filename,
                         'batch_size': batch_size,
                         'mfcc_features': n_mfcc,
                         'epoch_length': epoch_length,
                         'shuffle': shuffle
                        }

data_gen = data_preparation.DataGenerator(**data_generator_params)

In [8]:
from keras import backend as K
from keras.layers import Dense, SimpleRNN, LSTM, CuDNNLSTM, Bidirectional, TimeDistributed, Conv1D, ZeroPadding1D, GRU
from keras.layers import Lambda, Input, Dropout, Masking
from keras.models import Model
from keras.optimizers import Adam

In [9]:
# Lambda implementation of CTC loss, using ctc_batch_cost from TensorFlow backend
# CTC implementation from Keras example found at https://github.com/keras-team/keras/blob/master/examples/image_ocr.py
def ctc_lambda_func(args):
    y_pred, labels, input_length, label_length = args
    # the 2 is critical here since the first couple outputs of the RNN
    # tend to be garbage:
    # print "y_pred_shape: ", y_pred.shape
    y_pred = y_pred[:, 2:, :]
    # print "y_pred_shape: ", y_pred.shape
    return K.ctc_batch_cost(labels, y_pred, input_length, label_length)


# Returns clipped relu, clip value set to 20.
def clipped_relu(value):
    return K.relu(value, max_value=20)

**Models**

In [10]:
def base_model(units, input_dim=26, output_dim=29):
    
    dtype = 'float32'
    
    # Kernel and bias initializers for fully connected dense layers
    kernel_init_dense = 'random_normal'
    bias_init_dense = 'random_normal'
    
    input_data = Input(name='the_input',shape=(None, input_dim), dtype=dtype)
    
    x = GRU(units, activation='tanh', return_sequences=True)(input_data)
    x = GRU(units, activation='tanh', return_sequences=True, dropout=0.3)(x)
    
    # Output layer with softmax
    y_pred = TimeDistributed(Dense(units=output_dim, kernel_initializer=kernel_init_dense,
                                   bias_initializer=bias_init_dense, activation='softmax'), name='softmax')(x)

    # ---- CTC ----
    # y_input layers (transcription data) for CTC loss
    labels = Input(name='the_labels', shape=[None], dtype=dtype)        # transcription data (batch_size * y_seq_size)
    input_length = Input(name='input_length', shape=[1], dtype=dtype)   # unpadded len of all x_sequences in batch
    label_length = Input(name='label_length', shape=[1], dtype=dtype)   # unpadded len of all y_sequences in batch

    # Lambda layer with ctc_loss function due to Keras not supporting CTC layers
    loss_out = Lambda(function=ctc_lambda_func, name='ctc', output_shape=(1,))(
                      [y_pred, labels, input_length, label_length])

    network_model = Model(inputs=[input_data, labels, input_length, label_length], outputs=loss_out)
    
    return network_model

In [29]:
def deep_rnn(units, input_dim=26, output_dim=29, dropout=0.2, numb_of_dense=1, n_layers=1):
    """
    :param units: Hidden units per layer
    :param input_dim: Size of input dimension (number of features), default=26
    :param output_dim: Output dim of final layer of model (input to CTC layer), default=29
    :param dropout: Dropout rate, default=0.2
    :param numb_of_dense: Number of fully connected layers before recurrent, default=3
    :param n_layers: Number of simple RNN layers, default=3
    :return: network_model: deep_rnn
    Default model contains:
     1 layer of masking
     3 layers of fully connected clipped ReLu (DNN) with dropout 20 % between each layer
     3 layers of RNN with 20% dropout
     1 layers of fully connected clipped ReLu (DNN) with dropout 20 % between each layer
     1 layer of softmax
    """

    # Input data type
    dtype = 'float32'

    # Kernel and bias initializers for fully connected dense layers
    kernel_init_dense = 'random_normal'
    bias_init_dense = 'random_normal'

    # Kernel and bias initializers for recurrent layer
    kernel_init_rnn = 'glorot_uniform'
    bias_init_rnn = 'zeros'

    # ---- Network model ----
    # x_input layer, dim: (batch_size * x_seq_size * mfcc_features)
    input_data = Input(name='the_input',shape=(None, input_dim), dtype=dtype)

    # Masking layer
    x = Masking(mask_value=0., name='masking')(input_data)

    # Default 3 fully connected layers DNN ReLu
    # Default dropout rate 20 % at each FC layer
    for i in range(0, numb_of_dense):
        x = TimeDistributed(Dense(units=units, kernel_initializer=kernel_init_dense, bias_initializer=bias_init_dense,
                                  activation=clipped_relu), name='fc_'+str(i+1))(x)
        x = TimeDistributed(Dropout(dropout), name='dropout_'+str(i+1))(x)

    # Deep RNN network with a default of 3 layers
    for i in range(0, n_layers):
        x = SimpleRNN(units, activation='relu', kernel_initializer=kernel_init_rnn, bias_initializer=bias_init_rnn,
                      dropout=dropout, return_sequences=True, name=('deep_rnn_'+ str(i+1)))(x)

    # 1 fully connected layer DNN ReLu with default 20% dropout
    x = TimeDistributed(Dense(units=units, kernel_initializer=kernel_init_dense, bias_initializer=bias_init_dense,
                              activation='relu'), name='fc_' + str(numb_of_dense + 1))(x)
    x = TimeDistributed(Dropout(dropout), name='dropout_' + str(numb_of_dense + 1))(x)

    # Output layer with softmax
    y_pred = TimeDistributed(Dense(units=output_dim, kernel_initializer=kernel_init_dense,
                                   bias_initializer=bias_init_dense, activation='softmax'), name='softmax')(x)

    # ---- CTC ----
    # y_input layers (transcription data) for CTC loss
    labels = Input(name='the_labels', shape=[None], dtype=dtype)        # transcription data (batch_size * y_seq_size)
    input_length = Input(name='input_length', shape=[1], dtype=dtype)   # unpadded len of all x_sequences in batch
    label_length = Input(name='label_length', shape=[1], dtype=dtype)   # unpadded len of all y_sequences in batch

    # Lambda layer with ctc_loss function due to Keras not supporting CTC layers
    loss_out = Lambda(function=ctc_lambda_func, name='ctc', output_shape=(1,))(
                      [y_pred, labels, input_length, label_length])

    network_model = Model(inputs=[input_data, labels, input_length, label_length], outputs=loss_out)

    return network_model

In [12]:
def blstm(units, input_dim=26, output_dim=29, dropout=0.2, numb_of_dense=3, cudnn=False, n_layers=1):
    """
    :param units: Hidden units per layer
    :param input_dim: Size of input dimension (number of features), default=26
    :param output_dim: Output dim of final layer of model (input to CTC layer), default=29
    :param dropout: Dropout rate, default=0.2
    :param numb_of_dense: Number of fully connected layers before recurrent, default=3
    :param cudnn: Whether to use the CuDNN optimized LSTM (GPU only), default=False
    :param n_layers: Number of stacked BLSTM layers, default=1
    :return: network_model: blstm
    Default model contains:
     1 layer of masking
     3 layers of fully connected clipped ReLu (DNN) with dropout 20 % between each layer
     1 layer of BLSTM
     1 layers of fully connected clipped ReLu (DNN) with dropout 20 % between each layer
     1 layer of softmax
    """

    # Input data type
    dtype = 'float32'

    # Kernel and bias initializers for fully connected dense layers
    kernel_init_dense = 'random_normal'
    bias_init_dense = 'random_normal'

    # Kernel and bias initializers for recurrent layer
    kernel_init_rnn = 'glorot_uniform'
    bias_init_rnn = 'random_normal'

    # ---- Network model ----
    # x_input layer, dim: (batch_size * x_seq_size * features)
    input_data = Input(name='the_input', shape=(None, input_dim), dtype=dtype)

    if cudnn:
        # CuDNNLSTM does not support masking
        x = input_data
    else:
        # Masking layer
        x = Masking(mask_value=0., name='masking')(input_data)

    # Default 3 fully connected layers DNN ReLu
    # Default dropout rate 20 % at each FC layer
    for i in range(0, numb_of_dense):
        x = TimeDistributed(Dense(units=units, kernel_initializer=kernel_init_dense, bias_initializer=bias_init_dense,
                                  activation=clipped_relu), name='fc_'+str(i+1))(x)
        x = TimeDistributed(Dropout(dropout), name='dropout_'+str(i+1))(x)

    # Bidirectional RNN (with ReLu)
    # If running on GPU, use the CuDNN optimised LSTM model
    if cudnn:
        for i in range(0, n_layers):
            x = Bidirectional(CuDNNLSTM(units, kernel_initializer=kernel_init_rnn, bias_initializer=bias_init_rnn,
                                        unit_forget_bias=True, return_sequences=True),
                              merge_mode='sum', name=('CuDNN_bi_lstm' + str(i+1)))(x)
    else:
        for i in range(0, n_layers):
            x = Bidirectional(LSTM(units, activation='relu', kernel_initializer=kernel_init_rnn, dropout=dropout,
                                   bias_initializer=bias_init_rnn, return_sequences=True),
                              merge_mode='sum', name=('bi_lstm' + str(i+1)))(x)

    # 1 fully connected layer DNN ReLu with default 20% dropout
    x = TimeDistributed(Dense(units=units, kernel_initializer=kernel_init_dense, bias_initializer=bias_init_dense,
                              activation='relu'), name='fc_4')(x)
    x = TimeDistributed(Dropout(dropout), name='dropout_4')(x)

    # Output layer with softmax
    y_pred = TimeDistributed(Dense(units=output_dim, kernel_initializer=kernel_init_dense,
                                   bias_initializer=bias_init_dense, activation='softmax'), name='softmax')(x)

    # ---- CTC ----
    # y_input layers (transcription data) for CTC loss
    labels = Input(name='the_labels', shape=[None], dtype=dtype)       # transcription data (batch_size * y_seq_size)
    input_length = Input(name='input_length', shape=[1], dtype=dtype)  # unpadded len of all x_sequences in batch
    label_length = Input(name='label_length', shape=[1], dtype=dtype)  # unpadded len of all y_sequences in batch

    # Lambda layer with ctc_loss function due to Keras not supporting CTC layers
    loss_out = Lambda(function=ctc_lambda_func, name='ctc', output_shape=(1,))(
                      [y_pred, labels, input_length, label_length])

    network_model = Model(inputs=[input_data, labels, input_length, label_length], outputs=loss_out)

    return network_model

In [19]:
def cnn_blstm(units, input_dim=26, output_dim=29, dropout=0.2, seq_padding=2176, cudnn=False, n_layers=1):
    """
    :param units: Hidden units per layer
    :param input_dim: Size of input dimension (number of features), default=26
    :param output_dim: Output dim of final layer of model (input to CTC layer), default=29
    :param dropout: Dropout rate, default=0.2
    :param seq_padding: length of sequence zero padding before conv layers, default=2176
    :param cudnn: Whether to use the CuDNN optimized LSTM (only for GPU), default=False
    :param n_layers: Number of stacked BLSTM layers, default=1
    :return: network_model: cnn_blstm
    Model contains:
     3 layers of CNN Conv1D
     3 layers of BLSTM
     1 layers of fully connected clipped ReLu (DNN) with dropout 20 % between each layer
     1 layer of softmax
    """

    # Input data type
    dtype = 'float32'

    activation_conv = clipped_relu

    # Kernel and bias initializers for fully connected dense layer
    kernel_init_dense = 'random_normal'
    bias_init_dense = 'random_normal'

    # Kernel and bias initializers for convolution layers
    kernel_init_conv = 'glorot_uniform'
    bias_init_conv = 'random_normal'

    # Kernel and bias initializers for recurrent layer
    kernel_init_rnn = 'glorot_uniform'
    bias_init_rnn = 'random_normal'

    # ---- Network model ----
    input_data = Input(name='the_input', shape=(None, input_dim), dtype=dtype)

    # Pad on sequence dim so all sequences are equal length
    x = ZeroPadding1D(padding=(0, seq_padding))(input_data)

    # 3 x 1D convolutional layers with strides: 1, 1, 2
    x = Conv1D(filters=units, kernel_size=5, strides=1, activation=activation_conv,
               kernel_initializer=kernel_init_conv, bias_initializer=bias_init_conv, name='conv_1')(x)
    x = TimeDistributed(Dropout(dropout), name='dropout_1')(x)

    x = Conv1D(filters=units, kernel_size=5, strides=1, activation=activation_conv,
               kernel_initializer=kernel_init_conv, bias_initializer=bias_init_conv, name='conv_2')(x)
    x = TimeDistributed(Dropout(dropout), name='dropout_2')(x)

    x = Conv1D(filters=units, kernel_size=5, strides=2, activation=activation_conv,
               kernel_initializer=kernel_init_conv, bias_initializer=bias_init_conv, name='conv_3')(x)
    x = TimeDistributed(Dropout(dropout), name='dropout_3')(x)

    # Bidirectional LSTM
    if cudnn:
        for i in range(0, n_layers):
            x = Bidirectional(CuDNNLSTM(units, kernel_initializer=kernel_init_rnn, bias_initializer=bias_init_rnn,
                                        unit_forget_bias=True, return_sequences=True),
                              merge_mode='sum', name='CuDNN_bi_lstm'+str(i+1))(x)
    else:
        for i in range(0, n_layers):
            x = Bidirectional(LSTM(units, activation='relu', kernel_initializer=kernel_init_rnn, dropout=dropout,
                                   bias_initializer=bias_init_rnn, return_sequences=True),
                              merge_mode='sum', name='bi_lstm'+str(i+1))(x)

    # 1 fully connected layer DNN ReLu with default 20% dropout
    x = TimeDistributed(Dense(units=units, kernel_initializer=kernel_init_dense, bias_initializer=bias_init_dense,
                              activation='relu'), name='fc_4')(x)
    x = TimeDistributed(Dropout(dropout), name='dropout_4')(x)

    # Output layer with softmax
    y_pred = TimeDistributed(Dense(units=output_dim, kernel_initializer=kernel_init_dense,
                                   bias_initializer=bias_init_dense, activation='softmax'), name='softmax')(x)

    # ---- CTC ----
    # y_input layers (transcription data) for CTC loss
    labels = Input(name='the_labels', shape=[None], dtype=dtype)       # transcription data (batch_size * y_seq_size)
    input_length = Input(name='input_length', shape=[1], dtype=dtype)  # unpadded len of all x_sequences in batch
    label_length = Input(name='label_length', shape=[1], dtype=dtype)  # unpadded len of all y_sequences in batch

    # Lambda layer with ctc_loss function due to Keras not supporting CTC layers
    loss_out = Lambda(function=ctc_lambda_func, name='ctc', output_shape=(1,))(
                      [y_pred, labels, input_length, label_length])

    network_model = Model(inputs=[input_data, labels, input_length, label_length], outputs=loss_out)

    return network_model

**Training**

In [30]:
loss = {'ctc': lambda y_true, y_pred: y_pred}
training_generator = data_gen
model = deep_rnn(units=256, input_dim=n_mfcc, output_dim=29, dropout=0.2, numb_of_dense=3, n_layers=3)
#model = blstm(units=256, input_dim=n_mfcc, output_dim=29, dropout=0.2, numb_of_dense=1, cudnn=False, n_layers=1)
#model = base_model(units=64, input_dim=n_mfcc, output_dim=29)
model_train_params = {'generator': training_generator,
                      'epochs': 6,
                      'verbose': 1,
                      #'validation_data': validation_generator,
                      'workers': -1,
                      'shuffle': shuffle}

optimizer = Adam(lr=0.0001, epsilon=1e-8, clipnorm=2.0)

model.compile(loss=loss, optimizer=optimizer)
model.fit_generator(**model_train_params)

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


<keras.callbacks.History at 0x7f98b341ceb8>

In [19]:
model.fit_generator(**model_train_params)

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


<keras.callbacks.History at 0x7fbe986920f0>

In [18]:
model.fit_generator(**model_train_params)

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


<keras.callbacks.History at 0x7f4079295e80>

In [14]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
the_input (InputLayer)          (None, None, 26)     0                                            
__________________________________________________________________________________________________
masking (Masking)               (None, None, 26)     0           the_input[0][0]                  
__________________________________________________________________________________________________
fc_1 (TimeDistributed)          (None, None, 256)    6912        masking[0][0]                    
__________________________________________________________________________________________________
dropout_1 (TimeDistributed)     (None, None, 256)    0           fc_1[0][0]                       
__________________________________________________________________________________________________
fc_2 (Time

**Prediction**

In [21]:
# Creates a test function that takes preprocessed sound input and outputs predictions
# Used to calculate WER (word error rate) while training the network
input_data = model.get_layer('the_input').input
y_pred = model.get_layer('ctc').input[0]
test_func = K.function([input_data], [y_pred])

In [22]:
def max_decode(test_func, x_data):
    """
    Calculate network probabilities with test_func and decode with max decode/greedy decode
    :param test_func: Keras function that takes preprocessed audio input and outputs network predictions
    :param x_data: preprocessed audio data
    :return: decoded: max decoded network output
    """
    y_pred = test_func([x_data])[0]
    decoded = []
    print(y_pred.shape)
    for i in range(0, y_pred.shape[0]):

        decoded_batch = []
        for j in range(0,y_pred.shape[1]):
            decoded_batch.append(np.argmax(y_pred[i][j][:-1]))
        #print(decoded_batch)
        temp = [k for k, g in groupby(decoded_batch)]
        #print(temp)
        temp[:] = [x for x in temp if x != [28]]
        #print(temp)
        decoded.append(temp)

    return decoded



def predict_on_batch(data_gen, test_func, batch_index):
    """
    Produce a sample of predictions at given batch index from data in data_gen
    :param data_gen: DataGenerator to produce input data
    :param test_func: Keras function that takes preprocessed audio input and outputs network predictions
    :param batch_index: which batch to use as input data
    :return: List containing original transcripts and predictions
    """
    input_data, _ = data_gen.__getitem__(batch_index)

    x_data = input_data.get("the_input")
    y_data = input_data.get("the_labels")

    res = max_decode(test_func, x_data)
    predictions = []

    for i in range(y_data.shape[0]):
        original = "".join(index2str(y_data[i].astype('int')))
        predicted = "".join(index2str(res[i]))
        predictions.append([original,predicted])

    return predictions

In [23]:
input_data, _ = data_gen.__getitem__(1)

x_data = input_data.get("the_input")
y_data = input_data.get("the_labels")
y_pred = test_func([x_data])[0]
predictions = predict_on_batch(data_gen, test_func, 1)

(32, 726, 29)


In [24]:
x_data.shape

(32, 726, 12)

In [25]:
predictions

[['it s a waste of money what i m telling you is that these expensive complicated choices it s not simply that they don t help they actually hurt they actually make us worse',
  ' '],
 ['nepal his mother was incarcerated for the price of for the crime of being wealthy he was smuggled into the jail at the ti at the age of two to hide beneath her skirt tails because she couldn t bear to be without him the sister who had done that brave deed was put into an education camp one day she inadvertently stepped on an armband that of mao and for that transgression she was given seven years of hard labor',
  ' '],
 ['we have to design so they can go together we design unique elements into this you may have read that we put watermarks in think of this we have a four letter genetic code a c g and t triplets of that letter of those letters code for roughly twenty amino acids that there s a single letter',
  ' '],
 ['now if you begin to look at the idea that these cultures could create different real

In [23]:
x_data.shape

(32, 2652, 26)

In [37]:
y_data.shape

(32, 526)

In [31]:
x_data[0][100]

array([-160.77806228,   92.01941971,  -23.9410947 ,   18.6731909 ,
         -7.71552052,  -10.06911783,  -10.4417597 ,  -10.54981956,
         12.89189723,    3.71269653,   -6.12233873,    0.42868154,
         -1.12947846,    7.82820096,  -13.72863764,    3.45903755,
         -5.76475135,   -2.92914885,    2.75763579,   -3.04279983,
         -0.39132807,    0.83946294,   -7.74940364,    2.94824848,
         -3.00777463,   -2.28515724])

In [20]:
predictions

[['the short answer to all those questions is yes yes i m afraid of all those things and i always have been and i m afraid of many many more things besides that you know people can t even guess at like',
  'r'],
 ['blaise that is truly incredible congratulations', ' r r r'],
 ['assumption being that sub saharan africa had no religious beliefs well of course they did and voodoo is simply the distillation of these very',
  ' r'],
 ['each of these has four different variations you get to choose which one you want to use and you can inject these sounds into a sequence that you can assemble into the pattern that you want',
  ' r r'],
 ['you will never be pleasantly surprised because your expectations my expectations have gone through the roof the secret to happiness this is what you all came for the secret to happiness is low expectations',
  ' r'],
 ['this is the last one', ' r'],
 ['thank you so', ' r r r r r r'],
 ['she s okay the question is whether we will be and one reason is this eno

In [103]:
index2str(df.iloc[1].labels)

'you know one of the intense pleasures of travel and one of the delights of ethnographic research is the opportunity to live amongst those who have not forgotten the old ways who still feel their past in the wind touch it in stones polished by rain taste it in the bitter leaves of plants just to know that jaguar shamans still journey beyond the milky way or the'

In [39]:
df.head(35)

Unnamed: 0,filename,nb_frames,labels,offset,duration
0,WadeDavis_2003.sph.wav-12.51,2202,"[26, 16, 22, 1, 12, 15, 16, 24, 1, 16, 15, 6, ...",12.51,22.01
1,WadeDavis_2003.sph.wav-34.52,643,"[21, 9, 6, 1, 14, 26, 21, 9, 20, 1, 16, 7, 1, ...",34.52,6.42
2,WadeDavis_2003.sph.wav-40.94,2159,"[3, 22, 5, 5, 9, 10, 20, 21, 20, 1, 20, 21, 10...",40.94,21.58
3,WadeDavis_2003.sph.wav-62.52,1291,"[2, 15, 5, 1, 16, 7, 1, 4, 16, 22, 19, 20, 6, ...",62.52,12.9
4,WadeDavis_2003.sph.wav-75.42,1481,"[24, 6, 1, 2, 13, 13, 1, 5, 2, 15, 4, 6, 1, 24...",75.42,14.8
5,WadeDavis_2003.sph.wav-90.22,860,"[16, 19, 1, 21, 9, 6, 1, 24, 2, 19, 19, 10, 16...",90.22,8.59
6,WadeDavis_2003.sph.wav-98.81,884,"[16, 19, 1, 2, 1, 4, 2, 1, 4, 2, 19, 2, 23, 2,...",98.81,8.83
7,WadeDavis_2003.sph.wav-107.65,1567,"[16, 19, 1, 10, 15, 5, 6, 6, 5, 1, 2, 1, 26, 2...",107.65,15.66
8,WadeDavis_2003.sph.wav-123.31,3993,"[15, 16, 24, 1, 21, 16, 8, 6, 21, 9, 6, 19, 1,...",123.31,39.92
9,WadeDavis_2003.sph.wav-163.23,1682,"[3, 10, 16, 20, 17, 9, 6, 19, 6, 1, 9, 2, 20, ...",163.23,16.81


In [105]:
pwd

'/aimlx/kws/Notebooks'

In [110]:
import os

file = "/aimlx/Datasets/tedlium/TEDLIUM_release1/dev/sph/AlGore_2009.sph.wav"
os.system("aplay " + file)

32512