In [1]:
import os
# use CPU or GPU
os.environ['KERAS_BACKEND'] = 'theano'
#os.environ['CUDA_VISIBLE_DEVICES'] = '0'
os.environ['THEANO_FLAGS'] = 'device=cuda0'

In [2]:
import keras
import numpy as np

Using Theano backend.
Using cuDNN version 5110 on context None
Mapped name None to device cuda0: GeForce GTX 1080 Ti (0000:02:00.0)


### Data and weight loaders

In [3]:
from glob import glob
from random import Random
import json

rng = Random(42)

In [4]:
import kenlm
import beamsearch
reload(beamsearch)
from utils import argmax_decode, word_error_rate, for_tf_or_th
from beamsearch import beam_decode, beam_decode_u

lm = kenlm.Model('data/lm/lm.binary')

def iterate_weights(model_path):
    """Iterate over saved model weights"""
    for model_weight in glob(os.path.join(model_path, '') + '*.h5'):
        yield model_weight

def pick_sample_files(desc_file, count, min_duration, max_duration):
    metadata = []
    with open(desc_file) as f:
        for line in f:
            metadata.append(json.loads(line))
    legitimates = [ sample for sample in metadata if min_duration <= sample['duration'] <= max_duration ]
    rng.shuffle(legitimates)
    return legitimates[:count]

def test_generator(datagen, test_samples, batch_size=64, normalize=True):
    global in_
    texts = [s['text'] for s in test_samples]
    durations = [s['duration'] for s in test_samples]
    paths = [s['key'] for s in test_samples]
    features = [datagen.featurize(p) for p in paths]
    if normalize:
        features = [datagen.normalize(f) for f in features]

    for i in range( np.ceil(len(features) / float(batch_size)).astype(int) ):
        batch_durations = durations[i*batch_size: (i+1)*batch_size]
        batch_features = features[i*batch_size: (i+1)*batch_size]
        batch_texts = texts[i*batch_size: (i+1)*batch_size]
        batch_paths = paths[i*batch_size: (i+1)*batch_size]
        max_length = max([f.shape[0] for f in batch_features])
        batch_array = np.zeros((len(batch_features), max_length, features[0].shape[1]), dtype='float32')
        for fi in range(len(batch_features)):
            batch_array[fi, :batch_features[fi].shape[0], :] = batch_features[fi]
        yield {'x': batch_array, 'y': batch_texts, 'path': batch_paths, 'duration': batch_durations}

def best_lm_alternative(true_sentence, wer, predictions, verbose=False):
    """ predictions is a list of tuples which first denote sentence and next is It's probablity
    """
    best, best_score = None, np.finfo('float32').min
    for s, p in predictions:
        lm_score = lm.score(s)
        if lm_score > best_score:
            best, best_score = s, lm_score
    if best == predictions[0][0]:
        if verbose:
            print "language model didn't change prediction"
        best_wer = wer
    else:
        best_wer = word_error_rate([true_sentence], [best], decoded=True)[0]
        if verbose:
            print "language model changed prediction, WER changed from {old_wer} to {new_wer}".format(
                old_wer = wer, new_wer = best_wer
            )
    return best, best_wer

def evaluate(batch_generator, output_fn, learning_phase=False, use_lm=False, beam_width=12):
    all_nolm_wers, all_lm_wers = [], []
    for batch in batch_generator:
        net_out = output_fn([batch['x'], learning_phase])[0]
        mtp_net_out = for_tf_or_th(net_out, net_out.swapaxes(0, 1))
        pred_texts = [argmax_decode(o) for o in mtp_net_out]
        nolm_wers = word_error_rate(batch['y'], pred_texts, True)
        all_nolm_wers.append(nolm_wers)
        
        if use_lm:
            alt_beam_preds = lambda i: zip(*beam_decode_u(mtp_net_out[i, :, :], beam_width, normalize=True))
            pred_texts, lm_wers = zip(*[best_lm_alternative(batch['y'][i], nolm_wers[i], alt_beam_preds(i))
                                      for i in range(mtp_net_out.shape[0])])
            all_lm_wers.append(np.array(lm_wers))
            all_wers = all_lm_wers
        else:
            all_wers = all_nolm_wers
        
        for i, y in enumerate(batch['y']):
            print 'r:{}\np:{}\n{}: WER: {}, DURATION: {}, PATH: {}'.format(y, pred_texts[i], i, all_wers[-1][i], batch['duration'][i], batch['path'][i])
        print 'batch mean WER: {}'.format(all_wers[-1].mean())
    if use_lm:
        print 'LM WER: {} No LM WER: {}'.format(np.concatenate(all_lm_wers).mean(), np.concatenate(all_nolm_wers).mean())
    else:
        'whole mean WER: {}'.format(np.concatenate(all_wers).mean())
    return mtp_net_out, pred_texts, all_wers, batch['y']

### Customize data generator

In [16]:
test_desc = '/home/reith/deepspeech/ba-dls-deepspeech/descs/test-clean.json'
#test_desc = '/home/reith/deepspeech/ba-dls-deepspeech/descs/test-other.json'
#test_desc = '/home/reith/deepspeech/ba-dls-deepspeech/descs/dev-clean.json'

In [6]:
from data_generator import DataGenerator
datagen = DataGenerator()

In [18]:
test_samples = pick_sample_files(test_desc, 1024, 0, 30)

Normalize by input data

In [8]:
train_desc = '/home/reith/deepspeech/ba-dls-deepspeech/descs/train-clean-360.json'
datagen.load_train_data(train_desc, 15)
datagen.fit_train(100)

Or load them

In [17]:
datagen.reload_norm('860-1000')

### Load model

#### Theano mode

Load and test weights of a half-phoneme model

In [10]:
#model_dir = '/home/reith/deepspeech/ba-dls-deepspeech/models/22-cont-23-i9696-lr1e-4-train-360-dur15/'
#model_dir = '/home/reith/deepspeech/ba-dls-deepspeech/models/23-cont-i2494-joingrus-dur15-nobn-lr5e-5/'
model_dir = '/home/reith/deepspeech/ba-dls-deepspeech/models/24-cont-train-860'

A summary of training procedure:
- 7 Epochs of dual phoneme-text on train-100 (20)
- 3 Epochs on train-500 for phoenme fine-tuning (21)
- 3 Epochs on train-500 for text fine-tuning (22)
- 2 Epochs on train-360 (23)
- 2 Epochs on train-360 dropping phoneme branch and and batch normalization (24)

make half phoneme model 

In [10]:
from model_wrp import HalfPhonemeModelWrapper
model_wrp = HalfPhonemeModelWrapper()
model = model_wrp.compile(nodes=1000, conv_context=5, recur_layers=5)
output_fn = model_wrp.compile_output_fn()

  self.model = Model(input=acoustic_input, output=[phoneme_out, text_out])


or gru model

In [11]:
from model_wrp import GruModel
model_wrp = GruModel()
model = model_wrp.compile(nodes=1000, conv_context=5, recur_layers=5, batch_norm=False)
output_fn = model_wrp.compile_output_fn()

  activation=for_tf_or_th('softmax', 'linear')
  self.model = Model(input=acoustic_input, output=[network_output])


In [14]:
# model.load_weights(os.path.join(model_dir, 'best-val-weights.h5'))
model.load_weights(os.path.join(model_dir, 'model_19336_weights.h5'))

#### Tensorflow model

A summary of training procedure:
- 3 Epochs of dual phoneme-text on train-100 by dropout of 0.3 and leaky relu factor of 0.05 (40)
- 5 Epochs on train-100 for phoenme fine-tuning (41)
- 5 Epochs on train-100 for text fine-tuning (42)
- 5 Epochs on train-360 (43)
- 5 Epochs on train-860 dropping phoneme branch and and batch normalization and reduced dropout to 0.1 (44)
- 20 Epochs on train-860 reduced learning rate down to 5e-5 and for samples up to 20 seconds long (45)

In [9]:
model_dir = '/home/reith/deepspeech/ba-dls-deepspeech/models/44-cont-45-i14490-dur20-lr5e-5'

In [10]:
from model_wrp import GruModel
model_wrp = GruModel()
model = model_wrp.compile(nodes=1000, conv_context=5, recur_layers=5, dropout=.1, lirelu_alpha=.05, batch_norm=False)
output_fn = model_wrp.compile_output_fn()

  activation=for_tf_or_th('softmax', 'linear')
  self.model = Model(input=acoustic_input, output=[network_output])


In [11]:
model.load_weights(os.path.join(model_dir, 'best-val-weights.h5'))

In [None]:
model.summary()

### Evaluate model

In [None]:
res = evaluate(test_generator(datagen, test_samples, normalize=True), output_fn, use_lm=False)

In [None]:
res = evaluate(test_generator(datagen, test_samples, normalize=True), output_fn, beam_width=27, use_lm=True)

### test...

In [374]:
# thus idleness is the mother
# thus i don't lissisthe mother
def edits(word):
    letters = ''.join([chr(i) for i in range(ord('a'), ord('z') + 1)])
    splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
    deletes = [l + r[1:] for l, r in splits if r]
    transposes = [l + r[1] + r[0] + r[2:] for l, r in splits if len(r) >1]
    replaces = [l + c + r[1:] for c in letters for l, r in splits if r]
    inserts = [l + c + r for c in letters for l, r in splits if r]
    return set(deletes + transposes + replaces + inserts)

def edits_n(word, n):
    es = set([word])
    for i in range(n):
        es = reduce(lambda a, b: a.union(b), (edits(w) for w in es))
    return es

def words(text):
    return text.split()

def known_words(words):
    return {word for word in words if word in WORDS}

def candidate_words(word):
    return (known_words([word]) or known_words(edits_n(word, 1)) or known_words(edits_n(word, 2)) or [word])

list(candidate_words("swam"))

['swam']

In [336]:
with open('./data/lm/words.txt') as f:
    WORDS = set(words(f.read()))

In [None]:
r:a ring of amethyst i could not wear here plainer to my sight than that first kiss
p:a ring of amathyst i could not wear here plainer two my sight then that first kits

In [197]:
best_lm_alternative(res[3][3], res[2][3], zip(*beam_decode_u(res[0][:, 3, :], 12, normalize=True)))

she doesn't take up with anybody you know
she doesn't take up with anybody you know
langauge model changed prediction, WER changed from 0.0243902439024 to 0.0


"she doesn't take up with anybody you know"

In [187]:
print best_lm_alternative(res[3][46], res[2][46], zip(*beam_decode_u(res[0][:, 46, :], 12, normalize=False)))
print res[1][46]

sir i have it in command to inform your excellency that you have been appointed governor of the crown colony which is called britannula
sir i have in command to anform your excellency that you have been appointed governor of the crown colony which is called britain mula
langauge model changed prediction, WER changed from 0.0334572490706 to 0.0334572490706
sir i have in command to anform your excellency that you have been appointed governor of the crown colony which is called britain mula
sir i have in command to anform your excellency that you have been appointed governor of the crown colony which is called britaan mula


In [165]:
import edit_distance
ref = 'there is no danger of the modern commentators on the timaeus falling into the absurdities of the neo platonists'
pre = 'there is old danger of the madern commontychers un ther to meas falling into dubsurdities of the newo platinists'
pre = 'there is old danger of the madern commontychers un ther to mes falling into dubsurdities of the newo platinists'
#print edit_distance.SequenceMatcher(ref, pre).ratio()
word_error_rate([ref], [pre], decoded=True)[0]

0.16216216216216217

#### custom samples

In [None]:
samples = [
    {"duration": 4.905, "text": "he began a confused complaint against the wizard who had vanished behind the curtain on the left", "key": "/mnt/ml-data/LibriSpeech/test-clean/61/70968/61-70968-0000.wav"},
    {"duration": 3.61, "text": "give not so earnest a mind to these mummeries child", "key": "/mnt/ml-data/LibriSpeech/test-clean/61/70968/61-70968-0001.wav"}    
]

In [None]:
evaluate(test_generator(datagen, samples, normalize=True), output_fn)
