# Speech Recognition

## Imports

In [91]:
import os
import pickle
import librosa
import pandas as pd
import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)
warnings.filterwarnings('ignore', category=FutureWarning)

from IPython.display import Audio

import librosa.display
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf

from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import *
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import SGD, Adam, RMSprop
from tensorflow.keras import backend as K

## Data Preparation

In [92]:
rows = []
parent_dir = "../data/SWH-05-20101106"
files = os.listdir(parent_dir)
for f in files:
    audio, fs = librosa.load(f"{parent_dir}/{f}")
    filename = f.split('.')[0]
    row = {'filename': filename, 'audio': audio}
    rows.append(row)
rows[:5]

[{'filename': 'SWH-05-20101106_16k-emission_swahili_05h30_-_06h00_tu_20101106_part10',
  'audio': array([0.02953335, 0.03225018, 0.02603412, ..., 0.09593043, 0.09478676,
         0.05775513], dtype=float32)},
 {'filename': 'SWH-05-20101106_16k-emission_swahili_05h30_-_06h00_tu_20101106_part100',
  'audio': array([ 0.00471402,  0.00630584,  0.00576152, ...,  0.01627303,
         -0.00729037, -0.01463527], dtype=float32)},
 {'filename': 'SWH-05-20101106_16k-emission_swahili_05h30_-_06h00_tu_20101106_part101',
  'audio': array([0.00886934, 0.00965257, 0.0063316 , ..., 0.22327209, 0.280469  ,
         0.        ], dtype=float32)},
 {'filename': 'SWH-05-20101106_16k-emission_swahili_05h30_-_06h00_tu_20101106_part102',
  'audio': array([-0.01096754, -0.01230842, -0.01015999, ..., -0.21667908,
         -0.20379573, -0.11009098], dtype=float32)},
 {'filename': 'SWH-05-20101106_16k-emission_swahili_05h30_-_06h00_tu_20101106_part103',
  'audio': array([0.01063866, 0.01384298, 0.01281647, ..., 0.

In [93]:
sample_audios = []
for row in rows:
    audio = row['audio']
    sample_audios.append(audio)
sample_audios[:5]

[array([0.02953335, 0.03225018, 0.02603412, ..., 0.09593043, 0.09478676,
        0.05775513], dtype=float32),
 array([ 0.00471402,  0.00630584,  0.00576152, ...,  0.01627303,
        -0.00729037, -0.01463527], dtype=float32),
 array([0.00886934, 0.00965257, 0.0063316 , ..., 0.22327209, 0.280469  ,
        0.        ], dtype=float32),
 array([-0.01096754, -0.01230842, -0.01015999, ..., -0.21667908,
        -0.20379573, -0.11009098], dtype=float32),
 array([0.01063866, 0.01384298, 0.01281647, ..., 0.0591335 , 0.05393954,
        0.02577941], dtype=float32)]

In [94]:
meta_df = pd.read_csv('../metadata.csv')

In [95]:
meta_df.head()

Unnamed: 0,filename,transcription,filepath,sample_rate,duration
0,SWH-05-20101106_16k-emission_swahili_05h30_-_0...,rais wa tanzania jakaya mrisho kikwete,SWH-05-20101106/SWH-05-20101106_16k-emission_s...,16000,3.14
1,SWH-05-20101106_16k-emission_swahili_05h30_-_0...,yanayo andaliwa nami pendo pondo idhaa ya kisw...,SWH-05-20101106/SWH-05-20101106_16k-emission_s...,16000,3.1
2,SWH-05-20101106_16k-emission_swahili_05h30_-_0...,inayokutangazia moja kwa moja kutoka jijini da...,SWH-05-20101106/SWH-05-20101106_16k-emission_s...,16000,3.65
3,SWH-05-20101106_16k-emission_swahili_05h30_-_0...,juma hili bara la afrika limeshuhudia raia wa ...,SWH-05-20101106/SWH-05-20101106_16k-emission_s...,16000,3.9
4,SWH-05-20101106_16k-emission_swahili_05h30_-_0...,wakipiga kura ya maoni ilikufanya mabadiliko ya,SWH-05-20101106/SWH-05-20101106_16k-emission_s...,16000,2.94


In [96]:
meta_df['sample_rate'].value_counts()

16000    10180
Name: sample_rate, dtype: int64

In [97]:
meta_df.columns.to_list()

['filename', 'transcription', 'filepath', 'sample_rate', 'duration']

In [98]:
txts = []
for row in rows:
    filename = row['filename']
    filter = meta_df[meta_df['filename'] == filename]
    txt = filter[['transcription']].values
    txts.append(txt)

txts[:5]

[array([['rais wa tanzania jakaya mrisho kikwete']], dtype=object),
 array([['yanayo andaliwa nami pendo pondo idhaa ya kiswahili']],
       dtype=object),
 array([['inayokutangazia moja kwa moja kutoka jijini dar es salaam tanzania']],
       dtype=object),
 array([['juma hili bara la afrika limeshuhudia raia wa nchi za niger']],
       dtype=object),
 array([['wakipiga kura ya maoni ilikufanya mabadiliko ya']], dtype=object)]

In [99]:
txts = np.array(txts).reshape(-1)

In [100]:
txts[:5]

array(['rais wa tanzania jakaya mrisho kikwete',
       'yanayo andaliwa nami pendo pondo idhaa ya kiswahili',
       'inayokutangazia moja kwa moja kutoka jijini dar es salaam tanzania',
       'juma hili bara la afrika limeshuhudia raia wa nchi za niger',
       'wakipiga kura ya maoni ilikufanya mabadiliko ya'], dtype=object)

In [101]:
clean_txts = []
alphabets = 'a b c d e f g h i j k l m n o p q r s t u v w x y z'.split()
for txt in txts:
    clean_txt = []
    for c in txt:
        if c not in alphabets and c != ' ':
            continue
        clean_txt.append(c)
    clean_txt = ''.join(clean_txt)
    clean_txts.append(clean_txt)

In [102]:
clean_txts[:5]

['rais wa tanzania jakaya mrisho kikwete',
 'yanayo andaliwa nami pendo pondo idhaa ya kiswahili',
 'inayokutangazia moja kwa moja kutoka jijini dar es salaam tanzania',
 'juma hili bara la afrika limeshuhudia raia wa nchi za niger',
 'wakipiga kura ya maoni ilikufanya mabadiliko ya']

In [103]:
'' in clean_txts

True

In [104]:
df = pd.DataFrame(clean_txts)
df.columns = ['texts']
df.head()

Unnamed: 0,texts
0,rais wa tanzania jakaya mrisho kikwete
1,yanayo andaliwa nami pendo pondo idhaa ya kisw...
2,inayokutangazia moja kwa moja kutoka jijini da...
3,juma hili bara la afrika limeshuhudia raia wa ...
4,wakipiga kura ya maoni ilikufanya mabadiliko ya


In [105]:
idxs = df[df['texts'] == ''].index
idxs

Int64Index([19, 21, 56], dtype='int64')

In [106]:
del clean_txts[idxs[-1]]
del clean_txts[idxs[-2]]
del clean_txts[idxs[-3]]

In [107]:
'' in clean_txts

False

In [108]:
del sample_audios[idxs[-1]]
del sample_audios[idxs[-2]]
del sample_audios[idxs[-3]]

## Tokenizer

In [109]:
def character_dict():
    alphabet = 'a b c d e f g h i j k l m n o p q r s t u v w x y z'
    supported = alphabet.split()

    char_map = {}
    char_map[""] = 0
    char_map["<SPACE>"] = 1
    idx = 2
    for c in supported:
        char_map[c] = idx
        idx += 1
    index_map = {v: k for k, v in char_map.items()}
    return char_map, index_map

In [110]:
char_map, index_map = character_dict()

In [111]:
char_map

{'': 0,
 '<SPACE>': 1,
 'a': 2,
 'b': 3,
 'c': 4,
 'd': 5,
 'e': 6,
 'f': 7,
 'g': 8,
 'h': 9,
 'i': 10,
 'j': 11,
 'k': 12,
 'l': 13,
 'm': 14,
 'n': 15,
 'o': 16,
 'p': 17,
 'q': 18,
 'r': 19,
 's': 20,
 't': 21,
 'u': 22,
 'v': 23,
 'w': 24,
 'x': 25,
 'y': 26,
 'z': 27}

In [112]:
def text_to_int_sequence(text):
    """ Convert text to an integer sequence """
    int_sequence = []
    for c in text:
        if c == ' ':
            ch = char_map['<SPACE>']
        elif c in alphabets:
            ch = char_map[c]
        else:
            print(c)
            print('character not found')
            break
        int_sequence.append(ch)
    return np.array(int_sequence)

In [113]:
def int_sequence_to_text(int_sequence):
    """ Convert an integer sequence to text """
    textch = []
    for c in int_sequence:
        ch = index_map[c]
        textch.append(ch)
    text = ''.join(textch)
    text = text.replace('<SPACE>', ' ')
    return text

## Data Generator

In [114]:
class DataGenerator(tf.keras.utils.Sequence):
    def __init__(self, audios, texts, batch_size=32):
        self.audios = audios
        self.texts = texts
        self.batch_size = batch_size
        self.steps = int(len(self.audios) // self.batch_size)
        # self.index = 0
        self.on_epoch_end()

    # def shuffle(self):
    #     np.random.shuffle(self.indexes)

    def __len__(self):
        return self.steps

    def on_epoch_end(self):
        self.indexes = np.arange(self.steps*self.batch_size)
        # np.random.shuffle(self.indexes)

    def data_generation(self, batch_audios, batch_texts):

        longest_audio = max([len(i) for i in batch_audios])
        longest_txt = max([len(i) for i in batch_texts])

        audios          = np.zeros([int(self.batch_size), longest_audio], dtype="float32")
        txts            = np.zeros([int(self.batch_size), longest_txt], dtype="int64")
        audio_length    = np.zeros([int(self.batch_size)], dtype="int64")
        txt_length      = np.zeros([int(self.batch_size)], dtype="int64")

        i = 0
        for audio, txt in zip(batch_audios, batch_texts):

            txt_len = len(txt)

            txt = text_to_int_sequence(txt)
            # print(txts.shape)
            # print(np.array(txt).shape)
            txts[i,: txt_len] = txt

            audio_len = len(audio)

            audios[i, :audio_len] = audio

            audio_length[i] = audio_len
            txt_length[i] = txt_len

            i+=1          
            
        outputs = {'ctc': np.zeros([self.batch_size])}
        inputs = {
                    'the_input':    tf.convert_to_tensor(audios), 
                    'the_labels':   tf.convert_to_tensor(txts), 
                    'input_length': tf.convert_to_tensor(audio_length), 
                    'label_length': tf.convert_to_tensor(txt_length)
                }
        return (inputs, outputs)


    def __getitem__(self, index):
        indexes = self.indexes[int(index*self.batch_size):int((index+1)*self.batch_size)]
    
        batch_audios = [self.audios[int(i)] for i in indexes]
        batch_texts = [self.texts[int(i)] for i in indexes]
        
        return  self.data_generation(batch_audios, batch_texts)

In [115]:
dg = DataGenerator(sample_audios, clean_txts)

In [116]:
len(dg)

6

In [117]:
batch1 = dg[0][0]


In [118]:
batch1

{'the_input': <tf.Tensor: shape=(32, 128993), dtype=float32, numpy=
 array([[ 0.02953335,  0.03225018,  0.02603412, ...,  0.        ,
          0.        ,  0.        ],
        [ 0.00471402,  0.00630584,  0.00576152, ...,  0.        ,
          0.        ,  0.        ],
        [ 0.00886934,  0.00965257,  0.0063316 , ...,  0.        ,
          0.        ,  0.        ],
        ...,
        [-0.01929947, -0.0214183 , -0.01492864, ...,  0.        ,
          0.        ,  0.        ],
        [ 0.00464254,  0.00063416, -0.00608059, ...,  0.        ,
          0.        ,  0.        ],
        [-0.02031364, -0.02287264, -0.02081008, ...,  0.        ,
          0.        ,  0.        ]], dtype=float32)>,
 'the_labels': <tf.Tensor: shape=(32, 102), dtype=int64, numpy=
 array([[19,  2, 10, ...,  0,  0,  0],
        [26,  2, 15, ...,  0,  0,  0],
        [10, 15,  2, ...,  0,  0,  0],
        ...,
        [12, 22, 13, ...,  0,  0,  0],
        [15,  2,  1, ...,  0,  0,  0],
        [22, 13, 

## LogMelSpectrogram

In [119]:
class LogMelSpectrogram(tf.keras.layers.Layer):
    """Compute log-magnitude mel-scaled spectrograms."""

    def __init__(self, sample_rate, fft_size, hop_size, n_mels,
                 f_min=0.0, f_max=None, **kwargs):
        super(LogMelSpectrogram, self).__init__(**kwargs)
        self.sample_rate = sample_rate
        self.fft_size = fft_size
        self.hop_size = hop_size
        self.n_mels = n_mels
        self.f_min = f_min
        self.f_max = f_max if f_max else sample_rate / 2
        self.mel_filterbank = tf.signal.linear_to_mel_weight_matrix(
            num_mel_bins=self.n_mels,
            num_spectrogram_bins=fft_size // 2 + 1,
            sample_rate=self.sample_rate,
            lower_edge_hertz=self.f_min,
            upper_edge_hertz=self.f_max)

    def build(self, input_shape):
        self.non_trainable_weights.append(self.mel_filterbank)
        super(LogMelSpectrogram, self).build(input_shape)

    def call(self, waveforms):
        """Forward pass.
        Parameters
        ----------
        waveforms : tf.Tensor, shape = (None, n_samples)
            A Batch of mono waveforms.
        Returns
        -------
        log_mel_spectrograms : (tf.Tensor), shape = (None, time, freq, ch)
            The corresponding batch of log-mel-spectrograms
        """
        def _tf_log10(x):
            numerator = tf.math.log(x)
            denominator = tf.math.log(tf.constant(10, dtype=numerator.dtype))
            return numerator / denominator

        def power_to_db(magnitude, amin=1e-16, top_db=80.0):
            """
            https://librosa.github.io/librosa/generated/librosa.core.power_to_db.html
            """
            ref_value = tf.reduce_max(magnitude)
            log_spec = 10.0 * _tf_log10(tf.maximum(amin, magnitude))
            log_spec -= 10.0 * _tf_log10(tf.maximum(amin, ref_value))
            log_spec = tf.maximum(log_spec, tf.reduce_max(log_spec) - top_db)

            return log_spec

        spectrograms = tf.signal.stft(waveforms,
                                      frame_length=self.fft_size,
                                      frame_step=self.hop_size,
                                      pad_end=False)

        magnitude_spectrograms = tf.abs(spectrograms)

        mel_spectrograms = tf.matmul(tf.square(magnitude_spectrograms),
                                     self.mel_filterbank)

        log_mel_spectrograms = power_to_db(mel_spectrograms)

        # add channel dimension
        log_mel_spectrograms = tf.expand_dims(log_mel_spectrograms, 3)

        return log_mel_spectrograms

    def get_config(self):
        config = {
            'fft_size': self.fft_size,
            'hop_size': self.hop_size,
            'n_mels': self.n_mels,
            'sample_rate': self.sample_rate,
            'f_min': self.f_min,
            'f_max': self.f_max,
        }
        config.update(super(LogMelSpectrogram, self).get_config())

        return config

## CTC


In [120]:
def ctc_lambda_func(args):
    y_pred, labels, input_length, label_length = args
    return K.ctc_batch_cost(labels, y_pred, input_length, label_length)

In [121]:
def input_lengths_lambda_func(args):
    input_length = args
    return tf.cast(tf.math.floor(input_length/hop_size)-1, dtype="float32")

In [122]:
def add_ctc_loss(model_builder):
    the_labels      = Input(name='the_labels',      shape=(None,), dtype='float32')
    input_lengths   = Input(name='input_length',    shape=(1,), dtype='float32')
    label_lengths   = Input(name='label_length',    shape=(1,), dtype='float32')

    input_lengths2 = Lambda(input_lengths_lambda_func)(input_lengths)
    if model_builder.output_length:
         output_lengths  = Lambda(model_builder.output_length)(input_lengths2)
    else:
         output_lengths  = input_lengths2
    
    # CTC loss is implemented in a lambda layer
    loss_out = Lambda(ctc_lambda_func, output_shape=(1,), name='ctc')([model_builder.output, the_labels, output_lengths, label_lengths])
    model = Model( inputs=[model_builder.input, the_labels, input_lengths, label_lengths],  outputs=loss_out)
    return model

## Models

In [123]:
def preprocessin_model(sample_rate, fft_size, frame_step, n_mels, mfcc=False):

    input_data = Input(name='input', shape=(None,), dtype="float32")
    featLayer = LogMelSpectrogram(
        fft_size=fft_size,
        hop_size=frame_step,
        n_mels=n_mels,
        
        sample_rate=sample_rate,
        f_min=0.0,
        
        f_max=int(sample_rate / 2)
    )(input_data)
    
    x = BatchNormalization()(featLayer)
    model = Model(inputs=input_data, outputs=x, name="preprocessin_model")

    return model

In [124]:
def simple_rnn_model(input_dim, output_dim=224):

    input_data = Input(name='the_input', shape=(None, input_dim))
    simp_rnn = GRU(output_dim, return_sequences=True,
                   implementation=2, name='rnn')(input_data)
    y_pred = Activation('softmax', name='softmax')(simp_rnn)
    model = Model(inputs=input_data, outputs=y_pred, name="simple_rnn_model")
    model.output_length = lambda x: x
    return model


In [125]:
def train(model_builder, 
          data_gen,
          epochs, 
          verbose=1,
          optimizer=SGD(learning_rate=0.002, decay=1e-6, momentum=0.9, nesterov=True, clipnorm=5),
          ):    
              
    model = add_ctc_loss(model_builder)

    model.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer=optimizer)
    print(model.summary())


    hist = model.fit_generator(generator=data_gen,
                               epochs=epochs,
                               verbose=verbose, 
                               use_multiprocessing=False)

## Model Trainig

In [126]:

sample_rate = 16000
fft_size = 1024
frame_step = 512
n_mels = 128

batch_size = 32
epochs = 10
data_len = len(clean_txts)
output_dim = len(char_map) + 2


In [127]:
dg = DataGenerator(sample_audios, clean_txts, batch_size)

In [128]:
preprocess_model = preprocessin_model(sample_rate, fft_size, frame_step, n_mels)
preprocess_model.summary()


Model: "preprocessin_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input (InputLayer)           [(None, None)]            0         
_________________________________________________________________
log_mel_spectrogram_2 (LogMe (None, None, 128, 1)      0         
_________________________________________________________________
batch_normalization_2 (Batch (None, None, 128, 1)      4         
Total params: 4
Trainable params: 2
Non-trainable params: 2
_________________________________________________________________


In [129]:
pip install numpy==1.19.5

Note: you may need to restart the kernel to use updated packages.


'C:\Users\Maelaf' is not recognized as an internal or external command,
operable program or batch file.


In [130]:
speech_model = simple_rnn_model(n_mels, output_dim)
speech_model.summary()


Model: "simple_rnn_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
the_input (InputLayer)       [(None, None, 128)]       0         
_________________________________________________________________
rnn (GRU)                    (None, None, 30)          14400     
_________________________________________________________________
softmax (Activation)         (None, None, 30)          0         
Total params: 14,400
Trainable params: 14,400
Non-trainable params: 0
_________________________________________________________________


In [131]:
def build_model(output_dim, custom_model, preprocess_model, mfcc=False, calc=None):

    input_audios = Input(name='the_input', shape=(None,))
    pre = preprocess_model(input_audios)
    pre = tf.squeeze(pre, [3])

    y_pred = custom_model(pre)
    model = Model(inputs=input_audios, outputs=y_pred, name="model_builder")
    model.output_length = calc

    return model

In [132]:
model = build_model(output_dim, speech_model, preprocess_model)
model.summary()


Model: "model_builder"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
the_input (InputLayer)       [(None, None)]            0         
_________________________________________________________________
preprocessin_model (Function (None, None, 128, 1)      4         
_________________________________________________________________
tf.compat.v1.squeeze_2 (TFOp (None, None, 128)         0         
_________________________________________________________________
simple_rnn_model (Functional (None, None, 30)          14400     
Total params: 14,404
Trainable params: 14,402
Non-trainable params: 2
_________________________________________________________________


In [133]:
pip install --upgrade tensorflow-estimator==1.19.0

Note: you may need to restart the kernel to use updated packages.


'C:\Users\Maelaf' is not recognized as an internal or external command,
operable program or batch file.


In [134]:
import mlflow

In [135]:
mlflow.set_experiment('Speech model simple rnn')
mlflow.tensorflow.autolog()
hop_size = 512
train(model, dg, epochs=10)



INFO: 'Speech model simple rnn' does not exist. Creating a new experiment


2021/08/11 19:02:55 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'ed05f9cd599442fe9a0c9222184710b8', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current tensorflow workflow


Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
the_input (InputLayer)          [(None, None)]       0                                            
__________________________________________________________________________________________________
preprocessin_model (Functional) (None, None, 128, 1) 4           the_input[0][0]                  
__________________________________________________________________________________________________
tf.compat.v1.squeeze_2 (TFOpLam (None, None, 128)    0           preprocessin_model[0][0]         
__________________________________________________________________________________________________
input_length (InputLayer)       [(None, 1)]          0                                            
____________________________________________________________________________________________



Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10




INFO:tensorflow:Assets written to: C:\Users\MAELAF~1\AppData\Local\Temp\tmpfzjw_guh\model\data\model\assets


INFO:tensorflow:Assets written to: C:\Users\MAELAF~1\AppData\Local\Temp\tmpfzjw_guh\model\data\model\assets
