# Speech Recognition

## Imports

In [None]:
import os
import pickle
import librosa
import pandas as pd
import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)
warnings.filterwarnings('ignore', category=FutureWarning)

from IPython.display import Audio

import librosa.display
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf

from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import *
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import SGD, Adam, RMSprop
from tensorflow.keras import backend as K

## Data Preparation

In [None]:
rows = []
parent_dir = "../SWH-05-20101106"
files = os.listdir(parent_dir)
for f in files:
    audio, fs = librosa.load(f"{parent_dir}/{f}")
    filename = f.split('.')[0]
    row = {'filename': filename, 'audio': audio}
    rows.append(row)
rows[:5]

[{'filename': 'SWH-05-20101106_16k-emission_swahili_05h30_-_06h00_tu_20101106_part102',
  'audio': array([-0.01096754, -0.01230842, -0.01015999, ..., -0.21667908,
         -0.20379573, -0.11009098], dtype=float32)},
 {'filename': 'SWH-05-20101106_16k-emission_swahili_05h30_-_06h00_tu_20101106_part107',
  'audio': array([-0.00262849, -0.00256155, -0.00178459, ..., -0.2567303 ,
         -0.21261317,  0.        ], dtype=float32)},
 {'filename': 'SWH-05-20101106_16k-emission_swahili_05h30_-_06h00_tu_20101106_part12',
  'audio': array([-0.00823285, -0.00249539, -0.00311783, ..., -0.18402188,
         -0.19362031, -0.11912253], dtype=float32)},
 {'filename': 'SWH-05-20101106_16k-emission_swahili_05h30_-_06h00_tu_20101106_part64',
  'audio': array([ 0.00796661,  0.0098575 ,  0.0089713 , ...,  0.08651416,
          0.04152827, -0.00214096], dtype=float32)},
 {'filename': 'SWH-05-20101106_16k-emission_swahili_05h30_-_06h00_tu_20101106_part68',
  'audio': array([-0.0083479 , -0.0101786 , -0.0102

In [None]:
sample_audios = []
for row in rows:
    audio = row['audio']
    sample_audios.append(audio)
sample_audios[:5]

[array([-0.01096754, -0.01230842, -0.01015999, ..., -0.21667908,
        -0.20379573, -0.11009098], dtype=float32),
 array([-0.00262849, -0.00256155, -0.00178459, ..., -0.2567303 ,
        -0.21261317,  0.        ], dtype=float32),
 array([-0.00823285, -0.00249539, -0.00311783, ..., -0.18402188,
        -0.19362031, -0.11912253], dtype=float32),
 array([ 0.00796661,  0.0098575 ,  0.0089713 , ...,  0.08651416,
         0.04152827, -0.00214096], dtype=float32),
 array([-0.0083479 , -0.0101786 , -0.01026751, ...,  0.0113914 ,
         0.02161873,  0.        ], dtype=float32)]

In [None]:
meta_df = pd.read_csv('../metadata.csv')

In [None]:
meta_df.head()

Unnamed: 0,filename,transcription,filepath,sample_rate,duration
0,SWH-05-20101106_16k-emission_swahili_05h30_-_0...,rais wa tanzania jakaya mrisho kikwete,SWH-05-20101106/SWH-05-20101106_16k-emission_s...,16000,3.14
1,SWH-05-20101106_16k-emission_swahili_05h30_-_0...,yanayo andaliwa nami pendo pondo idhaa ya kisw...,SWH-05-20101106/SWH-05-20101106_16k-emission_s...,16000,3.1
2,SWH-05-20101106_16k-emission_swahili_05h30_-_0...,inayokutangazia moja kwa moja kutoka jijini da...,SWH-05-20101106/SWH-05-20101106_16k-emission_s...,16000,3.65
3,SWH-05-20101106_16k-emission_swahili_05h30_-_0...,juma hili bara la afrika limeshuhudia raia wa ...,SWH-05-20101106/SWH-05-20101106_16k-emission_s...,16000,3.9
4,SWH-05-20101106_16k-emission_swahili_05h30_-_0...,wakipiga kura ya maoni ilikufanya mabadiliko ya,SWH-05-20101106/SWH-05-20101106_16k-emission_s...,16000,2.94


In [None]:
meta_df['sample_rate'].value_counts()

16000    10180
Name: sample_rate, dtype: int64

In [None]:
meta_df.columns.to_list()

['filename', 'transcription', 'filepath', 'sample_rate', 'duration']

In [None]:
txts = []
for row in rows:
    filename = row['filename']
    filter = meta_df[meta_df['filename'] == filename]
    txt = filter[['transcription']].values
    txts.append(txt)

txts[:5]

[array([['juma hili bara la afrika limeshuhudia raia wa nchi za niger']],
       dtype=object),
 array([['na rais aliyetangulia henry konan berdi']], dtype=object),
 array([['baada ya kushinda katika uchaguzi mkuu wa taifa hilo']],
       dtype=object),
 array([['siku ya jumatano maharamia hao wa kisomali']], dtype=object),
 array([['pamoja na abiria ishirini wakiwemo raia wa madagascar']],
       dtype=object)]

In [None]:
txts = np.array(txts).reshape(-1)

In [None]:
txts[:5]

array(['juma hili bara la afrika limeshuhudia raia wa nchi za niger',
       'na rais aliyetangulia henry konan berdi',
       'baada ya kushinda katika uchaguzi mkuu wa taifa hilo',
       'siku ya jumatano maharamia hao wa kisomali',
       'pamoja na abiria ishirini wakiwemo raia wa madagascar'],
      dtype=object)

In [None]:
clean_txts = []
alphabets = 'a b c d e f g h i j k l m n o p q r s t u v w x y z'.split()
for txt in txts:
    clean_txt = []
    for c in txt:
        if c not in alphabets and c != ' ':
            continue
        clean_txt.append(c)
    clean_txt = ''.join(clean_txt)
    clean_txts.append(clean_txt)

In [None]:
clean_txts[:5]

['juma hili bara la afrika limeshuhudia raia wa nchi za niger',
 'na rais aliyetangulia henry konan berdi',
 'baada ya kushinda katika uchaguzi mkuu wa taifa hilo',
 'siku ya jumatano maharamia hao wa kisomali',
 'pamoja na abiria ishirini wakiwemo raia wa madagascar']

In [None]:
'' in clean_txts

True

In [None]:
df = pd.DataFrame(clean_txts)
df.columns = ['texts']
df.head()

Unnamed: 0,texts
0,juma hili bara la afrika limeshuhudia raia wa ...
1,na rais aliyetangulia henry konan berdi
2,baada ya kushinda katika uchaguzi mkuu wa taif...
3,siku ya jumatano maharamia hao wa kisomali
4,pamoja na abiria ishirini wakiwemo raia wa mad...


In [None]:
idxs = df[df['texts'] == ''].index
idxs

Int64Index([51, 161, 190], dtype='int64')

In [None]:
del clean_txts[idxs[-1]]
del clean_txts[idxs[-2]]
del clean_txts[idxs[-3]]

In [None]:
'' in clean_txts

False

In [None]:
del sample_audios[idxs[-1]]
del sample_audios[idxs[-2]]
del sample_audios[idxs[-3]]

## Tokenizer

In [None]:
def character_dict():
    alphabet = 'a b c d e f g h i j k l m n o p q r s t u v w x y z'
    supported = alphabet.split()

    char_map = {}
    char_map[""] = 0
    char_map["<SPACE>"] = 1
    idx = 2
    for c in supported:
        char_map[c] = idx
        idx += 1
    index_map = {v: k for k, v in char_map.items()}
    return char_map, index_map

In [None]:
char_map, index_map = character_dict()

In [None]:
char_map

{'': 0,
 '<SPACE>': 1,
 'a': 2,
 'b': 3,
 'c': 4,
 'd': 5,
 'e': 6,
 'f': 7,
 'g': 8,
 'h': 9,
 'i': 10,
 'j': 11,
 'k': 12,
 'l': 13,
 'm': 14,
 'n': 15,
 'o': 16,
 'p': 17,
 'q': 18,
 'r': 19,
 's': 20,
 't': 21,
 'u': 22,
 'v': 23,
 'w': 24,
 'x': 25,
 'y': 26,
 'z': 27}

In [None]:
def text_to_int_sequence(text):
    """ Convert text to an integer sequence """
    int_sequence = []
    for c in text:
        if c == ' ':
            ch = char_map['<SPACE>']
        elif c in alphabets:
            ch = char_map[c]
        else:
            print(c)
            print('character not found')
            break
        int_sequence.append(ch)
    return np.array(int_sequence)

In [None]:
def int_sequence_to_text(int_sequence):
    """ Convert an integer sequence to text """
    textch = []
    for c in int_sequence:
        ch = index_map[c]
        textch.append(ch)
    text = ''.join(textch)
    text = text.replace('<SPACE>', ' ')
    return text