# Speech Recognition


## Imports

In [110]:
import cv2
import pickle
import librosa
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.io as pio
import plotly.express as px
import matplotlib.pyplot as plt
from IPython.display import Image
import plotly.graph_objects as go

import tensorflow as tf
from tensorflow.keras.layers import *
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import SGD, Adam, RMSprop
from tensorflow.keras.callbacks import Callback, EarlyStopping, ModelCheckpoint, TensorBoard


In [111]:
sns.set()
plt.style.use('ggplot')
%matplotlib inline
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option("expand_frame_repr", False)
pd.set_option('display.float_format', '{:.2f}'.format)
sys.path.append(os.path.abspath(os.path.join('../scripts')))

In [112]:
from clean_audio import CleanAudio
from file_handler import FileHandler
from audio_vis import AudioVis

In [113]:
clean_audio = CleanAudio()
file_handler = FileHandler()
audio_vis = AudioVis()

## Load Data


In [114]:
PATH_TRAIN_WAV = "../data/AMHARIC_CLEAN/train/wav/"
PATH_TEST_WAV = "../data/AMHARIC_CLEAN/test/wav/"

In [115]:
data = pd.read_csv(r'../data/final_data.csv')
data.head(5)

Unnamed: 0.1,Unnamed: 0,key,text,char_length,duration,speed,category
0,0,tr_10000_tr097082,የተለያዩ የትግራይ አውራጃ ተወላጆች ገንዘባቸውን አዋጥተው የልማት ተቋማትን እንዲመሰርቱ ትልማ አይፈቅድ ም,67,7.42,9.02,Train
1,1,tr_10001_tr097083,የጠመንጃ ተኩስ ተከፈተና አራት የኤርትራ ወታደሮች ተገደሉ,36,4.67,7.71,Train
2,2,tr_10002_tr097084,ላነሷቸው ጥያቄዎች የሰጡትን መልስ አቅርበ ነዋል,30,4.67,6.42,Train
3,3,tr_10003_tr097085,እብዱ አስፋልቱ ላይየ ኰለኰ ለው ድንጋይ መኪና አላሳልፍ አለ,38,4.42,8.61,Train
4,4,tr_10004_tr097086,ጠጁን ኰ መኰ መ ኰ መኰ መና ሚስቱን ሲ ያሰቃያት አደረ,35,4.22,8.29,Train


In [116]:
def get_paths(df):
  paths = []
  for col, row in df.iterrows():
    if(row["category"] == "Train"):
      paths.append(PATH_TRAIN_WAV + row["key"] + ".npy")
    else:
      paths.append(PATH_TEST_WAV + row["key"] + ".npy")

  return paths

In [117]:
data["path"] = get_paths(data)
data.sort_values(by=["duration"], inplace=True)
data.reset_index(drop=True, inplace=True)
data = data[["text", "char_length", "duration", "path"]]
data[["text", "char_length", "duration"]]

Unnamed: 0,text,char_length,duration
0,ሼራተን አዲስ ተመረቀ,13,2.05
1,ፖሊሱ እስረኞቹን ቆጠረ,14,2.05
2,ይህ ቀስ በቀስ እያደገ ሄደ,17,2.05
3,ኢነጋማ ህጋዊ እውቅና አገኘ,17,2.05
4,በተጨባጭ ስና የው ግን ባዶ ነው,20,2.05
...,...,...,...
10669,ቦናፓርቲ ያዊ ያልሆኑ ብዛት ያላቸው ጸረ ህዝብ መንግስታት በጸረ ዴሞከራሲ ና በሙስና ውስጥ ሲዘፈቁ እንደሚታዩ ለማወቅ ትንሽ አስተውሎት ን ነው የሚ ጠይቀው,98,13.95
10670,ከዚህ እጅግዘመናዊ ና ውድ ሰአት ሽያጭ ትርፍ ሶስት በመቶ በቋሚነት ሳኦ ክሪስቶ ቮል ፋውንዴሽን ለተባለው የሮናልዶ የእርዳታ ድርጅት ይው ላል,89,13.95
10671,ይልቁንም በተለመደው አኳኋን ከሚኒስትሩ በታች መሆን ያለበት ኤታማዦር ሹም በቀጥታ ሪፖርት የሚያደርገውና ተጠሪነቱ ለጠቅላይ ሚኒስትሩ ነው,86,13.95
10672,ፕሮጀክቱን ለማዘጋጀትና ለማቀነባበር እንዲሁም ጥናቱና ዲዛይኑን ተግባራዊ ለማድረግ እንዲቻል ከሶስቱም ሀገሮች የተውጣጡ ባለሙያዎች ያሉት የፕሮጀክት ጽፈት ቤት እንደሚኖርም ሚኒስትሩ ገልጸዋል,119,13.95


## Tokenizer

In [75]:
class TokenizerWrap(Tokenizer):
    def __init__(self, texts, padding, len_sent, filters, reverse=False):
        Tokenizer.__init__(self, filters=filters, char_level=True)

        self.len_sent = len_sent
        self.fit_on_texts(texts)

        self.index_to_word = dict(zip(self.word_index.values(), self.word_index.keys()))
        self.tokens = self.texts_to_sequences(texts)

        if reverse:
            self.tokens = [list(reversed(x)) for x in self.tokens]
            truncating = 'pre'
        else:
            truncating = 'post'

        self.tokens_padded = pad_sequences(self.tokens,
                                           maxlen=len_sent,
                                           padding=padding,
                                           truncating=truncating
                                           )

    def token_to_word(self, token):
        word = " " if token == 0 else self.index_to_word[token]
        return word

    def tokens_to_string(self, tokens):
        words = [self.index_to_word[token] for token in tokens if token != 0]
        text = "".join(words)
        return text

    def text_to_tokens(self, text, reverse=False, padding=False):
        tokens = self.texts_to_sequences([text])
        tokens = np.array(tokens)

        if reverse:
            tokens = np.flip(tokens, axis=1)
            truncating = 'pre'
        else:
            truncating = 'post'

        if padding:
            tokens = pad_sequences(tokens,
                                   maxlen=self.len_sent,
                                   padding=truncating,
                                   truncating=truncating
                                   )
        return tokens


In [8]:
MAX_SENTENCE_LENGTH = 125       # The longest sentence in the data is around 150 chars
filters = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n።”፤፦’፥'  # { ።”፤፦’፥' } unique for amharic

In [9]:
%%time
tokenizer = TokenizerWrap(texts=data.text,
                          padding='post',
                          reverse=False,
                          len_sent=MAX_SENTENCE_LENGTH,
                          filters=filters)

CPU times: user 494 ms, sys: 8.66 ms, total: 503 ms
Wall time: 524 ms


In [10]:
print(len(tokenizer.word_index))
print(tokenizer.word_index)

222
{' ': 1, 'ን': 2, 'ት': 3, 'ው': 4, 'ስ': 5, 'ያ': 6, 'የ': 7, 'ተ': 8, 'በ': 9, 'አ': 10, 'ል': 11, 'እ': 12, 'ለ': 13, 'ር': 14, 'መ': 15, 'ም': 16, 'ች': 17, 'ና': 18, 'ደ': 19, 'ነ': 20, 'ገ': 21, 'ማ': 22, 'ባ': 23, 'ይ': 24, 'ሚ': 25, 'ግ': 26, 'ራ': 27, 'ቸ': 28, 'ላ': 29, 'ብ': 30, 'ድ': 31, 'ረ': 32, 'ሰ': 33, 'ከ': 34, 'ወ': 35, 'ኢ': 36, 'ታ': 37, 'ዳ': 38, 'ክ': 39, 'ዮ': 40, 'ዋ': 41, 'ህ': 42, 'ጵ': 43, 'ጥ': 44, 'ቀ': 45, 'ሪ': 46, 'ጠ': 47, 'ቅ': 48, 'ዲ': 49, 'ሳ': 50, 'ዎ': 51, 'ሮ': 52, 'ሩ': 53, 'ሉ': 54, 'ሆ': 55, 'ሁ': 56, 'ጋ': 57, 'ሊ': 58, 'ቶ': 59, 'ካ': 60, 'ፈ': 61, 'ጣ': 62, 'ፍ': 63, 'ሀ': 64, 'ሞ': 65, 'ሽ': 66, 'ዊ': 67, 'ዘ': 68, 'ቱ': 69, 'ሬ': 70, 'ኤ': 71, 'ኮ': 72, 'ሎ': 73, 'ኛ': 74, 'ዛ': 75, 'ሲ': 76, 'ቃ': 77, 'ጉ': 78, 'ቡ': 79, 'ቻ': 80, 'ዝ': 81, 'ፕ': 82, 'ቢ': 83, 'ዚ': 84, 'ኑ': 85, 'ሙ': 86, 'ሶ': 87, 'ጀ': 88, 'ቁ': 89, 'ኖ': 90, 'ኩ': 91, 'ቋ': 92, 'ሌ': 93, 'ቤ': 94, 'ሱ': 95, 'ኒ': 96, 'ቹ': 97, 'ኝ': 98, 'ጸ': 99, 'ዱ': 100, 'ቲ': 101, 'ጅ': 102, 'ሸ': 103, 'ዜ': 104, 'ቴ': 105, 'ቆ': 106, 'ዙ': 107, 'ዴ': 108, 'ኔ': 109, 'ጡ': 110, 'ኙ'

In [11]:
data.text[1]

'የጠመንጃ ተኩስ ተከፈተና አራት የኤርትራ ወታደሮች ተገደሉ'

In [12]:
sample = tokenizer.text_to_tokens(data.text[1], padding=True)
sample

array([[  7,  47,  15,   2, 118,   1,   8,  91,   5,   1,   8,  34,  61,
          8,  18,   1,  10,  27,   3,   1,   7,  71,  14,   3,  27,   1,
         35,  37,  19,  52,  17,   1,   8,  21,  19,  54,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0]], dtype=int32)

In [13]:
print(tokenizer.tokens_to_string(sample[0]))

የጠመንጃ ተኩስ ተከፈተና አራት የኤርትራ ወታደሮች ተገደሉ


save token

In [14]:
with open('../models/char_tokenizer_amharic.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle)

## Data Augmentation

In [27]:
class AudioAugment():
  def __init__(self):
    pass
   
  def change_speed(self, data):
    speed_rate = np.random.uniform(0.8, 1.2)
    wav_speed_tune = cv2.resize(data, (1, int(len(data) * speed_rate))).squeeze()

    if len(wav_speed_tune) < len(data):
      padding = len(data) - len(wav_speed_tune)
      offset = padding // 2
      wav_speed_tune = np.pad(wav_speed_tune, (offset, padding - offset), "constant")
    else:
      wav_speed_tune = wav_speed_tune[:len(data)]

    return wav_speed_tune

  def add_noise(self, data, noise_levels=(0, 0.3)):
    noise_level = np.random.uniform(*noise_levels)
    noise = np.random.randn(len(data))
    data_noise = data + noise_level * noise

    return data_noise

  def change_pitch(self, data):
    n_steps = np.random.randint(-1, 2)
    return librosa.effects.pitch_shift(data, 8000, n_steps)


## DataGenerator


In [123]:
class DataGenerator(tf.keras.utils.Sequence):
    def __init__(self, data, sr, batch_size=32, shuffle=True):
        self.data = data      # Data Augmentation
        self.sr = sr
        self.batch_size = batch_size / 4      # Data Augmentation
        self.audio_augment = AudioAugment()
        self.len = int(np.floor(data.shape[0]/ self.batch_size))
        self.shuffle = shuffle
        self.on_epoch_end()

    def __len__(self):
        return self.len

    def __data_generation(self, batch_data):

        longest_audio = int(batch_data["duration"].max() * self.sr)
        longest_trans = int(batch_data["char_length"].max())

        X_audio = np.zeros([int(self.batch_size * 4), longest_audio], dtype="float32")
        y_trans = np.ones([int(self.batch_size * 4), longest_trans], dtype="int64")
        X_length = np.ones([int(self.batch_size * 4), 1], dtype="int64") * longest_audio
        y_length = np.zeros([int(self.batch_size * 4), 1], dtype="int64")

        i = 0
        for col, row in batch_data.iterrows():

            # Add transcription
            transcription = tf.convert_to_tensor(tokenizer.text_to_tokens(row["text"], padding=True)[:, :longest_trans])
            y_trans[i,] = y_trans[i + 1,] = y_trans[i + 2,] = y_trans[i + 3,] = transcription
            y_length[i] = y_length[i + 1] = y_length[i + 2] = y_length[i + 3] = row["char_length"]

            # Add original Audio
            wav = np.load(row["path"])
            audio_length = int(row["duration"] * self.sr)
            X_audio[i, :audio_length] = wav
            i += 1

            # Add noise
            wav_ = self.audio_augment.add_noise(wav)
            X_audio[i, :audio_length] = wav_
            i += 1

            # Add noise
            wav_ = self.audio_augment.add_noise(wav)
            X_audio[i, :audio_length] = wav_
            i += 1

            # # Pitch change
            # wav_ = self.audio_augment.change_pitch(wav)
            # X_audio[i, :audio_length] = wav_
            # i+=1

            # Speed change
            wav_ = self.audio_augment.change_speed(wav)
            X_audio[i, :audio_length] = wav_
            i += 1

        outputs = {'ctc': tf.zeros(([int(self.batch_size * 4)]), dtype=tf.dtypes.float32)}
        inputs = {
            'the_input': tf.convert_to_tensor(X_audio),
            'the_labels': tf.convert_to_tensor(y_trans),
            'input_length': tf.convert_to_tensor(X_length, dtype="float32"),
            'label_length': tf.convert_to_tensor(y_length)
        }
        return (inputs, outputs)

    def on_epoch_end(self):

        self.indexes = np.arange(self.len * self.batch_size)

        if self.shuffle == True:

            self.indexes = self.indexes.reshape(int(self.len), int(self.batch_size))
            np.random.shuffle(self.indexes)

            for i in range(self.len):
                np.random.shuffle(self.indexes[i])

            self.indexes = self.indexes.reshape(int(self.len * self.batch_size))

    def __getitem__(self, index):
        indexes = self.indexes[int(index * self.batch_size):int((index + 1) * self.batch_size)]
        batch_data = self.data.iloc[indexes]
        return self.__data_generation(batch_data)


In [124]:
sr = 8000
batch_size = 128
sample_generator = DataGenerator(data, sr, batch_size, False)

In [100]:
sample_generator.__len__()

333

In [125]:
%%time
sample_data = sample_generator.__getitem__(261)

CPU times: user 277 ms, sys: 31.1 ms, total: 308 ms
Wall time: 413 ms


In [126]:
sample_audios = sample_data[0]["the_input"]
sample_labels = sample_data[0]["the_labels"]
sample_audios_length = sample_data[0]["input_length"]
sample_labels_length = sample_data[0]["label_length"]

In [127]:
print(sample_audios.shape)
print(sample_labels.shape)
print(sample_audios_length.shape)
print(sample_labels_length.shape)

(128, 67584)
(128, 98)
(128, 1)
(128, 1)


In [128]:
sample_labels[0]


<tf.Tensor: shape=(98,), dtype=int64, numpy=
array([ 24,   9,  11,  44,   1,  19,  26,  65,   1,  64,   2, 101,   2,
        26,   8,   2,   1,  12,   2,  38,  35,  95,   3,   1,   7,  53,
        76,   6,   1, 139, 108,  70,  66,   2,   1,  33,   2,  19,  48,
         1,  10,  29,  22,   1,   8,  21,  11,  30, 143,   1,  15,  21,
       134,   3,   1,  30, 107,   1,  30, 107,   1,  24,  18,  21,  27,
        11,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0])>

In [131]:
print(tokenizer.tokens_to_string(sample_labels[0].numpy()))
audio_vis.play_audio(sample_audios[0], sr)

ይበልጥ ደግሞ ሀንቲንግተን እንዳወሱት የሩሲያ ፌዴሬሽን ሰንደቅ አላማ ተገልብጦ መገኘት ብዙ ብዙ ይናገራል


TypeError: play_audio() got an unexpected keyword argument 'rate'