# Google Drive Mount

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# installing and creating directory

In [None]:
!mkdir /content/drive/MyDrive/SpeechDatasets  # one time only

mkdir: cannot create directory ‘/content/drive/MyDrive/SpeechDatasets’: File exists


In [None]:
# !rm -rf /content/drive/MyDrive/SpeechDatasets  # if something goes wrong

In [None]:
!pip install mltu onnx tf2onnx

Collecting mltu
  Downloading mltu-1.0.15-py3-none-any.whl (36 kB)
Collecting onnx
  Downloading onnx-1.14.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (14.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.6/14.6 MB[0m [31m49.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tf2onnx
  Downloading tf2onnx-1.14.0-py3-none-any.whl (451 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m451.2/451.2 kB[0m [31m32.1 MB/s[0m eta [36m0:00:00[0m
Collecting Pillow>=9.4.0 (from mltu)
  Downloading Pillow-10.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (3.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m92.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting onnxruntime>=1.15.0 (from mltu)
  Downloading onnxruntime-1.15.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.9/5.9 MB[0m [31m118.0 MB/s[0m eta [36m0:00:00[0m

In [None]:
model_name = "CRNN-01"

# Imports

In [None]:
import tarfile
from io import BytesIO
import requests
import os
import pandas as pd
from tqdm import tqdm
from datetime import datetime
from zipfile import ZipFile
import numpy as np
import typing

import tensorflow as tf
from keras import layers
from keras.models import Model

from mltu.tensorflow.model_utils import activation_layer
from mltu.utils.text_utils import ctc_decoder
from mltu.inferenceModel import OnnxInferenceModel
from mltu.configs import BaseModelConfigs
from mltu.preprocessors import WavReader
from mltu.tensorflow.dataProvider import DataProvider
from mltu.transformers import LabelIndexer, LabelPadding, SpectrogramPadding
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau, TensorBoard
from mltu.tensorflow.losses import CTCloss
from mltu.tensorflow.callbacks import Model2onnx, TrainLogger
from mltu.tensorflow.metrics import CERMetric, WERMetric

# Download Datasets

In [None]:
LJdataset_download_path = "/content/drive/MyDrive/SpeechDatasets"
def download_and_unzip_LJ(url, extract_to):
    dataset_path = os.path.join(extract_to, "LJSpeech-1.1.tar.bz2")
    if not (os.path.isfile(dataset_path) and os.path.exists(dataset_path)):
        with requests.get(url, stream=True) as r:
            total_length = int(r.headers.get("Content-Length"))
            with tqdm(total=total_length, unit='iB', unit_scale=True, desc="Downloading") as bar:
                with open(dataset_path, "wb") as fout:
                    for data in r.iter_content(1024 * 1024):
                        if data:
                            bar.update(len(data))
                            fout.write(data)

    total_members = 13104

    # Create a progress bar using tqdm
    with tarfile.open(dataset_path, "r:bz2") as tar:
        progress = tqdm(total=total_members, unit="file", desc="Extracting")

        # Extract each file while updating the progress bar
        while True:
            member = tar.next()
            if member is None:
                break

            tar.extract(member, path=extract_to)
            progress.update()

LJdataset_path = os.path.join(LJdataset_download_path, "LJSpeech-1.1")
LJmetadata_path = LJdataset_path + "/metadata.csv"
LJwavs_path = LJdataset_path + "/wavs/"

if not os.path.exists(LJdataset_path):
    download_and_unzip_LJ("https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2", extract_to=LJdataset_download_path)


# Configs

In [None]:
class ModelConfigs(BaseModelConfigs):
    def __init__(self):
        super().__init__()
        self.model_path = os.path.join(f"/content/drive/MyDrive/{model_name}", datetime.strftime(datetime.now(), "%Y%m%d%H%M"))
        self.frame_length = 256
        self.frame_step = 160
        self.fft_length = 384

        self.vocab = "abcdefghijklmnopqrstuvwxyz'?! "
        self.input_shape = None
        self.max_text_length = None
        self.max_spectrogram_length = None

        self.batch_size = 8
        self.learning_rate = 0.0005
        self.train_epochs = 10
        self.train_workers = 20

# Data Provider

In [None]:
# Create a ModelConfigs object to store model configurations
configs = ModelConfigs()

In [None]:
# Read metadata file and parse it
metadata_df = pd.read_csv(LJmetadata_path, sep="|", header=None, quoting=3)
metadata_df.columns = ["file_name", "transcription", "normalized_transcription"]
metadata_df = metadata_df[["file_name", "normalized_transcription"]]

# structure the dataset where each row is a list of [wav_file_path, sound transcription]
dataset = [[f"{LJwavs_path}/{file}.wav", label.lower()] for file, label in metadata_df.values.tolist()]

In [None]:
max_text_length, max_spectrogram_length = 0, 0
for file_path, label in tqdm(dataset):
    spectrogram = WavReader.get_spectrogram(file_path, frame_length=configs.frame_length, frame_step=configs.frame_step, fft_length=configs.fft_length)
    valid_label = [c for c in label if c in configs.vocab]
    max_text_length = max(max_text_length, len(valid_label))
    max_spectrogram_length = max(max_spectrogram_length, spectrogram.shape[0])
    configs.input_shape = [max_spectrogram_length, spectrogram.shape[1]]

configs.max_spectrogram_length = max_spectrogram_length
configs.max_text_length = max_text_length

# Do not run this unless the dataset changes or gets some additions

100%|██████████| 13100/13100 [1:46:28<00:00,  2.05it/s]


In [None]:
configs.save()

In [None]:
config = """
batch_size: 8
fft_length: 384
frame_length: 256
frame_step: 160
input_shape:
- 1392
- 193
learning_rate: 0.0005
max_spectrogram_length: 1392
max_text_length: 186
train_epochs: 3
train_workers: 20
vocab: 'abcdefghijklmnopqrstuvwxyz''?! '
"""

with open("/content/drive/MyDrive/CRNN-01/202306191330/configs.yaml", "w") as file:
    file.write(config)

with open("/content/drive/MyDrive/CRNN-01/202306191330/configs.yaml", "r") as file:
    print(file.read())


batch_size: 8
fft_length: 384
frame_length: 256
frame_step: 160
input_shape:
- 1392
- 193
learning_rate: 0.0005
max_spectrogram_length: 1392
max_text_length: 186
train_epochs: 3
train_workers: 20
vocab: 'abcdefghijklmnopqrstuvwxyz''?! '



In [None]:
# Load the config

configs = BaseModelConfigs.load("/content/drive/MyDrive/CRNN-01/202306191330/configs.yaml")

# Create a data provider for the dataset
data_provider = DataProvider(
    dataset=dataset,
    skip_validation=True,
    batch_size=configs.batch_size,
    data_preprocessors=[
        WavReader(frame_length=configs.frame_length, frame_step=configs.frame_step, fft_length=configs.fft_length),
    ],
    transformers=[
        SpectrogramPadding(max_spectrogram_length=configs.max_spectrogram_length, padding_value=0),
        LabelIndexer(configs.vocab),
        LabelPadding(max_word_length=configs.max_text_length, padding_value=len(configs.vocab)),
    ],
)

# Split the dataset into training and validation sets
train_data_provider, val_data_provider = data_provider.split(split = 0.9)

INFO:DataProvider:Skipping Dataset validation...


# Model

In [None]:
def crnn_model_01(input_dim, output_dim, activation="leaky_relu", dropout=0.2):

    inputs = layers.Input(shape=input_dim, name="input")

    # expand dims to add channel dimension
    input = layers.Lambda(lambda x: tf.expand_dims(x, axis=-1))(inputs)

    # Convolution layer 1
    x = layers.Conv2D(filters=32, kernel_size=[11, 41], strides=[2, 2], padding="same", use_bias=False)(input)
    x = layers.BatchNormalization()(x)
    x = activation_layer(x, activation="leaky_relu")

    # Convolution layer 2
    x = layers.Conv2D(filters=32, kernel_size=[11, 21], strides=[1, 2], padding="same", use_bias=False)(x)
    x = layers.BatchNormalization()(x)
    x = activation_layer(x, activation="leaky_relu")

    # Reshape the resulted volume to feed the RNNs layers
    x = layers.Reshape((-1, x.shape[-2] * x.shape[-1]))(x)

    # RNN layers
    x = layers.Bidirectional(layers.LSTM(128, return_sequences=True))(x)
    x = layers.Dropout(dropout)(x)

    x = layers.Bidirectional(layers.LSTM(128, return_sequences=True))(x)
    x = layers.Dropout(dropout)(x)

    x = layers.Bidirectional(layers.LSTM(128, return_sequences=True))(x)
    x = layers.Dropout(dropout)(x)

    x = layers.Bidirectional(layers.LSTM(128, return_sequences=True))(x)
    x = layers.Dropout(dropout)(x)

    x = layers.Bidirectional(layers.LSTM(128, return_sequences=True))(x)

    # Dense layer
    x = layers.Dense(256)(x)
    x = activation_layer(x, activation="leaky_relu")
    x = layers.Dropout(dropout)(x)

    # Classification layer
    output = layers.Dense(output_dim + 1, activation="softmax")(x)

    model = Model(inputs=inputs, outputs=output)
    return model

In [None]:
class WavToTextModel(OnnxInferenceModel):
    def __init__(self, char_list: typing.Union[str, list], *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.char_list = char_list

    def predict(self, data: np.ndarray):
        data_pred = np.expand_dims(data, axis=0)

        preds = self.model.run(None, {self.input_name: data_pred})[0]

        text = ctc_decoder(preds, self.char_list)[0]

        return text

# Training the model

In [None]:
configs.model_path = "/content/drive/MyDrive/CRNN-01/202306191330"

In [None]:
print(configs.input_shape)

[1392, 193]


In [None]:
model = crnn_model_01(
    input_dim = configs.input_shape,
    output_dim = len(configs.vocab),
    dropout=0.2
)

# Compile the model and print summary
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=configs.learning_rate),
    loss=CTCloss(),
    metrics=[
        CERMetric(vocabulary=configs.vocab),
        WERMetric(vocabulary=configs.vocab)
    ],
    run_eagerly=False
)
model.summary(line_length=110)

model.load_weights(configs.model_path + "/model.h5")

Model: "model_3"
______________________________________________________________________________________________________________
 Layer (type)                                    Output Shape                                Param #          
 input (InputLayer)                              [(None, 1392, 193)]                         0                
                                                                                                              
 lambda_3 (Lambda)                               (None, 1392, 193, 1)                        0                
                                                                                                              
 conv2d_6 (Conv2D)                               (None, 696, 97, 32)                         14432            
                                                                                                              
 batch_normalization_6 (BatchNormalization)      (None, 696, 97, 32)                         12

In [None]:
# Define callbacks
earlystopper = EarlyStopping(monitor="val_CER", patience=20, verbose=1, mode="min")
checkpoint = ModelCheckpoint(f"{configs.model_path}/model.h5", monitor="val_CER", verbose=1, save_best_only=True, mode="min")
trainLogger = TrainLogger(configs.model_path)
tb_callback = TensorBoard(f"{configs.model_path}/logs", update_freq=1)
reduceLROnPlat = ReduceLROnPlateau(monitor="val_CER", factor=0.8, min_delta=1e-10, patience=5, verbose=1, mode="auto")
model2onnx = Model2onnx(f"{configs.model_path}/model.h5")

# Train the model
model.fit(
    train_data_provider,
    validation_data=val_data_provider,
    epochs=configs.train_epochs,
    callbacks=[earlystopper, checkpoint, trainLogger, reduceLROnPlat, tb_callback, model2onnx],
    workers=configs.train_workers
)

# Save training and validation datasets as csv files
train_data_provider.to_csv(os.path.join(configs.model_path, "train.csv"))
val_data_provider.to_csv(os.path.join(configs.model_path, "val.csv"))

Epoch 1/3
Epoch 1: val_CER improved from inf to 0.01837, saving model to /content/drive/MyDrive/CRNN-01/202306191330/model.h5
Epoch 2/3
Epoch 2: val_CER did not improve from 0.01837
Epoch 3/3
Epoch 3: val_CER did not improve from 0.01837


# Validation

In [None]:
import numpy as np

from mltu.preprocessors import WavReader
from mltu.utils.text_utils import get_cer, get_wer

if __name__ == "__main__":
    import pandas as pd
    from tqdm import tqdm
    from mltu.configs import BaseModelConfigs

    configs = BaseModelConfigs.load("/content/drive/MyDrive/CRNN-01/202306180741/configs.yaml")

    model = WavToTextModel(model_path="/content/drive/MyDrive/CRNN-01/202306180741/model.onnx", char_list=configs.vocab, force_cpu=False)

    df = pd.read_csv("/content/drive/MyDrive/CRNN-01/202306180741/val.csv").values.tolist()

    accum_cer, accum_wer = [], []
    for wav_path, label in tqdm(df):

        spectrogram = WavReader.get_spectrogram(wav_path, frame_length=configs.frame_length, frame_step=configs.frame_step, fft_length=configs.fft_length)
        # WavReader.plot_raw_audio(wav_path, label)

        padded_spectrogram = np.pad(spectrogram, ((configs.max_spectrogram_length - spectrogram.shape[0], 0),(0,0)), mode="constant", constant_values=0)

        # WavReader.plot_spectrogram(spectrogram, label)

        text = model.predict(padded_spectrogram)

        true_label = "".join([l for l in label.lower() if l in configs.vocab])

        cer = get_cer(text, true_label)
        wer = get_wer(text, true_label)

        accum_cer.append(cer)
        accum_wer.append(wer)

        print(f"Pred: {text}")
        print(f"Actual: {true_label}")

        break

    print(f"Average CER: {np.average(accum_cer)}, Average WER: {np.average(accum_wer)}")

  0%|          | 0/1310 [00:00<?, ?it/s]

Pred: ned and set for ris ins first ansacin asi haboran ben anstrotet this fonte wol be fod spesil s l in main grambrt
Actual: knead and set for risings first and second as you have already been instructed this sponge will be found especially useful in making graham bread
Average CER: 0.3448275862068966, Average WER: 0.8



