### Write the TEDLIUM dataset to `tfrecord` file format.
***

In [1]:
import re
import os
import glob
import numpy as np
import pandas as pd
from tqdm import tqdm
from etils import epath
import tensorflow as tf
import tensorflow_datasets as tfds

import torch
import torchaudio
import torch.utils.data as data
from torch.utils.data import Dataset, DataLoader

### 2. Data preparation.

In [2]:
train_dataset = torchaudio.datasets.TEDLIUM(root = './', download=False, subset='train')
test_dataset = torchaudio.datasets.TEDLIUM(root = './', download=False, subset='test')
dev_dataset = torchaudio.datasets.TEDLIUM(root = './', download=False, subset='dev')

In [3]:
train_dataloader = data.DataLoader(dataset=train_dataset)
test_dataloader = data.DataLoader(dataset = test_dataset)
dev_dataloader = data.DataLoader(dataset = dev_dataset)

In [4]:
def clean_text_label(example):
    chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"]'
    cleaned_text = re.sub(chars_to_ignore_regex, '', example).lower()
    return cleaned_text


vocab_dict = {' ': 21, "'": 13, 'a': 24, 'b': 17,'c': 25,'d': 2,'e': 9,'f': 14,'g': 22,'h': 8,
              'i': 4,'j': 18,'k': 5,'l': 16,'m': 6,'n': 7,'o': 10,'p': 19,'q': 3,'r': 20,'s': 11,
              't': 0,'u': 26,'v': 27,'w': 1,'x': 23,'y': 15,'z': 12}
vocab_dict["|"] = vocab_dict[" "]
del vocab_dict[" "]

vocab_dict["[PAD]"] = len(vocab_dict)


# Mapping characters to integers
char_to_num = tf.keras.layers.StringLookup(vocabulary=list(vocab_dict.keys()))
# Mapping integers back to original characters
num_to_char = tf.keras.layers.StringLookup(
    vocabulary=char_to_num.get_vocabulary(), invert=True
)

print(
    f"The vocabulary is: {char_to_num.get_vocabulary()} "
    f"(size ={char_to_num.vocabulary_size()})"
)

The vocabulary is: ['[UNK]', "'", 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '|', '[PAD]'] (size =30)


In [5]:
def extract_audio_segment(sph_path, channel, start_sec, end_sec):
    """Extracts segment of audio samples (as an ndarray) from the given path."""
    with tf.io.gfile.GFile(sph_path, "rb") as f:
        try:
            segment = tfds.core.lazy_imports.pydub.AudioSegment.from_file(f, format="nistsphere")
        except tfds.core.lazy_imports.pydub.exceptions.CouldntDecodeError:
            raise ValueError(f"Unable to decode audio file: {sph_path}")
        
    # The dataset only contains mono audio.
    assert segment.channels == 1
    assert channel == 1
    start_ms = int(start_sec * 1000)
    end_ms = int(end_sec * 1000)
    segment = segment[start_ms:end_ms]
    samples = np.array(segment.get_array_of_samples())
    return samples

In [6]:
def maybe_trim_suffix(transcript):
    # stm files for the TEDLIUM release 1 train split contain a key (enclosed in
    # parens) at the end.
    splits = transcript.rsplit(" ", 1)
    transcript = splits[0]
    if len(splits) > 1:
        suffix = splits[-1]
        if not suffix.startswith("("):
            transcript += " " + suffix
    return transcript

In [13]:
def generate_examples_from_stm_file(stm_path):
    for path in stm_path:
        """Generate examples from a TED-LIUM stm file."""
        stm_dir = os.path.dirname(path.decode())
        sph_dir = os.path.join(os.path.dirname(stm_dir), "sph")
        with epath.Path(path.decode()).open() as f:
            for line in f:
                line = line.strip()
                fn, channel, speaker, start, end, label, transcript = line.split(" ", 6)
                transcript_trimmed = maybe_trim_suffix(transcript)
                transcript_cleaned = clean_text_label(transcript_trimmed)
                transcript_int = tf.strings.unicode_split(transcript_cleaned, input_encoding="UTF-8")
                transcript_int = char_to_num(transcript_int)

                audio_file = "%s.sph" % fn
                samples = extract_audio_segment(
                  os.path.join(sph_dir, audio_file),
                  int(channel),
                  float(start),
                  float(end))

                key = "-".join([speaker, start, end, label])
                example = {"speech": samples, "text": transcript_int}

                # yield example
                yield tf.constant(example["speech"], dtype=tf.float32), tf.constant(example["text"], dtype = tf.int16)

In [14]:
# An integer scalar Tensor. The window length in samples.
frame_length = 256
# An integer scalar Tensor. The number of samples to step.
frame_step = 160
# An integer scalar Tensor. The size of the FFT to apply.
# If not provided, uses the smallest power of 2 enclosing frame_length.
fft_length = 384

def encode_single_file(speech, transcript):
  #Change type to float.
    audio = tf.cast(speech, tf.float32)
    #Get the spectrogram.
    spectrogram = tf.signal.stft(
        audio, frame_length=frame_length, frame_step=frame_step, fft_length=fft_length)
    #We only need the magnitude, which can be derived by applying tf.abs
    spectrogram = tf.abs(spectrogram)
    spectrogram = tf.math.pow(spectrogram, 0.5)
    #normalisation
    means = tf.math.reduce_mean(spectrogram, 1, keepdims=True)
    stddevs = tf.math.reduce_std(spectrogram, 1, keepdims=True)
    spectrogram = (spectrogram - means) / (stddevs + 1e-10)
    
    #Process the label.
    label = tf.cast(transcript, tf.int64)
    return spectrogram, label

In [15]:
train_path = glob.glob(r'TEDLIUM_release1\train\stm\*.stm')
test_path = glob.glob(r'TEDLIUM_release1\test\stm\*.stm')
dev_path = glob.glob(r'TEDLIUM_release1\dev\stm\*.stm')
BATCH_SIZE = 32

output_signature = (
    tf.TensorSpec(shape=(None, ), dtype=tf.float32),
    tf.TensorSpec(shape = (None,), dtype = tf.int64)
)

train_dataset = tf.data.Dataset.from_tensor_slices(train_path)
train_dataset = train_dataset.from_generator(generate_examples_from_stm_file, args = [train_path], output_signature = output_signature)
train_dataset = train_dataset.map(encode_single_file, num_parallel_calls=tf.data.AUTOTUNE).prefetch(buffer_size=tf.data.AUTOTUNE)

    

test_dataset = tf.data.Dataset.from_tensor_slices(test_path)
test_dataset = test_dataset.from_generator(generate_examples_from_stm_file, args = [test_path], output_signature = output_signature)
test_dataset = test_dataset.map(encode_single_file, num_parallel_calls=tf.data.AUTOTUNE).prefetch(buffer_size=tf.data.AUTOTUNE)



dev_dataset = tf.data.Dataset.from_tensor_slices(dev_path)
dev_dataset = dev_dataset.from_generator(generate_examples_from_stm_file, args = [dev_path], output_signature = output_signature)
dev_dataset = dev_dataset.map(encode_single_file, num_parallel_calls=tf.data.AUTOTUNE).prefetch(buffer_size=tf.data.AUTOTUNE)

In [17]:
%%timeit
for element in dev_dataset.take(1):
    continue

print(element)

(<tf.Tensor: shape=(1303, 193), dtype=float32, numpy=
array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 0.43976876,  0.4531574 ,  0.18518394, ..., -1.6504657 ,
        -1.6816988 , -1.6921215 ],
       [ 0.61534095,  0.4983179 ,  0.17270204, ..., -1.7313682 ,
        -1.698178  , -1.6994416 ],
       [-0.06142621, -0.27352673, -0.37003124, ..., -1.5455304 ,
        -1.5731868 , -1.5923746 ]], dtype=float32)>, <tf.Tensor: shape=(30,), dtype=int64, numpy=
array([10,  8, 15, 16, 19,  6,  0, 21, 10, 14,  6,  0, 20,  6,  8, 14,  6,
       15, 21,  0, 10, 15,  0, 20,  4, 16, 19, 10, 15,  8], dtype=int64)>)
(<tf.Tensor: shape=(1303, 193), dtype=float32, numpy=
array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
  

RuntimeError: pybind11::error_already_set: MISMATCH of original and normalized active exception types: ORIGINAL _NotOkStatusException REPLACED BY KeyboardInterrupt: <EMPTY MESSAGE>

At:
  c:\Users\debonair\anaconda3\lib\site-packages\tensorflow\python\eager\core.py(36): __init__
  c:\Users\debonair\anaconda3\lib\site-packages\tensorflow\python\ops\gen_dataset_ops.py(3038): iterator_get_next
  c:\Users\debonair\anaconda3\lib\site-packages\tensorflow\python\data\ops\iterator_ops.py(780): _next_internal
  c:\Users\debonair\anaconda3\lib\site-packages\tensorflow\python\data\ops\iterator_ops.py(797): __next__
  <magic-timeit>(1): inner
  c:\Users\debonair\anaconda3\lib\site-packages\IPython\core\magics\execution.py(169): timeit
  c:\Users\debonair\anaconda3\lib\timeit.py(205): repeat
  c:\Users\debonair\anaconda3\lib\site-packages\IPython\core\magics\execution.py(1184): timeit
  c:\Users\debonair\anaconda3\lib\site-packages\IPython\core\magic.py(187): <lambda>
  c:\Users\debonair\anaconda3\lib\site-packages\decorator.py(232): fun
  c:\Users\debonair\anaconda3\lib\site-packages\IPython\core\interactiveshell.py(2419): run_cell_magic
  C:\Users\debonair\AppData\Local\Temp\ipykernel_32248\99719596.py(1): <module>
  c:\Users\debonair\anaconda3\lib\site-packages\IPython\core\interactiveshell.py(3457): run_code
  c:\Users\debonair\anaconda3\lib\site-packages\IPython\core\interactiveshell.py(3377): run_ast_nodes
  c:\Users\debonair\anaconda3\lib\site-packages\IPython\core\interactiveshell.py(3185): run_cell_async
  c:\Users\debonair\anaconda3\lib\site-packages\IPython\core\async_helpers.py(78): _pseudo_sync_runner
  c:\Users\debonair\anaconda3\lib\site-packages\IPython\core\interactiveshell.py(2960): _run_cell
  c:\Users\debonair\anaconda3\lib\site-packages\IPython\core\interactiveshell.py(2914): run_cell
  c:\Users\debonair\anaconda3\lib\site-packages\ipykernel\zmqshell.py(528): run_cell
  c:\Users\debonair\anaconda3\lib\site-packages\ipykernel\ipkernel.py(390): do_execute
  c:\Users\debonair\anaconda3\lib\site-packages\ipykernel\kernelbase.py(730): execute_request
  c:\Users\debonair\anaconda3\lib\site-packages\ipykernel\kernelbase.py(406): dispatch_shell
  c:\Users\debonair\anaconda3\lib\site-packages\ipykernel\kernelbase.py(499): process_one
  c:\Users\debonair\anaconda3\lib\site-packages\ipykernel\kernelbase.py(510): dispatch_queue
  c:\Users\debonair\anaconda3\lib\asyncio\events.py(80): _run
  c:\Users\debonair\anaconda3\lib\asyncio\base_events.py(1905): _run_once
  c:\Users\debonair\anaconda3\lib\asyncio\base_events.py(601): run_forever
  c:\Users\debonair\anaconda3\lib\site-packages\tornado\platform\asyncio.py(199): start
  c:\Users\debonair\anaconda3\lib\site-packages\ipykernel\kernelapp.py(712): start
  c:\Users\debonair\anaconda3\lib\site-packages\traitlets\config\application.py(846): launch_instance
  c:\Users\debonair\anaconda3\lib\site-packages\ipykernel_launcher.py(17): <module>
  c:\Users\debonair\anaconda3\lib\runpy.py(87): _run_code
  c:\Users\debonair\anaconda3\lib\runpy.py(197): _run_module_as_main


In [18]:
def _bytes_feature(value):
    """Returns a bytes_list from a string / byte."""
    if isinstance(value, type(tf.constant(0))):
        value = value.numpy() 
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))


def _float_feature(value):
    """Returns a float_list from a float / double."""
    return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))


def _int64_feature(value):
    return tf.train.Feature(int16_list=tf.train.Int16List(value=[value]))


def serialize_example(x_list, y_list):
    """
    Creates a tf.train.Example message ready to be written to a file.
    """
    feature = {
        'spectrogram': tf.train.Feature(float_list=tf.train.FloatList(value=[x_list])),
        'transcription': tf.train.Feature(int16_list=tf.train.Int16List(value=[y_list])),
    }
    example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
    return example_proto.SerializeToString()

In [19]:
#Use the tf.data.Dataset.map method to apply a function to each element in the dataset.
def tf_serialize_example(feature, target):
    tf_string = tf.py_function(
        serialize_example, #Function
        (feature, target),  #Pass these args to the above function.
        tf.string #The return type is tf.string
    )
    return tf.reshape(tf_string, ()) #Result is a scalar

def write_tfrecord(dataset, file_path):
    """
    Writes the dataset to a tfrecord file.
    """
    with tf.io.TFRecordWriter(file_path) as writer:
        for x, y in dataset:
            # x = np.asarray(x, dtype=np.float32)
            # y = np.asarray(y, dtype=np.int64)
            serialized = serialize_example(x, y)
            writer.write(serialized)


#Apply the above function to each element in the dataset.
serialized_features_dataset = dev_dataset.map(tf_serialize_example)


In [21]:
#Write file to a TFRecord file.
filename = 'dev.tfrecord'

with tf.io.TFRecordWriter(filename) as file_writer:
    for feature, target in dev_dataset:
        feature = bytes(feature.numpy())
        target = bytes(target.numpy())
        record_bytes = tf.train.Example(features = tf.train.Features(feature = {
            'spectrogram': tf.train.Feature(bytes_list = tf.train.BytesList(value=[feature])),
            'transcription': tf.train.Feature(bytes_list =tf.train.BytesList(value=[target])),
        })).SerializeToString()
        file_writer.write(record_bytes)

In [22]:
#Write file to a TFRecord file.
filename = 'test.tfrecord'

with tf.io.TFRecordWriter(filename) as file_writer:
    for feature, target in test_dataset:
        feature = bytes(feature.numpy())
        target = bytes(target.numpy())
        record_bytes = tf.train.Example(features = tf.train.Features(feature = {
            'spectrogram': tf.train.Feature(bytes_list = tf.train.BytesList(value=[feature])),
            'transcription': tf.train.Feature(bytes_list =tf.train.BytesList(value=[target])),
        })).SerializeToString()
        file_writer.write(record_bytes)

In [23]:
#Write file to a TFRecord file.
filename = 'train.tfrecord'

with tf.io.TFRecordWriter(filename) as file_writer:
    for feature, target in train_dataset:
        feature = bytes(feature.numpy())
        target = bytes(target.numpy())
        record_bytes = tf.train.Example(features = tf.train.Features(feature = {
            'spectrogram': tf.train.Feature(bytes_list = tf.train.BytesList(value=[feature])),
            'transcription': tf.train.Feature(bytes_list =tf.train.BytesList(value=[target])),
        })).SerializeToString()
        file_writer.write(record_bytes)

KeyboardInterrupt: 

In [25]:
feature.

AttributeError: 'bytes' object has no attribute 'numpy'