In [1]:
import os
import random
from glob import glob
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [2]:
import pandas as pd
import os


# Assuming saveto is the directory where the CSV file is located
saveto = r"D:\Mtechs3\project\UASpeech_original_FM\UASpeech\audio\original\F04"
wavs = glob("{}/**/*.wav".format(saveto), recursive=True)
id_to_text = {}

# Load the CSV file into a pandas DataFrame, skipping the first row
csv_file_path = os.path.join(saveto, "dictionary_UASPEECH.csv")
df = pd.read_csv(csv_file_path, sep=',', header=None, names=['ID', 'Text'], skiprows=1)

# Create a dictionary with formatted keys and values
id_to_text = {row["Text"]: row["ID"] for _, row in df.iterrows()}




In [10]:
k=df.iloc[:,1] 

In [3]:
def get_data(wavs, id_to_text, maxlen=50):
    """ returns mapping of audio paths and transcription texts """
    data = []
    for w in wavs:
        id = w.split("\\")[-1]
        for item in k:
            if item+"_" in id and len(id_to_text[item]) < maxlen:
                data.append({"audio": w, "text": id_to_text[item]})
                #print("hi")
    return data

In [2]:
saveto = r"C:\Users\abdul\project_presentation\datasets\LJSpeech"
wavs = glob("{}/**/*.wav".format(saveto), recursive=True)

id_to_text = {}
with open(os.path.join(saveto, "metadata.csv"), encoding="utf-8") as f:
    for line in f:
        id = line.strip().split("|")[0]
        text = line.strip().split("|")[2]
        id_to_text[id] = text




In [3]:
id_to_text

{'LJ001-0001': 'Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition',
 'LJ001-0002': 'in being comparatively modern.',
 'LJ001-0003': 'For although the Chinese took impressions from wood blocks engraved in relief for centuries before the woodcutters of the Netherlands, by a similar process',
 'LJ001-0004': 'produced the block books, which were the immediate predecessors of the true printed book,',
 'LJ001-0005': 'the invention of movable metal letters in the middle of the fifteenth century may justly be considered as the invention of the art of printing.',
 'LJ001-0006': 'And it is worth mention in passing that, as an example of fine typography,',
 'LJ001-0007': 'the earliest book printed with movable types, the Gutenberg, or "forty-two line Bible" of about fourteen fifty-five,',
 'LJ001-0008': 'has never been surpassed.',
 'LJ001-0009': 'Printing, then, for our purpose, may be considere

In [4]:
def get_data(wavs, id_to_text, maxlen=50):
    """ returns mapping of audio paths and transcription texts """
    data = []
    for w in wavs:
        id = w.split("\\")[-1].split(".")[0]
        if len(id_to_text[id]) < maxlen:
            data.append({"audio": w, "text": id_to_text[id]})
    return data


In [5]:
class VectorizeChar:
    def __init__(self, max_len=50):
        self.vocab = (
            ["-", "#", "<", ">"]
            + [chr(i + 96) for i in range(1, 27)]
            + [" ", ".", ",", "?"]
        )
        self.max_len = max_len
        self.char_to_idx = {}
        for i, ch in enumerate(self.vocab):
            self.char_to_idx[ch] = i

    def __call__(self, text):
        text = text.lower()
        text = text[: self.max_len - 2]
        text = "<" + text + ">"
        pad_len = self.max_len - len(text)
        return [self.char_to_idx.get(ch, 1) for ch in text] + [0] * pad_len

    def get_vocabulary(self):
        return self.vocab

In [6]:
max_target_len = 200  # all transcripts in out data are < 200 characters
data = get_data(wavs, id_to_text, max_target_len)
vectorizer = VectorizeChar(max_target_len)
#print("vocab size", len(vectorizer.get_vocabulary()))

In [7]:
data

[{'audio': 'C:\\Users\\abdul\\project_presentation\\datasets\\LJSpeech\\wavs\\LJ001-0001.wav',
  'text': 'Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition'},
 {'audio': 'C:\\Users\\abdul\\project_presentation\\datasets\\LJSpeech\\wavs\\LJ001-0002.wav',
  'text': 'in being comparatively modern.'},
 {'audio': 'C:\\Users\\abdul\\project_presentation\\datasets\\LJSpeech\\wavs\\LJ001-0003.wav',
  'text': 'For although the Chinese took impressions from wood blocks engraved in relief for centuries before the woodcutters of the Netherlands, by a similar process'},
 {'audio': 'C:\\Users\\abdul\\project_presentation\\datasets\\LJSpeech\\wavs\\LJ001-0004.wav',
  'text': 'produced the block books, which were the immediate predecessors of the true printed book,'},
 {'audio': 'C:\\Users\\abdul\\project_presentation\\datasets\\LJSpeech\\wavs\\LJ001-0005.wav',
  'text': 'the invention of movable meta

In [15]:
len(data)

3570

In [16]:
import tensorflow as tf


def create_text_ds(data):
    texts = [_["text"] for _ in data]
    text_ds = [vectorizer(t) for t in texts]
    text_ds = tf.data.Dataset.from_tensor_slices(text_ds)
    return text_ds

def path_to_audio(path):
    # spectrogram using stft
    audio = tf.io.read_file(path)
    audio, _ = tf.audio.decode_wav(audio, 1)
    audio = tf.squeeze(audio, axis=-1)
    
    # Apply spectrogram transformation
    stfts = tf.signal.stft(audio, frame_length=200, frame_step=80, fft_length=256)
    x = tf.math.pow(tf.abs(stfts), 0.5)
    
    # Convert to logarithmic scale
    log_spectrogram = tf.math.log(tf.abs(x) + 1e-5)
    
    # Normalization
    means = tf.math.reduce_mean(log_spectrogram, axis=1, keepdims=True)
    stddevs = tf.math.reduce_std(log_spectrogram, axis=1, keepdims=True)
    log_spectrogram = (log_spectrogram - means) / stddevs
    
    # Padding to 10 seconds
    pad_len = 2754
    paddings = tf.constant([[0, pad_len], [0, 0]])
    log_spectrogram = tf.pad(log_spectrogram, paddings, "CONSTANT")[:pad_len, :]
    
    return log_spectrogram

def create_audio_ds(data):
    flist = [_["audio"] for _ in data]
    audio_ds = tf.data.Dataset.from_tensor_slices(flist)
    audio_ds = audio_ds.map(
        path_to_audio, num_parallel_calls=tf.data.AUTOTUNE
    )
    return audio_ds

def create_tf_dataset(data, bs=4):
    audio_ds = create_audio_ds(data)
    text_ds = create_text_ds(data)
    ##print(text_ds)
    ds = tf.data.Dataset.zip((audio_ds, text_ds))
    ds = ds.map(lambda x, y: {"source": x, "target": y})
    ds = ds.batch(bs)
    ds = ds.prefetch(tf.data.AUTOTUNE)
    return ds

# Example usage
split = int(len(data) * 0.99)
train_data = data[:split]
test_data = data[split:]
ds = create_tf_dataset(train_data, bs=64)
val_ds = create_tf_dataset(test_data, bs=4)


In [8]:
def create_text_ds(data):
    texts = [_["text"] for _ in data]
    text_ds = [vectorizer(t) for t in texts]
    text_ds = tf.data.Dataset.from_tensor_slices(text_ds)
    return text_ds


def path_to_audio(path):
    # spectrogram using stft
    audio = tf.io.read_file(path)
    audio, _ = tf.audio.decode_wav(audio, 1)
    #print(audio)
    audio = tf.squeeze(audio, axis=-1)
    #print(audio)
    stfts = tf.signal.stft(audio, frame_length=200, frame_step=80, fft_length=256)
    x = tf.math.pow(tf.abs(stfts), 0.5)
    # normalisation
    means = tf.math.reduce_mean(x, 1, keepdims=True)
    stddevs = tf.math.reduce_std(x, 1, keepdims=True)
    x = (x - means) / stddevs
    audio_len = tf.shape(x)[0]
    # padding to 10 seconds
    pad_len = 2754
    paddings = tf.constant([[0, pad_len], [0, 0]])
    x = tf.pad(x, paddings, "CONSTANT")[:pad_len, :]
    return x


def create_audio_ds(data):
    flist = [_["audio"] for _ in data]
    audio_ds = tf.data.Dataset.from_tensor_slices(flist)
    audio_ds = audio_ds.map(
        path_to_audio, num_parallel_calls=tf.data.AUTOTUNE
    )
    return audio_ds


def create_tf_dataset(data, bs=4):
    audio_ds = create_audio_ds(data)
    text_ds = create_text_ds(data)
    ds = tf.data.Dataset.zip((audio_ds, text_ds))
    ds = ds.map(lambda x, y: {"source": x, "target": y})
    ds = ds.batch(bs)
    ds = ds.prefetch(tf.data.AUTOTUNE)
    return ds


split = int(len(data) * 0.99)
train_data = data[:split]
test_data = data[split:]
ds = create_tf_dataset(train_data, bs=64)
val_ds = create_tf_dataset(test_data, bs=4)

In [9]:
ds

<PrefetchDataset shapes: {source: (None, None, 129), target: (None, 200)}, types: {source: tf.float32, target: tf.int32}>

In [10]:
class TokenEmbedding(layers.Layer):
    def __init__(self, num_vocab=1000, maxlen=100, num_hid=64):
        super().__init__()
        self.emb = tf.keras.layers.Embedding(num_vocab, num_hid)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=num_hid)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        #print("tokens", x[:20])
        #print(x.shape, "token embeddnng1")
        x = self.emb(x)
        #print(x.shape, "token embeddnng2")
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        return x + positions


class SpeechFeatureEmbedding(layers.Layer):
    def __init__(self, num_hid=64, maxlen=100):
        super().__init__()
        self.conv1 = tf.keras.layers.Conv1D(
            num_hid, 11, strides=2, padding="same", activation="relu"
        )
        self.conv2 = tf.keras.layers.Conv1D(
            num_hid, 11, strides=2, padding="same", activation="relu"
        )
        self.conv3 = tf.keras.layers.Conv1D(
            num_hid, 11, strides=2, padding="same", activation="relu"
        )
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=num_hid)

    def call(self, x):
        x = self.conv1(x)
        x = self.conv2(x)
        return self.conv3(x)

In [11]:
class TransformerEncoder(layers.Layer):
    def __init__(self, embed_dim, num_heads, feed_forward_dim, rate=0.1):
        super().__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential(
            [
                layers.Dense(feed_forward_dim, activation="relu"),
                layers.Dense(embed_dim),
            ]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

In [12]:
class TransformerDecoder(layers.Layer):
    def __init__(self, embed_dim, num_heads, feed_forward_dim, dropout_rate=0.1):
        super().__init__()
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm3 = layers.LayerNormalization(epsilon=1e-6)
        self.self_att = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.enc_att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.self_dropout = layers.Dropout(0.5)
        self.enc_dropout = layers.Dropout(0.1)
        self.ffn_dropout = layers.Dropout(0.1)
        self.ffn = keras.Sequential(
            [
                layers.Dense(feed_forward_dim, activation="relu"),
                layers.Dense(embed_dim),
            ]
        )

    def causal_attention_mask(self, batch_size, n_dest, n_src, dtype):
        """Masks the upper half of the dot product matrix in self attention.

        This prevents flow of information from future tokens to current token.
        1's in the lower triangle, counting from the lower right corner.
        """
        i = tf.range(n_dest)[:, None]
        j = tf.range(n_src)
        m = i >= j - n_src + n_dest
        mask = tf.cast(m, dtype)
        mask = tf.reshape(mask, [1, n_dest, n_src])
        mult = tf.concat(
            [tf.expand_dims(batch_size, -1), tf.constant([1, 1], dtype=tf.int32)], 0
        )
        return tf.tile(mask, mult)

    def call(self, enc_out, target):
        input_shape = tf.shape(target)
        batch_size = input_shape[0]
        seq_len = input_shape[1]
        causal_mask = self.causal_attention_mask(batch_size, seq_len, seq_len, tf.bool)
        target_att = self.self_att(target, target, attention_mask=causal_mask)
        target_norm = self.layernorm1(target + self.self_dropout(target_att))
        enc_out = self.enc_att(target_norm, enc_out)
        enc_out_norm = self.layernorm2(self.enc_dropout(enc_out) + target_norm)
        ffn_out = self.ffn(enc_out_norm)
        ffn_out_norm = self.layernorm3(enc_out_norm + self.ffn_dropout(ffn_out))
        return ffn_out_norm

In [13]:
#with accuracy
class Transformer(keras.Model):
    def __init__(
        self,
        num_hid=64,
        num_head=2,
        num_feed_forward=128,
        source_maxlen=100,
        target_maxlen=100,
        num_layers_enc=4,
        num_layers_dec=1,
        num_classes=10,
    ):
        super().__init__()
        self.loss_metric = keras.metrics.Mean(name="loss")
        self.accuracy_metric = keras.metrics.CategoricalAccuracy(name="accuracy")
        self.num_layers_enc = num_layers_enc
        self.num_layers_dec = num_layers_dec
        self.target_maxlen = target_maxlen
        self.num_classes = num_classes

        self.enc_input = SpeechFeatureEmbedding(num_hid=num_hid, maxlen=source_maxlen)
        self.dec_input = TokenEmbedding(
            num_vocab=num_classes, maxlen=target_maxlen, num_hid=num_hid
        )

        self.encoder = keras.Sequential(
            [self.enc_input]
            + [
                TransformerEncoder(num_hid, num_head, num_feed_forward)
                for _ in range(num_layers_enc)
            ]
        )

        for i in range(num_layers_dec):
            setattr(
                self,
                f"dec_layer_{i}",
                TransformerDecoder(num_hid, num_head, num_feed_forward),
            )

        self.classifier = layers.Dense(num_classes)

    def decode(self, enc_out, target):
        y = self.dec_input(target)
        for i in range(self.num_layers_dec):
            y = getattr(self, f"dec_layer_{i}")(enc_out, y)
        return y

    def call(self, inputs):
        source = inputs[0]
        target = inputs[1]
        x = self.encoder(source)
        y = self.decode(x, target)
        return self.classifier(y)

    @property
    def metrics(self):
        return [self.loss_metric, self.accuracy_metric]

    def train_step(self, batch):
        source = batch["source"]
        target = batch["target"]
        dec_input = target[:, :-1]
        dec_target = target[:, 1:]
        with tf.GradientTape() as tape:
            preds = self([source, dec_input])
            one_hot = tf.one_hot(dec_target, depth=self.num_classes)
            mask = tf.math.logical_not(tf.math.equal(dec_target, 0))
            loss = self.compiled_loss(one_hot, preds, sample_weight=mask)
        trainable_vars = self.trainable_variables
        gradients = tape.gradient(loss, trainable_vars)
        self.optimizer.apply_gradients(zip(gradients, trainable_vars))
        self.loss_metric.update_state(loss)
        self.accuracy_metric.update_state(one_hot, preds, sample_weight=mask)
        return {"loss": self.loss_metric.result(), "accuracy": self.accuracy_metric.result()}

    def test_step(self, batch):
        source = batch["source"]
        target = batch["target"]
        dec_input = target[:, :-1]
        dec_target = target[:, 1:]
        preds = self([source, dec_input])
        one_hot = tf.one_hot(dec_target, depth=self.num_classes)
        mask = tf.math.logical_not(tf.math.equal(dec_target, 0))
        loss = self.compiled_loss(one_hot, preds, sample_weight=mask)
        self.loss_metric.update_state(loss)
        self.accuracy_metric.update_state(one_hot, preds, sample_weight=mask)
        return {"loss": self.loss_metric.result(), "accuracy": self.accuracy_metric.result()}

    def generate(self, source, target_start_token_idx):
        """Performs inference over one batch of inputs using greedy decoding."""
        bs = tf.shape(source)[0]
        enc = self.encoder(source)
        dec_input = tf.ones((bs, 1), dtype=tf.int32) * target_start_token_idx
        dec_logits = []
        for i in range(self.target_maxlen - 1):
            dec_out = self.decode(enc, dec_input)
            logits = self.classifier(dec_out)
            logits = tf.argmax(logits, axis=-1, output_type=tf.int32)
            last_logit = tf.expand_dims(logits[:, -1], axis=-1)
            dec_logits.append(last_logit)
            dec_input = tf.concat([dec_input, last_logit], axis=-1)
        return dec_input


In [14]:
class DisplayOutputs(keras.callbacks.Callback):
    def __init__(
        self, batch, idx_to_token, target_start_token_idx=27, target_end_token_idx=28
    ):
        """Displays a batch of outputs after every epoch

        Args:
            batch: A test batch containing the keys "source" and "target"
            idx_to_token: A List containing the vocabulary tokens corresponding to their indices
            target_start_token_idx: A start token index in the target vocabulary
            target_end_token_idx: An end token index in the target vocabulary
        """
        self.batch = batch
        self.target_start_token_idx = target_start_token_idx
        self.target_end_token_idx = target_end_token_idx
        self.idx_to_char = idx_to_token

    def on_epoch_end(self, epoch, logs=None):
        if epoch % 4 != 0:
            return
        source = self.batch["source"]
        target = self.batch["target"].numpy()
        bs = tf.shape(source)[0]
        preds = self.model.generate(source, self.target_start_token_idx)
        preds = preds.numpy()
        for i in range(bs):
            target_text = "".join([self.idx_to_char[_] for _ in target[i, :]])
            prediction = ""
            for idx in preds[i, :]:
                ##print(idx)
                prediction += self.idx_to_char[idx]
                if idx == self.target_end_token_idx:
                    break
            print(f"target:     {target_text.replace('-','')}")
            print(f"prediction: {prediction}\n")

In [15]:
class CustomSchedule(keras.optimizers.schedules.LearningRateSchedule):
    def __init__(
        self,
        init_lr=0.00001,
        lr_after_warmup=0.001,
        final_lr=0.00001,
        warmup_epochs=15,
        decay_epochs=85,
        steps_per_epoch=203,
    ):
        super().__init__()
        self.init_lr = init_lr
        self.lr_after_warmup = lr_after_warmup
        self.final_lr = final_lr
        self.warmup_epochs = warmup_epochs
        self.decay_epochs = decay_epochs
        self.steps_per_epoch = steps_per_epoch

    def calculate_lr(self, epoch):
        """ linear warm up - linear decay """
        warmup_lr = (
            self.init_lr
            + ((self.lr_after_warmup - self.init_lr) / (self.warmup_epochs - 1)) * epoch
        )
        decay_lr = tf.math.maximum(
            self.final_lr,
            self.lr_after_warmup
            - (epoch - self.warmup_epochs)
            * (self.lr_after_warmup - self.final_lr)
            / (self.decay_epochs),
        )
        return tf.math.minimum(warmup_lr, decay_lr)

    def __call__(self, step):
        epoch = step // self.steps_per_epoch
        return self.calculate_lr(epoch)

In [17]:
batch = next(iter(val_ds))

# The vocabulary to convert predicted indices into characters
idx_to_char = vectorizer.get_vocabulary()
display_cb = DisplayOutputs(
    batch, idx_to_char, target_start_token_idx=2, target_end_token_idx=3
)  # set the arguments as per vocabulary index for '<' and '>'

model = Transformer(
    num_hid=200,
    num_head=2,
    num_feed_forward=400,
    target_maxlen=max_target_len,
    num_layers_enc=4,
    num_layers_dec=1,
    num_classes=34,
)
loss_fn = tf.keras.losses.CategoricalCrossentropy(
    from_logits=True, label_smoothing=0.1,
)

learning_rate = CustomSchedule(
    init_lr=0.00001,
    lr_after_warmup=0.001,
    final_lr=0.00001,
    warmup_epochs=15,
    decay_epochs=85,
    steps_per_epoch=len(ds),
)
optimizer = keras.optimizers.Adam(learning_rate)
model.compile(optimizer=optimizer, loss=loss_fn)

history = model.fit(ds, validation_data=val_ds, callbacks=[display_cb], epochs=203)
model.save_weights("ljspeech.h5")

Epoch 1/203
target:     <the increased information supplied by other agencies will be wasted.>
prediction: <the e the s to ter e cte at t ther te s o t a the the t ae tre ie t ie te o t the t c e te e the tentenhe te te tor s rie on athe aienene re theriedit the ie ierete there te ie t stettento t te ene a

target:     <prs must develop the capacity to classify its subjects on a more sophisticated basis than the present geographic breakdown.>
prediction: <the e the s to te s t te at t ther te s o t a the the t ae tre ie t ie te o t the t c e te e the t enenhe t aie tor s rie on athe aienene re theriedit the ie ierete there te ie t stettento t te ene a

target:     <its present manual filing system is obsolete#>
prediction: <the e the s to ter e cte at t ther te s o t a the the t ae tre ie t ie te o t the t c e te e the tentenhe te te tor s rie on athe aienene re theriedit the ie ierete there te ie t stettento t te ene a

target:     <it makes no use of the recent developments in automa

KeyboardInterrupt: 

In [26]:
ds

<PrefetchDataset shapes: {source: (None, None, 129), target: (None, 200)}, types: {source: tf.float32, target: tf.int32}>

In [27]:
rmodel = Transformer(
    num_hid=200,
    num_head=2,
    num_feed_forward=400,
    target_maxlen=max_target_len,
    num_layers_enc=4,
    num_layers_dec=1,
    num_classes=34,
)

# Build the model by calling it with some input data (you can use a dummy batch)
dummy_batch = (tf.ones((1,1, 129)), tf.ones((1, 200)))
_ = rmodel(dummy_batch)

# Compile the model
rmodel.compile(optimizer=optimizer, loss=loss_fn, metrics=["accuracy"])


#history = model.fit(ds, validation_data=val_ds, callbacks=[display_cb], epochs=203)

In [28]:
rmodel.save_weights("control.h5")

In [29]:
rmodel.load_weights("control2.h5")

In [31]:
history = rmodel.fit(ds, validation_data=val_ds, callbacks=[display_cb], epochs=50)

Epoch 1/50
target:     <pencil>
prediction: <hongal>

target:     <bathtub>
prediction: <bathool>

target:     <bathtub>
prediction: <dequileath>

target:     <bathtub>
prediction: <footh>

Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
target:     <pencil>
prediction: <pencil>

target:     <bathtub>
prediction: <battlefield>

target:     <bathtub>
prediction: <bathool>

target:     <bathtub>
prediction: <thu>

Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
target:     <pencil>
prediction: <pencil>

target:     <bathtub>
prediction: <battlefield>

target:     <bathtub>
prediction: <battlefield>

target:     <bathtub>
prediction: <some>

Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
target:     <pencil>
prediction: <pencil>

target:     <bathtub>
prediction: <battlefield>

target:     <bathtub>
prediction: <baths>

target:     <bathtub>
prediction: <so>

Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
target:     <pencil>
prediction: <pencil>

target:     <bathtub>
prediction: <backspace>



KeyboardInterrupt: 

In [36]:
rmodel.summary()

Model: "transformer_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
speech_feature_embedding_1 ( (1, 1, 200)               1164400   
_________________________________________________________________
token_embedding_1 (TokenEmbe multiple                  46800     
_________________________________________________________________
sequential_10 (Sequential)   (1, 1, 200)               3095600   
_________________________________________________________________
transformer_decoder_1 (Trans multiple                  804600    
_________________________________________________________________
dense_21 (Dense)             multiple                  6834      
Total params: 3,953,838
Trainable params: 3,953,834
Non-trainable params: 4
_________________________________________________________________


In [407]:
rmodel.save_weights("M04transfromer.h5")

In [408]:
rmodel.load_weights("M04transfromer.h5")

In [32]:
import pandas as pd
import os
from glob import glob

# Assuming saveto is the directory where the CSV file is located
saveto = r"C:\Users\abdul\Downloads\F04"
wavs = glob("{}/**/*.wav".format(saveto), recursive=True)
id_to_text = {}

# Load the CSV file into a pandas DataFrame, skipping the first row
csv_file_path = os.path.join(saveto, "dictionary_UASPEECH.csv")
df = pd.read_csv(csv_file_path, sep=',', header=None, names=['ID', 'Text'], skiprows=1)

# Create a dictionary with formatted keys and values
id_to_text = {row["Text"]: row["ID"] for _, row in df.iterrows()}

In [33]:
id_to_text 

{'D3': 'Three',
 'D9': 'Nine',
 'D0': 'Zero',
 'D6': 'Six',
 'D7': 'Seven',
 'D8': 'Eight',
 'D4': 'Four',
 'D5': 'Five',
 'D1': 'One',
 'D2': 'Two',
 'LE': 'Echo',
 'LD': 'Delta',
 'LW': 'Whiskey',
 'LK': 'Kilo',
 'LS': 'Sierra',
 'LT': 'Tango',
 'LU': 'Uniform',
 'LX': 'X-ray',
 'LJ': 'Juliet',
 'LC': 'Charlie',
 'LQ': 'Quebec',
 'LP': 'Papa',
 'LZ': 'Zulu',
 'LR': 'Romeo',
 'LF': 'Foxtrot',
 'LI': 'India',
 'LL': 'Lima',
 'LV': 'Victor',
 'LY': 'Yankee',
 'LG': 'Golf',
 'LH': 'Hotel',
 'LN': 'November',
 'LB': 'Bravo',
 'LO': 'Oscar',
 'LA': 'Alpha',
 'LM': 'Mike',
 'C1': 'Command',
 'C2': 'Backspace',
 'C3': 'Delete',
 'C4': 'Enter',
 'C5': 'Tab',
 'C6': 'Escape',
 'C7': 'Alt',
 'C8': 'Control',
 'C9': 'Shift',
 'C10': 'Line',
 'C11': 'Paragraph',
 'C12': 'Sentence',
 'C13': 'Paste',
 'C14': 'Cut',
 'C15': 'Copy',
 'C16': 'Upward',
 'C17': 'Downward',
 'C18': 'Left',
 'C19': 'Right',
 'CW1': 'the',
 'CW2': 'of',
 'CW3': 'and',
 'CW4': 'a',
 'CW5': 'to',
 'CW6': 'in',
 'CW7': 'is',


In [34]:
def get_data(wavs, id_to_text, maxlen=50):
    """ returns mapping of audio paths and transcription texts """
    data = []
    for w in wavs:
        id = w.split("\\")[-1]
        for item in k:
            if item+"_" in id and len(id_to_text[item]) < maxlen:
                data.append({"audio": w, "text": id_to_text[item]})
                #print("hi")
    return data

In [35]:
class VectorizeChar:
    def __init__(self, max_len=50):
        self.vocab = (
            ["-", "#", "<", ">"]
            + [chr(i + 96) for i in range(1, 27)]
            + [" ", ".", ",", "?"]
        )
        self.max_len = max_len
        self.char_to_idx = {}
        for i, ch in enumerate(self.vocab):
            self.char_to_idx[ch] = i

    def __call__(self, text):
        text = text.lower()
        text = text[: self.max_len - 2]
        text = "<" + text + ">"
        pad_len = self.max_len - len(text)
        return [self.char_to_idx.get(ch, 1) for ch in text] + [0] * pad_len

    def get_vocabulary(self):
        return self.vocab

In [36]:
max_target_len = 200  # all transcripts in out data are < 200 characters
data = get_data(wavs, id_to_text, max_target_len)
vectorizer = VectorizeChar(max_target_len)

In [37]:
data

[{'audio': 'C:\\Users\\abdul\\Downloads\\F04\\F04_B1_C10_M2.wav',
  'text': 'Line'},
 {'audio': 'C:\\Users\\abdul\\Downloads\\F04\\F04_B1_C10_M3.wav',
  'text': 'Line'},
 {'audio': 'C:\\Users\\abdul\\Downloads\\F04\\F04_B1_C10_M4.wav',
  'text': 'Line'},
 {'audio': 'C:\\Users\\abdul\\Downloads\\F04\\F04_B1_C10_M5.wav',
  'text': 'Line'},
 {'audio': 'C:\\Users\\abdul\\Downloads\\F04\\F04_B1_C10_M6.wav',
  'text': 'Line'},
 {'audio': 'C:\\Users\\abdul\\Downloads\\F04\\F04_B1_C10_M7.wav',
  'text': 'Line'},
 {'audio': 'C:\\Users\\abdul\\Downloads\\F04\\F04_B1_C10_M8.wav',
  'text': 'Line'},
 {'audio': 'C:\\Users\\abdul\\Downloads\\F04\\F04_B1_C11_M2.wav',
  'text': 'Paragraph'},
 {'audio': 'C:\\Users\\abdul\\Downloads\\F04\\F04_B1_C11_M3.wav',
  'text': 'Paragraph'},
 {'audio': 'C:\\Users\\abdul\\Downloads\\F04\\F04_B1_C11_M4.wav',
  'text': 'Paragraph'},
 {'audio': 'C:\\Users\\abdul\\Downloads\\F04\\F04_B1_C11_M5.wav',
  'text': 'Paragraph'},
 {'audio': 'C:\\Users\\abdul\\Downloads\\F04\

In [38]:
ds = create_tf_dataset(data, bs=64)

Tensor("DecodeWav:0", shape=(None, 1), dtype=float32)
Tensor("Squeeze:0", shape=(None,), dtype=float32)


In [39]:
evaluation_result = rmodel.evaluate(ds)

# Print the evaluation results
print("Test Loss:", evaluation_result)
print("Test Accuracy:", rmodel.metrics_names)  # Prints all metric names
print("Test Accuracy:", evaluation_result)

Test Loss: [0.021915392950177193, 0.9965347051620483]
Test Accuracy: ['loss', 'accuracy']
Test Accuracy: [0.021915392950177193, 0.9965347051620483]


In [40]:
import numpy as np

def predict_label(model, audio_path):
    # Load and process the audio file
    audio = path_to_audio(audio_path)
    audio = tf.expand_dims(audio, axis=0)  # Add batch dimensionprint

    # Generate predictions
    start_token_idx = 2  # Adjust this based on your vocabulary
    preds = model.generate(audio, start_token_idx).numpy()

    # Convert predicted indices to label
    idx_to_char = vectorizer.get_vocabulary()
    predicted_label = ""
    for idx in preds[0, :]:
        predicted_label += idxprint_to_char[idx]
        if idx == 3:  # End token index, adjust based on your vocabulary
            break

    return predicted_label.replace("-", "")  # Adjust based on your preprocessing

# Example usage:



In [41]:
def target(wav, id_to_text, maxlen=50):
    """ returns mapping of audio paths and transcription texts """
    
    id = wav.split("\\")[-1]
    for item in k:
        if item+"_" in id and len(id_to_text[item]) < maxlen:
            return id_to_text[item]

In [35]:
audio_path = r"C:\Users\abdul\project_presentation\output_file.wav"

predicted_label = predict_label(rmodel, audio_path)
print("Predicted Label:", predicted_label)

Predicted Label: <


In [42]:
def calculate_accuracy(predictions, targets):
    # Remove "<,<"
    predictions_cleaned = [p.replace("<", "") for p in predictions]
    predictions_cleaned = [p.replace(">", "") for p in predictions_cleaned]
    # Convert both predicted and target labels to lowercase for case-insensitive comparison
    predictions_lower = [p.lower() for p in predictions_cleaned]
    #print(predictions_lower)
    targets_lower = [t.lower() for t in targets]
    #print(targets_lower)
    correct_predictions = sum(p == t for p, t in zip(predictions_lower, targets_lower))
    accuracy = correct_predictions / len(targets) if len(targets) > 0 else 0.0
    return accuracy


In [49]:
audio_folder = r"C:\Users\abdul\Downloads\Test"
# Get a list of all audio files in the folder
audio_files = [file for file in os.listdir(audio_folder) if file.endswith(".wav")]

# Initialize lists to store predictions and targets
predicted_labels = []
target_labels = []

# Iterate through all audio files
for audio_file in audio_files:
    audio_path = os.path.join(audio_folder, audio_file)
    print(audio_path)
    # Get target label
    target_label = target(audio_file, id_to_text)

    # Load and process the audio file
    #audio = path_to_audio(audio_path)

    # Generate predictions
    predicted_label = predict_label(rmodel, audio_path)

    # Print target and predicted labels
    print(f"Target Label: {target_label}")
    print(f"Predicted Label: {predicted_label}\n")

    # Store predictions and targets for accuracy calculation
    predicted_labels.append(predicted_label)
    target_labels.append(target_label)

# Calculate and print accuracy
accuracy = calculate_accuracy(predicted_labels, target_labels)
print(f"Overall Accuracy: {accuracy * 100:.2f}%")

C:\Users\abdul\Downloads\Test\F04_B3_C10_M2.wav
Target Label: Line
Predicted Label: <line>

C:\Users\abdul\Downloads\Test\F04_B3_C10_M3.wav
Target Label: Line
Predicted Label: <find>

C:\Users\abdul\Downloads\Test\F04_B3_C10_M4.wav
Target Label: Line
Predicted Label: <line>

C:\Users\abdul\Downloads\Test\F04_B3_C10_M5.wav
Target Label: Line
Predicted Label: <line>

C:\Users\abdul\Downloads\Test\F04_B3_C10_M6.wav
Target Label: Line
Predicted Label: <five>

C:\Users\abdul\Downloads\Test\F04_B3_C10_M7.wav
Target Label: Line
Predicted Label: <line>

C:\Users\abdul\Downloads\Test\F04_B3_C10_M8.wav
Target Label: Line
Predicted Label: <line>

C:\Users\abdul\Downloads\Test\F04_B3_C11_M2.wav
Target Label: Paragraph
Predicted Label: <paragraph>

C:\Users\abdul\Downloads\Test\F04_B3_C11_M3.wav
Target Label: Paragraph
Predicted Label: <paragraph>

C:\Users\abdul\Downloads\Test\F04_B3_C11_M4.wav
Target Label: Paragraph
Predicted Label: <paragraph>

C:\Users\abdul\Downloads\Test\F04_B3_C11_M5.wav
Ta

Target Label: Delete
Predicted Label: <three>

C:\Users\abdul\Downloads\Test\F04_B3_C3_M5.wav
Target Label: Delete
Predicted Label: <ser>

C:\Users\abdul\Downloads\Test\F04_B3_C3_M6.wav
Target Label: Delete
Predicted Label: <thirty>

C:\Users\abdul\Downloads\Test\F04_B3_C3_M7.wav
Target Label: Delete
Predicted Label: <if>

C:\Users\abdul\Downloads\Test\F04_B3_C3_M8.wav
Target Label: Delete
Predicted Label: <we>

C:\Users\abdul\Downloads\Test\F04_B3_C4_M2.wav
Target Label: Enter
Predicted Label: <enter>

C:\Users\abdul\Downloads\Test\F04_B3_C4_M3.wav
Target Label: Enter
Predicted Label: <enter>

C:\Users\abdul\Downloads\Test\F04_B3_C4_M4.wav
Target Label: Enter
Predicted Label: <enterthel>

C:\Users\abdul\Downloads\Test\F04_B3_C4_M5.wav
Target Label: Enter
Predicted Label: <enter>

C:\Users\abdul\Downloads\Test\F04_B3_C4_M6.wav
Target Label: Enter
Predicted Label: <enter>

C:\Users\abdul\Downloads\Test\F04_B3_C4_M7.wav
Target Label: Enter
Predicted Label: <enter>

C:\Users\abdul\Downloa

Target Label: are
Predicted Label: <are>

C:\Users\abdul\Downloads\Test\F04_B3_CW15_M4.wav
Target Label: are
Predicted Label: <are>

C:\Users\abdul\Downloads\Test\F04_B3_CW15_M5.wav
Target Label: are
Predicted Label: <are>

C:\Users\abdul\Downloads\Test\F04_B3_CW15_M6.wav
Target Label: are
Predicted Label: <are>

C:\Users\abdul\Downloads\Test\F04_B3_CW15_M7.wav
Target Label: are
Predicted Label: <are>

C:\Users\abdul\Downloads\Test\F04_B3_CW15_M8.wav
Target Label: are
Predicted Label: <are>

C:\Users\abdul\Downloads\Test\F04_B3_CW16_M2.wav
Target Label: as
Predicted Label: <has>

C:\Users\abdul\Downloads\Test\F04_B3_CW16_M3.wav
Target Label: as
Predicted Label: <as>

C:\Users\abdul\Downloads\Test\F04_B3_CW16_M4.wav
Target Label: as
Predicted Label: <ad>

C:\Users\abdul\Downloads\Test\F04_B3_CW16_M5.wav
Target Label: as
Predicted Label: <os>

C:\Users\abdul\Downloads\Test\F04_B3_CW16_M6.wav
Target Label: as
Predicted Label: <as>

C:\Users\abdul\Downloads\Test\F04_B3_CW16_M7.wav
Target L

Target Label: had
Predicted Label: <had>

C:\Users\abdul\Downloads\Test\F04_B3_CW27_M3.wav
Target Label: had
Predicted Label: <had>

C:\Users\abdul\Downloads\Test\F04_B3_CW27_M4.wav
Target Label: had
Predicted Label: <had>

C:\Users\abdul\Downloads\Test\F04_B3_CW27_M5.wav
Target Label: had
Predicted Label: <had>

C:\Users\abdul\Downloads\Test\F04_B3_CW27_M6.wav
Target Label: had
Predicted Label: <hid>

C:\Users\abdul\Downloads\Test\F04_B3_CW27_M7.wav
Target Label: had
Predicted Label: <had>

C:\Users\abdul\Downloads\Test\F04_B3_CW27_M8.wav
Target Label: had
Predicted Label: <hid>

C:\Users\abdul\Downloads\Test\F04_B3_CW28_M2.wav
Target Label: by
Predicted Label: <by>

C:\Users\abdul\Downloads\Test\F04_B3_CW28_M3.wav
Target Label: by
Predicted Label: <by>

C:\Users\abdul\Downloads\Test\F04_B3_CW28_M4.wav
Target Label: by
Predicted Label: <five>

C:\Users\abdul\Downloads\Test\F04_B3_CW28_M5.wav
Target Label: by
Predicted Label: <five>

C:\Users\abdul\Downloads\Test\F04_B3_CW28_M6.wav
Tar

Target Label: can
Predicted Label: <can>

C:\Users\abdul\Downloads\Test\F04_B3_CW39_M2.wav
Target Label: said
Predicted Label: <their>

C:\Users\abdul\Downloads\Test\F04_B3_CW39_M3.wav
Target Label: said
Predicted Label: <theid>

C:\Users\abdul\Downloads\Test\F04_B3_CW39_M4.wav
Target Label: said
Predicted Label: <then>

C:\Users\abdul\Downloads\Test\F04_B3_CW39_M5.wav
Target Label: said
Predicted Label: <good>

C:\Users\abdul\Downloads\Test\F04_B3_CW39_M6.wav
Target Label: said
Predicted Label: <then>

C:\Users\abdul\Downloads\Test\F04_B3_CW39_M7.wav
Target Label: said
Predicted Label: <then>

C:\Users\abdul\Downloads\Test\F04_B3_CW39_M8.wav
Target Label: said
Predicted Label: <theid>

C:\Users\abdul\Downloads\Test\F04_B3_CW3_M2.wav
Target Label: and
Predicted Label: <hin>

C:\Users\abdul\Downloads\Test\F04_B3_CW3_M3.wav
Target Label: and
Predicted Label: <hin>

C:\Users\abdul\Downloads\Test\F04_B3_CW3_M4.wav
Target Label: and
Predicted Label: <and>

C:\Users\abdul\Downloads\Test\F04_

Target Label: a
Predicted Label: <a>

C:\Users\abdul\Downloads\Test\F04_B3_CW4_M8.wav
Target Label: a
Predicted Label: <a>

C:\Users\abdul\Downloads\Test\F04_B3_CW50_M2.wav
Target Label: will
Predicted Label: <will>

C:\Users\abdul\Downloads\Test\F04_B3_CW50_M3.wav
Target Label: will
Predicted Label: <will>

C:\Users\abdul\Downloads\Test\F04_B3_CW50_M4.wav
Target Label: will
Predicted Label: <will>

C:\Users\abdul\Downloads\Test\F04_B3_CW50_M5.wav
Target Label: will
Predicted Label: <will>

C:\Users\abdul\Downloads\Test\F04_B3_CW50_M6.wav
Target Label: will
Predicted Label: <will>

C:\Users\abdul\Downloads\Test\F04_B3_CW50_M7.wav
Target Label: will
Predicted Label: <will>

C:\Users\abdul\Downloads\Test\F04_B3_CW50_M8.wav
Target Label: will
Predicted Label: <will>

C:\Users\abdul\Downloads\Test\F04_B3_CW51_M2.wav
Target Label: up
Predicted Label: <up>

C:\Users\abdul\Downloads\Test\F04_B3_CW51_M3.wav
Target Label: up
Predicted Label: <up>

C:\Users\abdul\Downloads\Test\F04_B3_CW51_M4.wa

Target Label: her
Predicted Label: <her>

C:\Users\abdul\Downloads\Test\F04_B3_CW61_M6.wav
Target Label: her
Predicted Label: <her>

C:\Users\abdul\Downloads\Test\F04_B3_CW61_M7.wav
Target Label: her
Predicted Label: <are>

C:\Users\abdul\Downloads\Test\F04_B3_CW61_M8.wav
Target Label: her
Predicted Label: <har>

C:\Users\abdul\Downloads\Test\F04_B3_CW62_M2.wav
Target Label: would
Predicted Label: <would>

C:\Users\abdul\Downloads\Test\F04_B3_CW62_M3.wav
Target Label: would
Predicted Label: <would>

C:\Users\abdul\Downloads\Test\F04_B3_CW62_M4.wav
Target Label: would
Predicted Label: <would>

C:\Users\abdul\Downloads\Test\F04_B3_CW62_M5.wav
Target Label: would
Predicted Label: <would>

C:\Users\abdul\Downloads\Test\F04_B3_CW62_M6.wav
Target Label: would
Predicted Label: <would>

C:\Users\abdul\Downloads\Test\F04_B3_CW62_M7.wav
Target Label: would
Predicted Label: <would>

C:\Users\abdul\Downloads\Test\F04_B3_CW62_M8.wav
Target Label: would
Predicted Label: <would>

C:\Users\abdul\Downl

KeyboardInterrupt: 

In [50]:
accuracy = calculate_accuracy(predicted_labels, target_labels)
print(f"Overall Accuracy: {accuracy * 100:.2f}%")

Overall Accuracy: 51.24%
