In [None]:
from __future__ import absolute_import, division, print_function, unicode_literals

In [None]:
try:
    %tensorflow_version 2.x
except Exception:
    pass
import tensorflow as tf

Colab only includes TensorFlow 2.x; %tensorflow_version has no effect.


In [None]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split
import unicodedata
import re
import os
import io
import time

In [None]:
path_to_file ='/content/spa.txt'

# Preprocessing the Data

In [None]:
def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')

In [None]:
def preprocess_sentence(w):
    w = unicode_to_ascii(w.lower().strip())
    #w = w.lower().strip()

    # creating a space between a word and the punctuation following it
    # eg: "he is a boy." => "he is a boy ."
    w = re.sub(r"([?.!,¿])", r" \1 ", w)
    w = re.sub(r'[" "]+', " ", w)

    # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
    #w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)

    w = w.lstrip().strip()

    # adding a start and an end token to the sentence
    # so that the model know when to start and stop predicting.
    w = '<start> ' + w + ' <end>'
    return w

In [None]:
def create_dataset(path, num_examples):
    #lines = io.open('hin.txt', encoding='UTF-8').read().split('\n')
    #lines = lines.strip().split('\n')
    #lines = io.open(path, encoding='UTF-8').readlines().strip().split('\n')
    lines = io.open(path, encoding='UTF-8').read().strip().split('\n')
    word_pairs = [[preprocess_sentence(w) for w in l.split('\t')]  for l in lines[:num_examples]]

    return zip(*word_pairs)


In [None]:
lines = io.open(path_to_file, encoding='UTF-8').read().strip().split('\n')
lines[-1]

"One day, I woke up to find that God had put hair on my face. I shaved it off. The next day, I found that God had put it back on my face, so I shaved it off again. On the third day, when I found that God had put hair back on my face again, I decided to let God have his way. That's why I have a beard.\tUn día, me desperté y vi que Dios me había puesto pelo en la cara. Me lo afeité. Al día siguiente, vi que Dios me lo había vuelto a poner en la cara, así que me lo afeité otra vez. Al tercer día, cuando vi que Dios me había puesto pelo en la cara de nuevo, decidí que Dios se saliera con la suya. Por eso tengo barba.\tCC-BY 2.0 (France) Attribution: tatoeba.org #10104877 (CK) & #10106093 (manufrutos)"

In [None]:
en, hn, cv = create_dataset(path_to_file, None)
print(en[-2])
print(hn[-2])
print(cv[-2])

<start> it may be impossible to get a completely error-free corpus due to the nature of this kind of collaborative effort . however , if we encourage members to contribute sentences in their own languages rather than experiment in languages they are learning , we might be able to minimize errors . <end>
<start> puede que sea imposible obtener un corpus completamente libre de errores debido a la naturaleza de este tipo de esfuerzo de colaboracion . sin embargo , si animamos a los miembros a contribuir frases en sus propios idiomas en lugar de experimentar con los idiomas que estan aprendiendo , podriamos ser capaces de minimizar los errores . <end>
<start> cc-by 2 . 0 (france) attribution: tatoeba . org #2024159 (ck) & #4463195 (cueyayotl) <end>


In [None]:
def max_length(tensor):
    return max(len(t) for t in tensor)

In [None]:
def tokenize(lang):
    lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
    lang_tokenizer.fit_on_texts(lang)

    tensor = lang_tokenizer.texts_to_sequences(lang)

    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post')

    return tensor, lang_tokenizer

In [None]:
def load_dataset(path, num_examples=None):
    # creating cleaned input, output pairs
    inp_lang, targ_lang, _ = create_dataset(path, num_examples)

    input_tensor, inp_lang_tokenizer = tokenize(inp_lang)
    target_tensor, targ_lang_tokenizer = tokenize(targ_lang)

    return input_tensor, target_tensor, inp_lang_tokenizer, targ_lang_tokenizer

In [None]:
#  To train faster, we can limit the size of the dataset to 3,000 sentences (of course, translation quality degrades with less data).
#  Try experimenting with the size of the dataset
num_examples = 70000
input_tensor, target_tensor, inp_lang, targ_lang = load_dataset(path_to_file, num_examples)

# Calculate max_length of the target tensors
max_length_inp, max_length_targ = max_length(input_tensor), max_length(target_tensor)

In [None]:
max_length_inp, max_length_targ

(12, 21)

In [None]:
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor,
                                                                                                target_tensor,
                                                                                                test_size=0.2)

# Show length
print(len(input_tensor_train), len(target_tensor_train), len(input_tensor_val), len(target_tensor_val))

56000 56000 14000 14000


In [None]:
def convert(lang, tensor):
    for t in tensor:
        if t!=0:
            print ("%d ----> %s" % (t, lang.index_word[t]))

In [None]:
input_tensor_train[0]

array([  1,  11, 135, 263,   3,   2,   0,   0,   0,   0,   0,   0],
      dtype=int32)

In [None]:
print('10:' + inp_lang.index_word[10])

10:is


In [None]:
print ("Input Language; index to word mapping")
convert(inp_lang, input_tensor_train[0])
print ()
print ("Target Language; index to word mapping")
convert(targ_lang, target_tensor_train[0])

Input Language; index to word mapping
1 ----> <start>
33 ----> be
4306 ----> vigilant
3 ----> .
2 ----> <end>

Target Language; index to word mapping
1 ----> <start>
1941 ----> estate
6615 ----> vigilante
3 ----> .
2 ----> <end>


In [None]:
len(inp_lang.word_index)+1, len(targ_lang.word_index)+1

(8433, 16283)

In [None]:
BUFFER_SIZE = len(input_tensor_train)
BATCH_SIZE = 32
steps_per_epoch = len(input_tensor_train)//BATCH_SIZE
embedding_dim = 256
units = 1024
vocab_inp_size = len(inp_lang.word_index)+1
vocab_tar_size = len(targ_lang.word_index)+1

dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

In [None]:
input_tensor_train[0]

array([   1,   33, 4306,    3,    2,    0,    0,    0,    0,    0,    0,
          0], dtype=int32)

In [None]:
example_input_batch, example_target_batch = next(iter(dataset))
example_input_batch.shape, example_target_batch.shape

(TensorShape([32, 12]), TensorShape([32, 21]))

In [None]:
example_input_batch[0:2]

<tf.Tensor: shape=(2, 12), dtype=int32, numpy=
array([[   1,    7, 1412,    8, 3252,    3,    2,    0,    0,    0,    0,
           0],
       [   1,   44,  422,   23,   54,   47,  145,    3,    2,    0,    0,
           0]], dtype=int32)>

In [None]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
        super(Encoder, self).__init__()
        self.batch_sz = batch_sz
        self.enc_units = enc_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.enc_units, return_sequences=True,
                                        return_state=True, recurrent_initializer='glorot_uniform')

    def call(self, x, hidden):
        x = self.embedding(x)
        output, state = self.gru(x, initial_state = hidden)
        return output, state

    def initialize_hidden_state(self):
        return tf.zeros((self.batch_sz, self.enc_units))

In [None]:
encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)

# sample input
sample_hidden = encoder.initialize_hidden_state()
sample_output, sample_hidden = encoder(example_input_batch, sample_hidden)
print ('Encoder output shape: (batch size, sequence length, units) {}'.format(sample_output.shape))
print ('Encoder Hidden state shape: (batch size, units) {}'.format(sample_hidden.shape))

Encoder output shape: (batch size, sequence length, units) (32, 12, 1024)
Encoder Hidden state shape: (batch size, units) (32, 1024)


In [None]:
class BahdanauAttention(tf.keras.Model):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)

    def call(self, query, values):
        # hidden shape == (batch_size, hidden size)
        # hidden_with_time_axis shape == (batch_size, 1, hidden size)
        # we are doing this to perform addition to calculate the score
        hidden_with_time_axis = tf.expand_dims(query, 1)

        # score shape == (batch_size, max_length, 1)
        # we get 1 at the last axis because we are applying score to self.V
        # the shape of the tensor before applying self.V is (batch_size, max_length, units)
        score = self.V(tf.nn.tanh(
            self.W1(values) + self.W2(hidden_with_time_axis)))

        # attention_weights shape == (batch_size, max_length, 1)
        attention_weights = tf.nn.softmax(score, axis=1)

        # context_vector shape after sum == (batch_size, hidden_size)
        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1)

        return context_vector, attention_weights

In [None]:
attention_layer = BahdanauAttention(10)
attention_result, attention_weights = attention_layer(sample_hidden, sample_output)

print("Attention result shape: (batch size, units) {}".format(attention_result.shape))
print("Attention weights shape: (batch_size, sequence_length, 1) {}".format(attention_weights.shape))

Attention result shape: (batch size, units) (32, 1024)
Attention weights shape: (batch_size, sequence_length, 1) (32, 12, 1)


In [None]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
        super(Decoder, self).__init__()
        self.batch_sz = batch_sz
        self.dec_units = dec_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.dec_units, return_sequences=True,
                                        return_state=True, recurrent_initializer='glorot_uniform')
        self.fc = tf.keras.layers.Dense(vocab_size)

        # used for attention
        self.attention = BahdanauAttention(self.dec_units)

    def call(self, x, hidden, enc_output):
        # enc_output shape == (batch_size, max_length, hidden_size)
        context_vector, attention_weights = self.attention(hidden, enc_output)

        # x shape after passing through embedding == (batch_size, 1, embedding_dim)
        x = self.embedding(x)

        # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

        # passing the concatenated vector to the GRU
        output, state = self.gru(x)

        # output shape == (batch_size * 1, hidden_size)
        output = tf.reshape(output, (-1, output.shape[2]))

        # output shape == (batch_size, vocab)
        x = self.fc(output)

        return x, state, attention_weights

In [None]:
decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE)

sample_decoder_output, _, _ = decoder(tf.random.uniform((32, 1)), sample_hidden, sample_output)

print ('Decoder output shape: (batch_size, vocab size) {}'.format(sample_decoder_output.shape))

Decoder output shape: (batch_size, vocab size) (32, 16283)


# Training

In [None]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_mean(loss_)

In [None]:
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer, encoder=encoder, decoder=decoder)

In [None]:
@tf.function
def train_step(inp, targ, enc_hidden):
    loss = 0

    with tf.GradientTape() as tape:
        enc_output, enc_hidden = encoder(inp, enc_hidden)

        dec_hidden = enc_hidden

        dec_input = tf.expand_dims([targ_lang.word_index['<start>']] * BATCH_SIZE, 1)

        # Teacher forcing - feeding the target as the next input
        for t in range(1, targ.shape[1]):
            # passing enc_output to the decoder
            predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)

            loss += loss_function(targ[:, t], predictions)

            # using teacher forcing
            dec_input = tf.expand_dims(targ[:, t], 1)

    batch_loss = (loss / int(targ.shape[1]))

    variables = encoder.trainable_variables + decoder.trainable_variables

    gradients = tape.gradient(loss, variables)

    optimizer.apply_gradients(zip(gradients, variables))

    return batch_loss

In [None]:
EPOCHS = 50

for epoch in range(EPOCHS):
    start = time.time()

    enc_hidden = encoder.initialize_hidden_state()
    total_loss = 0

    for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
        batch_loss = train_step(inp, targ, enc_hidden)
        total_loss += batch_loss

        #if batch % 100 == 0:
            #print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1, batch, batch_loss.numpy()))

    # saving (checkpoint) the model every 2 epochs
    if (epoch + 1) % 2 == 0:
        checkpoint.save(file_prefix = checkpoint_prefix)

    print('Epoch {} Loss {:.4f}'.format(epoch + 1, total_loss / steps_per_epoch))
    print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

Epoch 1 Loss 1.1965
Time taken for 1 epoch 244.43194723129272 sec

Epoch 2 Loss 0.6781
Time taken for 1 epoch 212.61538195610046 sec

Epoch 3 Loss 0.4221
Time taken for 1 epoch 211.3305060863495 sec

Epoch 4 Loss 0.2875
Time taken for 1 epoch 217.30458641052246 sec

Epoch 5 Loss 0.2109
Time taken for 1 epoch 211.4978895187378 sec

Epoch 6 Loss 0.1664
Time taken for 1 epoch 217.69576859474182 sec

Epoch 7 Loss 0.1384
Time taken for 1 epoch 211.42755889892578 sec

Epoch 8 Loss 0.1205
Time taken for 1 epoch 217.72517085075378 sec

Epoch 9 Loss 0.1075
Time taken for 1 epoch 211.438138961792 sec

Epoch 10 Loss 0.0981
Time taken for 1 epoch 217.84624600410461 sec

Epoch 11 Loss 0.0915
Time taken for 1 epoch 211.4702718257904 sec

Epoch 12 Loss 0.0853
Time taken for 1 epoch 217.60408401489258 sec

Epoch 13 Loss 0.0817
Time taken for 1 epoch 211.37652850151062 sec

Epoch 14 Loss 0.0783
Time taken for 1 epoch 217.5667209625244 sec

Epoch 15 Loss 0.0764
Time taken for 1 epoch 211.39322113990784 

In [None]:
def evaluate(sentence):
    attention_plot = np.zeros((max_length_targ, max_length_inp))

    sentence = preprocess_sentence(sentence)

    inputs = [inp_lang.word_index[i] for i in sentence.split(' ')]
    inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs], maxlen=max_length_inp, padding='post')
    inputs = tf.convert_to_tensor(inputs)

    result = ''

    hidden = [tf.zeros((1, units))]
    enc_out, enc_hidden = encoder(inputs, hidden)

    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([targ_lang.word_index['<start>']], 0)

    for t in range(max_length_targ):
        predictions, dec_hidden, attention_weights = decoder(dec_input, dec_hidden, enc_out)

        # storing the attention weights to plot later on
        attention_weights = tf.reshape(attention_weights, (-1, ))
        attention_plot[t] = attention_weights.numpy()

        predicted_id = tf.argmax(predictions[0]).numpy()

        result += targ_lang.index_word[predicted_id] + ' '

        if targ_lang.index_word[predicted_id] == '<end>':
            return result, sentence, attention_plot

        # the predicted ID is fed back into the model
        dec_input = tf.expand_dims([predicted_id], 0)

    return result, sentence, attention_plot

In [None]:
def plot_attention(attention, sentence, predicted_sentence):
    fig = plt.figure(figsize=(10,10))
    ax = fig.add_subplot(1, 1, 1)
    cax = ax.matshow(attention, cmap='viridis')
    fig.colorbar(cax)

    fontdict = {'fontsize': 14}

    ax.set_xticklabels([''] + sentence, fontdict=fontdict, rotation=90)
    ax.set_yticklabels([''] + predicted_sentence, fontdict=fontdict)

    ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
    ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

    plt.show()

In [None]:
def translate(sentence):
    result, sentence, attention_plot = evaluate(sentence)

    print('Input: %s' % (sentence))
    print('Predicted translation: {}'.format(result))

In [None]:
checkpoint_dire =

In [None]:
checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))

<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x788eb956e590>

In [None]:
def filter_words_not_in_dict(sentence, inp_lang):
    words = sentence.split()
    lower_case_word_index = {word.lower() for word in inp_lang.word_index}
    filtered_words = [word for word in words if word.lower() in lower_case_word_index]
    filtered_sentence = ' '.join(filtered_words)
    return filtered_sentence
filtered_sentence = filter_words_not_in_dict("this is a test sentence cav not present", inp_lang)


In [None]:
translate(filtered_sentence)

Input: <start> this is a test sentence not present <end>
Predicted translation: esta es una prueba esta creas conveniente . <end> 


In [None]:
translate("This is a test sentence not present")

Input: <start> this is a test sentence not present <end>
Predicted translation: esta es una prueba esta creas conveniente . <end> 


In [None]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from keras import layers
from keras.models import load_model
from gtts import gTTS
import os
import numpy as np
from sklearn.model_selection import train_test_split
import unicodedata
import re
import io
import time
from flask import Flask, request, render_template,url_for, redirect

app = Flask(__name__,static_url_path='/static')

@app.route('/')
def home():
    return render_template("audio-only.html")

@app.route('/predict', methods=['POST'])
def predict():
    if request.method == "POST":
        if 'file' not in request.files:
            return "No file part"
        audioFile = request.files['file']

        directory_path = "/home/dhanush/Documents/ML/MinorProject"
        if not os.path.exists(directory_path):
            os.makedirs(directory_path)

        file_path = os.path.join(directory_path, audioFile.filename)
        audioFile.save(file_path)
        frame_length = 256
        frame_step = 160
        fft_length = 384
        characters = [x for x in "abcdefghijklmnopqrstuvwxyz'?! "]

        # Mapping characters to integers
        char_to_num = keras.layers.StringLookup(vocabulary=characters, oov_token="")

        # Mapping integers back to original characters
        num_to_char = keras.layers.StringLookup(
        vocabulary=char_to_num.get_vocabulary(), oov_token="", invert=True
        )
        batch_size = 32
        def decode_batch_predictions(pred):
            input_len = np.ones(pred.shape[0]) * pred.shape[1]
        # Use greedy search. For complex tasks, you can use beam search
            results = keras.backend.ctc_decode(pred, input_length=input_len, greedy=True)[0][0]
        # Iterate over the results and get back the text
            output_text = []
            for result in results:
                result = tf.strings.reduce_join(num_to_char(result)).numpy().decode("utf-8")
                output_text.append(result)
            return output_text
        def encode_single_sample(wav_file):
        # Process the Audio
        # 1. Read wav file
            file = tf.io.read_file(wav_file)
        # 2. Decode the wav file
            audio, _ = tf.audio.decode_wav(file)
        # 3. Squeeze the tensor along the channel axis
            audio = tf.squeeze(audio, axis=-1)
        # 4. Change type to float
            # if len(audio.shape) > 1:
            # # If there's an extra dimension, select the first channel
            #     audio = audio[:, 0]
            audio = tf.cast(audio, tf.float32)
        # 5. Get the spectrogram
            spectrogram = tf.signal.stft(
                audio, frame_length=frame_length, frame_step=frame_step, fft_length=fft_length
            )
        # 6. We only need the magnitude, which can be derived by applying tf.abs
            spectrogram = tf.abs(spectrogram)
            spectrogram = tf.math.pow(spectrogram, 0.5)
        # 7. Normalization
            means = tf.math.reduce_mean(spectrogram, 1, keepdims=True)
            stddevs = tf.math.reduce_std(spectrogram, 1, keepdims=True)
            spectrogram = (spectrogram - means) / (stddevs + 1e-10)
            return spectrogram
        def CTCLoss(y_true, y_pred):
        # Compute the training-time loss value
            batch_len = tf.cast(tf.shape(y_true)[0], dtype="int64")
            input_length = tf.cast(tf.shape(y_pred)[1], dtype="int64")
            label_length = tf.cast(tf.shape(y_true)[1], dtype="int64")

            input_length = input_length * tf.ones(shape=(batch_len, 1), dtype="int64")
            label_length = label_length * tf.ones(shape=(batch_len, 1), dtype="int64")

            loss = keras.backend.ctc_batch_cost(y_true, y_pred, input_length, label_length)
            return loss
        def build_model(input_dim, output_dim, rnn_layers=5, rnn_units=128):
            """Model similar to DeepSpeech2."""
            # Model's input
            input_spectrogram = layers.Input((None, input_dim), name="input")
            # Expand the dimension to use 2D CNN.
            x = layers.Reshape((-1, input_dim, 1), input_shape=(None, input_dim), name="expand_dim")(input_spectrogram)
            # Convolution layer 1
            x = layers.Conv2D(
                filters=32,
                kernel_size=[11, 41],
                strides=[2, 2],
                padding="same",
                use_bias=False,
                name="conv_1",
            )(x)
            x = layers.BatchNormalization(name="conv_1_bn")(x)
            x = layers.ReLU(name="conv_1_relu")(x)
            # Convolution layer 2
            x = layers.Conv2D(
                filters=32,
                kernel_size=[11, 21],
                strides=[1, 2],
                padding="same",
                use_bias=False,
                name="conv_2",
            )(x)
            x = layers.BatchNormalization(name="conv_2_bn")(x)
            x = layers.ReLU(name="conv_2_relu")(x)
            # Reshape the resulted volume to feed the RNNs layers
            x = layers.Reshape((-1, x.shape[-2] * x.shape[-1]))(x)
            # RNN layers
            for i in range(1, rnn_layers + 1):
                recurrent = layers.GRU(
                    units=rnn_units,
                    activation="tanh",
                    recurrent_activation="sigmoid",
                    use_bias=True,
                    return_sequences=True,
                    reset_after=True,
                    name=f"gru_{i}",
                )
                x = layers.Bidirectional(
                    recurrent, name=f"bidirectional_{i}", merge_mode="concat"
                )(x)
                if i < rnn_layers:
                    x = layers.Dropout(rate=0.5)(x)
            # Dense layer
            x = layers.Dense(units=rnn_units * 2, name="dense_1")(x)
            x = layers.ReLU(name="dense_1_relu")(x)
            x = layers.Dropout(rate=0.5)(x)
            # Classification layer
            output = layers.Dense(units=output_dim + 1, activation="softmax")(x)
            # Model
            model = keras.Model(input_spectrogram, output, name="DeepSpeech_2")
            # Optimizer
            opt = keras.optimizers.Adam(learning_rate=1e-4)
            # Compile the model and return
            model.compile(optimizer=opt, loss=CTCLoss)
            return model

        model1 = load_model('speech_recognition_model.h5', custom_objects={'CTCLoss': CTCLoss})

        wav_file=audioFile

        def decode_batch_predictions(pred, beam_width=5):
            input_len = np.ones(pred.shape[0]) * pred.shape[1]
            # Use beam search instead of greedy search
            results = keras.backend.ctc_decode(pred, input_length=input_len, beam_width=beam_width, top_paths=1)[0][0]
            # Iterate over the results and get back the text
            output_text = []
            for result in results:
                result = tf.strings.reduce_join(num_to_char(result)).numpy().decode("utf-8")
                output_text.append(result)
            return output_text
        X =encode_single_sample(file_path)
        X = tf.expand_dims(X, axis=0)
        batch_predictions = model1.predict(X)
        batch_predictions = decode_batch_predictions(batch_predictions)
        # print(batch_predictions)
        path_to_file ='spa.txt'
        def unicode_to_ascii(s):
            return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')
        def preprocess_sentence(w):
            w = unicode_to_ascii(w.lower().strip())
        #w = w.lower().strip()

        # creating a space between a word and the punctuation following it
        # eg: "he is a boy." => "he is a boy ."
            w = re.sub(r"([?.!,¿])", r" \1 ", w)
            w = re.sub(r'[" "]+', " ", w)

            # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
            #w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)

            w = w.lstrip().strip()

            # adding a start and an end token to the sentence
            # so that the model know when to start and stop predicting.
            w = '<start> ' + w + ' <end>'
            return w
        def create_dataset(path, num_examples):
        #lines = io.open('hin.txt', encoding='UTF-8').read().split('\n')
        #lines = lines.strip().split('\n')
        #lines = io.open(path, encoding='UTF-8').readlines().strip().split('\n')
            lines = io.open(path, encoding='UTF-8').read().strip().split('\n')
            word_pairs = [[preprocess_sentence(w) for w in l.split('\t')]  for l in lines[:num_examples]]

            return zip(*word_pairs)
        def max_length(tensor):
            return max(len(t) for t in tensor)
        def tokenize(lang):
            lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
            lang_tokenizer.fit_on_texts(lang)

            tensor = lang_tokenizer.texts_to_sequences(lang)

            tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post')

            return tensor, lang_tokenizer
        def load_dataset(path, num_examples=None):
        # creating cleaned input, output pairs
            inp_lang, targ_lang, _ = create_dataset(path, num_examples)

            input_tensor, inp_lang_tokenizer = tokenize(inp_lang)
            target_tensor, targ_lang_tokenizer = tokenize(targ_lang)

            return input_tensor, target_tensor, inp_lang_tokenizer, targ_lang_tokenizer
        num_examples = 3000
        input_tensor, target_tensor, inp_lang, targ_lang = load_dataset(path_to_file, num_examples)

        # Calculate max_length of the target tensors
        max_length_inp, max_length_targ = max_length(input_tensor), max_length(target_tensor)

        input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor,
                                                                                                    target_tensor,
                                                                                                    test_size=0.2)
        def convert(lang, tensor):
            for t in tensor:
                if t!=0:
                    print ("%d ----> %s" % (t, lang.index_word[t]))
        BUFFER_SIZE = len(input_tensor_train)
        BATCH_SIZE = 32
        steps_per_epoch = len(input_tensor_train)//BATCH_SIZE
        embedding_dim = 256
        units = 1024
        vocab_inp_size = len(inp_lang.word_index)+1
        vocab_tar_size = len(targ_lang.word_index)+1

        dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)
        dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)
        example_input_batch, example_target_batch = next(iter(dataset))
        class Encoder(tf.keras.Model):
            def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
                super(Encoder, self).__init__()
                self.batch_sz = batch_sz
                self.enc_units = enc_units
                self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
                self.gru = tf.keras.layers.GRU(self.enc_units, return_sequences=True,
                                                return_state=True, recurrent_initializer='glorot_uniform')

            def call(self, x, hidden):
                x = self.embedding(x)
                output, state = self.gru(x, initial_state = hidden)
                return output, state

            def initialize_hidden_state(self):
                return tf.zeros((self.batch_sz, self.enc_units))
        encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)
        class BahdanauAttention(tf.keras.Model):
            def __init__(self, units):
                super(BahdanauAttention, self).__init__()
                self.W1 = tf.keras.layers.Dense(units)
                self.W2 = tf.keras.layers.Dense(units)
                self.V = tf.keras.layers.Dense(1)

            def call(self, query, values):
                # hidden shape == (batch_size, hidden size)
                # hidden_with_time_axis shape == (batch_size, 1, hidden size)
                # we are doing this to perform addition to calculate the score
                hidden_with_time_axis = tf.expand_dims(query, 1)

                # score shape == (batch_size, max_length, 1)
                # we get 1 at the last axis because we are applying score to self.V
                # the shape of the tensor before applying self.V is (batch_size, max_length, units)
                score = self.V(tf.nn.tanh(
                    self.W1(values) + self.W2(hidden_with_time_axis)))

                # attention_weights shape == (batch_size, max_length, 1)
                attention_weights = tf.nn.softmax(score, axis=1)

                # context_vector shape after sum == (batch_size, hidden_size)
                context_vector = attention_weights * values
                context_vector = tf.reduce_sum(context_vector, axis=1)
                return context_vector, attention_weights
        class Decoder(tf.keras.Model):
            def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
                super(Decoder, self).__init__()
                self.batch_sz = batch_sz
                self.dec_units = dec_units
                self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
                self.gru = tf.keras.layers.GRU(self.dec_units, return_sequences=True,
                                                return_state=True, recurrent_initializer='glorot_uniform')
                self.fc = tf.keras.layers.Dense(vocab_size)

                # used for attention
                self.attention = BahdanauAttention(self.dec_units)

            def call(self, x, hidden, enc_output):
                # enc_output shape == (batch_size, max_length, hidden_size)
                context_vector, attention_weights = self.attention(hidden, enc_output)

                # x shape after passing through embedding == (batch_size, 1, embedding_dim)
                x = self.embedding(x)

                # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
                x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

                # passing the concatenated vector to the GRU
                output, state = self.gru(x)

                # output shape == (batch_size * 1, hidden_size)
                output = tf.reshape(output, (-1, output.shape[2]))

                # output shape == (batch_size, vocab)
                x = self.fc(output)
                return x, state, attention_weights

        decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE)

        optimizer = tf.keras.optimizers.Adam()
        loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

        def loss_function(real, pred):
            mask = tf.math.logical_not(tf.math.equal(real, 0))
            loss_ = loss_object(real, pred)

            mask = tf.cast(mask, dtype=loss_.dtype)
            loss_ *= mask

            return tf.reduce_mean(loss_)
        checkpoint_dir = './training_checkpoints'
        checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
        checkpoint = tf.train.Checkpoint(optimizer=optimizer, encoder=encoder, decoder=decoder)
        @tf.function
        def train_step(inp, targ, enc_hidden):
            loss  = 0

            with tf.GradientTape() as tape:
                enc_output, enc_hidden = encoder(inp, enc_hidden)

                dec_hidden = enc_hidden

                dec_input = tf.expand_dims([targ_lang.word_index['<start>']] * BATCH_SIZE, 1)

                # Teacher forcing - feeding the target as the next input
                for t in range(1, targ.shape[1]):
                    # passing enc_output to the decoder
                    predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)

                    loss += loss_function(targ[:, t], predictions)

                    # using teacher forcing
                    dec_input = tf.expand_dims(targ[:, t], 1)

            batch_loss = (loss / int(targ.shape[1]))

            variables = encoder.trainable_variables + decoder.trainable_variables

            gradients = tape.gradient(loss, variables)

            optimizer.apply_gradients(zip(gradients, variables))

            return batch_loss
        def evaluate(sentence):
            attention_plot = np.zeros((max_length_targ, max_length_inp))

            sentence = preprocess_sentence(sentence)

            inputs = [inp_lang.word_index[i] for i in sentence.split(' ')]
            inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs], maxlen=max_length_inp, padding='post')
            inputs = tf.convert_to_tensor(inputs)

            result = ''

            hidden = [tf.zeros((1, units))]
            enc_out, enc_hidden = encoder(inputs, hidden)

            dec_hidden = enc_hidden
            dec_input = tf.expand_dims([targ_lang.word_index['<start>']], 0)

            for t in range(max_length_targ):
                predictions, dec_hidden, attention_weights = decoder(dec_input, dec_hidden, enc_out)

                # storing the attention weights to plot later on
                attention_weights = tf.reshape(attention_weights, (-1, ))
                attention_plot[t] = attention_weights.numpy()

                predicted_id = tf.argmax(predictions[0]).numpy()

                result += targ_lang.index_word[predicted_id] + ' '

                if targ_lang.index_word[predicted_id] == '<end>':
                    return result, sentence, attention_plot

                # the predicted ID is fed back into the model
                dec_input = tf.expand_dims([predicted_id], 0)

            return result, sentence, attention_plot
        def translate(sentence):
            result, sentence, attention_plot = evaluate(sentence)
            return result
        checkpoint_dir='training_checkpoints'

        checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))
        def filter_words_not_in_dict(sentence, inp_lang):
            words = sentence.split()
            lower_case_word_index = {word.lower() for word in inp_lang.word_index}
            filtered_words = [word for word in words if word.lower() in lower_case_word_index]
            filtered_sentence = ' '.join(filtered_words)
            return filtered_sentence
        filtered_sentence = filter_words_not_in_dict(batch_predictions[0], inp_lang)
        output=translate(filtered_sentence)


        def text_to_sound(text, language='hi', filename='output.mp3'):
            # Create a gTTS object
            tts = gTTS(text=text, lang=language, slow=False)

            # Save the speech as an MP3 file
            tts.save(filename)
        text_to_sound(output)
    zrr= "Output is here:"
    return render_template("audio-only.html", pred=zrr)

if __name__ == "__main__":
    app.run(debug=True)

In [None]:
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="utf-8">
    <title>Audio-only Example - Record Plugin for Video.js</title>

    <!-- Existing CSS links -->
    <!-- <link href="../node_modules/video.js/dist/video-js.min.css" rel="stylesheet">
    <link href="../node_modules/videojs-wavesurfer/dist/css/videojs.wavesurfer.min.css" rel="stylesheet">
    <link href="../dist/css/videojs.record.css" rel="stylesheet">
    <link href="assets/css/examples.css" rel="stylesheet"> -->

    <link rel="stylesheet" href="{{ url_for('static', filename='css/video-js.min.css') }}">
    <link rel="stylesheet" href="{{ url_for('static', filename='css/videojs.wavesurfer.min.css') }}">

    <link rel="stylesheet" href="{{ url_for('static', filename='css/videojs.record.css') }}">
    <link rel="stylesheet" href="{{ url_for('static', filename='css/examples.css') }}">




    <!-- Additional CSS for the players and heading -->
    <style>
        body {
            margin: 0;
            padding: 0;
            background-image: url("static/Images/pexels-pixabay-355747.jpg");
            background-size: cover;
            background-position: center;
            font-family: Arial, Helvetica, sans-serif;
            height: 100vh;
            display: flex;
            flex-direction: column;
            align-items: center;
            justify-content: center;
        }

        #heading {
            color: black;
            font-size: 4em; /* Increased size by 2 times */
            margin-bottom: 20px;
        }

        #myAudio,
        #voicePlayer,
        #audioPlayer {
            margin-top: 25px;
            margin-bottom: 65px;
            width: 50%;
            z-index: 2;
            background-color: rgba(255, 131, 0, 0);
        }
         /* Add the following styles to darken the play button and player bar */
         .vjs-button-control.vjs-play-control:before {
            color: #333; /* Darkened color (replace with your preferred color) */
        }

        #myAudio .vjs-play-progress,
        #myAudio .vjs-volume-level {
            background-color: #050; /* Darkened color (replace with your preferred color) */
        }

        #myAudio .vjs-control-bar {
            background-color: #333; /* Darkened color (replace with your preferred color) */
        }

        #button-container {
    display: flex;
}

#chooseFileLabel {
    background-color: black;
    color: white;
    font-size: 1.3em;
    padding: 7px 8px;
    border: none;
    cursor: pointer;
    border-radius: 35px;
    margin-left: 0; /* Adjusted to shift the button to the left */

}

#fileDisplay {
    color: black;
    font-size: 1.5em;
    font-weight: bold; /* Added to make text bolder */
    padding: 10px;
    margin-left: 10px;
}

#upload {
    background-color: black;
    color: white;
    font-size: 1.3em;
    padding: 7px 8px;
    border: none;
    cursor: pointer;
    border-radius: 35px;
}

input[type="file"] {
    display: none;
}
    </style>

    <!-- Existing JavaScript links -->
    <!-- <script src="../node_modules/video.js/dist/video.min.js"></script>
    <script src="../node_modules/recordrtc/RecordRTC.js"></script>
    <script src="../node_modules/webrtc-adapter/out/adapter.js"></script>
    <script src="../node_modules/wavesurfer.js/dist/wavesurfer.min.js"></script>
    <script src="../node_modules/wavesurfer.js/dist/plugin/wavesurfer.microphone.min.js"></script>
    <script src="../node_modules/videojs-wavesurfer/dist/videojs.wavesurfer.min.js"></script>

    <script src="../dist/videojs.record.js"></script>

    <script src="browser-workarounds.js"></script> -->
    <script src="{{ url_for('static', filename='video.js/dist/video.min.js') }}"></script>
    <script src="{{ url_for('static', filename='recordrtc/RecordRTC.js') }}"></script>
    <script src="{{ url_for('static', filename='webrtc-adapter/out/adapter.js') }}"></script>
    <script src="{{ url_for('static', filename='wavesurfer.js/dist/wavesurfer.min.js') }}"></script>

    <script src="{{ url_for('static', filename='wavesurfer.js/dist/plugin/wavesurfer.microphone.min.js') }}"></script>
    <script src="{{ url_for('static', filename='videojs-wavesurfer/dist/videojs.wavesurfer.min.js') }}"></script>
    <script src="{{ url_for('static', filename='videojs.record.js') }}"></script>


    <script src="{{ url_for('static', filename='browser-workarounds.js') }}"></script>


    <!-- Existing Style -->
    <style>
        /* Change player background color */
        #myAudio {
            background-color: (0, 0, 0, 0.5);
        }
    </style>
</head>
<body>

<!-- Aesthetic Heading -->
<h1 id="heading">Voice Translation System</h1>

<audio id="myAudio" class="video-js vjs-default-skin"></audio>
<!-- <audio id="audioPlayer" controls class="video-js vjs-default-skin"></audio> -->

<audio id="voicePlayer" class="video-js vjs-default-skin"></audio>

<div id="button-container">
    <form method="POST" action="{{url_for('predict')}}" enctype="multipart/form-data">
        <label for="fileInput" id="chooseFileLabel">Choose file</label>
        <div class="file-display" id="fileDisplay"></div>
        <input id="fileInput" name="file" type="file">
        <button id="upload" type="submit" value = "predict">Upload</button>
        <h4 style="color: antiquewhite; position:absolute; top: 30%; left: 50%; font-size: 1cm">{{ pred }}</h4>
    </form>
</div>
<audio id="audioPlayer" controls class="video-js vjs-default-skin"></audio>

<script>
    /* eslint-disable */
    var options = {
        controls: true,
        bigPlayButton: false,
        width: 600,
        height: 300,
        fluid: false,
        plugins: {
            wavesurfer: {
                backend: 'WebAudio',
                waveColor: '#36393b',
                progressColor: 'black',
                displayMilliseconds: true,
                debug: true,
                cursorWidth: 1,
                hideScrollbar: true,
                plugins: [
                    // enable microphone plugin
                    WaveSurfer.microphone.create({
                        bufferSize: 4096,
                        numberOfInputChannels: 1,
                        numberOfOutputChannels: 1,
                        constraints: {
                            video: false,
                            audio: true
                        }
                    })
                ]
            },
            record: {
                audio: true,
                video: false,
                maxLength: 6,
                displayMilliseconds: true,
                debug: true
            }
        }
    };

    // apply audio workarounds for certain browsers
    applyAudioWorkaround();

    // create player
    var player = videojs('myAudio', options, function() {
        // print version information at startup
        var msg = 'Using video.js ' + videojs.VERSION +
            ' with videojs-record ' + videojs.getPluginVersion('record') +
            ', videojs-wavesurfer ' + videojs.getPluginVersion('wavesurfer') +
            ', wavesurfer.js ' + WaveSurfer.VERSION + ' and recordrtc ' +
            RecordRTC.version;
        videojs.log(msg);
    });
    var fileInput = document.getElementById('fileInput');
    var fileDisplay = document.getElementById('fileDisplay');

    fileInput.addEventListener('change', function () {
        // Check if files are selected
        if (fileInput.files.length > 0) {
            // Display the first selected file name
            fileDisplay.textContent = fileInput.files[0].name;
        } else {
            // If no file is selected, clear the display
            fileDisplay.textContent = '';
        }
    });

    // error handling
    player.on('deviceError', function() {
        console.log('device error:', player.deviceErrorCode);
    });

    player.on('error', function(element, error) {
        console.error(error);
    });

    // user clicked the record button and started recording
    player.on('startRecord', function() {
        console.log('started recording!');
    });

    // user completed recording and stream is available
    player.on('finishRecord', function() {
        // the blob object contains the recorded data that
        // can be downloaded by the user, stored on the server, etc.
        console.log('finished recording: ', player.recordedData);
    });
    player.on('finishRecord', function () {
        // Check if recorded data is available and is a Blob
        if (player.recordedData instanceof Blob) {
            var blob = player.recordedData;
            var url = URL.createObjectURL(blob);

            // Create a link element and trigger a download
            var a = document.createElement('a');
            a.href = url;
            a.download = 'recorded-audio.wav'; // You can change the filename and extension
            a.click();

            // Release the object URL after the download
            URL.revokeObjectURL(url);
        } else {
            console.log('No valid recorded data available.');
        }
    });
    // Create a new instance of AudioContext for the voice player
    var audioContext = new (window.AudioContext || window.webkitAudioContext)();

    // Create an audio element for the voice player
    var voicePlayer = document.getElementById('voicePlayer');

    // Load a sample WAV file for the voice player
    fetch("/home/dhanush/Documents/ML/MinorProject/output.mp3") // Replace with the actual path
        .then(response => response.arrayBuffer())
        .then(buffer => audioContext.decodeAudioData(buffer))
        .then(decodedData => {
            var source = audioContext.createBufferSource();
            source.buffer = decodedData;
            source.connect(audioContext.destination);
            voicePlayer.srcObject = audioContext.createMediaStreamDestination().stream;
            source.connect(audioContext.createMediaStreamDestination());
            source.start();
        })
        .catch(error => console.error('Error loading WAV file:', error));

    // Set the source of the audio player to a sample MP3 file
    document.getElementById('audioPlayer').src = "/home/dhanush/Documents/ML/MinorProject/output.mp3";
</script>
</body>
</html>