# End to End Speech Translation

In [1]:
import os
import librosa
import numpy as np
from tensorflow import keras
from tensorflow.keras import layers




# Listed Parallel Audio Dataset 

In [2]:
def folder_to_audio_files(folder_path):
    audio_files = []
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        if filename.endswith('.wav') and os.path.isfile(file_path):
            audio_files.append(file_path)
    return audio_files

# Example usage
source_folder_path = 'Source'  # Replace with the path to your source folder
target_folder_path = 'target'  # Replace with the path to your target folder

source_audio_files = folder_to_audio_files(source_folder_path)
target_audio_files = folder_to_audio_files(target_folder_path)

# Now source_audio_files and target_audio_files contain paths to the audio files in their respective folders


# Data Preprocessing 
### """Preprocesses audio data from WAV files for the model.

###   Args:
###     wav_files: A list of paths to WAV files (source and target).
###       sr: Sampling rate (default: 16000 Hz).
###     n_mels: Number of mel filterbanks (default: 128).
###       seq_len: Maximum sequence length (default: 16).

###   Returns:
###       source_mels: A NumPy array of preprocessed mel spectrograms for source audio (shape: (num_samples, seq_len, n_mels)).
###       target_mels: A NumPy array of preprocessed mel spectrograms for target audio (shape: (num_samples, seq_len, n_mels)).
###   """

In [3]:
def preprocess_data(wav_files,sr=16000,n_mels=128,seq_len=16):

  source_mels = []
  target_mels = []

  for filename in wav_files:
    # Load audio and extract mel spectrograms
    y,_= librosa.load(filename, sr=sr)
    mel_spectrogram = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels)
    mel_spectrogram = librosa.power_to_db(mel_spectrogram).astype(np.float32)  # Normalize

    # Pad or trim to fixed sequence length
    if mel_spectrogram.shape[1] > seq_len:
      mel_spectrogram = mel_spectrogram[:,:seq_len]  # Trim if longer
    else:
      pad_width = ((0, 0), (0, seq_len - mel_spectrogram.shape[1]))
      mel_spectrogram = np.pad(mel_spectrogram,pad_width,mode='constant')

    source_mels.append(mel_spectrogram)

  source_mels = np.array(source_mels)
  target_mels = source_mels  

  return source_mels,target_mels

In [4]:
# Preprocess data
source_mels, target_mels = preprocess_data( source_audio_files+target_audio_files )  # Combine for easier data splitting
# Split data into training, validation, and (if needed) test sets (example using train-test split)
from sklearn.model_selection import train_test_split
train_mels, val_mels, train_targets, val_targets = train_test_split(source_mels, target_mels, test_size=0.2, random_state=42)
# Reshape for model input (add batch dimension)
train_mels = train_mels.reshape((train_mels.shape[0], train_mels.shape[1], train_mels.shape[2], 1))
val_mels = val_mels.reshape((val_mels.shape[0], val_mels.shape[1], val_mels.shape[2], 1))
train_targets = train_targets.reshape((train_targets.shape[0], train_targets.shape[1], train_targets.shape[2], 1))
val_targets = val_targets.reshape((val_targets.shape[0],val_targets.shape[1],val_targets.shape[2],1))

In [5]:
class EncoderDecoder(keras.Model):
    """Encoder-decoder model architecture for audio transformation."""

    def __init__(self, embedding_dim, latent_dim, rnn_units):
        super(EncoderDecoder, self).__init__()

        # Encoder
        self.encoder = keras.Sequential([
            layers.TimeDistributed(layers.Conv1D(filters=16, kernel_size=3, activation='relu', padding='same')),
            layers.TimeDistributed(layers.MaxPooling1D(pool_size=2)),
            layers.LSTM(units=rnn_units, return_sequences=True)
        ])

        # Decoder
        self.decoder = keras.Sequential([
            layers.LSTM(units=rnn_units, return_sequences=True),
            layers.TimeDistributed(layers.Dense(embedding_dim, activation='relu')),
            layers.TimeDistributed(layers.Conv1D(filters=1, kernel_size=3, activation='linear', padding='same'))
        ])

    def call(self, inputs, training=None):
        encoded = self.encoder(inputs)
        decoded = self.decoder(encoded)
        return decoded


In [None]:
model = EncoderDecoder(128, 64, 256)
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mean_absolute_error'])
history = model.fit(train_mels, train_targets, epochs=100, batch_size=32, validation_data=(val_mels, val_targets))