# Speech-to-Text Converter


<strong>Speech Recognition</strong> involves converting spoken language into text. This is typically done using a combination of signal processing and machine learning models.

<h4>Key Steps:</h4>

1. Feature Extraction: Extract features from the audio signal, such as Mel-Frequency Cepstral Coefficients (MFCCs).
2. Modeling: Train a neural network (e.g., RNN, LSTM) to map these features to text.
3. Decoding: Use a decoder to convert the output of the neural network into readable text.


In [None]:
import tensorflow as tf
from tensorflow.keras import layers, models
import numpy as np
import librosa

# Load dataset (LibriSpeech, etc.)
# Assume data is loaded and preprocessed into (features, labels) format
characters = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 
              'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', ' ']
blank_token = len(characters)  # Assuming the blank token is the last index

# Feature extraction (MFCCs)
def extract_features(audio_file):
    audio, sr = librosa.load(audio_file, sr=16000)
    mfcc = librosa.feature.mfcc(audio, sr=sr, n_mfcc=13)
    return mfcc.T

# Function to encode text labels to integers
def encode_text(text):
    return [characters.index(c) for c in text]

# For illustration, assuming the dataset is already loaded

# Generate random MFCC features for 100 samples, each with a varying number of time steps
x_train = [np.random.rand(np.random.randint(80, 120), 13) for _ in range(100)]
# Generate random text labels for 100 samples
text_labels = ["hello", "world", "tensorflow", "speech", "recognition", "model", "example"] * 14 + ["end"]
y_train = [encode_text(label) for label in text_labels]


input_length = np.array([x.shape[0] for x in x_train])
label_length = np.array([len(y) for y in y_train])

# Padding y_train to match the longest label length for training
max_label_length = max(label_length)
padded_y_train = np.zeros((len(y_train), max_label_length))
for i, label in enumerate(y_train):
    padded_y_train[i, :len(label)] = label


# Model architecture for speech-to-text
model = models.Sequential([
    layers.Input(shape=(None, 13)),  # 13 MFCC features
    layers.LSTM(128, return_sequences=True),
    layers.LSTM(128),
    layers.Dense(len(characters), activation='softmax')  # len(characters) is the number of unique characters in text
])

# CTC Loss for speech recognition
def ctc_loss(y_true, y_pred):
    return tf.keras.backend.ctc_batch_cost(y_true, y_pred, input_length, label_length)

# Compile model
model.compile(optimizer='adam', loss=ctc_loss)

# Train model
model.fit(np.array(x_train, dtype=object), padded_y_train, epochs=10, batch_size=32)

# Inference (Converting speech to text)
# Function to decode predictions into text
def decode_predictions(pred):
    pred_indices = np.argmax(pred, axis=-1)
    decoded_text = []
    prev_index = blank_token  # Initialize with blank token
    for index in pred_indices[0]:
        if index != prev_index and index != blank_token:
            decoded_text.append(characters[index])
        prev_index = index
    return ''.join(decoded_text)

audio_file = 'assest/output.wav'
mfcc = extract_features(audio_file)
predictions = model.predict(np.expand_dims(mfcc, axis=0))
text = decode_predictions(predictions)
print(text)
