In [1]:
import os
import numpy as np
import librosa
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Dropout, TimeDistributed, Bidirectional
from tensorflow.keras.callbacks import EarlyStopping

In [2]:
def list_wav_files(directory):
    wav_files = [file for file in os.listdir(directory) if file.endswith('.wav')]
    text_files = [file for file in os.listdir(directory) if file.endswith('.txt')]
    return wav_files, text_files


data_dir = 'dataset'
wav_files, text_files = list_wav_files(data_dir)
print(wav_files, text_files)
audio_files = wav_files
transcripts = text_files

['p1.wav', 'file2.wav', 'file1.wav', 'file4.wav', 'file3.wav'] ['file4.txt', 'file1.txt', 'p1.txt', 'file3.txt', 'file2.txt']


In [3]:
def extract_features(file_path, max_len=100):
    audio, sr = librosa.load(file_path, sr=None)
    mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=40)
    if mfccs.shape[1] < max_len:
        pad_width = max_len - mfccs.shape[1]
        mfccs = np.pad(mfccs, pad_width=((0, 0), (0, pad_width)), mode='constant')
    else:
        mfccs = mfccs[:, :max_len]
    return mfccs.T

In [4]:
max_len = 100  # Max length of the MFCC feature sequence
features = np.array([extract_features(os.path.join(data_dir, f), max_len) for f in audio_files])

In [5]:
transcript_texts = [open(os.path.join(data_dir, t)).read().strip() for t in transcripts]

In [6]:
tokenizer = Tokenizer(char_level=True)  # Character-level tokenizer
tokenizer.fit_on_texts(transcript_texts)
sequences = tokenizer.texts_to_sequences(transcript_texts)
max_seq_length = max_len  # Ensure that max_seq_length matches max_len
padded_sequences = pad_sequences(sequences, maxlen=max_seq_length, padding='post')
num_classes = len(tokenizer.word_index) + 1
labels = np.array([to_categorical(seq, num_classes=num_classes) for seq in padded_sequences])

In [7]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

In [9]:
input_shape = (max_len, 40)
input_layer = Input(shape=input_shape)
x = Bidirectional(LSTM(128, return_sequences=True))(input_layer)
x = TimeDistributed(Dense(128, activation='relu'))(x)
x = Dropout(0.5)(x)
x = TimeDistributed(Dense(num_classes, activation='softmax'))(x)
model = Model(inputs=input_layer, outputs=x)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 100, 40)]         0         
                                                                 
 bidirectional_1 (Bidirecti  (None, 100, 256)          173056    
 onal)                                                           
                                                                 
 time_distributed_2 (TimeDi  (None, 100, 128)          32896     
 stributed)                                                      
                                                                 
 dropout_1 (Dropout)         (None, 100, 128)          0         
                                                                 
 time_distributed_3 (TimeDi  (None, 100, 18)           2322      
 stributed)                                                      
                                                           

In [10]:
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
history = model.fit(X_train, y_train, epochs=30, batch_size=32, validation_data=(X_test, y_test), callbacks=[early_stopping])

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [11]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test accuracy: {accuracy*100:.2f}%")
model.save('speech_recognition_model.h5')

Test accuracy: 95.00%


  saving_api.save_model(


In [12]:
def predict_transcript(file_path, model, tokenizer, max_len=100):
    features = extract_features(file_path, max_len)
    features = features[np.newaxis, ...]  # Add batch dimension
    prediction = model.predict(features)
    predicted_sequence = np.argmax(prediction, axis=-1).flatten()
    predicted_text = tokenizer.sequences_to_texts([predicted_sequence])[0]
    return predicted_text

# Example usage
predicted_transcript = predict_transcript(os.path.join(data_dir, 'p1.wav'), model, tokenizer)
print("Predicted Transcript:", predicted_transcript)

Predicted Transcript:          
