In [None]:
import numpy as np
import librosa
import soundfile as sf
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

# Generate a simple speech dataset
def generate_speech_data():
    '''
    Make sure you have a file path set that contains the files you want to use.
    '''
    speech_samples = ["recognize speech", "transcribe spoken words", "LSTMs in speech analysis"]
    labels = ["recognize speech", "transcribe spoken words", "LSTMs in speech analysis"]

    for i, sample in enumerate(speech_samples):
        file_path = f"speech_sample_{i}.wav"
        audio_data, _ = librosa.load(sf.SoundFile(file_path))
        sf.write(file_path, audio_data, 16000, subtype='PCM_16')

    return speech_samples, labels

# Load and process speech dataset
speech_samples, labels = generate_speech_data()
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(speech_samples)
total_words = len(tokenizer.word_index) + 1

input_sequences = tokenizer.texts_to_sequences(speech_samples)
max_sequence_length = max([len(x) for x in input_sequences])
X_train_speech = tf.keras.preprocessing.sequence.pad_sequences(input_sequences, maxlen=max_sequence_length, padding='pre')
y_train_speech = tokenizer.texts_to_matrix(labels, mode='binary')

# Build and train the LSTM model
model_speech = Sequential()
model_speech.add(LSTM(32, activation="relu", input_shape=(max_sequence_length,)))
model_speech.add(Dense(total_words, activation="softmax"))
model_speech.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])
history_speech = model_speech.fit(X_train_speech, y_train_speech, epochs=20, batch_size=1, verbose=1)

# Generate sample output
seed_speech = "LSTMs in"
seed_sequence = tokenizer.texts_to_sequences([seed_speech])[0]
seed_sequence = tf.keras.preprocessing.sequence.pad_sequences([seed_sequence], maxlen=max_sequence_length, padding='pre')
predicted_probs_speech = model_speech.predict(seed_sequence)[0]
predicted_index_speech = np.argmax(predicted_probs_speech)
output_speech = [word for word, index in tokenizer.word_index.items() if index == predicted_index_speech][0]
print("Generated Output:", seed_speech + " " + output_speech)