In [25]:
import os
import numpy as np
import tensorflow as tf
import librosa
import pydub
import pyaudio
import sounddevice as sd
import soundfile as sf
from tensorflow.keras import layers

In [26]:
WAKE_WORD_DIRECTORY = "Audios/HolaTecBot"
BACKGROUND_NOISE_DIRECTORY = "Audios/HolaCasoNegativo"

In [27]:
def load_audio_data(directory, sample_rate=16000, duration=1):
    audio_data = []
    for filename in os.listdir(directory):
        filepath = os.path.join(directory, filename)
        if filepath.endswith((".mp3", ".ogg", ".aif", ".aifc")):
            y, sr = librosa.load(filepath, sr=sample_rate, duration=duration, mono=True)
            audio_data.append(y)
    return np.array(audio_data)


In [28]:
def create_model(input_shape):
    print("Creating Model")
    model = tf.keras.Sequential([
        layers.Input(shape=input_shape),
        layers.Conv1D(64, 3, activation='relu'),
        layers.MaxPooling1D(2),
        layers.Conv1D(128, 3, activation='relu'),
        layers.MaxPooling1D(2),
        layers.Flatten(),
        layers.Dense(64, activation='relu'),
        layers.Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [29]:
def train_model():
    print("Training")
    X_positive = load_audio_data(WAKE_WORD_DIRECTORY)
    X_negative = load_audio_data(BACKGROUND_NOISE_DIRECTORY)
    Y_positive = np.ones(len(X_positive))  # Etiquetas para datos positivos
    Y_negative = np.zeros(len(X_negative)) # Etiquetas para datos negativos

    X = np.concatenate([X_positive, X_negative])
    X = np.expand_dims(X, axis=-1)
    Y = np.concatenate([Y_positive, Y_negative])

    indices = np.arange(len(X))
    np.random.shuffle(indices)
    X = X[indices]
    Y = Y[indices]

    input_shape = X.shape[1:]
    model = create_model(input_shape)
    model.fit(X, Y, epochs=30, batch_size=32)

    model.save("wake_word_detection_model.keras")


In [30]:
def test_model_with_microphone():
    model = tf.keras.models.load_model('wake_word_detection_model.keras')
    audio_stream = None
    sample_rate = 44100
    duration = 10000
    try:
        pa = pyaudio.PyAudio()
        audio_stream = pa.open(
            rate=sample_rate,
            channels=1,
            format=pyaudio.paInt16,
            input=True,
            frames_per_buffer=sample_rate *  duration)

        print("Listening...")

        while True:
            print("Hola")
            pcm = audio_stream.read(sample_rate * duration)
            pcm = np.frombuffer(pcm, dtype=np.int16)

            if len(pcm) != sample_rate * duration:
                pcm = librosa.resample(pcm.astype(np.float32), len(pcm), sample_rate * duration).astype(np.int16)

            pcm = pcm / 32768.0
            pcm = np.expand_dims(pcm, axis=0)

            prediction = model.predict(pcm)

            if prediction > 0.5:
                print("Wake word detected!")
            else:
                print("No wake word detected.")
    except KeyboardInterrupt:
        print("Stopped listening!")
    except Exception as e:
        print(f"Error: {e}")
    finally:
        if audio_stream:
            audio_stream.stop_stream()
            audio_stream.close()
        if pa:
            pa.terminate()

In [31]:
train_model()


Training
Creating Model
Epoch 1/30
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step - accuracy: 0.8800 - loss: 0.6926
Epoch 2/30
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 569ms/step - accuracy: 0.8400 - loss: 0.4004
Epoch 3/30
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 538ms/step - accuracy: 0.8400 - loss: 0.7210
Epoch 4/30
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 551ms/step - accuracy: 0.8400 - loss: 0.4067
Epoch 5/30
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 516ms/step - accuracy: 0.8400 - loss: 0.4344
Epoch 6/30
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 534ms/step - accuracy: 0.8400 - loss: 0.4576
Epoch 7/30
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 544ms/step - accuracy: 0.8400 - loss: 0.4062
Epoch 8/30
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 547ms/step - accuracy: 0.8400 - loss: 0.3758
Epoch 9/30
[1m1/1[0m [32m━━━━━━━

In [32]:
#test_model_with_microphone()

In [33]:
def test_model_with_audio_files(test_directory):
    model = tf.keras.models.load_model('wake_word_detection_model.keras')
    preprocessed_audio_data  = load_audio_data(test_directory)

    for i, audio_sample in enumerate(preprocessed_audio_data):
        audio_sample = audio_sample.reshape(1, -1)
        prediction = model.predict(audio_sample)

        if prediction > 0.5:
            print(f"Audio file {i+1}: Wake word detected!")
        else:
            print(f"Audio file {i+1}: No wake word detected.")

In [34]:
test_model_with_audio_files('Audios/Test')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 407ms/step
Audio file 1: Wake word detected!
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
Audio file 2: Wake word detected!
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
Audio file 3: Wake word detected!
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
Audio file 4: Wake word detected!
