## Importamos librerias

In [1]:
import numpy as np
import os
import librosa
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from sklearn.calibration import LabelEncoder
from keras.layers import LSTM, Dense, Dropout, Conv1D

## Preparativos antes de la RNN

In [2]:
def preprocess_audio(file_path,max_len):
    # Cargar el audio
    signal, sr = librosa.load(file_path,sr=96000)
    # Realizar preénfasis
    #filter_audio = librosa.effects.preemphasis(signal)
    # Extraer MFCCs
    mfcc = librosa.feature.mfcc(y=signal, sr=sr, n_mfcc=13)

    # Realizar padding o recorte
    if mfcc.shape[1] < max_len:
        num_zeros = max_len - mfcc.shape[1]
        padded_mfcc = np.pad(mfcc, ((0, 0), (0, num_zeros)), mode='constant', constant_values=0)
        return padded_mfcc
    else:
        mfcc = mfcc[:, :max_len]
        return mfcc

In [3]:
DATA_DIR = 'Data'
LABELS = ['happy','cat', 'bed']

mfccs = []
labels = []

padding = 500

for label in LABELS:
    path_file = DATA_DIR + f'/{label}'
    for file in os.listdir(path_file):
        file_path = path_file + f'/{file}'
        
        mfcc = preprocess_audio(file_path,padding)
        
        mfccs.append(mfcc)
        labels.append(label)

In [4]:
#Convertimos las listas a numpy array
mfccs_array = np.array(mfccs) #Matriz 3d (5187, 13, 500)

In [5]:
# Convertir etiquetas a números
le = LabelEncoder()
labels_encoded = le.fit_transform(labels)

print("Codificación realizada:")
for idx, label in enumerate(le.classes_):
    print(f"{label} -> {idx}")

Codificación realizada:
bed -> 0
cat -> 1
happy -> 2


In [6]:
# Dividir datos en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(mfccs_array, labels_encoded, test_size=0.2, random_state=312)

# Crear modelo LSTM (Long short-term memory)
model = Sequential([
    Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(X_train.shape[1], X_train.shape[2])),
    LSTM(128, return_sequences=True),
    Dropout(0.3),
    LSTM(128),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(3, activation='softmax')  # 3 clases: 'happy', 'cat', 'bed'
])

# Compilar el modelo
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

## Entrenamiento

In [7]:
# Entrenar el modelo
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=15, batch_size=32)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x2d5bd57ec90>

## Evaluación

In [8]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Loss: {loss}")
print(f"Accuracy: {accuracy}")

Loss: 0.3596942126750946
Accuracy: 0.8583815097808838


## Prueba

In [9]:
file_paths = ['audio_bed.wav','audio_cat.wav','audio_happy.wav']

for file_path in file_paths:
    print(file_path)
    preprocessed_audio = preprocess_audio(file_path,padding)
    preprocessed_audio = preprocessed_audio.reshape(1, preprocessed_audio.shape[0], preprocessed_audio.shape[1])  # Convertir a formato (1, features, tiempo)
    prediction = model.predict(preprocessed_audio)

    print(f"{le.inverse_transform([0])[0]} : {prediction[0][0] * 100}%")
    print(f"{le.inverse_transform([1])[0]} : {prediction[0][1] * 100}%")
    print(f"{le.inverse_transform([2])[0]} : {prediction[0][2] * 100}%")

    predicted_label_encoded = np.argmax(prediction, axis=1)[0]
    predicted_label = le.inverse_transform([predicted_label_encoded])[0]
    print(f"\nLa palabra predicha es: {predicted_label}\n")

audio_bed.wav
bed : 50.9693443775177%
cat : 43.843233585357666%
happy : 5.187417566776276%

La palabra predicha es: bed

audio_cat.wav
bed : 45.105159282684326%
cat : 45.65918743610382%
happy : 9.235657006502151%

La palabra predicha es: cat

audio_happy.wav
bed : 42.20638573169708%
cat : 50.805866718292236%
happy : 6.987746059894562%

La palabra predicha es: cat



## Guardamos el modelo

In [10]:
# Entrenar el modelo
model.save('GUI/modelo.h5')