# Transformación de Audios a Espectrogramas y Clasificación de Emociones

Este script tiene como objetivo procesar audios de voz y transformarlos en imágenes de espectrogramas para su posterior clasificación en diferentes emociones. Se utilizan dos enfoques principales:

1. **Extracción de Características y Clasificación con SVM**  
   - Se extraen diversas características de los audios como MFCCs, cromagrama, espectrograma mel y otros.
   - Se entrena un modelo SVM para clasificar las emociones detectadas en los audios.

2. **Conversión de Audio a Espectrograma y Clasificación con CNN**  
   - Se convierten los audios en imágenes de espectrogramas.
   - Se entrena una red neuronal convolucional (CNN) para reconocer patrones en estas imágenes y clasificar las emociones.

El modelo es capaz de distinguir entre emociones como happy, neutral y angry a partir de grabaciones de voz.  
Se emplean bibliotecas como `librosa` para el procesamiento de audio, `scikit-learn` para el modelo SVM y `TensorFlow/Keras` para la CNN.


In [6]:
import librosa
import soundfile
import os, glob
import pickle
import numpy as np
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
import joblib
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout

def extract_feature_improved(file_name, min_length=2048):
    try:
        with soundfile.SoundFile(file_name) as sound_file:
            X = sound_file.read(dtype="float32")
            sample_rate = sound_file.samplerate
            
            if len(X) < min_length:
                print(f"⚠️ File too short: {file_name}. Padding with zeros.")
                X = np.pad(X, (0, min_length - len(X)), mode='constant')
            
            stft = np.abs(librosa.stft(X))
            result = np.array([])
            
            mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T, axis=0)
            result = np.hstack((result, mfccs))
            
            chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T, axis=0)
            result = np.hstack((result, chroma))
            
            mel = np.mean(librosa.feature.melspectrogram(y=X, sr=sample_rate).T, axis=0)
            result = np.hstack((result, mel))
            
            zcr = np.mean(librosa.feature.zero_crossing_rate(X))
            result = np.hstack((result, zcr))
            
            contrast = np.mean(librosa.feature.spectral_contrast(S=stft, sr=sample_rate).T, axis=0)
            result = np.hstack((result, contrast))
            
            rolloff = np.mean(librosa.feature.spectral_rolloff(S=stft, sr=sample_rate).T, axis=0)
            result = np.hstack((result, rolloff))
            
        return result
    except Exception as e:
        print(f"❌ Error processing {file_name}: {e}")
        return None


# Emotion in RAVDESS Dataset
emotions = {
    '01': 'neutral',
    '02': 'calm',
    '03': 'happy',
    '04': 'sad',
    '05': 'angry',
    '06': 'fearful',
    '07': 'disgust',
    '08': 'surprised'
}

# Define the observed emotions
observed_emotions = ['happy', 'neutral', 'angry']

# Function to load the data
def load_data_expanded(test_size=0.2):
    x, y = [], []
    label_encoder = LabelEncoder()

    # Process the original dataset (RAVDESS)
    for file in glob.glob("Actor_*/*.wav"):
        file_name = os.path.basename(file)
        emotion = emotions.get(file_name.split("-")[2])
        
        if emotion in observed_emotions:
            feature = extract_feature_improved(file)
            if feature is not None:
                x.append(feature)
                y.append(emotion)
    
    emotion_mapping = {
    "happy": "happy",
    "neutral": "neutral",
    "angry": "angry"
}


    # Process the new dataset (Emotions)
    for folder in emotion_mapping:
        emotion_path = f"Emotions/{folder}/"
        for file in glob.glob(emotion_path + "*.wav"):
            feature = extract_feature_improved(file)
            if feature is not None:
                x.append(feature)
                y.append(emotion_mapping[folder])
   
    # Encode the labels
    y_encoded = label_encoder.fit_transform(y)
    return train_test_split(np.array(x), y_encoded, test_size=test_size, random_state=9), label_encoder

# Load data
(x_train, x_test, y_train, y_test), label_encoder_audio = load_data_expanded(test_size=0.25)

# Normalize the features
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

# Show the shape of training and test sets
print(f"Training set: {x_train.shape[0]} samples.")
print(f"Test set: {x_test.shape[0]} samples.")
print(f"Number of extracted features: {x_train.shape[1]}")


svm_audio = SVC()
grid_search_audio = GridSearchCV(svm_audio, param_grid_audio, cv=5, scoring='accuracy', n_jobs=-1)
grid_search_audio.fit(x_train, y_train)

best_svm_audio = grid_search_audio.best_estimator_
y_pred_audio = best_svm_audio.predict(x_test)

accuracy_audio = accuracy_score(y_test, y_pred_audio)
print(f"SVM Audios - Accuracy: {accuracy_audio}")

# Save the best SVM model for audio
joblib.dump(best_svm_audio, 'svm_model_audio_expanded.pkl')


# Load the SVM model for audio and the label encoder
try:
    model_audio_svm = joblib.load('svm_model_audio_expanded.pkl')
    print("✅ SVM model for audio loaded correctly.")
except Exception as e:
    print(f"❌ Error loading SVM model for audio: {e}")
    model_audio_svm = None

# CNN MODEL: Start with the function to convert audios to spectograms
def audio_to_spectrogram(audio, sample_rate, min_shape=(128, 128)):
    try:
        spectrogram = librosa.feature.melspectrogram(y=audio, sr=sample_rate)
        spectrogram = librosa.power_to_db(spectrogram, ref=np.max)
        
        if spectrogram.shape[1] < min_shape[1]:
            print("⚠️ Spectrogram too small, padding with zeros.")
            spectrogram = np.pad(spectrogram, ((0, 0), (0, min_shape[1] - spectrogram.shape[1])), mode='constant')
        
        return spectrogram[:min_shape[0], :min_shape[1]]
    except Exception as e:
        print(f"❌ Error converting to spectrogram: {e}")
        return np.zeros(min_shape)

# Create the CNN model
def create_cnn_model(input_shape, num_classes):
    model = Sequential([
        Conv2D(32, (3, 3), activation='relu', input_shape=input_shape),
        MaxPooling2D((2, 2)),
        Conv2D(64, (3, 3), activation='relu'),
        MaxPooling2D((2, 2)),
        Flatten(),
        Dense(128, activation='relu'),
        Dropout(0.5),
        Dense(num_classes, activation='softmax')
    ])
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

# Define the function for preparing the data for the CNN model (this model only works with images, so we need this conversion of audio to spectrogam (=image of an audio))
def prepare_cnn_data(x, y, sample_rate=22050):
    spectrograms = []
    for audio in x:
        spectrogram = audio_to_spectrogram(audio, sample_rate)
        spectrograms.append(spectrogram)
    spectrograms = np.array(spectrograms)
    spectrograms = np.expand_dims(spectrograms, axis=-1)  # Add channel dimension
    return spectrograms, y

# Prepare the data for the CNN
x_train_cnn, y_train_cnn = prepare_cnn_data(x_train, y_train)
x_test_cnn, y_test_cnn = prepare_cnn_data(x_test, y_test)

# Create and train de CNN
input_shape = x_train_cnn.shape[1:]
num_classes = len(observed_emotions)
cnn_model = create_cnn_model(input_shape, num_classes)
cnn_model.fit(x_train_cnn, y_train_cnn, epochs=20, validation_data=(x_test_cnn, y_test_cnn))

# Evaluate the CNN
y_pred_cnn = cnn_model.predict(x_test_cnn)
y_pred_cnn = np.argmax(y_pred_cnn, axis=1)
accuracy_cnn = accuracy_score(y_test_cnn, y_pred_cnn)
print(f"CNN Audios - Accuracy: {accuracy_cnn}")

# save the CNN model
cnn_model.save('cnn_model_audio.h5')

❌ Error processing Emotions/happy/03-01-03-01-02-01-20.wav: all the input arrays must have same number of dimensions, but the array at index 0 has 1 dimension(s) and the array at index 1 has 2 dimension(s)
❌ Error processing Emotions/neutral/03-02-01-01-01-01-24.wav: all the input arrays must have same number of dimensions, but the array at index 0 has 1 dimension(s) and the array at index 1 has 2 dimension(s)
Training set: 4955 samples.
Test set: 1652 samples.
Number of extracted features: 189
SVM Audios - Accuracy: 0.8184019370460048
✅ SVM model for audio loaded correctly.
⚠️ Spectrogram too small, padding with zeros.
⚠️ Spectrogram too small, padding with zeros.
⚠️ Spectrogram too small, padding with zeros.
⚠️ Spectrogram too small, padding with zeros.
⚠️ Spectrogram too small, padding with zeros.
⚠️ Spectrogram too small, padding with zeros.
⚠️ Spectrogram too small, padding with zeros.
⚠️ Spectrogram too small, padding with zeros.
⚠️ Spectrogram too small, padding with zeros.
⚠️ S



⚠️ Spectrogram too small, padding with zeros.
⚠️ Spectrogram too small, padding with zeros.
⚠️ Spectrogram too small, padding with zeros.
⚠️ Spectrogram too small, padding with zeros.
⚠️ Spectrogram too small, padding with zeros.
⚠️ Spectrogram too small, padding with zeros.
⚠️ Spectrogram too small, padding with zeros.
⚠️ Spectrogram too small, padding with zeros.
⚠️ Spectrogram too small, padding with zeros.
⚠️ Spectrogram too small, padding with zeros.
⚠️ Spectrogram too small, padding with zeros.
⚠️ Spectrogram too small, padding with zeros.
⚠️ Spectrogram too small, padding with zeros.
⚠️ Spectrogram too small, padding with zeros.
⚠️ Spectrogram too small, padding with zeros.
⚠️ Spectrogram too small, padding with zeros.
⚠️ Spectrogram too small, padding with zeros.
⚠️ Spectrogram too small, padding with zeros.
⚠️ Spectrogram too small, padding with zeros.
⚠️ Spectrogram too small, padding with zeros.
⚠️ Spectrogram too small, padding with zeros.
⚠️ Spectrogram too small, padding 

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/20
[1m155/155[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 95ms/step - accuracy: 0.4492 - loss: 1.0525 - val_accuracy: 0.6053 - val_loss: 0.8610
Epoch 2/20
[1m155/155[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 92ms/step - accuracy: 0.5997 - loss: 0.8428 - val_accuracy: 0.6205 - val_loss: 0.8032
Epoch 3/20
[1m155/155[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 92ms/step - accuracy: 0.6236 - loss: 0.7895 - val_accuracy: 0.6374 - val_loss: 0.7486
Epoch 4/20
[1m155/155[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 93ms/step - accuracy: 0.6473 - loss: 0.7500 - val_accuracy: 0.6725 - val_loss: 0.7226
Epoch 5/20
[1m155/155[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 93ms/step - accuracy: 0.6888 - loss: 0.6839 - val_accuracy: 0.6646 - val_loss: 0.7112
Epoch 6/20
[1m155/155[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 93ms/step - accuracy: 0.6895 - loss: 0.6634 - val_accuracy: 0.6525 - val_loss: 0.7217
Epoch 7/20
[1m1



CNN Audios - Accuracy: 0.7058111380145279


In [7]:
#With the following code, we are going to try to optimize the mlp model with grid search
(x_train, x_test, y_train, y_test), label_encoder_audio = load_data_expanded(test_size=0.25)
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

# Hyperparameters optimized with GridSearch
param_grid = {
    'hidden_layer_sizes': [(300,), (300, 100), (256, 128, 64)],
    'activation': ['relu'],
    'solver': ['adam'],
    'alpha': [0.0001, 0.001, 0.01],
    'batch_size': [128, 256],
    'learning_rate': ['adaptive'],
    'max_iter': [500]
}

grid_search = GridSearchCV(MLPClassifier(), param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(x_train, y_train)
mlp_model = grid_search.best_estimator_

y_pred_mlp = mlp_model.predict(x_test)
accuracy_mlp = accuracy_score(y_test, y_pred_mlp)
print(f"Optimized MLP Accuracy: {accuracy_mlp}")

joblib.dump(mlp_model, 'model_emotions_audio_mlp.pkl')
joblib.dump(label_encoder_audio, 'label_encoder_audio_mlp.pkl')



❌ Error processing Emotions/happy/03-01-03-01-02-01-20.wav: all the input arrays must have same number of dimensions, but the array at index 0 has 1 dimension(s) and the array at index 1 has 2 dimension(s)
❌ Error processing Emotions/neutral/03-02-01-01-01-01-24.wav: all the input arrays must have same number of dimensions, but the array at index 0 has 1 dimension(s) and the array at index 1 has 2 dimension(s)
Fitting 3 folds for each of 18 candidates, totalling 54 fits
[CV] END activation=relu, alpha=0.0001, batch_size=128, hidden_layer_sizes=(256, 128, 64), learning_rate=adaptive, max_iter=500, solver=adam; total time=  13.8s
[CV] END activation=relu, alpha=0.0001, batch_size=128, hidden_layer_sizes=(256, 128, 64), learning_rate=adaptive, max_iter=500, solver=adam; total time=  14.8s
[CV] END activation=relu, alpha=0.0001, batch_size=128, hidden_layer_sizes=(300, 100), learning_rate=adaptive, max_iter=500, solver=adam; total time=  17.8s
[CV] END activation=relu, alpha=0.0001, batch_

['label_encoder_audio_mlp.pkl']