# PIA 
## Extraccion de caracteristicas de los nombres de cada archivo.

In [45]:
import os
import pandas as pd

In [46]:
# Directorio raíz del dataset
dataset_path = './'  # Asumimos que estás corriendo en el mismo nivel que las carpetas Actor_*

In [68]:
# Diccionario de emociones
emotion_dict = {
    "01": "neutral",
    "02": "calm",
    "03": "happy",
    "04": "sad",
    "05": "angry",
    "06": "fearful",
    "07": "disgust",
    "08": "surprised"
}

# Lista para guardar registros
data = []



In [89]:
# Funciones de aumento
def change_pitch(audio, sr, pitch_factor=2.0):
    return librosa.effects.pitch_shift(audio, sr=sr, n_steps=pitch_factor)

def time_stretch(audio, rate=0.9):
    return librosa.effects.time_stretch(y=audio, rate=rate)


def add_noise(audio, noise_factor=0.005):
    noise = np.random.randn(len(audio))
    return audio + noise_factor * noise

- Modality (01 = full-AV, 02 = video-only, 03 = audio-only).
- Vocal channel (01 = speech, 02 = song).
- Emotion (01 = neutral, 02 = calm, 03 = happy, 04 = sad, 05 = angry, 06 = fearful, 07 = disgust, 08 = surprised).
- Emotional intensity (01 = normal, 02 = strong). NOTE: There is no strong intensity for the 'neutral' emotion.
- Statement (01 = "Kids are talking by the door", 02 = "Dogs are sitting by the door").
- Repetition (01 = 1st repetition, 02 = 2nd repetition).
- Actor (01 to 24. Odd numbered actors are male, even numbered actors are female).

In [90]:
# Recorrer carpetas y procesar audios
for folder in os.listdir(dataset_path):
    if folder.startswith("Actor_"):
        folder_path = os.path.join(dataset_path, folder)
        for file in os.listdir(folder_path):
            if file.endswith(".wav"):
                parts = file.split(".")[0].split("-")
                emotion_code = parts[2]
                intensity_code = parts[3]
                actor_id = int(parts[-1])

                emotion = emotion_dict[emotion_code]
                intensity = "normal" if intensity_code == "01" else "strong"
                gender = "male" if actor_id % 2 != 0 else "female"
                file_path = os.path.join(folder_path, file)

                # Agregar original (con audio en disco)
                data.append({
                    "file_path": file_path,
                    "audio": None,
                    "sr": None,
                    "emotion": emotion,
                    "intensity": intensity,
                    "gender": gender,
                    "actor_id": actor_id,
                    "augmentation": "original"
                })

                try:
                    audio, sr = librosa.load(file_path)

                    # Aumento 1: pitch
                    pitch_audio = change_pitch(audio, sr)
                    data.append({
                        "file_path": None,
                        "audio": pitch_audio,
                        "sr": sr,
                        "emotion": emotion,
                        "intensity": intensity,
                        "gender": gender,
                        "actor_id": actor_id,
                        "augmentation": "pitch"
                    })

                    # Aumento 2: stretch
                    stretched_audio = time_stretch(audio)
                    data.append({
                        "file_path": None,
                        "audio": stretched_audio,
                        "sr": sr,
                        "emotion": emotion,
                        "intensity": intensity,
                        "gender": gender,
                        "actor_id": actor_id,
                        "augmentation": "stretch"
                    })

                    # Aumento 3: ruido
                    noisy_audio = add_noise(audio)
                    data.append({
                        "file_path": None,
                        "audio": noisy_audio,
                        "sr": sr,
                        "emotion": emotion,
                        "intensity": intensity,
                        "gender": gender,
                        "actor_id": actor_id,
                        "augmentation": "noise"
                    })
                except Exception as e:
                    print(f"Error con archivo {file_path}: {e}")

In [91]:
# Crear y guardar el DataFrame
df = pd.DataFrame(data)
df.to_csv("ravdess_metadata.csv", index=False)




In [92]:
# Mostrar resumen
df.head()

Unnamed: 0,file_path,emotion,intensity,gender,actor_id,augmentation,audio,sr
0,./Actor_01\03-01-01-01-01-01-01.wav,neutral,normal,male,1,original,,
1,,neutral,normal,male,1,pitch,"[3.1423087e-07, -3.197933e-07, 3.260547e-07, -...",22050.0
2,,neutral,normal,male,1,stretch,"[6.719997e-08, -2.8142479e-08, 1.5973922e-09, ...",22050.0
3,,neutral,normal,male,1,noise,"[8.595426389673514e-05, 0.0005436920150429574,...",22050.0
4,./Actor_01\03-01-01-01-01-02-01.wav,neutral,normal,male,1,original,,


## Extracción de caracteristicas de cada Audio.

In [93]:
import librosa
import numpy as np
from tqdm import tqdm

In [96]:
def extract_features_unificado(row):
    try:
        if row["augmentation"] == "original":
            y, sr = librosa.load(row["file_path"], sr=None)
        else:
            y = row["audio"]
            sr = row["sr"]

        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
        mfcc_mean = np.mean(mfcc, axis=1)
        mfcc_std = np.std(mfcc, axis=1)
        features = np.concatenate([mfcc_mean, mfcc_std])
        return features

    except Exception as e:
        print(f"Error con archivo {row.get('file_path', 'AUMENTADO')} / tipo {row['augmentation']}: {e}")
        return np.zeros(26)



In [98]:
from tqdm import tqdm
tqdm.pandas()

features = df.progress_apply(extract_features_unificado, axis=1)



100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11520/11520 [02:35<00:00, 73.93it/s]


In [100]:
features_df = pd.DataFrame(features.tolist())
features_df.columns = [f"mfcc_{i}" for i in range(1, 27)]

In [101]:
# Unir con etiquetas
full_df = pd.concat([features_df, df[["emotion"]].reset_index(drop=True)], axis=1)



In [102]:
# Guardar
full_df.to_csv("ravdess_features.csv", index=False)
full_df.head()

Unnamed: 0,mfcc_1,mfcc_2,mfcc_3,mfcc_4,mfcc_5,mfcc_6,mfcc_7,mfcc_8,mfcc_9,mfcc_10,...,mfcc_18,mfcc_19,mfcc_20,mfcc_21,mfcc_22,mfcc_23,mfcc_24,mfcc_25,mfcc_26,emotion
0,-726.217224,68.54142,3.293398,12.2053,5.510278,13.66741,-2.983828,3.098029,-3.310813,-1.564384,...,13.634708,19.163044,14.063682,8.619501,10.998984,8.34863,12.845314,7.708874,8.651435,neutral
1,-742.924805,54.153629,-1.021684,14.067798,2.208796,-2.99783,-7.191713,-6.332038,-11.387864,3.942475,...,14.310036,13.388592,13.856584,13.237471,16.083012,9.576642,9.777632,9.87855,5.751151,neutral
2,-733.57019,54.273224,0.259147,12.671712,7.179569,0.662815,-3.471071,-3.802527,-12.115388,-3.410399,...,16.956284,14.008063,11.031326,12.343883,17.463345,9.055738,8.345324,9.922368,10.208949,neutral
3,-307.91739,6.168475,4.837421,3.978016,2.255884,-0.090857,-1.050032,-2.436418,-3.433082,-2.553736,...,6.087761,4.197568,4.140401,5.066715,5.162981,4.339493,4.795025,5.18018,4.499706,neutral
4,-719.128296,70.201569,1.168397,13.122543,7.83695,14.41129,-4.11136,4.468973,-3.539367,-3.658607,...,12.449026,19.218317,14.516836,8.05733,11.509418,8.151206,12.590632,7.271216,8.202284,neutral


In [103]:
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical




In [104]:
# Configuración
n_mfcc = 13
max_pad_len = 173  # Se usará para hacer padding



In [105]:
# Función para extraer MFCC secuencial y aplicar padding
def extract_mfcc_sequence(file_path, max_pad_len=173):
    y, sr = librosa.load(file_path, sr=None)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
    if mfcc.shape[1] < max_pad_len:
        pad_width = max_pad_len - mfcc.shape[1]
        mfcc = np.pad(mfcc, pad_width=((0, 0), (0, pad_width)), mode='constant')
    else:
        mfcc = mfcc[:, :max_pad_len]
    return mfcc.T  # Transpuesta para que sea [frames, n_mfcc]



In [108]:
def extract_mfcc_sequence_unificado(row, max_pad_len=173):
    try:
        if row["augmentation"] == "original":
            y, sr = librosa.load(row["file_path"], sr=None)
        else:
            y = row["audio"]
            sr = row["sr"]

        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)

        # Padding para igualar longitud
        if mfcc.shape[1] < max_pad_len:
            pad_width = max_pad_len - mfcc.shape[1]
            mfcc = np.pad(mfcc, pad_width=((0, 0), (0, pad_width)), mode='constant')
        else:
            mfcc = mfcc[:, :max_pad_len]

        return mfcc
    except Exception as e:
        print(f"Error con archivo {row.get('file_path', 'AUMENTADO')} / tipo {row['augmentation']}: {e}")
        return np.zeros((13, max_pad_len))


In [109]:
X = []
y = []




In [110]:
for _, row in tqdm(df.iterrows(), total=len(df)):
    mfcc_seq = extract_mfcc_sequence_unificado(row, max_pad_len=173)
    X.append(mfcc_seq)
    y.append(row["emotion"])



100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11520/11520 [02:03<00:00, 93.45it/s]


In [111]:
X = np.array(X)
print("Forma de X:", X.shape)  # [n_samples, frames, n_mfcc]



Forma de X: (11520, 13, 173)


In [112]:
# Codificamos etiquetas
le = LabelEncoder()
y_encoded = le.fit_transform(y)
y_categorical = to_categorical(y_encoded)

print("Clases:", le.classes_)

Clases: ['angry' 'calm' 'disgust' 'fearful' 'happy' 'neutral' 'sad' 'surprised']


In [113]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Masking, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split


In [114]:

X_train, X_test, y_train, y_test = train_test_split(X, y_categorical, test_size=0.2, random_state=42)


In [115]:
model = Sequential()
model.add(Input(shape=(X.shape[1], X.shape[2])))
model.add(LSTM(128, return_sequences=True))
model.add(Dropout(0.3))
model.add(LSTM(64))
model.add(Dropout(0.3))
model.add(Dense(64, activation='relu'))
model.add(Dense(y_categorical.shape[1], activation='softmax'))

In [116]:


X_train, X_test, y_train, y_test = train_test_split(X, y_categorical, test_size=0.2, random_state=42)


In [None]:
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])


# Entrenamiento
early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=50, batch_size=32, callbacks=[early_stop])

Epoch 1/50
[1m288/288[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 20ms/step - accuracy: 0.2137 - loss: 1.9747 - val_accuracy: 0.3394 - val_loss: 1.7509
Epoch 2/50
[1m288/288[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 19ms/step - accuracy: 0.3479 - loss: 1.7080 - val_accuracy: 0.3542 - val_loss: 1.7228
Epoch 3/50
[1m288/288[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 18ms/step - accuracy: 0.4040 - loss: 1.5633 - val_accuracy: 0.3911 - val_loss: 1.5797
Epoch 4/50
[1m288/288[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 19ms/step - accuracy: 0.4624 - loss: 1.4514 - val_accuracy: 0.4149 - val_loss: 1.5446
Epoch 5/50
[1m288/288[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 18ms/step - accuracy: 0.4916 - loss: 1.3719 - val_accuracy: 0.4479 - val_loss: 1.4807
Epoch 6/50
[1m288/288[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 19ms/step - accuracy: 0.5184 - loss: 1.2748 - val_accuracy: 0.4709 - val_loss: 1.4455
Epoch 7/50
[1m288/288

In [None]:
model.save("modelo_emociones_rnn.keras")