In [2]:
import os
import pandas as pd
import numpy as np
import librosa
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score

metadata_path = "../data/metadata.csv"
df = pd.read_csv(metadata_path)

df.head()


Unnamed: 0,filepath,emotion,gender
0,../data/RADVESS/Actor_16/03-01-05-01-02-01-16.wav,angry,female
1,../data/RADVESS/Actor_16/03-01-05-02-01-01-16.wav,angry,female
2,../data/RADVESS/Actor_16/03-01-04-01-01-02-16.wav,sad,female
3,../data/RADVESS/Actor_16/03-01-04-02-02-02-16.wav,sad,female
4,../data/RADVESS/Actor_16/03-01-03-02-02-02-16.wav,happy,female


In [3]:
# encode labels
label_encoder = LabelEncoder()
df["emotion_label"] = label_encoder.fit_transform(df["emotion"])

num_classes = df["emotion_label"].nunique()
num_classes


4

In [8]:
# mfcc extraction
class RAVDESSDataset(Dataset):
    def __init__(self, df, max_len=200):
        self.df = df
        self.max_len = max_len

    def __len__(self):
        return len(self.df)

    def pad(self, x):
        if x.shape[1] < self.max_len:
            pad_width = self.max_len - x.shape[1]
            x = np.pad(x, ((0,0),(0,pad_width)), mode='constant')
        else:
            x = x[:, :self.max_len]
        return x

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        path = row["filepath"]
        y = row["emotion_label"]

        audio, sr = librosa.load(path, sr=16000)
        mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=40)
        mfcc = self.pad(mfcc)

        return torch.tensor(mfcc, dtype=torch.float32).T, torch.tensor(y)


In [10]:
#train test split and loaders
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df, test_size=0.2, stratify=df["emotion_label"])

train_ds = RAVDESSDataset(train_df)
test_ds = RAVDESSDataset(test_df)

train_loader = DataLoader(train_ds, batch_size=16, shuffle=True)
test_loader = DataLoader(test_ds, batch_size=16)


In [11]:
# LSTM model
class LSTMBaseline(nn.Module):
    def __init__(self, input_dim=40, hidden_dim=64, num_classes=4):
        super().__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, num_classes)

    def forward(self, x):
        out, _ = self.lstm(x)
        out = out[:, -1, :]  # last timestep
        return self.fc(out)

model = LSTMBaseline(num_classes=num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)


In [13]:
# model training
epochs = 10

for epoch in range(epochs):
    model.train()
    running_loss = 0

    for X, y in train_loader:
        optimizer.zero_grad()
        outputs = model(X)
        loss = criterion(outputs, y)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    print(f"Epoch {epoch+1}/{epochs}, Loss: {running_loss/len(train_loader):.4f}")


Epoch 1/10, Loss: 1.3528
Epoch 2/10, Loss: 1.3548
Epoch 3/10, Loss: 1.3555
Epoch 4/10, Loss: 1.3529
Epoch 5/10, Loss: 1.3561
Epoch 6/10, Loss: 1.3544
Epoch 7/10, Loss: 1.3533
Epoch 8/10, Loss: 1.3575
Epoch 9/10, Loss: 1.3530
Epoch 10/10, Loss: 1.3542


In [16]:
# evakuate and save predictions of the baseline lstm
model.eval()
all_preds = []
all_labels = []
all_paths = []

with torch.no_grad():
    for batch, (X, y) in enumerate(test_loader):
        outputs = model(X)
        preds = torch.argmax(outputs, dim=1)

        all_preds.extend(preds.numpy())
        all_labels.extend(y.numpy())

accuracy = accuracy_score(all_labels, all_preds)
f1 = f1_score(all_labels, all_preds, average="weighted")

print("Baseline Model Accuracy:", accuracy)
print("Baseline Model F1 Score:", f1)


Baseline Model Accuracy: 0.28888888888888886
Baseline Model F1 Score: 0.12950191570881225


In [17]:
# save model
model_path = "../models/lstm_baseline.pth"
torch.save(model.state_dict(), model_path)
model_path


'../models/lstm_baseline.pth'

In [19]:
# save predictions
pred_df = test_df.copy()
pred_df["predicted"] = [label_encoder.inverse_transform([p])[0] for p in all_preds]

output_path = "../results/lstm_predictions.csv"
pred_df.to_csv(output_path, index=False)
output_path


'../results/lstm_predictions.csv'