In [None]:
%pip install librosa soundfile numpy scikit-learn pyaudio torch

In [1]:
import os, glob
import numpy as np
import soundfile
import librosa

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader


In [2]:
def extract_feature(file_name, mfcc=True, chroma=True, mel=True):
    with soundfile.SoundFile(file_name) as sound_file:
        X = sound_file.read(dtype="float32")
        sample_rate = sound_file.samplerate
        result = np.array([])
        if chroma:
            stft = np.abs(librosa.stft(X))
        if mfcc:
            mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T, axis=0)
            result = np.hstack((result, mfccs))
        if chroma:
            chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T, axis=0)
            result = np.hstack((result, chroma))
        if mel:
            mel = np.mean(librosa.feature.melspectrogram(y=X, sr=sample_rate).T, axis=0)
            result = np.hstack((result, mel))
    return result


In [3]:
emotions = {
    '01':'neutral', '02':'calm', '03':'happy', '04':'sad',
    '05':'angry', '06':'fearful', '07':'disgust', '08':'surprised'
}
observed_emotions = ['calm', 'happy', 'fearful', 'disgust']

In [4]:
def load_data(test_size=0.25):
    x, y = [], []
    for file in glob.glob("D:\\Projects\\Machine Learning\\speech emotion detector\\data\\Actor_*\\*.wav"):
        file_name = os.path.basename(file)
        emotion = emotions[file_name.split("-")[2]]
        if emotion not in observed_emotions:
            continue
        features = extract_feature(file, mfcc=True, chroma=True, mel=True)
        x.append(features)
        y.append(emotion)
    return train_test_split(np.array(x), y, test_size=test_size, random_state=9)

In [5]:
x_train, x_test, y_train, y_test = load_data()

In [6]:
x_train.shape, x_test.shape

((576, 180), (192, 180))

In [7]:
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [8]:
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

In [9]:
x_train_tensor = torch.tensor(x_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
x_test_tensor = torch.tensor(x_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

In [10]:
train_loader = DataLoader(TensorDataset(x_train_tensor, y_train_tensor), batch_size=32, shuffle=True)
test_loader = DataLoader(TensorDataset(x_test_tensor, y_test_tensor), batch_size=32, shuffle=False)

In [11]:
class EmotionNet(nn.Module):
    def __init__(self, input_size, num_classes):
        super(EmotionNet, self).__init__()
        self.fc1 = nn.Linear(input_size, 256)
        self.dropout1 = nn.Dropout(0.3)
        self.fc2 = nn.Linear(256, 128)
        self.dropout2 = nn.Dropout(0.3)
        self.out = nn.Linear(128, num_classes)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.dropout1(x)
        x = torch.relu(self.fc2(x))
        x = self.dropout2(x)
        return self.out(x)

In [12]:
input_size = x_train.shape[1]
num_classes = len(le.classes_)

In [13]:
model = EmotionNet(input_size, num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [14]:
for epoch in range(50):
    model.train()
    running_loss = 0.0
    for features, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(features)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    print(f"Epoch {epoch+1}/50 - Loss: {running_loss/len(train_loader):.4f}")


Epoch 1/50 - Loss: 1.3020
Epoch 2/50 - Loss: 1.1379
Epoch 3/50 - Loss: 0.9890
Epoch 4/50 - Loss: 0.8709
Epoch 5/50 - Loss: 0.7711
Epoch 6/50 - Loss: 0.7272
Epoch 7/50 - Loss: 0.6166
Epoch 8/50 - Loss: 0.5478
Epoch 9/50 - Loss: 0.4926
Epoch 10/50 - Loss: 0.4377
Epoch 11/50 - Loss: 0.4060
Epoch 12/50 - Loss: 0.3802
Epoch 13/50 - Loss: 0.3723
Epoch 14/50 - Loss: 0.3096
Epoch 15/50 - Loss: 0.2924
Epoch 16/50 - Loss: 0.2647
Epoch 17/50 - Loss: 0.2225
Epoch 18/50 - Loss: 0.2077
Epoch 19/50 - Loss: 0.2264
Epoch 20/50 - Loss: 0.2074
Epoch 21/50 - Loss: 0.2047
Epoch 22/50 - Loss: 0.1946
Epoch 23/50 - Loss: 0.1981
Epoch 24/50 - Loss: 0.1572
Epoch 25/50 - Loss: 0.1390
Epoch 26/50 - Loss: 0.1626
Epoch 27/50 - Loss: 0.1222
Epoch 28/50 - Loss: 0.1266
Epoch 29/50 - Loss: 0.1429
Epoch 30/50 - Loss: 0.1383
Epoch 31/50 - Loss: 0.1836
Epoch 32/50 - Loss: 0.1425
Epoch 33/50 - Loss: 0.1211
Epoch 34/50 - Loss: 0.1260
Epoch 35/50 - Loss: 0.1039
Epoch 36/50 - Loss: 0.0892
Epoch 37/50 - Loss: 0.0784
Epoch 38/5

In [15]:
model.eval()
correct = 0
total = 0

with torch.no_grad():
    for features, labels in test_loader:
        outputs = model(features)
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print(f"Test Accuracy: {100 * correct / total:.2f}%")


Test Accuracy: 75.52%


In [19]:
def predict_emotion(file_path):
    features = extract_feature(file_path, mfcc=True, chroma=True, mel=True)
    features = scaler.transform([features])
    tensor = torch.tensor(features, dtype=torch.float32)
    model.eval()
    with torch.no_grad():
        output = model(tensor)
        prediction = torch.argmax(output).item()
        return le.inverse_transform([prediction])[0]

# Example usage
print(predict_emotion("D:\\Projects\\Machine Learning\\speech emotion detector\\test\\hap.wav"))


fearful
