In [41]:
from google.colab import drive
import os

# Mount Google Drive
drive.mount('/content/drive')

# Set dataset folder path (update if needed)
AUDIO_FOLDER = '/content/drive/My Drive/audio/audio'


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [42]:
import numpy as np
import librosa
import cv2
import torch
import torchvision.models as models
import torchvision.transforms as transforms
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import classification_report


In [43]:
# Configuration
SAMPLE_RATE = 16000
SEGMENT_LENGTH = 3  # Segment length in seconds
ALEXNET_INPUT_SIZE = (227, 227)

# EMO-DB Emotion Mapping
EMOTION_MAP = {
    'W': 'anger',
    'L': 'boredom',
    'A': 'anxiety',
    'F': 'happiness',
    'T': 'sadness',
    'E': 'disgust',
    'N': 'neutral'
}


In [44]:
# Load Pretrained AlexNet (Truncated)
alexnet = models.alexnet(pretrained=True)
alexnet.classifier = alexnet.classifier[:6]  # Remove final classification layer
alexnet.eval()

# Image transformation for AlexNet
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])




In [45]:
def extract_log_mel_spectrogram(audio_file, segment_length=3):
    y, sr = librosa.load(audio_file, sr=SAMPLE_RATE)
    segment_samples = segment_length * sr
    segments = []

    for i in range(0, len(y) - segment_samples, segment_samples // 2):
        segment = y[i:i+segment_samples]
        mel_spec = librosa.feature.melspectrogram(y=segment, sr=sr, n_mels=64)
        log_mel = librosa.power_to_db(mel_spec)

        delta = librosa.feature.delta(log_mel)
        delta_delta = librosa.feature.delta(log_mel, order=2)

        combined = np.stack([log_mel, delta, delta_delta], axis=-1)
        resized = cv2.resize(combined, ALEXNET_INPUT_SIZE)

        segments.append(resized)

    return segments


In [46]:
def extract_features_from_segments(segments):
    features = []
    for segment in segments:
        tensor = transform(segment).unsqueeze(0)
        with torch.no_grad():
            feature = alexnet(tensor).squeeze().numpy()
        features.append(feature)
    return np.array(features)


In [47]:
def dtpm_pooling(segment_features):
    levels = [1, 2, 4]
    pooled_features = []

    for level in levels:
        split_size = len(segment_features) // level
        for i in range(level):
            start = i * split_size
            end = (i+1) * split_size if i != level-1 else len(segment_features)
            segment_part = segment_features[start:end]

            pooled_part = np.mean(segment_part, axis=0) if len(segment_part) > 0 else np.zeros(segment_features.shape[1])
            pooled_features.append(pooled_part)

    return np.concatenate(pooled_features)


In [48]:
def parse_filename(filename):
    speaker = filename[:2]
    emotion = EMOTION_MAP.get(filename[5], None)
    return speaker, emotion


In [49]:
# Convert emotion labels to numerical indices
EMOTION_TO_INDEX = {emotion: idx for idx, emotion in enumerate(EMOTION_MAP.values())}

def prepare_dataset():
    data, labels, speakers = [], [], []

    for file in os.listdir(AUDIO_FOLDER):
        if not file.endswith('.wav'):
            continue

        speaker, emotion = parse_filename(file)
        if not emotion:
            continue

        segments = extract_log_mel_spectrogram(os.path.join(AUDIO_FOLDER, file))
        if not segments:
            continue

        features = extract_features_from_segments(segments)
        pooled_features = dtpm_pooling(features)

        data.append(pooled_features)
        labels.append(EMOTION_TO_INDEX[emotion])  # Convert emotion to index
        speakers.append(speaker)

    return np.array(data), np.array(labels, dtype=np.int64), np.array(speakers)


In [50]:
def loso_cross_validation(features, labels, speakers):
    unique_speakers = np.unique(speakers)
    all_predictions, all_true_labels = [], []

    for test_speaker in unique_speakers:
        train_indices = np.where(speakers != test_speaker)[0]
        test_indices = np.where(speakers == test_speaker)[0]

        if len(test_indices) == 0:
            continue

        X_train, y_train = features[train_indices], labels[train_indices]
        X_test, y_test = features[test_indices], labels[test_indices]

        clf = SVC(kernel='linear')
        clf.fit(X_train, y_train)
        predictions = clf.predict(X_test)

        all_predictions.extend(predictions)
        all_true_labels.extend(y_test)

        print(f'LOSO - Leaving out Speaker {test_speaker}: Accuracy = {np.mean(predictions == y_test):.2f}')

    print("\nFinal Classification Report (LOSO Cross-Validation):")
    print(classification_report(all_true_labels, all_predictions, target_names=list(EMOTION_MAP.values())))


In [51]:
def main():
    print("Preparing dataset...")
    features, labels, speakers = prepare_dataset()
    #print(f"Extracted {len(features)} samples from {len(np.unique(speakers))} speakers.")

    print("Running LOSO Cross-Validation...")
    loso_cross_validation(features, labels, speakers)

if __name__ == "__main__":
    main()


Preparing dataset...
Running LOSO Cross-Validation...
LOSO - Leaving out Speaker 03: Accuracy = 0.44
LOSO - Leaving out Speaker 08: Accuracy = 0.69
LOSO - Leaving out Speaker 09: Accuracy = 0.41
LOSO - Leaving out Speaker 10: Accuracy = 0.57
LOSO - Leaving out Speaker 11: Accuracy = 0.45
LOSO - Leaving out Speaker 12: Accuracy = 0.69
LOSO - Leaving out Speaker 13: Accuracy = 0.33
LOSO - Leaving out Speaker 14: Accuracy = 0.60
LOSO - Leaving out Speaker 15: Accuracy = 0.67
LOSO - Leaving out Speaker 16: Accuracy = 0.50

Final Classification Report (LOSO Cross-Validation):
              precision    recall  f1-score   support

       anger       0.49      0.60      0.54        35
     boredom       0.49      0.55      0.51        33
     anxiety       0.00      0.00      0.00         9
   happiness       0.21      0.15      0.18        20
     sadness       0.80      0.83      0.81        47
     disgust       0.47      0.36      0.41        25
     neutral       0.50      0.56      0.53

In [52]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

In [53]:
# Define BiLSTM Model
class BiLSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers, output_dim):
        super(BiLSTM, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)  # BiLSTM outputs hidden_dim * 2

    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        out = self.fc(lstm_out[:, -1, :])  # Use last time step
        return out

In [54]:
def prepare_dataloader(features, labels, batch_size=16):
    # Add sequence length dimension (batch, seq_len=1, feat_dim)
    features_tensor = torch.tensor(features, dtype=torch.float32).unsqueeze(1)
    labels_tensor = torch.tensor(labels, dtype=torch.long)
    dataset = TensorDataset(features_tensor, labels_tensor)
    return DataLoader(dataset, batch_size=batch_size, shuffle=True)


In [55]:
# Training Function
def train_bilstm(model, dataloader, criterion, optimizer, num_epochs=10):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for inputs, targets in dataloader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}, Loss: {total_loss/len(dataloader):.4f}")

In [56]:
# Main Workflow
features, labels, speakers = prepare_dataset()
dataloader = prepare_dataloader(features, labels)

input_dim = features.shape[1]
hidden_dim = 128
num_layers = 2
output_dim = len(EMOTION_MAP)

bilstm_model = BiLSTM(input_dim, hidden_dim, num_layers, output_dim)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(bilstm_model.parameters(), lr=0.001)

train_bilstm(bilstm_model, dataloader, criterion, optimizer)

Epoch 1, Loss: 1.9061
Epoch 2, Loss: 1.8096
Epoch 3, Loss: 1.6988
Epoch 4, Loss: 1.6321
Epoch 5, Loss: 1.5526
Epoch 6, Loss: 1.5104
Epoch 7, Loss: 1.4957
Epoch 8, Loss: 1.4746
Epoch 9, Loss: 1.4479
Epoch 10, Loss: 1.4886


In [57]:
def evaluate_bilstm(model, features, labels):
    model.eval()
    with torch.no_grad():
        # Ensure features are shaped correctly (batch_size, seq_len=1, feature_dim)
        inputs = torch.tensor(features, dtype=torch.float32).unsqueeze(1)
        targets = torch.tensor(labels, dtype=torch.long)

        outputs = model(inputs)
        predictions = torch.argmax(outputs, dim=1).numpy()
        true_labels = targets.numpy()

    acc = accuracy_score(true_labels, predictions)
    report = classification_report(true_labels, predictions, target_names=list(EMOTION_MAP.values()), zero_division=0)
    print(f"Accuracy: {acc:.4f}")
    print("Classification Report:")
    print(report)
    return acc, report


In [39]:
def loso_bilstm(features, labels, speakers):
    unique_speakers = np.unique(speakers)
    all_predictions, all_true_labels = [], []

    for test_speaker in unique_speakers:
        train_indices = np.where(speakers != test_speaker)[0]
        test_indices = np.where(speakers == test_speaker)[0]

        if len(test_indices) == 0:
            continue

        X_train, y_train = features[train_indices], labels[train_indices]
        X_test, y_test = features[test_indices], labels[test_indices]

        train_loader = prepare_dataloader(X_train, y_train)

        bilstm_model = BiLSTM(X_train.shape[1], 128, 2, len(EMOTION_MAP))
        criterion = nn.CrossEntropyLoss()
        optimizer = optim.Adam(bilstm_model.parameters(), lr=0.001)

        train_bilstm(bilstm_model, train_loader, criterion, optimizer)

        # Ensure X_test is reshaped correctly before passing to the model
        X_test_tensor = torch.tensor(X_test, dtype=torch.float32).unsqueeze(1)
        predictions = torch.argmax(bilstm_model(X_test_tensor), dim=1).numpy()

        all_predictions.extend(predictions)
        all_true_labels.extend(y_test)

        acc = accuracy_score(y_test, predictions)
        print(f'LOSO - Leaving out Speaker {test_speaker}: Accuracy = {acc:.2f}')

    print("\nFinal Classification Report (LOSO Cross-Validation):")
    print(classification_report(all_true_labels, all_predictions, target_names=list(EMOTION_MAP.values()), zero_division=0))


In [40]:
evaluate_bilstm(bilstm_model, features, labels)
loso_bilstm(features, labels, speakers)

Accuracy: 0.5027
Classification Report:
              precision    recall  f1-score   support

       anger       0.60      0.60      0.60        35
     boredom       0.35      0.82      0.49        33
     anxiety       0.00      0.00      0.00         9
   happiness       0.00      0.00      0.00        20
     sadness       0.63      0.94      0.75        47
     disgust       0.50      0.04      0.07        25
     neutral       0.00      0.00      0.00        16

    accuracy                           0.50       185
   macro avg       0.30      0.34      0.27       185
weighted avg       0.40      0.50      0.40       185

Epoch 1, Loss: 1.9078
Epoch 2, Loss: 1.7767
Epoch 3, Loss: 1.6978
Epoch 4, Loss: 1.5856
Epoch 5, Loss: 1.5027
Epoch 6, Loss: 1.5442
Epoch 7, Loss: 1.4963
Epoch 8, Loss: 1.5267
Epoch 9, Loss: 1.4876
Epoch 10, Loss: 1.3915
LOSO - Leaving out Speaker 03: Accuracy = 0.38
Epoch 1, Loss: 1.9088
Epoch 2, Loss: 1.8241
Epoch 3, Loss: 1.7827
Epoch 4, Loss: 1.7229
Epoch 5