In [1]:
from google.colab import drive
import os
import json

drive.mount('/content/drive')
audio_folder = "/content/drive/MyDrive/multimodal_emotion_recognition/data"

Mounted at /content/drive


In [9]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, random_split
import os
import glob
from PIL import Image
import numpy as np
from torchvision import transforms

In [40]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class MultimodalEmotionCNN(nn.Module):
    def __init__(self):
        super(MultimodalEmotionCNN, self).__init__()

        # MFCC branch
        self.mfcc_conv1 = nn.Conv2d(1, 16, kernel_size=3, padding=1)
        self.mfcc_conv2 = nn.Conv2d(16, 32, kernel_size=3, padding=1)
        self.mfcc_pool = nn.MaxPool2d(2, 2)
        self.mfcc_dropout = nn.Dropout(0.0)

        # Spectrogram branch
        self.spec_conv1 = nn.Conv2d(3, 16, kernel_size=3, padding=1)
        self.spec_conv2 = nn.Conv2d(16, 32, kernel_size=3, padding=1)
        self.spec_pool = nn.MaxPool2d(2, 2)
        self.spec_dropout = nn.Dropout(0.3)

        # Dummy forward pass to calculate flatten dimensions
        with torch.no_grad():
            dummy_mfcc = torch.zeros(1, 1, 40, 200)  # MFCC shape
            x_m = self.mfcc_pool(F.relu(self.mfcc_conv1(dummy_mfcc)))
            x_m = self.mfcc_pool(F.relu(self.mfcc_conv2(x_m)))
            self.mfcc_flatten_dim = x_m.view(1, -1).shape[1]

            dummy_spec = torch.zeros(1, 3, 128, 128)  # Spectrogram shape
            x_s = self.spec_pool(F.relu(self.spec_conv1(dummy_spec)))
            x_s = self.spec_pool(F.relu(self.spec_conv2(x_s)))
            self.spec_flatten_dim = x_s.view(1, -1).shape[1]

        # Fully connected layers after concatenation
        self.fc1 = nn.Linear(self.mfcc_flatten_dim + self.spec_flatten_dim, 128)
        self.fc2 = nn.Linear(128, 8)  # 8 emotion classes

    def forward(self, mfcc, spec):
        # MFCC branch
        x_m = self.mfcc_pool(F.relu(self.mfcc_conv1(mfcc)))
        x_m = self.mfcc_pool(F.relu(self.mfcc_conv2(x_m)))
        x_m = self.mfcc_dropout(x_m)
        x_m = x_m.view(x_m.size(0), -1)

        # Spectrogram branch
        x_s = self.spec_pool(F.relu(self.spec_conv1(spec)))
        x_s = self.spec_pool(F.relu(self.spec_conv2(x_s)))
        x_s = self.spec_dropout(x_s)
        x_s = x_s.view(x_s.size(0), -1)

        # Concatenate
        x = torch.cat((x_m, x_s), dim=1)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x


In [41]:
emotion_map = {
    "neutral": 0, "calm": 1, "happy": 2, "sad": 3,
    "angry": 4, "fearful": 5, "disgust": 6, "surprised": 7
}

class MultimodalEmotionDataset(Dataset):
    def __init__(self, mfcc_dir, spec_dir, transform=None):
        self.mfcc_dir = mfcc_dir
        self.spec_dir = spec_dir
        self.files = sorted([f for f in os.listdir(mfcc_dir) if f.endswith('.npy')])
        self.transform = transform

    def __len__(self):
        return len(self.files)

    def __getitem__(self, idx):
        mfcc_file = self.files[idx]
        base_name = os.path.splitext(mfcc_file)[0]
        label_str = base_name.split('_')[0]
        label = emotion_map[label_str]

        # Load MFCC
        mfcc = np.load(os.path.join(self.mfcc_dir, mfcc_file))
        mfcc = torch.tensor(mfcc, dtype=torch.float32).unsqueeze(0)  # Add channel dim

        # Load Spectrogram image
        spec_path = os.path.join(self.spec_dir, f"{base_name}.png")
        spec_img = Image.open(spec_path).convert('RGB')
        if self.transform:
            spec_img = self.transform(spec_img)

        return mfcc, spec_img, label

In [42]:
# Paths
mfcc_dir = "/content/drive/MyDrive/multimodal_emotion_recognition/mfccs"
spec_dir = "/content/drive/MyDrive/multimodal_emotion_recognition/spectrograms"

transform = transforms.Compose([
    transforms.Resize((128, 128)),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

# Dataset and split
full_dataset = MultimodalEmotionDataset(mfcc_dir, spec_dir, transform=transform)
train_len = int(0.8 * len(full_dataset))
val_len = len(full_dataset) - train_len
train_dataset, val_dataset = random_split(full_dataset, [train_len, val_len], generator=torch.Generator().manual_seed(42))

# DataLoaders
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

In [53]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = MultimodalEmotionCNN().to(device)

In [54]:
# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001, weight_decay=0.001)

# Move model to device
model = model.to(device)

# Training and validation
num_epochs = 35
for epoch in range(num_epochs):
    print(f"\nEpoch {epoch+1}/{num_epochs} ----------------------------")
    model.train()
    train_loss, train_correct, total = 0.0, 0, 0

    for batch_idx, (mfccs, specs, labels) in enumerate(train_loader):
        mfccs, specs, labels = mfccs.to(device), specs.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(mfccs, specs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        train_loss += loss.item() * labels.size(0)
        _, predicted = torch.max(outputs, 1)
        train_correct += (predicted == labels).sum().item()
        total += labels.size(0)

    train_acc = 100 * train_correct / total
    train_loss = train_loss / total

    # Validation
    model.eval()
    val_loss, val_correct, val_total = 0.0, 0, 0
    with torch.no_grad():
        for batch_idx, (mfccs, specs, labels) in enumerate(val_loader):
            mfccs, specs, labels = mfccs.to(device), specs.to(device), labels.to(device)
            outputs = model(mfccs, specs)
            loss = criterion(outputs, labels)

            val_loss += loss.item() * labels.size(0)
            _, predicted = torch.max(outputs, 1)
            val_correct += (predicted == labels).sum().item()
            val_total += labels.size(0)

    val_acc = 100 * val_correct / val_total
    val_loss = val_loss / val_total

    print(f"Epoch {epoch+1} Summary -> "
          f"Train Loss: {train_loss:.4f}, Acc: {train_acc:.2f}% | "
          f"Val Loss: {val_loss:.4f}, Acc: {val_acc:.2f}%")



Epoch 1/35 ----------------------------
  Training batch 1/36
  Training batch 2/36
  Training batch 3/36
  Training batch 4/36
  Training batch 5/36
  Training batch 6/36
  Training batch 7/36
  Training batch 8/36
  Training batch 9/36
  Training batch 10/36
  Training batch 11/36
  Training batch 12/36
  Training batch 13/36
  Training batch 14/36
  Training batch 15/36
  Training batch 16/36
  Training batch 17/36
  Training batch 18/36
  Training batch 19/36
  Training batch 20/36
  Training batch 21/36
  Training batch 22/36
  Training batch 23/36
  Training batch 24/36
  Training batch 25/36
  Training batch 26/36
  Training batch 27/36
  Training batch 28/36
  Training batch 29/36
  Training batch 30/36
  Training batch 31/36
  Training batch 32/36
  Training batch 33/36
  Training batch 34/36
  Training batch 35/36
  Training batch 36/36
  Validating batch 1/9
  Validating batch 2/9
  Validating batch 3/9
  Validating batch 4/9
  Validating batch 5/9
  Validating batch 6/9
  