In [1]:
!pip install pydub

Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1


In [2]:
import kagglehub
path = kagglehub.dataset_download("adrivg/ravdess-emotional-speech-video")
print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/adrivg/ravdess-emotional-speech-video?dataset_version_number=1...


100%|██████████| 12.4G/12.4G [09:32<00:00, 23.3MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/adrivg/ravdess-emotional-speech-video/versions/1


In [3]:
import os
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
import cv2
import numpy as np
from moviepy.editor import VideoFileClip
from scipy.io import wavfile
from scipy.signal import spectrogram
import matplotlib.pyplot as plt
import moviepy.editor as mp
from torchsummary import summary
from torch.nn.functional import one_hot
from torchvision.models import vgg16
from torchvision.models.video import r3d_18

  if event.key is 'enter':



In [4]:
if torch.cuda.is_available():
    print("GPU is available")
    print("Device:", torch.cuda.get_device_name(0))
else:
    print("GPU is not available")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

GPU is available
Device: Tesla T4


In [5]:
emotion_map = {
    '01': 'neutral',
    '02': 'calm',
    '03': 'happy',
    '04': 'sad',
    '05': 'angry',
    '06': 'fearful',
    '07': 'disgust',
    '08': 'surprised'
}
severity_map = {
    1: "Low Intensity",
    2: "High Intensity"
}

In [49]:
emotion_lst = [(int(x)-1) for x in emotion_map.keys()]
severit_lst = [(int(x)-1) for x in severity_map.keys()]
emotion_map_oh = one_hot(torch.tensor(emotion_lst), num_classes=8)
severity_map_oh = one_hot(torch.tensor(severit_lst), num_classes=2)

In [55]:
class Mydata(Dataset):
  def __init__(self, rootdir, num_frame = 16, size = (112, 112), sr = 22050, transform = None):
    super().__init__()
    self.rootdir = rootdir
    self.num_frame = num_frame
    self.size = size
    self.sr = sr
    self.transform = transform
    self.filepaths = []

    for (root,dirs,files) in os.walk(self.rootdir):
      for file_name in files:
        if file_name.endswith(".mp4"):
          splits = file_name.split("-")
          if int(splits[0]) == 1: # adding that have both audio and video
            self.filepaths.append(os.path.join(root, file_name))

    print(self.filepaths)

  def __len__(self):
    return len(self.filepaths)

  def extract_frames(self, video_path):

    cap = cv2.VideoCapture(video_path)
    length = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) - 1
    arr = np.linspace(0, length, self.num_frame)
    frames = []

    for i in arr:
      cap.set(cv2.CAP_PROP_POS_FRAMES, i)
      ret, frame = cap.read()

      if ret:
        frame = cv2.resize(frame, (112, 112))
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        frames.append(frame)
      else: # if frame is not read successfully, append a blank frame
        frames.append(np.zeros((112, 112, 3), dtype=np.uint8)) # Append blank frame


    cap.release()
    frames = np.array(frames)
    frames = frames.transpose(3, 0, 1, 2)
    # To (num_Frame, C, H, W)

    return torch.tensor(frames, dtype=torch.float32) / 255.

  def generate_spectrogram(self, video_path):
    temp_audio_path = "temp_audio.wav"

    video = VideoFileClip(video_path)
    video.audio.write_audiofile(temp_audio_path)
    sample_rate, samples = wavfile.read(temp_audio_path)

    # Convert stereo to mono
    if samples.ndim == 2:
        samples = samples.mean(axis=1)

    max_val = np.max(np.abs(samples))

    # Normalize and amplify
    if max_val == 0:
        print("Audio appears to be completely silent.")
        normalized = samples
    else:
        normalized = samples / max_val
    amplified = normalized * 50
    amplified = np.clip(amplified, -1.0, 1.0)

    frequencies, times, Sxx = spectrogram(amplified, self.sr)

    spectrogram_array = 10 * np.log10(Sxx + 1e-10)

    if spectrogram_array.ndim > 2:
        spectrogram_array = np.mean(spectrogram_array, axis=0)

    # Pad or trim to target length
    current_length = spectrogram_array.shape[1]
    target_length = 650
    if current_length < target_length:
        # Pad with zeros
        pad_width = target_length - current_length
        spectrogram_array = np.pad(spectrogram_array, pad_width=((0, 0), (0, pad_width)), mode='constant')
    elif current_length > target_length:
        # Trim to target length
        spectrogram_array = spectrogram_array[:, :target_length]

    return torch.tensor(spectrogram_array, dtype=torch.float32).unsqueeze(0)

  def get_lebel(self, video_path):
    video_path = video_path.split("/")[-1]
    emotion = video_path.split("-")[2]
    severity = video_path.split("-")[3]
    emotion_oh = one_hot(torch.tensor([int(emotion)-1]), num_classes=8).float()
    severity_oh = one_hot(torch.tensor([int(severity)-1]), num_classes=2).float()
    return emotion_oh, severity_oh

  def __getitem__(self, idx):
    video_path = self.filepaths[idx]
    frames = self.extract_frames(video_path)
    spectrogram = self.generate_spectrogram(video_path)
    emotion, severity = self.get_lebel(video_path)

    if self.transform:
      frames = self.transform(frames)
      spectrogram = self.transform(spectrogram)

    return frames, spectrogram, emotion, severity

In [56]:
class MyModel(nn.Module):
  def __init__(self, num_classes_emotion, num_classes_sev):
    super().__init__()
    self.num_classes = num_classes_emotion
    self.num_classes_sev = num_classes_sev

### Fine tuning r3d

    # Tranning only last 2 conv layer of r3d
    vid_c = r3d_18(pretrained=True)
    for name, para in vid_c.named_parameters():
      if "layer4.0" in name:
        para.requires_grad = True
      else:
        para.requires_grad = False
    # Removing Last and only FC layer in r3d
    vid_c.fc = nn.Identity()

    self.vid_c = vid_c
    # Passing Through one FC layer that will be train
    self.vid_d = nn.Sequential(
        nn.Flatten(),
        nn.Linear(512, 256),
        nn.ReLU(),
    )

###

### Fine tuning vgg16
    aud_c = vgg16(pretrained=True)
    aud_c.features[0] = nn.Conv2d(1, 64, kernel_size=(3, 3), stride=(1, 1))
    for name, params in aud_c.named_parameters():
      if  "features.26" in name or "features.28" in name or "features.30" in name : # fine-tuning only last 3 conv layers in features
        params.requires_grad = True
      else:
        params.requires_grad = False
    aud_c.features[0].requires_grad = True
    aud_c.classifier = nn.Identity()

    self.aud_c = aud_c

    # Passing Through one FC layer that will be train
    self.aud_d = nn.Sequential(
        nn.Flatten(),
        nn.Linear(25088, 256),
        nn.ReLU(),
    )
###

    self.fc = nn.Sequential(
        nn.Linear(512, 256),
        nn.ReLU()
        )

    # OutPutting 2 output
    self.output_emo = nn.Linear(256, num_classes_emotion)
    self.output_sev = nn.Linear(256, num_classes_sev)

  def forward(self, x_vid, x_aud):
    #videos
    x_vid = self.vid_c(x_vid)
    x_vid = self.vid_d(x_vid)

    #audio
    x_aud = self.aud_c(x_aud)
    x_aud = self.aud_d(x_aud)

    #concat
    x = torch.cat([x_vid, x_aud], dim=1)

    # after
    x = self.fc(x)

    # output
    output_emo = self.output_emo(x)
    output_emo = nn.Softmax(dim=1)(output_emo)
    output_sev = self.output_sev(x)
    output_sev = nn.Softmax(dim=1)(output_sev)

    return output_emo, output_sev

In [57]:
def train(model, epochs, lr, dataset, loss_fn_emotion, loss_fn_serverity):
    model.train()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    for epoch in range(epochs):
      for batch in dataset:
        frames, spectrogram, emotion, severity = batch
        frames = frames.to(device)
        spectrogram = spectrogram.to(device)
        emotion = emotion.to(device)
        severity = severity.to(device)

        optimizer.zero_grad()

        # Forward pass
        output_emo, output_sev = model(frames, spectrogram)

        # Calculate loss
        loss_emo = loss_fn_emotion(output_emo, emotion.argmax(dim=1))
        loss_sev = loss_fn_serverity(output_sev, severity.argmax(dim=1))

        loss = loss_emo + loss_sev
        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        print(f"Epoch: {epoch+1}/{epochs}, Loss: {loss.item()}") #Print loss for monitoring
        del frames, spectrogram, emotion, severity, output_emo, output_sev, loss_emo, loss_sev, loss
        torch.cuda.empty_cache()


In [53]:
dataset = Mydata(rootdir=path)
dataloader = DataLoader(dataset, batch_size=8, shuffle=True)

['/root/.cache/kagglehub/datasets/adrivg/ravdess-emotional-speech-video/versions/1/RAVDESS dataset/Video_Speech_Actor_13/Actor_13/01-01-04-01-01-01-13.mp4', '/root/.cache/kagglehub/datasets/adrivg/ravdess-emotional-speech-video/versions/1/RAVDESS dataset/Video_Speech_Actor_13/Actor_13/01-01-02-02-01-02-13.mp4', '/root/.cache/kagglehub/datasets/adrivg/ravdess-emotional-speech-video/versions/1/RAVDESS dataset/Video_Speech_Actor_13/Actor_13/01-01-03-02-01-02-13.mp4', '/root/.cache/kagglehub/datasets/adrivg/ravdess-emotional-speech-video/versions/1/RAVDESS dataset/Video_Speech_Actor_13/Actor_13/01-01-05-02-02-01-13.mp4', '/root/.cache/kagglehub/datasets/adrivg/ravdess-emotional-speech-video/versions/1/RAVDESS dataset/Video_Speech_Actor_13/Actor_13/01-01-04-02-02-02-13.mp4', '/root/.cache/kagglehub/datasets/adrivg/ravdess-emotional-speech-video/versions/1/RAVDESS dataset/Video_Speech_Actor_13/Actor_13/01-01-05-02-01-01-13.mp4', '/root/.cache/kagglehub/datasets/adrivg/ravdess-emotional-speec

In [58]:
model = MyModel(num_classes_emotion=3, num_classes_sev=2)
model.to(device)
loss_fn_emotion = nn.CrossEntropyLoss()
loss_fn_serverity = nn.CrossEntropyLoss()
train(model, epochs=10, lr=0.001, dataset=dataloader, loss_fn_emotion=loss_fn_emotion, loss_fn_serverity=loss_fn_serverity)


OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 2.12 MiB is free. Process 10614 has 14.74 GiB memory in use. Of the allocated memory 14.60 GiB is allocated by PyTorch, and 13.89 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)