## Install Required Packages

In [None]:
!pip install librosa torchaudio openai-whisper transformers \
    matplotlib seaborn scikit-learn accelerate
!apt-get install ffmpeg

Collecting openai-whisper
  Downloading openai-whisper-20240930.tar.gz (800 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m800.5/800.5 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch==2.6.0->torchaudio)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch==2.6.0->torchaudio)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch==2.6.0->torchaudio)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch==2.6.0->torchaudio)
  Downloading nvidia_cudn

## Setup Project Structure

In [None]:
from google.colab import drive
import os

drive.mount('/content/drive')

# Create project folder
project_dir = "/content/drive/MyDrive/multimodal_emotion_recognition"
!mkdir -p {project_dir}/data {project_dir}/spectrograms {project_dir}/transcripts {project_dir}/mfccs
data_dir = os.path.join(project_dir, "data")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Unzip and Check Extracted Files

In [None]:
!unzip -q "{data_dir}/archive.zip" -d "{data_dir}"

In [None]:
import glob

# Recursively get all WAV files
audio_files = sorted(glob.glob(os.path.join(data_dir, '**', '*.wav'), recursive=True))

print(f"Found {len(audio_files)} WAV files.")
print(audio_files[:5])  # preview a few

Found 2880 WAV files.
['/content/drive/MyDrive/multimodal_emotion_recognition/data/Actor_01/03-01-01-01-01-01-01.wav', '/content/drive/MyDrive/multimodal_emotion_recognition/data/Actor_01/03-01-01-01-01-02-01.wav', '/content/drive/MyDrive/multimodal_emotion_recognition/data/Actor_01/03-01-01-01-02-01-01.wav', '/content/drive/MyDrive/multimodal_emotion_recognition/data/Actor_01/03-01-01-01-02-02-01.wav', '/content/drive/MyDrive/multimodal_emotion_recognition/data/Actor_01/03-01-02-01-01-01-01.wav']


## Generate and Save Spectrograms for CNN Input

In [None]:
import os
import librosa
import librosa.display
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm
import glob

# Set directories
project_dir = "/content/drive/MyDrive/multimodal_emotion_recognition"
data_dir = os.path.join(project_dir, "data")
spec_dir = os.path.join(project_dir, "spectrograms")
os.makedirs(spec_dir, exist_ok=True)

# Load audio files
audio_files = sorted(glob.glob(os.path.join(data_dir, '**', '*.wav'), recursive=True))

# Emotion label map
emotion_map = {
    "01": "neutral",
    "02": "calm",
    "03": "happy",
    "04": "sad",
    "05": "angry",
    "06": "fearful",
    "07": "disgust",
    "08": "surprised"
}

def parse_emotion(filename):
    parts = filename.split("-")
    return emotion_map.get(parts[2], "unknown")

# Generate and save spectrograms
for file in tqdm(audio_files, desc="Generating spectrograms"):
    try:
        y, sr = librosa.load(file, sr=22050)
        spec = librosa.feature.melspectrogram(y=y, sr=sr)
        spec_db = librosa.power_to_db(spec, ref=np.max)

        # Save as image
        emotion = parse_emotion(os.path.basename(file))
        base = os.path.splitext(os.path.basename(file))[0]
        out_path = os.path.join(spec_dir, f"{emotion}_{base}.png")

        plt.figure(figsize=(3, 3))
        librosa.display.specshow(spec_db, sr=sr, cmap='magma')
        plt.axis('off')
        plt.tight_layout(pad=0)
        plt.savefig(out_path, bbox_inches='tight', pad_inches=0)
        plt.close()
    except Exception as e:
        print(f"⚠️ Error processing {file}: {e}")


Generating spectrograms: 100%|██████████| 2880/2880 [07:15<00:00,  6.61it/s]


## Generate and Save Transcripts for the RNN

In [None]:
!pip install git+https://github.com/openai/whisper.git
!sudo apt update && sudo apt install ffmpeg

Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /tmp/pip-req-build-kemy_58g
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git /tmp/pip-req-build-kemy_58g
  Resolved https://github.com/openai/whisper.git to commit 517a43ecd132a2089d85f4ebc044728a71d49f6e
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting tiktoken (from openai-whisper==20240930)
  Downloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch->openai-whisper==20240930)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch->openai-whisper==20240930)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-

In [None]:
import whisper
model = whisper.load_model("base")

In [None]:
emotion_map = {
    "01": "neutral",
    "02": "calm",
    "03": "happy",
    "04": "sad",
    "05": "angry",
    "06": "fearful",
    "07": "disgust",
    "08": "surprised"
}

In [None]:
from google.colab import drive
import os
import json

drive.mount('/content/drive')
audio_folder = "/content/drive/MyDrive/multimodal_emotion_recognition/data"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
output = []

for root, dirs, files in os.walk(audio_folder):
    for fname in files:
        if fname.endswith(".wav"):
            path = os.path.join(root, fname)
            result = model.transcribe(path)
            label = emotion_map[fname.split("-")[2]]

            output.append({
                "filename": fname,
                "transcript": result["text"],
                "label": label
            })

# Save transcripts as JSON
with open("/content/drive/MyDrive/multimodal_emotion_recognition/transcripts/transcripts.json", "w") as f:
    json.dump(output, f, indent=2)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
