In [1]:
import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O
import os  # to use operating system dependent functionality
import librosa  # to extract speech features
import wave  # read and write WAV files
import matplotlib.pyplot as plt  # generate visualizations

In [2]:
from google.colab import files
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"piyumih","key":"b7ae12708273d11bde23180951b45297"}'}

In [3]:
import os
import zipfile

!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json



!kaggle datasets download -d uldisvalainis/audio-emotions


Dataset URL: https://www.kaggle.com/datasets/uldisvalainis/audio-emotions
License(s): unknown
Downloading audio-emotions.zip to /content
100% 1.12G/1.12G [00:05<00:00, 230MB/s]
100% 1.12G/1.12G [00:05<00:00, 207MB/s]


In [4]:
!unzip audio-emotions.zip -d audio_emotions


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: audio_emotions/Emotions/Happy/OAF_keg_happy.wav  
  inflating: audio_emotions/Emotions/Happy/OAF_kick_happy.wav  
  inflating: audio_emotions/Emotions/Happy/OAF_kill_happy.wav  
  inflating: audio_emotions/Emotions/Happy/OAF_king_happy.wav  
  inflating: audio_emotions/Emotions/Happy/OAF_kite_happy.wav  
  inflating: audio_emotions/Emotions/Happy/OAF_knock_happy.wav  
  inflating: audio_emotions/Emotions/Happy/OAF_late_happy.wav  
  inflating: audio_emotions/Emotions/Happy/OAF_laud_happy.wav  
  inflating: audio_emotions/Emotions/Happy/OAF_lean_happy.wav  
  inflating: audio_emotions/Emotions/Happy/OAF_learn_happy.wav  
  inflating: audio_emotions/Emotions/Happy/OAF_lease_happy.wav  
  inflating: audio_emotions/Emotions/Happy/OAF_lid_happy.wav  
  inflating: audio_emotions/Emotions/Happy/OAF_life_happy.wav  
  inflating: audio_emotions/Emotions/Happy/OAF_limb_happy.wav  
  inflating: audio_emotions/Emotions/H

In [5]:
data_path = "audio_emotions/Emotions"

In [6]:
import glob
import os

# Count all .wav files in the dataset directory (including subdirectories if any)
wav_files = glob.glob(os.path.join(data_path, '**', '*.wav'), recursive=True)
print("Number of audio files in the dataset:", len(wav_files))

Number of audio files in the dataset: 12798


In [7]:
import os
import glob


# List of emotion categories (as per folder names)
emotion_categories = ['Angry', 'Fearful', 'Happy', 'Neutral', 'Sad']

# Dictionary to store count per emotion
emotion_counts = {}

# Count .wav files in each emotion folder
for emotion in emotion_categories:
    folder_path = os.path.join(data_path, emotion)
    wav_files = glob.glob(os.path.join(folder_path, '*.wav'))
    emotion_counts[emotion] = len(wav_files)

# Print the counts
for emotion, count in emotion_counts.items():
    print(f"{emotion}: {count} files")


Angry: 2167 files
Fearful: 2047 files
Happy: 2167 files
Neutral: 1795 files
Sad: 2167 files


In [8]:
import os
import shutil
import random
import zipfile
# Base paths
original_dataset_path = data_path
output_base_path = '/content/output'  # where train/test folders will be created

# Train/test split ratio
split_ratio = 0.2  # 20% test

# Emotions (folder names)
emotion_folders = ['Angry','Fearful', 'Happy', 'Neutral', 'Sad']

# Loop through each emotion
for emotion in emotion_folders:
    emotion_path = os.path.join(original_dataset_path, emotion)
    files = [f for f in os.listdir(emotion_path) if f.endswith('.wav')]

    random.shuffle(files)
    split_index = int(len(files) * split_ratio)
    test_files = files[:split_index]
    train_files = files[split_index:]

    # Define destination folders
    train_emotion_path = os.path.join(output_base_path, 'train', emotion)
    test_emotion_path = os.path.join(output_base_path, 'test', emotion)
    os.makedirs(train_emotion_path, exist_ok=True)
    os.makedirs(test_emotion_path, exist_ok=True)

    # Move files
    for file in train_files:
        shutil.copy(os.path.join(emotion_path, file), os.path.join(train_emotion_path, file))

    for file in test_files:
        shutil.copy(os.path.join(emotion_path, file), os.path.join(test_emotion_path, file))

    print(f"{emotion}: {len(train_files)} training, {len(test_files)} testing files")

# Create a zip file of the output folder
shutil.make_archive('/content/audio_emotion_split', 'zip', output_base_path)


Angry: 1734 training, 433 testing files
Fearful: 1638 training, 409 testing files
Happy: 1734 training, 433 testing files
Neutral: 1436 training, 359 testing files
Sad: 1734 training, 433 testing files


'/content/audio_emotion_split.zip'

In [9]:
import os
import librosa
import pandas as pd

# List of emotion folders (same as your directory names)
emotions = ['Angry', 'Fearful', 'Happy', 'Neutral', 'Sad']

def extract_features(file_path):
    y, sr = librosa.load(file_path, sr=16000)
    pitch = librosa.yin(y, fmin=50, fmax=500).mean()
    energy = (y ** 2).mean()
    duration = librosa.get_duration(y=y, sr=sr)
    return pitch, energy, duration

def process_split(base_path):  # split is now full path
    rows = []
    for emotion in emotions:
        emotion_path = os.path.join(base_path, emotion)
        for file in os.listdir(emotion_path):
            if file.endswith('.wav'):
                path = os.path.join(emotion_path, file)
                pitch, energy, duration = extract_features(path)
                rows.append({
                    "path": path,
                    "pitch": pitch,
                    "energy": energy,
                    "duration": duration,
                    "label": emotion
                })
    return pd.DataFrame(rows)

# Generate train/test CSVs
train_df = process_split('/content/output/train')
test_df = process_split('/content/output/test')

train_df.to_csv('/content/train.csv', index=False)
test_df.to_csv('/content/test.csv', index=False)


**Create Dataset Class**

In [None]:
import os
import pandas as pd
import torchaudio

def resample_and_save(csv_path, output_dir):
    df = pd.read_csv(csv_path)
    os.makedirs(output_dir, exist_ok=True)

    new_paths = []
    for idx, row in df.iterrows():
        audio, sr = torchaudio.load(row['path'])

        # Resample if needed
        if sr != 16000:
            resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)
            audio = resampler(audio)

        new_path = os.path.join(output_dir, f"{idx}.wav")
        torchaudio.save(new_path, audio, 16000)
        new_paths.append(new_path)

    df['path'] = new_paths
    return df

# Run preprocessing once
train_df = resample_and_save('/content/train.csv', '/content/audio_16k/train')
test_df = resample_and_save('/content/test.csv', '/content/audio_16k/test')

# Save updated CSVs
train_df.to_csv('/content/train_16k.csv', index=False)
test_df.to_csv('/content/test_16k.csv', index=False)


In [None]:
from torch.utils.data import Dataset
import torch
import torchaudio
from transformers import Wav2Vec2Processor

class EmotionDataset(Dataset):
    def __init__(self, csv_path, processor, label2id):
        import pandas as pd
        self.data = pd.read_csv(csv_path)
        self.processor = processor
        self.label2id = label2id

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]

        # Load pre-resampled 16kHz audio
        waveform, sr = torchaudio.load(row['path'])

        # Process with Wav2Vec2Processor
        inputs = self.processor(
            waveform.squeeze(),
            sampling_rate=16000,
            return_tensors='pt',
            padding=True
        )

        input_values = inputs['input_values'].squeeze(0)
        attention_mask = inputs.get('attention_mask', torch.ones_like(input_values)).squeeze(0)

        prosody = torch.tensor([row['pitch'], row['energy'], row['duration']], dtype=torch.float32)
        label = torch.tensor(self.label2id[row['label']], dtype=torch.long)

        return {
            'input_values': input_values,
            'attention_mask': attention_mask,
            'prosody': prosody,
            'labels': label
        }


In [11]:
!unzip /content/wav2vec2_model.zip

Archive:  /content/wav2vec2_model.zip
   creating: wav2vec2_model/
  inflating: wav2vec2_model/config.json  
  inflating: wav2vec2_model/preprocessor_config.json  
  inflating: wav2vec2_model/pytorch_model.bin  
  inflating: wav2vec2_model/special_tokens_map.json  
  inflating: wav2vec2_model/tokenizer_config.json  
  inflating: wav2vec2_model/vocab.json  


**Define the Multi moal model**

In [15]:
from transformers import Wav2Vec2Model
import torch.nn as nn

class EmotionClassifier(nn.Module):
    def __init__(self, num_labels):
        super().__init__()
        self.wav2vec = Wav2Vec2Model.from_pretrained("/content/wav2vec2_model")
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Sequential(
            nn.Linear(768 + 3, 256),
            nn.ReLU(),
            nn.Linear(256, num_labels)
        )

    def forward(self, input_values, attention_mask, prosody):
        x = self.wav2vec(input_values, attention_mask=attention_mask).last_hidden_state[:, 0, :]
        combined = torch.cat([x, prosody], dim=1)
        logits = self.classifier(self.dropout(combined))
        return logits


In [17]:
def collate_fn(batch):
    input_values = [item['input_values'] for item in batch]
    attention_mask = [item['attention_mask'] for item in batch]
    prosody = torch.stack([item['prosody'] for item in batch])
    labels = torch.stack([item['labels'] for item in batch])

    # Pad input_values and attention_mask to the longest in batch
    input_values_padded = torch.nn.utils.rnn.pad_sequence(input_values, batch_first=True)
    attention_mask_padded = torch.nn.utils.rnn.pad_sequence(attention_mask, batch_first=True)

    return {
        'input_values': input_values_padded,
        'attention_mask': attention_mask_padded,
        'prosody': prosody,
        'labels': labels
    }


**Training**

In [20]:
from torch.utils.data import DataLoader
from transformers import Wav2Vec2Processor
import torch, torch.nn.functional as F
from sklearn.metrics import accuracy_score

processor = Wav2Vec2Processor.from_pretrained("/content/wav2vec2_model")
label2id = {label: idx for idx, label in enumerate(emotions)}

train_dataset = EmotionDataset('/content/train_16k.csv', processor, label2id)
test_dataset = EmotionDataset('/content/test_16k.csv', processor, label2id)

train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=4, collate_fn=collate_fn)


model = EmotionClassifier(num_labels=len(emotions)).to("cuda")
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

for epoch in range(1):
    model.train()
    total_loss = 0
    for batch in train_loader:
        for k in batch:
            batch[k] = batch[k].to("cuda")
        logits = model(batch['input_values'], batch['attention_mask'], batch['prosody'])
        loss = F.cross_entropy(logits, batch['labels'])
        total_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1} Loss: {total_loss / len(train_loader):.4f}")





KeyboardInterrupt: 