### Mount Drive

In [1]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


### Import Libraries

In [2]:
import os
import torch
import librosa
import numpy as np
from torch import nn
from transformers import Wav2Vec2Processor, Wav2Vec2Model
from sklearn.model_selection import train_test_split

In [17]:
# Use GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### Define Emotion & Intensity Maps

In [4]:
emotion_map = {
    "ANG": 0,
    "HAP": 1,
    "SAD": 2,
    "FEA": 3,
    "DIS": 4,
    "NEU": 5
}

intensity_map = {
    "LO": 0.3,
    "MD": 0.6,
    "HI": 0.9
}

In [19]:
emotion_map

{'ANG': 0, 'HAP': 1, 'SAD': 2, 'FEA': 3, 'DIS': 4, 'NEU': 5}

In [20]:
intensity_map

{'LO': 0.3, 'MD': 0.6, 'HI': 0.9}

### Load CREMA-D Metadata

In [5]:
DATA_PATH = "/content/drive/MyDrive/CREMA-D/AudioWAV"

files = os.listdir(DATA_PATH)

data = []

for file in files:
    parts = file.split("_")
    emotion = emotion_map[parts[2]]
    intensity_code = parts[3].split(".")[0]

    intensity = intensity_map.get(intensity_code, None)

    data.append({
        "path": os.path.join(DATA_PATH, file),
        "emotion": emotion,
        "intensity": intensity
    })

In [25]:
files[:1],len(files)
# File name contain actor id,sentence code,emotion,intensity.

(['1079_WSI_DIS_XX.wav'], 7442)

In [28]:
data[:1]

[{'path': '/content/drive/MyDrive/CREMA-D/AudioWAV/1079_WSI_DIS_XX.wav',
  'emotion': 4,
  'intensity': None}]

### Load Pretrained Wav2Vec2

In [6]:
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base")
wav2vec = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base")
wav2vec.to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/159 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

tokenizer_config.json:   0%|          | 0.00/163 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/380M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/211 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/380M [00:00<?, ?B/s]

Wav2Vec2Model LOAD REPORT from: facebook/wav2vec2-base
Key                          | Status     |  | 
-----------------------------+------------+--+-
quantizer.weight_proj.bias   | UNEXPECTED |  | 
project_hid.weight           | UNEXPECTED |  | 
quantizer.weight_proj.weight | UNEXPECTED |  | 
project_q.weight             | UNEXPECTED |  | 
quantizer.codevectors        | UNEXPECTED |  | 
project_hid.bias             | UNEXPECTED |  | 
project_q.bias               | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


Wav2Vec2Model(
  (feature_extractor): Wav2Vec2FeatureEncoder(
    (conv_layers): ModuleList(
      (0): Wav2Vec2GroupNormConvLayer(
        (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
        (activation): GELUActivation()
        (layer_norm): GroupNorm(512, 512, eps=1e-05, affine=True)
      )
      (1-4): 4 x Wav2Vec2NoLayerNormConvLayer(
        (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
        (activation): GELUActivation()
      )
      (5-6): 2 x Wav2Vec2NoLayerNormConvLayer(
        (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,), bias=False)
        (activation): GELUActivation()
      )
    )
  )
  (feature_projection): Wav2Vec2FeatureProjection(
    (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
    (projection): Linear(in_features=512, out_features=768, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): Wav2Vec2Encoder(
    (pos_conv_embed): Wav2Vec2PositionalConvEmbedding(
  

### Create Multi-Task Model

In [7]:
class EmotionIntensityModel(nn.Module):
    def __init__(self, wav2vec_model, num_emotions=6):
        super().__init__()
        self.wav2vec = wav2vec_model

        self.dropout = nn.Dropout(0.3)

        self.emotion_head = nn.Linear(768, num_emotions)
        self.intensity_head = nn.Linear(768, 1)

    def forward(self, input_values):
        outputs = self.wav2vec(input_values)

        hidden_states = outputs.last_hidden_state

        pooled = torch.mean(hidden_states, dim=1)  # Mean pooling

        pooled = self.dropout(pooled)

        emotion_logits = self.emotion_head(pooled)
        intensity = self.intensity_head(pooled)

        return emotion_logits, intensity

In [8]:
model = EmotionIntensityModel(wav2vec).to(device)

### Prepare Dataset Class

In [9]:
class CremaDataset(torch.utils.data.Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]

        audio, sr = librosa.load(item["path"], sr=16000)

        return {
            "audio": audio,
            "emotion": item["emotion"],
            "intensity": item["intensity"]
        }

In [10]:
def collate_fn(batch):
    audios = [item["audio"] for item in batch]

    # Pad dynamically
    inputs = processor(
        audios,
        sampling_rate=16000,
        return_tensors="pt",
        padding=True
    )

    emotion = torch.tensor([item["emotion"] for item in batch])

    intensity = torch.tensor([
        item["intensity"] if item["intensity"] is not None else -1.0
        for item in batch
    ])

    return inputs.input_values, emotion, intensity

### Train-Test Split

In [11]:
train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)

train_dataset = CremaDataset(train_data)
val_dataset = CremaDataset(val_data)

train_loader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size=4,
    shuffle=True,
    collate_fn=collate_fn   # IMPORTANT
)

val_loader = torch.utils.data.DataLoader(
    val_dataset,
    batch_size=4,
    collate_fn=collate_fn
)

### Define Loss & Optimizer

In [12]:
emotion_loss_fn = nn.CrossEntropyLoss()
intensity_loss_fn = nn.MSELoss()

optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

### Training Loop

In [13]:
epochs = 5

for epoch in range(epochs):
    model.train()
    total_loss = 0

    for input_values, emotion, intensity in train_loader:

        input_values = input_values.to(device)
        emotion = emotion.to(device)
        intensity = intensity.to(device).float()

        optimizer.zero_grad()

        emotion_logits, intensity_pred = model(input_values)

        loss_emotion = emotion_loss_fn(emotion_logits, emotion)

        # Flatten safely
        intensity_pred = intensity_pred.view(-1)
        intensity = intensity.view(-1)

        mask = intensity != -1.0

        if mask.sum() > 0:
            loss_intensity = intensity_loss_fn(
                intensity_pred[mask],
                intensity[mask]
            )
        else:
            loss_intensity = torch.tensor(0.0, device=device)

        loss = loss_emotion + loss_intensity


        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {total_loss}")

Epoch 1, Loss: 1974.0307490378618
Epoch 2, Loss: 1311.2966364696622
Epoch 3, Loss: 1051.6258554719388
Epoch 4, Loss: 858.1788521148264
Epoch 5, Loss: 697.1542137861252


### Evaluation
**Emotion Accuracy**

In [14]:
model.eval()
correct = 0
total = 0

with torch.no_grad():
    for input_values, emotion, intensity in val_loader:
        input_values = input_values.to(device)
        emotion = emotion.to(device)

        emotion_logits, _ = model(input_values)

        preds = torch.argmax(emotion_logits, dim=1)

        correct += (preds == emotion).sum().item()
        total += emotion.size(0)

print("Validation Accuracy:", correct / total)

Validation Accuracy: 0.7515110812625924


**Intensity MAE**

In [15]:
from sklearn.metrics import mean_absolute_error

all_preds = []
all_true = []

with torch.no_grad():
    for input_values, emotion, intensity in val_loader:
        input_values = input_values.to(device)

        _, intensity_pred = model(input_values)

        mask = intensity != -1.0
        if mask.sum() > 0:
            all_preds.extend(intensity_pred.squeeze()[mask].cpu().numpy())
            all_true.extend(intensity[mask].numpy())

print("Intensity MAE:", mean_absolute_error(all_true, all_preds))

Intensity MAE: 0.17336306398399032


### Save Model

In [16]:
torch.save(model.state_dict(), "/content/drive/MyDrive/CREMA-D/wav2vec_multitask.pt")