In [1]:
from google.colab import drive
drive.mount('/content/drive')


data_dir = "/content/drive/MyDrive/FYP/ser-selective-enhancement/data"

Mounted at /content/drive


In [3]:
!pip install torchaudio transformers librosa pandas


Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch==2.6.0->torchaudio)
  Using cached nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cusolver-cu12==11.6.1.9 (from torch==2.6.0->torchaudio)
  Using cached nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Using cached nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl (664.8 MB)
Using cached nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl (127.9 MB)
Installing collected packages: nvidia-cudnn-cu12, nvidia-cusolver-cu12
  Attempting uninstall: nvidia-cudnn-cu12
    Found existing installation: nvidia-cudnn-cu12 9.3.0.75
    Uninstalling nvidia-cudnn-cu12-9.3.0.75:
      Successfully uninstalled nvidia-cudnn-cu12-9.3.0.75
  Attempting uninstall: nvidia-cusolver-cu12
    Found existing installation: nvidia-cusolver-cu12 11.6.3.83
    Uninstalling nvidia-cusolver-cu12-11.6.3.83:
      Successfully uninstalled nvidia-cusolver-cu12-11.6.

In [4]:
from transformers import Wav2Vec2Processor, Wav2Vec2ForSequenceClassification
import torch

# Load pretrained model from audEERING
model_name = "audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim"
processor = Wav2Vec2Processor.from_pretrained(model_name)
model = Wav2Vec2ForSequenceClassification.from_pretrained(model_name)
model.eval()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/214 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/2.34k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/661M [00:00<?, ?B/s]

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Wav2Vec2ForSequenceClassification(
  (wav2vec2): Wav2Vec2Model(
    (feature_extractor): Wav2Vec2FeatureEncoder(
      (conv_layers): ModuleList(
        (0): Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (1-4): 4 x Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (5-6): 2 x Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
      )
    )
    (feature_projection): Wav2Vec2FeatureProjection(
      (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=

In [5]:
import torchaudio
import os
import pandas as pd

# Predict AVD using pretrained model
def predict_avd(file_path):
    waveform, sample_rate = torchaudio.load(file_path)
    if sample_rate != 16000:
        resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
        waveform = resampler(waveform)
    inputs = processor(waveform.squeeze().numpy(), sampling_rate=16000, return_tensors="pt", padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    probs = torch.sigmoid(outputs.logits).squeeze().tolist()
    return {
        "arousal": round(probs[0], 4),
        "valence": round(probs[1], 4),
        "dominance": round(probs[2], 4)
    }

# Parse filename for emotion and intensity
def parse_crema_filename(filename):
    # Example: 1001_DFA_ANG_HI.wav
    parts = filename.split('_')
    if len(parts) < 4:
        return None
    emotion_code = parts[2]
    intensity_code = parts[3].split('.')[0]

    emotion_map = {
        "ANG": "anger",
        "DIS": "disgust",
        "FEA": "fear",
        "HAP": "happy",
        "NEU": "neutral",
        "SAD": "sad"
    }
    intensity_map = {
        "LO": "low",
        "MD": "medium",
        "HI": "high",
        "XX": "unspecified"
    }

    emotion = emotion_map.get(emotion_code, "unknown")
    intensity = intensity_map.get(intensity_code, "unspecified")

    return {
        "filename": filename,
        "emotion": emotion,
        "intensity": intensity
    }


In [6]:
def generate_labels_csv(audio_dir, output_csv):
    files = [f for f in os.listdir(audio_dir) if f.endswith(".wav")]
    rows = []

    for fname in files:
        parsed = parse_crema_filename(fname)
        if not parsed:
            continue
        try:
            full_path = os.path.join(audio_dir, fname)
            avd = predict_avd(full_path)
            row = {
                "filename": fname,
                "emotion": parsed["emotion"],
                "intensity": parsed["intensity"],
                "arousal": avd["arousal"],
                "valence": avd["valence"],
                "dominance": avd["dominance"]
            }
            rows.append(row)
        except Exception as e:
            print(f"Error processing {fname}: {e}")

    df = pd.DataFrame(rows)
    df.to_csv(output_csv, index=False)
    print(f"✅ Saved: {output_csv} with {len(df)} entries")


In [8]:
# Set these to your actual paths
clean_dir = "/content/drive/MyDrive/FYP/ser-selective-enhancement/data/clean"
noisy_dir = "/content/drive/MyDrive/FYP/ser-selective-enhancement/data/noisy/CREMA-D-noisy-10db"

generate_labels_csv(clean_dir, "labels_clean_avd_predicted.csv")
generate_labels_csv(noisy_dir, "labels_noisy_avd_predicted.csv")


✅ Saved: labels_clean_avd_predicted.csv with 7442 entries
✅ Saved: labels_noisy_avd_predicted.csv with 7315 entries


In [27]:
import pandas as pd

clean_df = pd.read_csv('/content/drive/MyDrive/FYP/ser-selective-enhancement/data/labels_clean_avd_predicted.csv')
noisy_df = pd.read_csv('/content/drive/MyDrive/FYP/ser-selective-enhancement/data/labels_noisy_avd_predicted.csv')


In [28]:
import re

# Create a "base_filename" column
clean_df['base_filename'] = clean_df['filename']

# Use regex to strip suffixes like _10db.wav → .wav
noisy_df['base_filename'] = noisy_df['filename'].apply(lambda x: re.sub(r'(_\d+db)?\.wav$', '.wav', x))


In [29]:
common_files = set(clean_df['base_filename']).intersection(set(noisy_df['base_filename']))

# Filter both DataFrames
clean_df = clean_df[clean_df['base_filename'].isin(common_files)].sort_values('base_filename').reset_index(drop=True)
noisy_df = noisy_df[noisy_df['base_filename'].isin(common_files)].sort_values('base_filename').reset_index(drop=True)

print("Clean samples:", len(clean_df))
print("Noisy samples:", len(noisy_df))


Clean samples: 7315
Noisy samples: 7315


In [31]:
def avd_to_label(value):
    if value < 0.34:
        return 'low'
    elif value < 0.67:
        return 'medium'
    else:
        return 'high'

clean_arousal_labels = clean_df['arousal'].apply(avd_to_label)
noisy_arousal_labels = noisy_df['arousal'].apply(avd_to_label)

clean_valence_labels = clean_df['valence'].apply(avd_to_label)
noisy_valence_labels = noisy_df['valence'].apply(avd_to_label)

clean_dominance_labels = clean_df['dominance'].apply(avd_to_label)
noisy_dominance_labels = noisy_df['dominance'].apply(avd_to_label)


In [32]:
from sklearn.metrics import accuracy_score

arousal_acc = accuracy_score(clean_arousal_labels, noisy_arousal_labels)
valence_acc = accuracy_score(clean_valence_labels, noisy_valence_labels)
dominance_acc = accuracy_score(clean_dominance_labels, noisy_dominance_labels)

print(f"Arousal Accuracy:   {arousal_acc * 100:.2f}%")
print(f"Valence Accuracy:   {valence_acc * 100:.2f}%")
print(f"Dominance Accuracy: {dominance_acc * 100:.2f}%")


Arousal Accuracy:   100.00%
Valence Accuracy:   100.00%
Dominance Accuracy: 100.00%
