In [24]:
!uv pip install evaluate -q

import tensorflow_datasets as tfds
import torch
import numpy as np
from datasets import Dataset, Audio
from transformers import (
    Wav2Vec2Processor, 
    Wav2Vec2ForSequenceClassification, 
    TrainingArguments, 
    Trainer
)
import evaluate
import IPython.display as ipd



import torch
import numpy as np
import evaluate
from datasets import load_from_disk
from transformers import (
    TrainingArguments,
    Trainer,
    Wav2Vec2Processor, 
    Wav2Vec2ForSequenceClassification,
    Wav2Vec2FeatureExtractor
)

import os
from transformers.trainer_utils import get_last_checkpoint


import warnings
warnings.filterwarnings("ignore")

In [13]:
ds_data, ds_info = tfds.load(
    "crema_d",
    with_info=True,
    as_supervised=False,
    split=['train', 'validation', 'test']
)



# Extract label mappings (0=Neutral, 1=Happy, etc.) from TFDS metadata
label_names = ds_info.features['label'].names
label2id = {label: i for i, label in enumerate(label_names)}
id2label = {i: label for i, label in enumerate(label_names)}
print(f"Labels found: {label_names}")

# --- 2. Bridge: Convert TFDS to Hugging Face Dataset ---
# Wav2Vec2 Trainer works best with Hugging Face Datasets. 
# Since CREMA-D is small (~2GB), we can convert it in memory.

def tfds_to_hf_dataset(tf_dataset):
    data_dict = {"audio": [], "label": []}
    for sample in tf_dataset:
        audio = sample['audio'].numpy()
        label = sample['label'].numpy()
        
        # Normalize audio if it's integer PCM (Wav2Vec2 expects float inputs)
        # CREMA-D in TFDS is often int64; we convert to float32
        audio = audio.astype(np.float32)
        if np.abs(audio).max() > 1.0:
            audio = audio / 32768.0  # Normalize 16-bit PCM to [-1, 1]
            
        data_dict["audio"].append(audio)
        data_dict["label"].append(label)
    
    return Dataset.from_dict(data_dict)

print("Converting TFDS to Hugging Face format... (this may take a minute)")
train_dataset = tfds_to_hf_dataset(ds_data[0])
eval_dataset = tfds_to_hf_dataset(ds_data[1])
test_dataset = tfds_to_hf_dataset(ds_data[2])


print("Saving converted dataset to disk...")
train_dataset.save_to_disk("./crema_hf/train")
eval_dataset.save_to_disk("./crema_hf/eval")
test_dataset.save_to_disk("./crema_hf/test")

print("Saved! You can now load it later using from_disk()")

Labels found: ['NEU', 'HAP', 'SAD', 'ANG', 'FEA', 'DIS']
Converting TFDS to Hugging Face format... (this may take a minute)
Saving converted dataset to disk...


Saving the dataset (0/2 shards):   0%|          | 0/5144 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/738 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1556 [00:00<?, ? examples/s]

Saved! You can now load it later using from_disk()


In [25]:
print("Loading dataset from disk...")
train_dataset = load_from_disk("./crema_hf/train")
eval_dataset  = load_from_disk("./crema_hf/eval")
test_dataset  = load_from_disk("./crema_hf/test")


label_names = ["NEU", "HAP", "SAD", "ANG", "FEA", "DIS"]
label2id = {label: i for i, label in enumerate(label_names)}
id2label = {i: label for i, label in enumerate(label_names)}

print(f"Labels configured: {label_names}")


model_id = "facebook/hubert-base-ls960"
# model_id = "facebook/wav2vec2-base"
# model_id = "microsoft/wavlm-base"

processor = Wav2Vec2FeatureExtractor.from_pretrained(model_id)
target_sampling_rate = processor.sampling_rate


def preprocess_function(examples):
    audio_arrays = examples["audio"]
    return processor(
        audio_arrays,
        sampling_rate=target_sampling_rate,
        max_length=target_sampling_rate * 5,
        truncation=True,
        padding="max_length",
        return_tensors="pt"
    )

print("Preprocessing datasets...")
encoded_train = train_dataset.map(preprocess_function, batched=True)
encoded_eval  = eval_dataset.map(preprocess_function, batched=True)
encoded_test  = test_dataset.map(preprocess_function, batched=True)


encoded_train = encoded_train.rename_column("label", "labels")
encoded_eval  = encoded_eval.rename_column("label", "labels")
encoded_test  = encoded_test.rename_column("label", "labels")

columns = ["input_values", "labels"]
encoded_train.set_format("torch", columns=columns)
encoded_eval.set_format("torch", columns=columns)
encoded_test.set_format("torch", columns=columns)


model = Wav2Vec2ForSequenceClassification.from_pretrained(
    model_id,
    num_labels=len(label_names),
    label2id=label2id,
    id2label=id2label
)

model.freeze_feature_extractor()


accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    preds = np.argmax(eval_pred.predictions, axis=1)
    return accuracy.compute(predictions=preds, references=eval_pred.label_ids)



training_args = TrainingArguments(
    output_dir="./hubert-base-emotion-model",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=2,
    eval_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=20,
    learning_rate=3e-5,
    fp16=True,
    logging_steps=10,
    load_best_model_at_end=True,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_train,
    eval_dataset=encoded_eval,
    tokenizer=processor,  
    compute_metrics=compute_metrics,
)


if torch.cuda.is_available():
    print(f"Training on GPU: {torch.cuda.get_device_name(0)}")
else:
    print("WARNING: GPU not detected.")


output_dir = "./hubert-base-emotion-model"

if os.path.isdir(output_dir) and get_last_checkpoint(output_dir) is not None:
    last_checkpoint = get_last_checkpoint(output_dir)
    print(f"Resuming training from {last_checkpoint}")
    trainer.train(resume_from_checkpoint=last_checkpoint)

else:
    print("No checkpoint found, training from scratch")
    trainer.train()


print("Evaluating on test set...")
trainer.save_model("./hubert-base-emotion-model/final_model")
processor.save_pretrained("./hubert-base-emotion-model/final_processor")


test_results = trainer.predict(encoded_test)
print(f"Test Accuracy: {test_results.metrics['test_accuracy']:.4f}")

Loading dataset from disk...
Labels configured: ['NEU', 'HAP', 'SAD', 'ANG', 'FEA', 'DIS']
Preprocessing datasets...


You are using a model of type hubert to instantiate a model of type wav2vec2. This is not supported for all configurations of models and can yield errors.
Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/hubert-base-ls960 and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training on GPU: Tesla T4
No checkpoint found, training from scratch


Epoch,Training Loss,Validation Loss,Accuracy
1,1.4669,1.308947,0.501355
2,1.1048,1.017069,0.655827
3,0.9143,0.956272,0.670732
4,0.8148,0.937967,0.681572
5,0.7212,0.829177,0.739837
6,0.6791,0.965837,0.704607
7,0.6029,0.816032,0.750678
8,0.4591,1.019663,0.722222
9,0.3522,0.945358,0.754743
10,0.4508,0.927353,0.757453


RuntimeError: [enforce fail at inline_container.cc:626] . unexpected pos 332730048 vs 332729936

In [22]:
!ls 

crema_hf  hubert-base-emotion-model
