In [None]:
from huggingface_hub import interpreter_login

In [None]:
# Write token required
interpreter_login()

# Load dataset

In [ ]:
!pip install datasets transformers evaluate transformers[torch]

In [1]:
from datasets import load_dataset, Audio

In [2]:
minds = load_dataset("EdwardLin2023/AESDD", name="AESDD", split="train", trust_remote_code=True)
minds = minds.train_test_split(test_size=0.2)
minds = minds.remove_columns(["path", "utterance", "speaker"])

In [3]:
minds

DatasetDict({
    train: Dataset({
        features: ['audio', 'label'],
        num_rows: 483
    })
    test: Dataset({
        features: ['audio', 'label'],
        num_rows: 121
    })
})

In [4]:
labels = minds["train"].features["label"].names
label2id, id2label = dict(), dict()

for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label

In [5]:
id2label[str(2)]

'fear'

# Preprocess

In [6]:
from transformers import AutoFeatureExtractor

In [7]:
feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base")



In [8]:
minds = minds.cast_column("audio", Audio(sampling_rate=16_000))

In [9]:
def preprocess_function(examples):
    print(examples)
    audio_arrays = [x["array"] for x in examples["audio"]]
    
    inputs = feature_extractor(
        audio_arrays, sampling_rate=feature_extractor.sampling_rate, max_length=16_000, truncation=True
    )
    
    return inputs

In [None]:
encoded_minds = minds.map(preprocess_function, remove_columns="audio", batched=True)

Map:   0%|          | 0/483 [00:00<?, ? examples/s]

# Evaluate

In [None]:
import evaluate

In [None]:
accuracy = evaluate.load("accuracy")

In [None]:
import numpy as np

In [None]:
def compute_metrics(eval_pred):
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=eval_pred.label_ids)

# Train

In [None]:
from transformers import AutoModelForAudioClassification, TrainingArguments, Trainer

In [None]:
num_labels = len(id2label)
model = AutoModelForAudioClassification.from_pretrained("facebook/wav2vec2-base", num_labels=num_labels, label2id=label2id, id2label=id2label)

In [None]:
training_args = TrainingArguments(
    output_dir="mood_box",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=32,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=32,
    num_train_epochs=10,
    warmup_ratio=0.1,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    push_to_hub=True
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_minds["train"].with_format("torch"),
    eval_dataset=encoded_minds["test"].with_format("torch"),
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

In [None]:
trainer.push_to_hub()