In [35]:
import os
import torch
import torch.nn as nn
import evaluate
import numpy as np
from datasets import load_dataset, Audio
from transformers import (
    AutoFeatureExtractor,
    AutoModel,
    TrainingArguments,
    Trainer,
    set_seed
)

In [36]:
# Configuration
MODEL_NAME = "ntu-spml/distilhubert"
SAMPLE_RATE = 16000
NUM_LABELS = 2
LABEL2ID = {"NO_INTERRUPT": 0, "INTERRUPT": 1}
ID2LABEL = {v: k for k, v in LABEL2ID.items()}

OUTPUT_MODEL = "./SemanticVAD(128*2_2000)"
# Random Seeds
set_seed(2025)
device = torch.device("mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu")
print(device)

mps


Load Data

In [37]:
# Load data
dataset = load_dataset("csv", data_files="dataset/binary_classification/train/inputs.csv")
# dataset = load_dataset("csv", data_files="./data/training_dataset.csv")
dataset = dataset.cast_column("filepath", Audio(sampling_rate=SAMPLE_RATE))
dataset["train"] = dataset["train"].shuffle(seed=2025)
dataset = dataset["train"].train_test_split(test_size=0.2, seed=2025)

Generating train split: 0 examples [00:00, ? examples/s]

In [38]:
import pandas as pd

for i in range(5):
    filepath = dataset["train"][i]["filepath"]
    array = filepath["array"]
    label = dataset["train"][i]["labels"]
    print(f"Sample {i} - Length: {len(array)}")
    print(f"First few values: {array[-20000:]}")
    print(f"Sampel {i} label: {label}")
    print("-" * 40)

Sample 0 - Length: 8000
First few values: [-0.00930786 -0.00939941 -0.00906372 ...  0.          0.
  0.        ]
Sampel 0 label: 0
----------------------------------------
Sample 1 - Length: 8000
First few values: [0.01208496 0.01599121 0.02020264 ... 0.00598145 0.00585938 0.00549316]
Sampel 1 label: 1
----------------------------------------
Sample 2 - Length: 8000
First few values: [ 0.01086426  0.0140686   0.0098877  ... -0.02706909 -0.02087402
 -0.01763916]
Sampel 2 label: 1
----------------------------------------
Sample 3 - Length: 8000
First few values: [-0.0682373  -0.09295654 -0.0838623  ...  0.          0.
  0.        ]
Sampel 3 label: 0
----------------------------------------
Sample 4 - Length: 8000
First few values: [-0.03540039 -0.03601074 -0.02313232 ...  0.          0.
  0.        ]
Sampel 4 label: 0
----------------------------------------


Load Prechained Model

In [39]:
feature_extractor = AutoFeatureExtractor.from_pretrained(MODEL_NAME)
base_model = AutoModel.from_pretrained(MODEL_NAME).to(device)

Construct Classifier

In [None]:
from transformers.modeling_outputs import SequenceClassifierOutput

class DistilHuBERTClassifier(nn.Module):
    def __init__(self, base_model, num_labels):
        super().__init__()
        self.encoder = base_model
        self.classifier = nn.Sequential(
            nn.Linear(base_model.config.hidden_size, 128),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, num_labels)
        )

    def forward(self, input_values, attention_mask=None, labels=None):
        outputs = self.encoder(input_values=input_values, attention_mask=attention_mask)
        pooled = outputs.last_hidden_state.mean(dim=1)
        logits = self.classifier(pooled)
        loss = None
        if labels is not None:
            loss = nn.CrossEntropyLoss()(logits, labels)
        return SequenceClassifierOutput(
            loss=loss,
            logits=logits
        )

In [41]:
# Initialize model
model = DistilHuBERTClassifier(base_model, num_labels=NUM_LABELS).to(device)

In [42]:
# preprocess data
def preprocess(example):
    audio_array = example["filepath"]

    inputs = feature_extractor(
        audio_array["array"],
        sampling_rate=SAMPLE_RATE,
        padding="max_length",
        max_length=8000,
        truncation=True,
        return_tensors="np"
    )
    input_values = inputs["input_values"][0]

    return {
        "input_values": input_values
    }

encoded_dataset = dataset.map(preprocess)
encoded_dataset = encoded_dataset.shuffle(seed=2025) 

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [43]:
print(encoded_dataset["train"])

Dataset({
    features: ['filepath', 'labels', 'input_values'],
    num_rows: 800
})


In [44]:
# set evaluation function
accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    return {
        "accuracy": accuracy.compute(predictions=preds, references=labels)["accuracy"],
        "f1": f1.compute(predictions=preds, references=labels, average="weighted")["f1"],
    }

In [45]:
# set training arguments
training_args = TrainingArguments(
    num_train_epochs=5,
    eval_strategy="epoch",
    output_dir="./checkpoints",
    save_strategy="epoch",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    logging_dir="./logs",
    logging_steps=20,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    save_total_limit=2,
    report_to="none",
)

In [46]:
# construct trainer
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.pop("labels")
        inputs = {k: v.to(model.encoder.device) for k, v in inputs.items()}
        labels = labels.to(model.encoder.device)
        outputs = model(**inputs, labels=labels)
        loss = outputs["loss"]
        return (loss, outputs) if return_outputs else loss

In [47]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(feature_extractor)

Creat Trainer and Fintune

In [48]:
encoded_dataset["train"].set_format(type="torch", columns=["input_values", "labels"])
encoded_dataset["test"].set_format(type="torch", columns=["input_values", "labels"])

sample = encoded_dataset["train"][0]
print(sample)
print(type(encoded_dataset["train"]["input_values"]))
print(type(encoded_dataset["train"]["labels"]))

{'labels': tensor(1), 'input_values': tensor([-0.0282, -0.0621, -0.1190,  ..., -0.0063, -0.0050, -0.0031])}
<class 'torch.Tensor'>
<class 'torch.Tensor'>


In [49]:
# training
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["test"],
    compute_metrics=compute_metrics,
    data_collator=data_collator,
)

In [50]:
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.0982,0.224601,0.945,0.945067
2,0.1429,0.197416,0.95,0.95006
3,0.1132,0.177496,0.955,0.954892
4,0.0114,0.204969,0.96,0.959879
5,0.0219,0.183908,0.96,0.959927




TrainOutput(global_step=1000, training_loss=0.1078292000060901, metrics={'train_runtime': 30.3626, 'train_samples_per_second': 131.741, 'train_steps_per_second': 32.935, 'total_flos': 0.0, 'train_loss': 0.1078292000060901, 'epoch': 5.0})

In [51]:
import time
new_model = OUTPUT_MODEL+str(int(time.time()))+".pt"
torch.save(model.state_dict(),new_model)
print(f"Model saved: {new_model}.")


Model saved: ./SemanticVAD(32*2_2000)1746150418.pt.
