In [1]:
import os
import torch
import torch.nn as nn
import evaluate
import numpy as np
import time
from datasets import load_dataset, Audio
from transformers import (
    AutoFeatureExtractor,
    AutoModel,
    TrainingArguments,
    Trainer,
    set_seed
)

In [2]:
# Configuration
MODEL_NAME = "ntu-spml/distilhubert"
NUM_LABELS = 2

Output_Model_Name = f"./distilhubert_classifier_TwoClassification_2*512*256_{time.time()}.pt"
SAMPLE_RATE = 16000
Max_Input_Length = 32000

EPOCHS = 3

if NUM_LABELS == 1:
    LABEL2ID = {"Backchannel": 0, "NotBackchannel": 1}
    Dataset_Path = "./dataset/binary_classification/train/inputs.csv"
elif NUM_LABELS == 2:
    LABEL2ID = {"Backchannel": 0, "INTERRUPT": 1}
    Dataset_Path = "./dataset/binary_classification/train/inputs.csv"
elif NUM_LABELS == 4:
    LABEL2ID = {"Backchannel": 0, "Interrupt": 1, "Statement": 2, "TurnToEnd": 3}
    Dataset_Path = "./dataset/four_classification/train/inputs.csv"
elif NUM_LABELS == 7:
    LABEL2ID = {"Interjection": 0, "Backchannel": 1, "Interrupt": 2, "Question": 3, "MidSentence": 4, "Statement":5, "TurnToEnd":6}
    Dataset_Path = "./dataset/seven_classification/train/inputs.csv"
ID2LABEL = {v: k for k, v in LABEL2ID.items()}

# Random Seeds
set_seed(int(time.time()))
device = torch.device("mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu")
print(device)

mps


Load Prechained Model

In [3]:
feature_extractor = AutoFeatureExtractor.from_pretrained(MODEL_NAME)
base_model = AutoModel.from_pretrained(MODEL_NAME).to(device)

Construct Classifier

In [4]:
from train import DistilHuBERTClassifier,DistilHuBERTClassifierBinary
if NUM_LABELS == 1:
    model = DistilHuBERTClassifierBinary(base_model).to(device)
else:
    model = DistilHuBERTClassifier(base_model, num_labels=NUM_LABELS).to(device)

Preprocess Data

In [5]:
# Load data
dataset = load_dataset("csv", data_files=Dataset_Path)
# dataset = load_dataset("csv", data_files="./dataset/train/inputs.csv")
dataset = dataset.cast_column("filepath", Audio(sampling_rate=SAMPLE_RATE))
dataset["train"] = dataset["train"].shuffle(seed=2025)
dataset = dataset["train"].train_test_split(test_size=0.2, seed=2025)

# preprocess data
def preprocess(example):
    audio_array = example["filepath"]

    inputs = feature_extractor(
        audio_array["array"],
        sampling_rate=SAMPLE_RATE,
        padding="max_length",
        max_length=Max_Input_Length,
        truncation=True,
        return_tensors="np"
    )
    input_values = inputs["input_values"][0]

    return {
        "input_values": input_values,
        "labels": example["labels"]
    }

encoded_dataset = dataset.map(preprocess)
encoded_dataset = encoded_dataset.shuffle(seed=2025) 
# print(encoded_dataset["train"])

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [6]:
# show the first 20 inputs
for idx in range(20):
    item = encoded_dataset["train"][idx]
    input_values = item["input_values"]
    label = item["labels"]
    print(f"Example {idx}: first_few_values={input_values[:5]}..., label = {label}")

Example 0: first_few_values=[-0.0535888671875, -0.0467529296875, -0.0477294921875, -0.04833984375, -0.0360107421875]..., label = 1
Example 1: first_few_values=[0.0, 0.0, 0.0, 0.0, 0.0]..., label = 0
Example 2: first_few_values=[0.0, 0.0, 0.0, 0.0, 0.0]..., label = 0
Example 3: first_few_values=[-0.000732421875, -0.00048828125, -0.00048828125, -0.00079345703125, -0.0009765625]..., label = 1
Example 4: first_few_values=[0.0, 0.0, 0.0, 0.0, 0.0]..., label = 0
Example 5: first_few_values=[-0.0054931640625, -0.00634765625, -0.0069580078125, -0.007110595703125, -0.0059814453125]..., label = 1
Example 6: first_few_values=[0.0, 0.0, 0.0, 0.0, 0.0]..., label = 0
Example 7: first_few_values=[0.0, 0.0, 0.0, 0.0, 0.0]..., label = 0
Example 8: first_few_values=[0.0009765625, 0.0010986328125, 0.001220703125, 0.001434326171875, 0.001708984375]..., label = 1
Example 9: first_few_values=[-0.00048828125, -0.000762939453125, -0.000732421875, -0.000457763671875, 0.0]..., label = 1
Example 10: first_few_va

Training Setting

In [7]:
# set evaluation function
accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")

if NUM_LABELS == 1:
    import numpy as np
    from sklearn.metrics import accuracy_score

    def compute_metrics(pred):
        logits = pred.predictions
        labels = pred.label_ids

        probs = torch.sigmoid(torch.from_numpy(logits)).numpy()
        preds = (probs > 0.5).astype(int)  # threshold 0.5

        return {
            'accuracy': accuracy_score(labels, preds),
            "f1": f1.compute(predictions=preds, references=labels, average="weighted")["f1"]
        }
else:
    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        preds = np.argmax(logits, axis=1)
        return {
            "accuracy": accuracy.compute(predictions=preds, references=labels)["accuracy"],
            "f1": f1.compute(predictions=preds, references=labels, average="weighted")["f1"],
        }


In [8]:
# set training arguments
training_args = TrainingArguments(
    num_train_epochs=EPOCHS,
    eval_strategy="epoch",
    output_dir="./checkpoints",
    save_strategy="epoch",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    logging_dir="./logs",
    logging_steps=20,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    save_total_limit=2,
    report_to="none",
)

Creat Trainer

In [15]:
# construct trainer
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.pop("labels")
        inputs = {k: v.to(model.encoder.device) for k, v in inputs.items()}
        labels = labels.to(model.encoder.device)
        outputs = model(**inputs, labels=labels)
        loss = outputs["loss"]
        return (loss, outputs) if return_outputs else loss

In [10]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(feature_extractor)

In [11]:
encoded_dataset["train"].set_format(type="torch", columns=["input_values", "labels"])
encoded_dataset["test"].set_format(type="torch", columns=["input_values", "labels"])

In [12]:
# training
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["test"],
    compute_metrics=compute_metrics,
    data_collator=data_collator,
)

Fintune

In [13]:
trainer.train()



tensor([[-0.0299,  0.0151],
        [-0.0161,  0.0107],
        [-0.0069, -0.0116],
        [ 0.0152,  0.0283]], device='mps:0', grad_fn=<LinearBackward0>)


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.002,0.066879,0.985,0.98499
2,0.0136,0.009463,0.995,0.995003
3,0.0255,0.047055,0.985,0.98499


tensor([[ 0.1219, -0.0569],
        [ 0.0695,  0.0027],
        [ 0.0704, -0.0402],
        [ 0.1109, -0.0281]], device='mps:0', grad_fn=<LinearBackward0>)
tensor([[ 0.2219, -0.1083],
        [ 0.0795, -0.0624],
        [ 0.1602, -0.1750],
        [ 0.1103, -0.0591]], device='mps:0', grad_fn=<LinearBackward0>)
tensor([[ 0.2869, -0.2235],
        [ 0.0729, -0.0160],
        [ 0.2597, -0.1637],
        [ 0.2234, -0.1356]], device='mps:0', grad_fn=<LinearBackward0>)
tensor([[ 0.0839, -0.0636],
        [ 0.3773, -0.2077],
        [ 0.1974, -0.1156],
        [ 0.1627, -0.0479]], device='mps:0', grad_fn=<LinearBackward0>)
tensor([[ 0.2788, -0.1178],
        [ 0.3497, -0.2036],
        [ 0.1052, -0.0650],
        [ 0.2158, -0.0839]], device='mps:0', grad_fn=<LinearBackward0>)
tensor([[ 0.1435, -0.0645],
        [ 0.1695, -0.0515],
        [ 0.3225, -0.1645],
        [ 0.1631, -0.0453]], device='mps:0', grad_fn=<LinearBackward0>)
tensor([[ 0.0498,  0.0010],
        [ 0.3081, -0.1465],
        



tensor([[ 3.5811, -2.6717],
        [-3.4317,  2.8339],
        [-3.2573,  3.0282],
        [-3.1426,  2.7779]], device='mps:0', grad_fn=<LinearBackward0>)
tensor([[ 3.4001, -3.0095],
        [-3.3393,  2.9825],
        [ 3.6819, -2.9753],
        [-3.6755,  3.0445]], device='mps:0', grad_fn=<LinearBackward0>)
tensor([[ 3.6801, -2.9215],
        [-2.8635,  2.8704],
        [-3.4646,  3.2631],
        [ 3.8228, -2.8538]], device='mps:0', grad_fn=<LinearBackward0>)
tensor([[ 3.6747, -3.1753],
        [-3.2462,  3.0818],
        [ 3.6332, -3.0847],
        [ 3.6909, -3.0323]], device='mps:0', grad_fn=<LinearBackward0>)
tensor([[ 4.0414, -3.1592],
        [-3.6661,  3.2970],
        [ 3.6580, -3.1852],
        [-3.3382,  2.9171]], device='mps:0', grad_fn=<LinearBackward0>)
tensor([[-3.2761,  3.2844],
        [ 3.7481, -3.0811],
        [-3.5689,  2.9358],
        [-3.1759,  2.9045]], device='mps:0', grad_fn=<LinearBackward0>)
tensor([[-3.4197,  3.0666],
        [ 3.2190, -2.9113],
        



tensor([[ 3.8413, -3.0709],
        [-4.0222,  3.8060],
        [ 3.6192, -3.0286],
        [-3.8766,  3.6401]], device='mps:0', grad_fn=<LinearBackward0>)
tensor([[ 3.7562, -3.0467],
        [ 3.8397, -3.2195],
        [ 3.9286, -3.1919],
        [ 3.8308, -3.2400]], device='mps:0', grad_fn=<LinearBackward0>)
tensor([[-3.9094,  3.8761],
        [ 0.0767, -0.1374],
        [ 3.7428, -2.9583],
        [ 4.2592, -3.3155]], device='mps:0', grad_fn=<LinearBackward0>)
tensor([[-3.9265,  3.9348],
        [ 4.1129, -3.3471],
        [ 3.8441, -3.2138],
        [ 3.7511, -3.1423]], device='mps:0', grad_fn=<LinearBackward0>)
tensor([[-4.0953,  3.8576],
        [ 3.2459, -3.1290],
        [-4.0213,  3.9105],
        [-3.8543,  3.7783]], device='mps:0', grad_fn=<LinearBackward0>)
tensor([[ 3.8260, -3.3001],
        [-3.9595,  3.7775],
        [-3.9728,  3.8464],
        [-3.5248,  3.0381]], device='mps:0', grad_fn=<LinearBackward0>)
tensor([[-3.5093,  3.3953],
        [-3.8764,  3.7733],
        

TrainOutput(global_step=600, training_loss=0.06242011436726898, metrics={'train_runtime': 35.2999, 'train_samples_per_second': 67.989, 'train_steps_per_second': 16.997, 'total_flos': 0.0, 'train_loss': 0.06242011436726898, 'epoch': 3.0})

In [14]:
# save model
torch.save(model.state_dict(), Output_Model_Name)