In [1]:
import numpy as np
import pandas as pd
from transformers import AutoModelForAudioClassification, EvalPrediction

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
model_name_or_path = "/home/xiaoyujie/hf-models/TencentGameMate/chinese-hubert-large"

In [4]:
from datasets import load_dataset, Audio

In [5]:
dataset_files = {
    "train": "train.csv",
    "dev": "dev.csv",
    "test": "test.csv"
}

In [6]:
dataset = load_dataset('csv', data_files=dataset_files, name="kespeech")

In [7]:
dataset

DatasetDict({
    train: Dataset({
        features: ['audio_path', 'label'],
        num_rows: 267142
    })
    dev: Dataset({
        features: ['audio_path', 'label'],
        num_rows: 1080
    })
    test: Dataset({
        features: ['audio_path', 'label'],
        num_rows: 9559
    })
})

In [8]:
dataset = dataset.cast_column("audio_path", Audio(sampling_rate=16_000))

In [9]:
dataset = dataset.class_encode_column(column='label')

In [10]:
labels = dataset["train"].features["label"].names

In [11]:
labels

['Beijing',
 'Ji-Lu',
 'Jiang-Huai',
 'Jiao-Liao',
 'Lan-Yin',
 'Mandarin',
 'Northeastern',
 'Southwestern',
 'Zhongyuan']

In [12]:
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label

In [13]:
from transformers import AutoFeatureExtractor

feature_extractor = AutoFeatureExtractor.from_pretrained(model_name_or_path)

In [14]:
def preprocess_function(examples):
    audio_arrays = [x["array"] for x in examples["audio_path"]]
    inputs = feature_extractor(audio_arrays, sampling_rate=feature_extractor.sampling_rate)
    return inputs

In [15]:
encoded_dataset = dataset.map(preprocess_function, remove_columns="audio_path", batched=True, num_proc=4,
                              batch_size=100)

In [16]:
len(encoded_dataset["train"][0]['input_values'])

50864

In [17]:
import numpy as np

np.arange(100).tolist()

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99]

In [18]:
from sklearn.metrics import f1_score, classification_report, precision_score, accuracy_score


def compute_metrics(eval_pred):
    return {
        "accuracy": accuracy_score(eval_pred.label_ids, eval_pred.predictions.argmax(axis=-1)),
        "precision": precision_score(eval_pred.label_ids, eval_pred.predictions.argmax(axis=-1), average="macro"),
        "f1": f1_score(eval_pred.label_ids, eval_pred.predictions.argmax(axis=-1), average="macro"),
    }

In [40]:
from torch import nn
from transformers import HubertForSequenceClassification


class TransformerProjector(nn.Module):
    def __init__(self):
        super(TransformerProjector, self).__init__()
        self.transformer_encoder = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model=1024, nhead=16),
            num_layers=1,
        )
        self.trans = nn.Linear(1024, 256)

    def forward(self, x):
        x = self.transformer_encoder(x)
        x = self.trans(x)
        return x


class HubertTransformerForAudioClassification(HubertForSequenceClassification):
    def __init__(self, config):
        super().__init__(config)
        self.projector = TransformerProjector()
        self.post_init()

In [41]:


num_labels = len(id2label)
model = HubertTransformerForAudioClassification.from_pretrained(
    model_name_or_path, num_labels=num_labels, label2id=label2id, id2label=id2label
)

Some weights of HubertTransformerForAudioClassification were not initialized from the model checkpoint at /home/xiaoyujie/hf-models/TencentGameMate/chinese-hubert-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'projector.trans.bias', 'projector.trans.weight', 'projector.transformer_encoder.layers.0.linear1.bias', 'projector.transformer_encoder.layers.0.linear1.weight', 'projector.transformer_encoder.layers.0.linear2.bias', 'projector.transformer_encoder.layers.0.linear2.weight', 'projector.transformer_encoder.layers.0.norm1.bias', 'projector.transformer_encoder.layers.0.norm1.weight', 'projector.transformer_encoder.layers.0.norm2.bias', 'projector.transformer_encoder.layers.0.norm2.weight', 'projector.transformer_encoder.layers.0.self_attn.in_proj_bias', 'projector.transformer_encoder.layers.0.self_attn.in_proj_weight', 'projector.

In [42]:
import torch

In [43]:
torch.cuda.is_available()

True

In [44]:
model.freeze_base_model()

In [45]:
model

HubertTransformerForAudioClassification(
  (hubert): HubertModel(
    (feature_extractor): HubertFeatureEncoder(
      (conv_layers): ModuleList(
        (0): HubertLayerNormConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (1-4): 4 x HubertLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (5-6): 2 x HubertLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,), bias=False)
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
      )
    )
    (feature_projection): HubertFeatureProjection(
      (layer_norm): LayerNorm((512,), ep

In [46]:
def get_model_training_params(model):
    total_params = sum(p.numel() for p in model.parameters())
    print(f'{total_params:,} total parameters.')
    total_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f'{total_trainable_params:,} training parameters.')

In [47]:
get_model_training_params(model)

324,099,721 total parameters.
8,664,585 training parameters.


In [48]:
encoded_dataset["dev"]

Dataset({
    features: ['label', 'input_values', 'attention_mask'],
    num_rows: 1080
})

In [61]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="models",
    logging_dir="logs",
    eval_strategy="steps",
    eval_steps=10000,
    save_strategy="steps",
    save_steps=10000,
    learning_rate=1e-3,
    fp16=False,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=1,
    per_device_eval_batch_size=1,
    logging_steps=10000,
    num_train_epochs=2,
)

In [62]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["dev"],
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

Step,Training Loss,Validation Loss
