In [1]:
DATA_PATH = 'data'
TRAIN_DATA_PATH = f'{DATA_PATH}/train'
TASK_DATA_PATH = f'{DATA_PATH}/test'
TRAIN_FILENAME = f'{DATA_PATH}/train_gt.csv'
TASK_FILENAME = f'{DATA_PATH}/test.csv'

SAMPLE_RATE = 16000

In [2]:
import pandas as pd
from transformers import AutoFeatureExtractor, ASTForAudioClassification, Trainer, ASTConfig, ASTModel, TrainingArguments, ASTFeatureExtractor
from datasets import Dataset, load_dataset, Audio
import numpy as np
import evaluate
import torch




In [3]:
accuracy = evaluate.load("accuracy")

In [11]:
df = pd.read_csv(TRAIN_FILENAME, header=None, names=['audio', 'label'])
df['audio'] = TRAIN_DATA_PATH + "/" + df['audio']

dataset = Dataset.from_pandas(df).cast_column("audio", Audio(sampling_rate=SAMPLE_RATE))

In [5]:
model_config = ASTConfig(
    hidden_size = 200, 
    num_hidden_layers = 4, 
    num_attention_heads = 4, 
    intermediate_size = 900, 
    hidden_act = 'gelu', 
    hidden_dropout_prob = 0.0, 
    attention_probs_dropout_prob = 0.0, 
    initializer_range = 0.02, 
    layer_norm_eps = 1e-12, 
    patch_size = 16, 
    qkv_bias = True, 
    frequency_stride = 10, 
    time_stride = 10, 
    max_length = 1024, 
    num_mel_bins = 128
)

model = ASTForAudioClassification(model_config)

feature_extractor = ASTFeatureExtractor()

In [3]:
# Preprocess the dataset
def preprocess_function(examples):
    audio = examples["audio"]  # Adjust based on your dataset structure
    inputs = feature_extractor(audio["array"], return_tensors='np', feature_size=-1, sampling_rate=SAMPLE_RATE) 
    inputs['input_values'] = inputs['input_values'].reshape(1024, 128)
    return inputs

# Metrics computing while fine tuning
def compute_metrics(eval_pred):
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=eval_pred.label_ids)

In [14]:
encoded_dataset = dataset.map(preprocess_function, remove_columns=["audio"]).train_test_split(test_size=0.3) # Можно добавить в map batched=True чтобы быстрее было

Map:   0%|          | 0/8803 [00:00<?, ? examples/s]

In [21]:
training_arguments = TrainingArguments(
    output_dir="/ast_save",
    learning_rate=3e-5,
    per_device_train_batch_size=16,
    # gradient_accumulation_steps=4, 
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    # warmup_ratio=0.1,
    # logging_steps=5,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    metric_for_best_model="accuracy",
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_arguments,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["test"],
    compute_metrics=compute_metrics,
)

In [22]:
trainer.train()

CPU times: total: 0 ns
Wall time: 0 ns


  0%|          | 0/772 [00:00<?, ?it/s]

  0%|          | 0/166 [00:00<?, ?it/s]

Non-default generation parameters: {'max_length': 1024}


{'eval_loss': 0.6552125215530396, 'eval_accuracy': 0.6376372586141613, 'eval_runtime': 664.3126, 'eval_samples_per_second': 3.976, 'eval_steps_per_second': 0.25, 'epoch': 1.0}
{'loss': 0.666, 'grad_norm': 5.292749881744385, 'learning_rate': 1.0569948186528496e-05, 'epoch': 1.3}


Non-default generation parameters: {'max_length': 1024}


  0%|          | 0/166 [00:00<?, ?it/s]

Non-default generation parameters: {'max_length': 1024}


{'eval_loss': 0.6537614464759827, 'eval_accuracy': 0.6376372586141613, 'eval_runtime': 664.5851, 'eval_samples_per_second': 3.974, 'eval_steps_per_second': 0.25, 'epoch': 2.0}
{'train_runtime': 8402.3936, 'train_samples_per_second': 1.467, 'train_steps_per_second': 0.092, 'train_loss': 0.6645753297163415, 'epoch': 2.0}


TrainOutput(global_step=772, training_loss=0.6645753297163415, metrics={'train_runtime': 8402.3936, 'train_samples_per_second': 1.467, 'train_steps_per_second': 0.092, 'total_flos': 2.3130917868404736e+16, 'train_loss': 0.6645753297163415, 'epoch': 2.0})

In [27]:
test = pd.read_csv(TASK_FILENAME, header=None, names=['audio', 'label'])
test['audio'] = TASK_DATA_PATH + "/" + test['audio']

dataset = Dataset.from_pandas(test, split="train").cast_column("audio", Audio(sampling_rate=SAMPLE_RATE))

In [7]:
from transformers import AutoModelForAudioClassification

model = AutoModelForAudioClassification.from_pretrained("ast_save")

In [29]:
resi = []
for tut in dataset:
    with torch.no_grad():
        resi.append(model(**feature_extractor(tut["audio"]["array"], sampling_rate=SAMPLE_RATE, return_tensors="pt", feature_size=-1)).logits)

In [31]:
import torch

resich = []

for i in resi:
    resich.append(torch.argmax(i).item())

In [32]:
resich

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,


In [59]:
trainer.save_model('ast_save')

Non-default generation parameters: {'max_length': 1024}
