In [24]:
from datasets import Dataset

data = Dataset.from_json('data-open-voice/annotations/dataset1.json')
data

Dataset({
    features: ['path', 'text', 'array', 'sampling_rate'],
    num_rows: 1250
})

In [25]:
print(data.features)

{'path': Value(dtype='string', id=None), 'text': Value(dtype='string', id=None), 'array': Sequence(feature=Value(dtype='float64', id=None), length=-1, id=None), 'sampling_rate': Value(dtype='int64', id=None)}


In [26]:
from transformers import AutoProcessor

processor = AutoProcessor.from_pretrained("facebook/s2t-small-librispeech-asr")

def preprocess_batch(batch):
    # Extract arrays and sample rates
    audio_arrays = batch["array"]
    sampling_rates = batch["sampling_rate"]

    # Process audio features
    audio_inputs = processor(
        audio_arrays,
        sampling_rate=sampling_rates[0],  # assuming all have same rate
        return_tensors="pt",
        padding=True
    )

    # Process text labels
    labels = processor.tokenizer(
        batch["text"],
        return_tensors="pt",
        padding=True,
        truncation=True
    )
    
    # Replace padding token ID with -100
    labels["input_ids"][labels["input_ids"] == processor.tokenizer.pad_token_id] = -100
    

    return {
        "input_features": audio_inputs["input_features"],
        "labels": labels["input_ids"]
    }


In [27]:
dataset = data.map(preprocess_batch, batched=True)

Map:   0%|          | 0/1250 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [28]:
dataset = dataset.train_test_split(test_size=0.3)
dataset

DatasetDict({
    train: Dataset({
        features: ['path', 'text', 'array', 'sampling_rate', 'input_features', 'labels'],
        num_rows: 875
    })
    test: Dataset({
        features: ['path', 'text', 'array', 'sampling_rate', 'input_features', 'labels'],
        num_rows: 375
    })
})

In [29]:
train_dataset = dataset["train"]
eval_dataset = dataset["test"]

In [30]:
from transformers import Speech2TextForConditionalGeneration, AutoProcessor

model = Speech2TextForConditionalGeneration.from_pretrained("facebook/s2t-small-librispeech-asr")
processor = AutoProcessor.from_pretrained("facebook/s2t-small-librispeech-asr")


Some weights of Speech2TextForConditionalGeneration were not initialized from the model checkpoint at facebook/s2t-small-librispeech-asr and are newly initialized: ['model.decoder.embed_positions.weights', 'model.encoder.embed_positions.weights']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [31]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="./s2t_finetuned",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    predict_with_generate=True,
    learning_rate=5e-5,
    eval_strategy="epoch",
    logging_strategy="steps",
    logging_steps=50,
    save_strategy="epoch",
    num_train_epochs=5,
    generation_max_length=128,
    fp16=False,  # If using a GPU that supports it
    save_total_limit=2
)


In [32]:
from dataclasses import dataclass
from typing import Any, Dict, List, Union
import torch

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any
    model: Any

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        input_features = [f["input_features"] for f in features]
        label_features = [f["labels"] for f in features]

        batch = self.processor.feature_extractor.pad(
            {"input_features": input_features},
            return_tensors="pt"
        )

        # Pad labels manually
        max_length = max(len(l) for l in label_features)
        labels_batch = torch.full((len(label_features), max_length), self.model.config.pad_token_id)
        for i, labels in enumerate(label_features):
            labels_batch[i, :len(labels)] = torch.tensor(labels)

        batch["labels"] = labels_batch
        return batch


In [33]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor, model=model)


In [38]:
import evaluate
import numpy as np
import torch

wer_metric = evaluate.load("wer")

def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # Replace -100 in labels as padding token
    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = processor.batch_decode(label_ids, skip_special_tokens=True)

    wer = wer_metric.compute(predictions=pred_str, references=label_str)
    return {"wer": wer}

In [39]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=processor.feature_extractor,  # Needed for logging input shapes
    data_collator=data_collator,
    compute_metrics=compute_metrics
)


  trainer = Seq2SeqTrainer(


In [40]:
trainer.train()


Epoch,Training Loss,Validation Loss,Wer
1,1.0397,1.225211,0.290656
2,0.8754,1.200621,0.298141
3,0.9666,1.193967,0.300981
4,0.959,1.196837,0.303046
5,0.776,1.195418,0.304853


TrainOutput(global_step=1095, training_loss=0.9164759318033854, metrics={'train_runtime': 573.6956, 'train_samples_per_second': 7.626, 'train_steps_per_second': 1.909, 'total_flos': 6.360308394246144e+16, 'train_loss': 0.9164759318033854, 'epoch': 5.0})