In [20]:
from datasets import Dataset, concatenate_datasets

data1 = Dataset.from_json('data-open-voice/annotations/dataset1.json')
data2 = Dataset.from_json('data-open-voice/annotations/dataset2.json')
data3 = Dataset.from_json('data-open-voice/annotations/dataset3.json')
data = concatenate_datasets([data1, data2, data3])

In [21]:
data.shape

(3750, 4)

In [22]:
print(data.features)

{'path': Value(dtype='string', id=None), 'text': Value(dtype='string', id=None), 'array': Sequence(feature=Value(dtype='float64', id=None), length=-1, id=None), 'sampling_rate': Value(dtype='int64', id=None)}


In [23]:
from transformers import AutoProcessor

processor = AutoProcessor.from_pretrained("facebook/s2t-small-librispeech-asr")

def preprocess_batch(batch):
    # Extract arrays and sample rates
    audio_arrays = batch["array"]
    sampling_rates = batch["sampling_rate"]

    # Process audio features
    audio_inputs = processor(
        audio_arrays,
        sampling_rate=sampling_rates[0],  # assuming all have same rate
        return_tensors="pt",
        padding=True
    )

    # Process text labels
    labels = processor.tokenizer(
        batch["text"],
        return_tensors="pt",
        padding=True,
        truncation=True
    )
    
    # Replace padding token ID with -100
    labels["input_ids"][labels["input_ids"] == processor.tokenizer.pad_token_id] = -100
    

    return {
        "input_features": audio_inputs["input_features"],
        "labels": labels["input_ids"]
    }


In [24]:
dataset = data.map(preprocess_batch, batched=True)

In [25]:
dataset = dataset.train_test_split(test_size=0.3)
dataset

DatasetDict({
    train: Dataset({
        features: ['path', 'text', 'array', 'sampling_rate', 'input_features', 'labels'],
        num_rows: 2625
    })
    test: Dataset({
        features: ['path', 'text', 'array', 'sampling_rate', 'input_features', 'labels'],
        num_rows: 1125
    })
})

In [26]:
train_dataset = dataset["train"]
eval_dataset = dataset["test"]

In [27]:
from transformers import Speech2TextForConditionalGeneration, AutoProcessor

model = Speech2TextForConditionalGeneration.from_pretrained("facebook/s2t-small-librispeech-asr")
processor = AutoProcessor.from_pretrained("facebook/s2t-small-librispeech-asr")


Some weights of Speech2TextForConditionalGeneration were not initialized from the model checkpoint at facebook/s2t-small-librispeech-asr and are newly initialized: ['model.decoder.embed_positions.weights', 'model.encoder.embed_positions.weights']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="./s2t_finetuned",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    predict_with_generate=True,
    learning_rate=5e-5,
    eval_strategy="epoch",
    logging_strategy="steps",
    logging_steps=50,
    save_strategy="epoch",
    num_train_epochs=5,
    generation_max_length=128,
    fp16=False,  # If using a GPU that supports it
    save_total_limit=2
)


In [29]:
from dataclasses import dataclass
from typing import Any, Dict, List, Union
import torch

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any
    model: Any

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        input_features = [f["input_features"] for f in features]
        label_features = [f["labels"] for f in features]

        batch = self.processor.feature_extractor.pad(
            {"input_features": input_features},
            return_tensors="pt"
        )

        # Pad labels manually
        max_length = max(len(l) for l in label_features)
        labels_batch = torch.full((len(label_features), max_length), self.model.config.pad_token_id)
        for i, labels in enumerate(label_features):
            labels_batch[i, :len(labels)] = torch.tensor(labels)

        batch["labels"] = labels_batch
        return batch


In [30]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor, model=model)


In [31]:
import evaluate
import numpy as np
import torch

wer_metric = evaluate.load("wer")

def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # Replace -100 in labels as padding token
    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = processor.batch_decode(label_ids, skip_special_tokens=True)

    wer = wer_metric.compute(predictions=pred_str, references=label_str)
    return {"Word Error Rate": wer}

In [48]:
from transformers import Seq2SeqTrainer, EarlyStoppingCallback

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=processor.feature_extractor,
    data_collator=data_collator,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
    compute_metrics=compute_metrics
)


  trainer = Seq2SeqTrainer(


In [None]:
trainer.train()


Epoch,Training Loss,Validation Loss,Word error rate
1,2.0409,1.52215,0.403803
2,1.9294,1.328592,0.371679
3,1.6166,1.280242,0.349453
4,1.5939,1.201919,0.337645
5,1.543,1.195847,0.337211


TrainOutput(global_step=3285, training_loss=1.8112195681219232, metrics={'train_runtime': 1543.0217, 'train_samples_per_second': 8.506, 'train_steps_per_second': 2.129, 'total_flos': 1.9138238921244672e+17, 'train_loss': 1.8112195681219232, 'epoch': 5.0})

In [37]:
results = trainer.evaluate()
print(results)

{'eval_loss': 1.1958465576171875, 'eval_Word Error Rate': 0.337211321409967, 'eval_runtime': 121.8296, 'eval_samples_per_second': 9.234, 'eval_steps_per_second': 2.315, 'epoch': 5.0}


In [45]:
sample = eval_dataset.select(range(10))
output = trainer.predict(sample)

wer_metric = evaluate.load("wer")
pred_str = processor.batch_decode(output.predictions, skip_special_tokens=True)
label_str = processor.batch_decode(output.label_ids, skip_special_tokens=True)

for ref, pred in zip(label_str, pred_str):
    wer = wer_metric.compute(predictions=[pred], references=[ref])
    print(f"\n * Reference: {ref}\n * Prediction: {pred} \n * WER: {wer:.3f}")


 * Reference: two initiatives were approved for the general election ballot
 * Prediction: two natives were approved for the general election ballets 
 * WER: 0.222

 * Reference: the test missile is launched successfully
 * Prediction: the test thistle is lodged successfully 
 * WER: 0.333

 * Reference: larson is married and has two children sarah and luke
 * Prediction: loosont is married and has two zimbre 
 * WER: 0.500

 * Reference: law and economics has developed in a variety of directions
 * Prediction: the north end echoed 
 * WER: 1.000

 * Reference: it has been found that shynessinhibition is associated with a variety of maladaptive behaviors
 * Prediction: it has been found that chynus inhibation is associated with the variety of maled behavio 
 * WER: 0.357

 * Reference: rey also provided guitar on janet jackson's single just a little while
 * Prediction: ray also provided guitar on janet jackson's symboled just a little while while 
 * WER: 0.250

 * Reference: the cl

In [49]:
trainer.save_model("cupal-model-3750-5")
processor.save_pretrained("cupal-model-3750-5")


[]

In [50]:
# DABLOAT TESTING GROUNDS