In [1]:
from datasets import Dataset, concatenate_datasets

data1 = Dataset.from_json('data-open-voice/annotations/dataset1.json')
data2 = Dataset.from_json('data-open-voice/annotations/dataset2.json')
data3 = Dataset.from_json('data-open-voice/annotations/dataset3.json')
data4 = Dataset.from_json('data-open-voice/annotations/dataset4.json')
data5 = Dataset.from_json('data-open-voice/annotations/dataset5.json')
data6 = Dataset.from_json('data-open-voice/annotations/dataset6.json')
data7 = Dataset.from_json('data-open-voice/annotations/dataset7.json')
data8 = Dataset.from_json('data-open-voice/annotations/dataset8.json')
data9 = Dataset.from_json('data-open-voice/annotations/dataset9.json')
data10 = Dataset.from_json('data-open-voice/annotations/dataset10.json')
data11 = Dataset.from_json('data-open-voice/annotations/dataset11.json')
data12 = Dataset.from_json('data-open-voice/annotations/dataset12.json')
data13 = Dataset.from_json('data-open-voice/annotations/dataset13.json')
data14 = Dataset.from_json('data-open-voice/annotations/dataset14.json')
data15 = Dataset.from_json('data-open-voice/annotations/dataset15.json')
data = concatenate_datasets([data1, data2, data3, data4, data5, data6, data7, data8, data9, data10, data11, data12, data13, data14, data15])

In [2]:
data.shape

(17871, 4)

In [3]:
print(data.features)

{'path': Value(dtype='string', id=None), 'text': Value(dtype='string', id=None), 'array': Sequence(feature=Value(dtype='float64', id=None), length=-1, id=None), 'sampling_rate': Value(dtype='int64', id=None)}


In [4]:
from transformers import Speech2TextProcessor
import numpy as np

# Load processor and dataset
processor = Speech2TextProcessor.from_pretrained("facebook/s2t-small-librispeech-asr")

# Function to compute length of tokenized text
def get_token_lengths(example):
    with processor.as_target_processor():
        tokens = processor(example["text"], return_tensors=None)
        example["token_length"] = len(tokens["input_ids"])
    return example

# Map over dataset
token_lens = data.map(get_token_lengths, remove_columns=data.column_names)

# Extract and analyze
lengths = token_lens["token_length"]
print("Max length:", max(lengths))
print("Mean length:", np.mean(lengths))
print("95th percentile:", np.percentile(lengths, 95))

Max length: 50
Mean length: 17.532538749930055
95th percentile: 27.0


In [5]:
from transformers import Speech2TextProcessor

processor = Speech2TextProcessor.from_pretrained("facebook/s2t-small-librispeech-asr")

def preprocess_batch(batch):
    # Extract arrays and sample rates
    audio_arrays = batch["array"]
    sampling_rates = batch["sampling_rate"]

    # Process audio features
    audio_inputs = processor(
        audio_arrays,
        sampling_rate=sampling_rates[0],
        return_tensors="pt",
        padding=True
    )

    # Process text labels
    with processor.as_target_processor():
        labels = processor(
            batch["text"],
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=32
        )

    # Replace pad tokens with -100 for loss masking
    labels["input_ids"][labels["input_ids"] == processor.tokenizer.pad_token_id] = -100

    return {
        "input_features": audio_inputs["input_features"],
        "labels": labels["input_ids"]
    }


In [6]:
dataset = data.map(preprocess_batch, batched=True)

Map:   0%|          | 0/17871 [00:00<?, ? examples/s]

2025-05-01 23:13:08.574580: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746112388.645589  305702 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746112388.665930  305702 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1746112388.850508  305702 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1746112388.850530  305702 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1746112388.850531  305702 computation_placer.cc:177] computation placer alr

In [7]:
dataset = dataset.train_test_split(test_size=0.3)
dataset

DatasetDict({
    train: Dataset({
        features: ['path', 'text', 'array', 'sampling_rate', 'input_features', 'labels'],
        num_rows: 12509
    })
    test: Dataset({
        features: ['path', 'text', 'array', 'sampling_rate', 'input_features', 'labels'],
        num_rows: 5362
    })
})

In [8]:
train_dataset = dataset["train"]
eval_dataset = dataset["test"]

In [9]:
from transformers import Speech2TextForConditionalGeneration, Speech2TextProcessor

model = Speech2TextForConditionalGeneration.from_pretrained("facebook/s2t-small-librispeech-asr")
processor = Speech2TextProcessor.from_pretrained("facebook/s2t-small-librispeech-asr")


Some weights of Speech2TextForConditionalGeneration were not initialized from the model checkpoint at facebook/s2t-small-librispeech-asr and are newly initialized: ['model.decoder.embed_positions.weights', 'model.encoder.embed_positions.weights']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="./s2t_finetuned",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    predict_with_generate=True,
    learning_rate=3e-5,
    eval_strategy="epoch",
    logging_strategy="steps",
    logging_steps=50,
    save_strategy="epoch",
    num_train_epochs=10,
    generation_max_length=128,
    fp16=True,  # If using a GPU that supports it
    save_total_limit=2,
    load_best_model_at_end=True,                # ✅ Required for early stopping
    metric_for_best_model="eval_Word Error Rate",                # ✅ Choose your metric
    greater_is_better=False,
)


In [11]:
from dataclasses import dataclass
from typing import Any, Dict, List, Union
import torch

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any
    model: Any

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        input_features = [f["input_features"] for f in features]
        label_features = [f["labels"] for f in features]

        batch = self.processor.feature_extractor.pad(
            {"input_features": input_features},
            return_tensors="pt"
        )

        # Pad labels manually
        max_length = max(len(l) for l in label_features)
        labels_batch = torch.full((len(label_features), max_length), self.model.config.pad_token_id)
        for i, labels in enumerate(label_features):
            labels_batch[i, :len(labels)] = torch.tensor(labels)

        batch["labels"] = labels_batch
        return batch


In [12]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor, model=model)


In [13]:
import evaluate
import numpy as np
import torch

wer_metric = evaluate.load("wer")

def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # Replace -100 in labels as padding token
    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = processor.batch_decode(label_ids, skip_special_tokens=True)

    wer = wer_metric.compute(predictions=pred_str, references=label_str)
    return {"Word Error Rate": wer}

In [14]:
from transformers import Seq2SeqTrainer, EarlyStoppingCallback

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=processor.feature_extractor,
    data_collator=data_collator,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
    compute_metrics=compute_metrics
)


  trainer = Seq2SeqTrainer(


In [15]:
trainer.train()


Epoch,Training Loss,Validation Loss,Word error rate
1,1.7949,1.20687,0.314554
2,1.3237,0.966823,0.265956
3,1.1181,0.895766,0.248469
4,0.8993,0.861204,0.238577
5,0.992,0.838219,0.232803
6,0.8109,0.823213,0.228059
7,0.8212,0.815503,0.225889
8,0.6947,0.809167,0.222819
9,0.733,0.804756,0.222102
10,0.7345,0.803357,0.221771


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


TrainOutput(global_step=31280, training_loss=1.1801874401014478, metrics={'train_runtime': 14923.883, 'train_samples_per_second': 8.382, 'train_steps_per_second': 2.096, 'total_flos': 1.832735332009083e+18, 'train_loss': 1.1801874401014478, 'epoch': 10.0})

In [16]:
results = trainer.evaluate()
print(results)

{'eval_loss': 0.8033570051193237, 'eval_Word Error Rate': 0.22177070883515676, 'eval_runtime': 555.7157, 'eval_samples_per_second': 9.649, 'eval_steps_per_second': 2.413, 'epoch': 10.0}


In [17]:
sample = eval_dataset.select(range(10))
output = trainer.predict(sample)

wer_metric = evaluate.load("wer")
pred_str = processor.batch_decode(output.predictions, skip_special_tokens=True)
label_str = processor.batch_decode(output.label_ids, skip_special_tokens=True)

for ref, pred in zip(label_str, pred_str):
    wer = wer_metric.compute(predictions=[pred], references=[ref])
    print(f"\n * Reference: {ref}\n * Prediction: {pred} \n * WER: {wer:.3f}")


 * Reference: a limited release of the record featured clear blue vinyl
 * Prediction: limited release of the record featured clear blue final 
 * WER: 0.200

 * Reference: the plane crashed near ayelet hashahar
 * Prediction: the blame crashed near alet hashhahar 
 * WER: 0.500

 * Reference: di bona met his first wife gina with whom he has a daughter cara
 * Prediction: the owner met his first wife genoa with whom he has the daughter cara 
 * WER: 0.286

 * Reference: the subsequent habsburg emperors concentrated on their territories mainly in austria bohemia and hungary
 * Prediction: his subsequent airspring emperors concentrated on their territory namely in austria bohemia and hungary 
 * WER: 0.286

 * Reference: the pair returned and king edward quickly reinstated despenser as royal favourite
 * Prediction: the pair returned and king edward quickly reinstated to spencer his royal favorite 
 * WER: 0.333

 * Reference: progress on the saturn design seemed to go smoothly
 * Predi

In [24]:
from transformers import Seq2SeqTrainer
from kenlm import Model
import torch

# Load WER metric and KenLM ARPA model
wer_metric = evaluate.load("wer")
lm = Model("lib4gram.binary")

# Helper function to score sentences with KenLM
def score_sentence(sentence):
    return lm.score(sentence.strip(), bos=True, eos=True)

# Rescore beam outputs
def rescore_beam_list(beam_outputs):
    best_hypotheses = []
    for beams in beam_outputs:
        if not beams:
            best_hypotheses.append("")
            continue
        scores = [(hyp, score_sentence(hyp)) for hyp in beams]
        best = max(scores, key=lambda x: x[1])
        best_hypotheses.append(best[0])
    return best_hypotheses

# Beam search decoding
def generate_beam_outputs(model, processor, dataset, num_beams=5):
    model.eval()
    beam_outputs = []
    
    for batch in dataset:
        inputs = processor(batch["array"], sampling_rate=batch["sampling_rate"], return_tensors="pt", padding=True)
        input_values = inputs.input_values.to(model.device)

        with torch.no_grad():
            gen_output = model.generate(
                input_values,
                num_beams=num_beams,
                num_return_sequences=num_beams,
                return_dict_in_generate=True,
                output_scores=True,
                early_stopping=True,
            ).sequences

        decoded = processor.batch_decode(gen_output, skip_special_tokens=True)
        beam_outputs.append(decoded)

    return beam_outputs

# Evaluate rescored outputs
def evaluate_with_lm(model, processor, eval_dataset, num_beams=5):
    # Get references
    references = [sample["text"].lower().strip() for sample in eval_dataset]

    # Generate hypotheses and rescore
    beam_outputs = generate_beam_outputs(model, processor, eval_dataset, num_beams)
    rescored_preds = rescore_beam_list(beam_outputs)

    # Compute WER
    wer_score = wer_metric.compute(predictions=rescored_preds, references=references)
    print(f"WER after LM rescoring: {wer_score:.4f}")
    return wer_score


In [25]:
evaluate_with_lm(model, processor, eval_dataset)

KeyboardInterrupt: 

In [23]:
trainer.save_model("cupal-model-17871-10")
processor.save_pretrained("cupal-model-17871-10")


[]

In [50]:
# DABLOAT TESTING GROUNDS