In [1]:
from datasets import Dataset, load_dataset

In [2]:
data = Dataset.from_json('data-open-voice/annotations/dataset1.json')

In [3]:
data

Dataset({
    features: ['path', 'text', 'array', 'sampling_rate'],
    num_rows: 1250
})

In [4]:
from transformers import Speech2TextProcessor, Speech2TextForConditionalGeneration

model_name = "facebook/s2t-small-librispeech-asr"

processor = Speech2TextProcessor.from_pretrained(model_name)
model = Speech2TextForConditionalGeneration.from_pretrained(model_name)

2025-04-23 00:46:29.606502: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745340389.687339    3453 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745340389.708676    3453 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1745340389.877696    3453 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1745340389.877723    3453 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1745340389.877727    3453 computation_placer.cc:177] computation placer alr

In [5]:
def preprocess(example):
    max_audio_length = 1000  # Customize based on your data or model input length
    max_text_length = 200    # Customize based on your data or model output length

    # Process audio input with truncation and padding
    inputs = processor(
        example["array"],
        sampling_rate=example["sampling_rate"],
        return_tensors="pt",
        padding="max_length",  # Pad to fixed length
        max_length=max_audio_length,  # Define max length for audio
        truncation=True  # Truncate if it's longer than max_length
    )
    
    # Process text transcription (labels) with padding and truncation
    with processor.as_target_processor():
        labels = processor(
            example["text"],
            return_tensors="pt",
            padding="max_length",  # Pad to fixed length for text
            max_length=max_text_length,  # Define max length for text labels
            truncation=True  # Truncate if text is too long
        )

    return {
        "input_features": inputs.input_features.squeeze(0),  # Remove batch dim
        "labels": labels.input_ids.squeeze(0)  # Remove batch dim
    }



# Apply transformation
dataset = data.map(preprocess, batched=False)

In [10]:
print(dataset[0])

{'path': 'data-open-voice/voices/common_voice_en_35244283.mp3', 'text': 'He was commonly referred to as the "Blacksmith of Ballinalee".', 'array': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.

In [6]:
import evaluate
import numpy as np

wer_metric = evaluate.load("wer")

def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # Decode predictions
    pred_str = processor.batch_decode(pred_ids, skip_special_tokens=True)

    # Replace -100 with pad token
    label_ids_clean = [
        [token if token != -100 else processor.tokenizer.pad_token_id for token in label_seq]
        for label_seq in label_ids
    ]

    # Decode labels
    label_str = processor.batch_decode(label_ids_clean, skip_special_tokens=True)

    # 🐛 Debug print
    print("pred_str (sample):", pred_str[:2])
    print("label_str (sample):", label_str[:2])
    print("Type of pred_str[0]:", type(pred_str[0]))
    print("Type of label_str[0]:", type(label_str[0]))

    # Compute WER
    wer = wer_metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

In [13]:
from transformers import TrainingArguments, Trainer

split_dataset = dataset.train_test_split(test_size=0.1)
train_dataset = split_dataset["train"]
eval_dataset = split_dataset["test"]

training_args = TrainingArguments(
    output_dir="./s2t-finetuned",
    per_device_train_batch_size=4,
    num_train_epochs=3,
    save_steps=500,
    logging_steps=50,
    fp16=True,
    eval_strategy="epoch",  # Needed for eval
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=processor.feature_extractor,
    compute_metrics = compute_metrics
)

  trainer = Trainer(


In [None]:
output = trainer.train()

Step,Training Loss
50,8.8966
100,5.1199
150,1.6555
200,0.6023
250,0.4859
300,0.4068
350,0.4301
400,0.385
450,0.384
500,0.3804




In [None]:
# Evaluate the model
eval_results = trainer.evaluate()

# Output results
print(f"Evaluation results: {eval_results}")

TypeError: int() argument must be a string, a bytes-like object or a real number, not 'list'