In [None]:
from datasets import load_dataset, DatasetDict
from transformers import (
    WhisperTokenizer,
    WhisperProcessor,
    WhisperFeatureExtractor,
    WhisperForConditionalGeneration,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer
)
from datasets import Audio, Dataset
from dataclasses import dataclass
from typing import Any, Dict, List, Union
 
import torch
import torchaudio
import evaluate

from pathlib import Path

ImportError: cannot import name 'DataCollatorSpeechSeq2SeqWithPadding' from 'transformers' (c:\Users\josep\AppData\Local\Programs\Python\Python39\lib\site-packages\transformers\__init__.py)

In [3]:
model_name = "openai/whisper-small"
processor = WhisperProcessor.from_pretrained(model_name)
model = WhisperForConditionalGeneration.from_pretrained(model_name)

In [None]:
def preprocess_function(batch):
    audio, sampling_rate = torchaudio.load(batch["audio_filepath"])

    if sampling_rate != 16000:
        audio = torchaudio.transforms.Resample(sampling_rate, 16000)(audio)
    
    inputs = processor(audio.squeeze().numpy(), text=batch['transcription'], return_tensors="pt", sampling_rate=16000, padding=True)
    
    input_features = inputs.input_features[0]
    labels = inputs.labels
    return {"input_features": input_features, "labels": labels}

In [31]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper-finetuned",
    per_device_train_batch_size=8,
    gradient_accumulation_steps=2,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-5,
    warmup_steps=500,
    max_steps=5000,
    # fp16=True,
    save_total_limit=2,
    logging_dir="./logs",
)

In [39]:
train_path = Path("train-clean-100/LibriSpeech/train-clean-100")

folders = [item.name for item in train_path.iterdir() if item.is_dir()]

def get_data(a, b):
    data = {'audio_filepath': [], 'transcription': []}
    for i in range(a, b):
        cur_path = train_path / folders[i]

        for num_folder in cur_path.iterdir():
            sub_folder_path = cur_path / num_folder.name
            txt_file = next(sub_folder_path.glob("*.txt"))

            with txt_file.open('r') as file:
                for line in file:
                    data['audio_filepath'].append(str(sub_folder_path /(line.split(" ", 1)[0]+'.flac')))
                    data['transcription'].append(line.split(" ", 1)[1])
    return data

dataset = Dataset.from_dict(get_data(0, 5))
test_dataset = Dataset.from_dict(get_data(5, 10))

In [33]:
dataset

Dataset({
    features: ['audio_filepath', 'transcription'],
    num_rows: 517
})

In [34]:
dataset = dataset.map(preprocess_function, remove_columns=["audio_filepath", "transcription"])

Map:   0%|          | 0/517 [00:00<?, ? examples/s]

In [38]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    tokenizer=processor,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None)


In [44]:
def preprocess_function2(batch):
    audio, sampling_rate = torchaudio.load(batch["audio_filepath"])

    if sampling_rate != 16000:
        audio = torchaudio.transforms.Resample(sampling_rate, 16000)(audio)
    
    inputs = processor(audio.squeeze().numpy(), text=batch['transcription'], return_tensors="pt", sampling_rate=16000, padding=True)

    input_features = inputs.input_features[0]
    labels = inputs.labels
    labels[labels == processor.tokenizer.pad_token_id] = -100

    return {"input_features": input_features[0], "labels": labels[0]}

In [45]:
processed_test_dataset = test_dataset.map(preprocess_function2, remove_columns=["audio_filepath", "transcription"])
predictions = trainer.predict(processed_test_dataset)

Map:   0%|          | 0/533 [00:00<?, ? examples/s]

AttributeError: 'WhisperProcessor' object has no attribute 'pad'

In [None]:
predictions