In [18]:
from transformers import WhisperProcessor
from datasets import DatasetDict
from pathlib import Path
import torchaudio
from datasets import Dataset

In [7]:
train_path = Path("train-clean-100/LibriSpeech/train-clean-100")

folders = [item.name for item in train_path.iterdir() if item.is_dir()]

def get_data(a, b):
    data = {'audio_filepath': [], 'transcription': []}
    for i in range(a, b):
        cur_path = train_path / folders[i]

        for num_folder in cur_path.iterdir():
            sub_folder_path = cur_path / num_folder.name
            txt_file = next(sub_folder_path.glob("*.txt"))

            with txt_file.open('r') as file:
                for line in file:
                    data['audio_filepath'].append(str(sub_folder_path /(line.split(" ", 1)[0]+'.flac')))
                    data['transcription'].append(line.split(" ", 1)[1])
    return data

In [24]:
train_dataset = Dataset.from_dict(get_data(0, 5))
eval_dataset = Dataset.from_dict(get_data(5, 10))

In [9]:
processor = WhisperProcessor.from_pretrained("openai/whisper-small")

In [35]:
def preprocess_data(batch):
    audio_path = batch["audio_filepath"]
    try:
        waveform, sample_rate = torchaudio.load(audio_path)
        
        if sample_rate != 16000:
            resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
            waveform = resampler(waveform)
        
        if waveform.shape[0] > 1:
            waveform = waveform.mean(dim=0)
        
        inputs = processor(waveform.numpy(), sampling_rate=16000, return_tensors="pt", padding=True)
        labels = processor.tokenizer(batch["transcription"]).input_ids
        
        batch["input_ids"] = inputs.input_features.squeeze(0)
        batch["labels"] = labels
    except Exception as e:
        print(f"Error processing file {audio_path}: {e}")
        batch["input_ids"] = None
        batch["labels"] = None

    return batch

In [36]:
processed_train_dataset = train_dataset.map(preprocess_data, remove_columns=["audio_filepath", "transcription"])
processed_eval_dataset = eval_dataset.map(preprocess_data, remove_columns=["audio_filepath", "transcription"])

processed_train_dataset.set_format(type="torch", columns=["input_ids", "labels"])
processed_eval_dataset.set_format(type="torch", columns=["input_ids", "labels"])

Map:   0%|          | 0/517 [00:00<?, ? examples/s]

Map:   0%|          | 0/533 [00:00<?, ? examples/s]

In [37]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(processor.tokenizer, return_tensors="pt")

In [38]:
from transformers import WhisperForConditionalGeneration, TrainingArguments, Trainer

model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")

In [39]:
training_args = TrainingArguments(
    output_dir="./whisper-small-finetuned",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    save_steps=500,
    save_total_limit=2,
    logging_dir="./logs",
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=processed_train_dataset,
    eval_dataset=processed_eval_dataset,
    tokenizer=processor.tokenizer,
    data_collator=data_collator,
)

# Train the model
trainer.train()

  0%|          | 0/390 [00:00<?, ?it/s]

ValueError: You should supply an encoding or a list of encodings to this method that includes input_ids, but you provided ['labels']