In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Step 1: Install necessary libraries
!pip install transformers datasets accelerate audiomentations librosa==0.10.1

Collecting audiomentations
  Downloading audiomentations-0.43.1-py3-none-any.whl.metadata (11 kB)
Collecting librosa==0.10.1
  Downloading librosa-0.10.1-py3-none-any.whl.metadata (8.3 kB)
Collecting numpy-minmax<1,>=0.3.0 (from audiomentations)
  Downloading numpy_minmax-0.5.0-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.0 kB)
Collecting numpy-rms<1,>=0.4.2 (from audiomentations)
  Downloading numpy_rms-0.6.0-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.5 kB)
Collecting python-stretch<1,>=0.3.1 (from audiomentations)
  Downloading python_stretch-0.3.1-cp312-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.7 kB)
Collecting soxr>=0.3.2 (from librosa==0.10.1)
  Downloading soxr-0.5.0.post1-cp312-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.6 kB)
Downloading librosa-0.10.1-py3-none-any.whl (253 kB)
[2K   [90m━━━━━━━━━━━━━━━━━

In [None]:
# Step 2: Import all required modules
import os
import random
import librosa
import numpy as np
from datasets import Dataset
from transformers import AutoProcessor, AutoModelForCTC, TrainingArguments, Trainer
from dataclasses import dataclass
from typing import Dict, List, Union
from scipy.signal import butter, lfilter
from audiomentations import AddGaussianNoise
import torch

# Step 3: Your custom Augmenter class
class Augmenter:
    def __init__(self, sr=16000,
                 noise_prob=0.2, noise_max_amp=0.015,
                 reverb_prob=0.1, reverb_delay=0.03, reverb_decay=0.25,
                 time_stretch_prob=0.2, time_stretch_range=(0.9, 1.1),
                 freq_mask_prob=0.2, freq_mask_n=2):
        self.sr, self.noise_prob, self.reverb_prob, self.time_stretch_prob, self.freq_mask_prob = sr, noise_prob, reverb_prob, time_stretch_prob, freq_mask_prob
        self.noise_aug = AddGaussianNoise(p=1.0, max_amplitude=noise_max_amp)
        self.reverb_delay, self.reverb_decay, self.time_stretch_range, self.freq_mask_n = reverb_delay, reverb_decay, time_stretch_range, freq_mask_n

    def augment(self, audio):
        distortions, audio = [], np.array(audio, dtype=np.float32)
        if random.random() < self.noise_prob: distortions.append('noise')
        if random.random() < self.reverb_prob: distortions.append('reverb')
        if random.random() < self.time_stretch_prob: distortions.append('time_stretch')
        if random.random() < self.freq_mask_prob: distortions.append('frequency_masking')
        for d in distortions:
            if d == 'noise': audio = self.noise_aug(samples=audio, sample_rate=self.sr)
            elif d == 'reverb':
                delay, reverb = int(self.reverb_delay*self.sr), np.pad(audio*self.reverb_decay, (int(self.reverb_delay*self.sr), 0), 'constant')
                audio += reverb[:len(audio)]
            elif d == 'time_stretch': audio = librosa.effects.time_stretch(y=audio, rate=random.uniform(*self.time_stretch_range))
            elif d == 'frequency_masking':
                nyquist = self.sr / 2
                for _ in range(self.freq_mask_n):
                    l_freq, h_freq = random.uniform(200, 4000), random.uniform(500, 2500)
                    if l_freq + h_freq < nyquist:
                        b, a = butter(N=4, Wn=[l_freq, l_freq + h_freq], btype="bandstop", fs=self.sr)
                        audio = lfilter(b, a, audio)
        return audio

# Step 4: Your custom load_data function
Training_dirs = "/content/drive/MyDrive/MLDatasets/Milan2025/LibriSpeech/train-clean-100/"

def load_data():
    file_paths, transcriptions = [], []
    print("Scanning directories for .flac files...")
    for speaker_id in os.listdir(Training_dirs):
        speaker_path = os.path.join(Training_dirs, speaker_id)
        if not os.path.isdir(speaker_path): continue
        for chapter_id in os.listdir(speaker_path):
            chapter_path = os.path.join(speaker_path, chapter_id)
            if not os.path.isdir(chapter_path): continue
            trans_file = f"{speaker_id}-{chapter_id}.trans.txt"
            trans_path = os.path.join(chapter_path, trans_file)
            if os.path.exists(trans_path):
                with open(trans_path, 'r') as f:
                    for line in f:
                        parts = line.strip().split(' ', 1)
                        file_id, text = parts[0], parts[1]
                        audio_path = os.path.join(chapter_path, f"{file_id}.flac")
                        if os.path.exists(audio_path):
                            file_paths.append(audio_path)
                            transcriptions.append(text)
    return file_paths, transcriptions

# --- Main Fine-Tuning Workflow using the Trainer ---

# 1. Load data and create Dataset object
file_paths, transcriptions = load_data()
data_dict = {"file_path": file_paths, "transcription": transcriptions}
hf_dataset = Dataset.from_dict(data_dict)
print(f"Created a dataset with {len(hf_dataset)} samples.")

# 2. Instantiate dependencies
augmenter = Augmenter()
model_id = "facebook/wav2vec2-base-960h"
processor = AutoProcessor.from_pretrained(model_id)
model = AutoModelForCTC.from_pretrained(model_id) # PyTorch model for the Trainer

# 3. Create the preprocessing function
def prepare_dataset(batch):
    audio, sr = librosa.load(batch["file_path"], sr=16000)
    augmented_audio = augmenter.augment(audio)
    batch["input_values"] = processor(audio=augmented_audio, sampling_rate=16000).input_values[0]
    batch["labels"] = processor(text=batch["transcription"]).input_ids
    return batch

processed_ds = hf_dataset.map(prepare_dataset, remove_columns=hf_dataset.column_names, num_proc=1)

# 4. Define the Data Collator
@dataclass
class DataCollatorCTCWithPadding:
    processor: AutoProcessor
    padding: Union[bool, str] = True
    def __call__(self, features: List[Dict[str, Union[List[int], np.ndarray]]]) -> Dict[str, torch.Tensor]:
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        # Pad inputs and labels, returning PyTorch tensors
        batch = processor.pad(input_features, padding=self.padding, return_tensors="pt")
        labels_batch = processor.pad(labels=label_features, padding=self.padding, return_tensors="pt")

        # Replace padding with -100 to be ignored by loss function
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
        batch["labels"] = labels
        return batch

data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

# 5. Define Training Arguments
training_args = TrainingArguments(
  output_dir="/content/wav2vec2-finetuned-augmented",
  per_device_train_batch_size=8,
  gradient_accumulation_steps=2,
  num_train_epochs=5,
  fp16=True, # Use mixed precision
  learning_rate=5e-5,
  save_total_limit=2,
  # You can add evaluation and logging steps here if you have a validation split
)

# 6. Instantiate the Trainer
trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    train_dataset=processed_ds,
    tokenizer=processor.feature_extractor,
)

print("\n--- Starting Model Fine-Tuning with Wav2Vec2 and Trainer ---")
trainer.train()
print("\n--- Fine-Tuning Complete ---")
trainer.save_model("/content/drive/MyDrive/")

Scanning directories for .flac files...
Created a dataset with 28539 samples.


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/159 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/163 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

vocab.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/378M [00:00<?, ?B/s]

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/28539 [00:00<?, ? examples/s]