In [13]:
pip install torch torchvision torchaudio pandas numpy transformers datasets librosa soundfile

Note: you may need to restart the kernel to use updated packages.


In [14]:
# Import required libraries
import os
import torch
import pandas as pd
import numpy as np
from pathlib import Path
from datasets import load_dataset, Audio, Dataset
from transformers import (
    Wav2Vec2ForCTC,
    Wav2Vec2Processor,
    Wav2Vec2CTCTokenizer,
    Wav2Vec2FeatureExtractor,
    TrainingArguments,
    Trainer
)

In [15]:
# Set paths
DATA_DIR = Path('data/cv-corpus-21.0-2025-03-14/uz')
TRAIN_TSV = DATA_DIR / 'train.tsv'
DEV_TSV = DATA_DIR / 'dev.tsv'
CLIPS_DIR = DATA_DIR / 'clips'

# Load TSV files
train_df = pd.read_csv(TRAIN_TSV, sep='\t')
dev_df = pd.read_csv(DEV_TSV, sep='\t')

print(f'Training samples: {len(train_df)}')
print(f'Validation samples: {len(dev_df)}')

Training samples: 46256
Validation samples: 12254


In [16]:
# Check the DataFrame structure
print("Training DataFrame columns:", train_df.columns)
print("\nFirst row of training data:")
print(train_df.iloc[0])

Training DataFrame columns: Index(['client_id', 'path', 'sentence_id', 'sentence', 'sentence_domain',
       'up_votes', 'down_votes', 'age', 'gender', 'accents', 'variant',
       'locale', 'segment'],
      dtype='object')

First row of training data:
client_id          2160561702bac0e2048d2dc79810c2d8a6e6942a6dcac8...
path                                    common_voice_uz_28907218.mp3
sentence_id        bc07db8b7a00a0b99d7c0ef267d1de330e807c9810eef6...
sentence                 Bugun ertalab Gyotenikiga taklifnoma oldim.
sentence_domain                                                  NaN
up_votes                                                           2
down_votes                                                         0
age                                                         twenties
gender                                                male_masculine
accents                                                          NaN
variant                                                 

In [17]:
!pip install librosa



In [18]:
# Create a function to process audio files
def prepare_dataset(batch):
    # Get the text (sentence is the column with the transcription)
    batch["text"] = batch["sentence"].lower()
    
    # Create the full audio path - Common Voice MP3 files are in the clips directory
    audio_path = str(CLIPS_DIR / batch["path"])
    
    try:
        # Load and resample audio file
        import librosa
        import numpy as np
        
        # Load audio with librosa (automatically converts to mono)
        audio_data, sample_rate = librosa.load(audio_path, sr=16000)  # Resample to 16kHz
        
        # Ensure audio_data is a numpy array
        batch["audio"] = audio_data.tolist()
        batch["sampling_rate"] = sample_rate
        
    except Exception as e:
        print(f"Error processing {audio_path}: {str(e)}")
        # Provide default values in case of error
        batch["audio"] = [0.0]  * 16000  # 1 second of silence
        batch["sampling_rate"] = 16000
    
    return batch

# Prepare the datasets
train_dataset = Dataset.from_pandas(train_df)
dev_dataset = Dataset.from_pandas(dev_df)

# Process a small subset first to test
print("Processing first 5 examples...")
small_train = train_dataset.select(range(5))
small_dev = dev_dataset.select(range(5))

# Process datasets
processed_train = small_train.map(
    prepare_dataset,
    remove_columns=['client_id', 'sentence_id', 'sentence_domain', 
                   'up_votes', 'down_votes', 'age', 'gender', 
                   'accents', 'variant', 'locale', 'segment']
)
processed_dev = small_dev.map(
    prepare_dataset,
    remove_columns=['client_id', 'sentence_id', 'sentence_domain', 
                   'up_votes', 'down_votes', 'age', 'gender', 
                   'accents', 'variant', 'locale', 'segment']
)

print(f"Processed {len(processed_train)} training samples")
print(f"Processed {len(processed_dev)} validation samples")

# Show the first example to verify the data
print("\nFirst example:")
print("Text:", processed_train[0]["text"])
print("Audio shape:", np.array(processed_train[0]["audio"]).shape)  # Convert to numpy array before checking shape
print("Sampling rate:", processed_train[0]["sampling_rate"])

# Verify the data types
print("\nData types:")
print("Audio type:", type(processed_train[0]["audio"]))
print("Audio dtype:", len(processed_train[0]["audio"]))



Processing first 5 examples...


Map: 100%|██████████| 5/5 [00:00<00:00, 124.11 examples/s]
Map: 100%|██████████| 5/5 [00:00<00:00, 212.46 examples/s]

Processed 5 training samples
Processed 5 validation samples

First example:
Text: bugun ertalab gyotenikiga taklifnoma oldim.
Audio shape: (76608,)
Sampling rate: 16000

Data types:
Audio type: <class 'list'>
Audio dtype: 76608





In [19]:
# Save the processed datasets to disk to avoid reprocessing
processed_train.save_to_disk("data/processed_train")
processed_dev.save_to_disk("data/processed_dev")

# Prepare tokenizer
def extract_all_chars(batch):
    all_text = " ".join(batch["text"])
    vocab = list(set(all_text))
    return {"vocab": [vocab], "all_text": [all_text]}

# Get vocabulary from both datasets
vocab_train = processed_train.map(
    extract_all_chars,
    batched=True,
    batch_size=-1,
    remove_columns=processed_train.column_names
)

vocab_test = processed_dev.map(
    extract_all_chars,
    batched=True,
    batch_size=-1,
    remove_columns=processed_dev.column_names
)

# Create vocabulary from all data
vocab_list = list(set(
    vocab_train["vocab"][0] +
    vocab_test["vocab"][0]
))

# Create vocabulary dictionary with special tokens
vocab_dict = {
    "<pad>": 0,
    "<unk>": 1,
    "<s>": 2,
    "</s>": 3,
}

# Add all characters from our text data to the vocabulary
for i, char in enumerate(sorted(vocab_list)):
    vocab_dict[char] = len(vocab_dict)

print(f"Vocabulary size: {len(vocab_dict)}")

# Save vocab dict to disk
import json
with open('vocab.json', 'w', encoding='utf-8') as vocab_file:
    json.dump(vocab_dict, vocab_file, ensure_ascii=False)

# Create tokenizer
tokenizer = Wav2Vec2CTCTokenizer(
    'vocab.json',
    unk_token="<unk>",
    pad_token="<pad>",
    word_delimiter_token=" "
)

# Create feature extractor
feature_extractor = Wav2Vec2FeatureExtractor(
    feature_size=1,
    sampling_rate=16000,
    padding_value=0.0,
    do_normalize=True,
    return_attention_mask=True
)

# Create processor (combines tokenizer and feature extractor)
processor = Wav2Vec2Processor(
    feature_extractor=feature_extractor,
    tokenizer=tokenizer
)

# Save processor
processor.save_pretrained("./wav2vec2-large-xlsr-uzbek")

print("Processor saved successfully!")

Saving the dataset (1/1 shards): 100%|██████████| 5/5 [00:00<00:00, 1418.82 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 5/5 [00:00<00:00, 1269.39 examples/s]
Map: 100%|██████████| 5/5 [00:00<00:00, 3280.90 examples/s]
Map: 100%|██████████| 5/5 [00:00<00:00, 3408.34 examples/s]

Vocabulary size: 34
Processor saved successfully!





In [23]:
pip install 'accelerate>=0.26.0'

Collecting accelerate>=0.26.0
  Using cached accelerate-1.7.0-py3-none-any.whl.metadata (19 kB)
Using cached accelerate-1.7.0-py3-none-any.whl (362 kB)
Installing collected packages: accelerate
Successfully installed accelerate-1.7.0
Note: you may need to restart the kernel to use updated packages.


In [24]:
# Prepare data for training by creating a data collator
from dataclasses import dataclass
from typing import Dict, List, Union
import torch

@dataclass
class DataCollatorCTCWithPadding:
    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
    
    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # Split inputs and labels since they have to be of different lengths and need different padding methods
        input_features = [{"input_values": feature["audio"]} for feature in features]
        label_features = [{"input_ids": self.processor.tokenizer(feature["text"]).input_ids} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            return_tensors="pt",
        )

        labels_batch = self.processor.pad(
            label_features,
            padding=self.padding,
            return_tensors="pt",
        )

        # Replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch

# Create the data collator
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

# Load the pre-trained Wav2Vec2 model
model = Wav2Vec2ForCTC.from_pretrained(
    "facebook/wav2vec2-large-xlsr-53",
    attention_dropout=0.1,
    hidden_dropout=0.1,
    feat_proj_dropout=0.0,
    mask_time_prob=0.05,
    layerdrop=0.1,
    ctc_loss_reduction="mean",
    pad_token_id=processor.tokenizer.pad_token_id,
    vocab_size=len(processor.tokenizer),
)

# Freeze the feature encoder
model.freeze_feature_encoder()

# Set training arguments
training_args = TrainingArguments(
    output_dir="./wav2vec2-large-xlsr-uzbek",
    group_by_length=True,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=2,
    eval_steps=400,                    # Changed from evaluation_strategy
    save_steps=400,
    logging_steps=400,
    learning_rate=3e-4,
    num_train_epochs=30,
    weight_decay=0.005,
    warmup_steps=500,
    save_total_limit=2,
    push_to_hub=False,
    fp16=True,                        # Mixed precision training
    dataloader_num_workers=4,         # Adjust based on your CPU
    gradient_checkpointing=True,      # To save memory
    eval_accumulation_steps=1
)

# Create Trainer instance
trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    train_dataset=processed_train,
    eval_dataset=processed_dev,
    tokenizer=processor.feature_extractor,
)

print("Training setup complete! Ready to start training.")

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-xlsr-53 and are newly initialized: ['lm_head.bias', 'lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.26.0`: Please run `pip install transformers[torch]` or `pip install 'accelerate>=0.26.0'`