# wav2vec 2.0 finetuning 

## Table of contents
1. [Environment & Imports](#Environment-&-Imports)
2. [Dataset & Paths](#Dataset-&-Paths)
4. [PreProcessing and Model Definition](#Model-Definition)
5. [Training](#Training)
6. [Evaluation](#Evaluation)

## Placeholders introduced
- `PATH_TO_DATASET` — where to place your dataset (e.g., data/ )
- `PATH_TO_CHECKPOINT` -outputs/logs
- `PATH_TO_MODEL` — Saved model, vocabulary
- `KAGGLE_DATASET_LINK` — redacted private Kaggle link



## Environment & Imports



In [None]:
!pip install datasets soundfile librosa evaluate jiwer kaggle pandas wandb
!pip install huggingface_hub
!pip install transformers accelerate
!pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu118

In [None]:
import torch, transformers
print(f"Transformers: {transformers.__version__}")

try:
    from torch.distributed.tensor import DTensor
    print(" DTensor available")
except ImportError:
    raise RuntimeError(" DTensor not found - upgrade PyTorch!")

In [None]:
print("Torch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
print("CUDA version (torch):", torch.version.cuda)


In [None]:
import transformers, accelerate
print(f"Transformers version: {transformers.__version__}")
print(f"Accelerate version: {accelerate.__version__}")

In [None]:
import importlib.util
reqs=['transformers', 'datasets', 'soundfile', 'librosa', 'evaluate', 'jiwer', 'accelerate','pandas','wandb']
def check_installed(packages):
    for pkg in packages:
        try:
            importlib.import_module(pkg)
            print(f"{pkg} is installed")
        except ImportError:
            print(f"{pkg} is NOT installed")
check_installed(reqs)

In [None]:
#All necessary imports

import os, json, numpy as np, pandas as pd, random,re
import zipfile
import json
import torch
from pathlib import Path
from datasets import Dataset, Audio, load_dataset
from transformers import (
    Wav2Vec2FeatureExtractor,
    Wav2Vec2Processor,
    Wav2Vec2ForCTC,
    Wav2Vec2CTCTokenizer,
    TrainingArguments,
    Trainer
)
from sklearn.model_selection import train_test_split
from dataclasses import dataclass
from typing import List, Dict, Union
from transformers import EarlyStoppingCallback
from huggingface_hub import HfApi
import evaluate
import numpy as np
import wandb
from huggingface_hub import login
from jiwer import wer, cer
from transformers import AutoProcessor, AutoModelForCTC


## Datasets and Paths



In [None]:
#setting up storage directories
os.makedirs("PATH_TO_DATASET", exist_ok=True)       # Store original dataset
os.makedirs("PATH_TO_MODEL", exist_ok=True)         # Final models, tokenizer, processor

os.makedirs("PATH_TO_CHECKPOINT", exist_ok=True)         # Trainer checkpoints
os.makedirs("PATH_TO_LOGS", exist_ok=True)                # Logging


In [None]:
os.environ['KAGGLE_USERNAME'] = "*****"
os.environ['KAGGLE_KEY'] = "*****"
!kaggle datasets download -d KAGGLE_DATASET_LINK -p PATH_TO_DATASET

In [None]:
zip_path = "PATH_TO_DATASET/your-dataset-name.zip"
extract_path = "PATH_TO_DATASET/extracted"
os.makedirs(extract_path, exist_ok=True)

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

print(f"Extracted to: {extract_path}")


In [None]:
#To verify by printing transcriptions file

csv_path = "PATH_TO_DATASET/extracted/transcriptions.csv"
df = pd.read_csv(csv_path)

print("Available columns:", df.columns.tolist())
print("\nFirst 5 rows:")
print(df.head())

In [None]:
SAVE_DIR= Path("PATH_TO_MODEL/trained_model")  # Final trained model

for path in [SAVE_DIR]:
    path.mkdir(parents=True, exist_ok=True)

## Preprocessing and Model Definition



In [None]:
audio_base_path = "PATH_TO_DATASET/extracted/audio_files"
json_output_path = "PATH_TO_DATASET/processed_json"
os.makedirs(json_output_path, exist_ok=True)


#preparing the JSON file

json_data = []
for _, row in df.iterrows():
    entry = {
        "audio": os.path.join(audio_base_path, row["filename"]),  # fixed key here
        "transcription": str(row["transcription"]).strip()
    }
    json_data.append(entry)

json_save_path = os.path.join(json_output_path, "combined_dataset.json")
with open(json_save_path, "w", encoding="utf-8") as f:
    for item in json_data:
        json.dump(item, f, ensure_ascii=False)
        f.write("\n")

print(f" Saved combined JSON dataset to: {json_save_path}")


In [None]:
json_path = "PATH_TO_DATASET/processed_json/combined_dataset.json" 

print("Previewing first 3 entries in the JSON file:")
with open(json_path, "r", encoding="utf-8") as f:
    for i in range(3):
        line = f.readline()
        if not line:
            break
        sample = json.loads(line)
        print(f"Sample {i+1}: {sample}")


In [None]:
#Audio decoding
audio_base_path = "PATH_TO_DATASET/extracted/audio_files"
SAMPLING_RATE = 16000

print(f"Loading dataset from: {json_path}")
dataset = load_dataset("json", data_files=json_path, split="train")
print(f"Loaded dataset with {len(dataset)} samples")

def normalize_audio_path(example):
    relative_filename = example["audio"].split("/")[-1]
    example["audio"] = os.path.join(audio_base_path, "clips", relative_filename)
    return example

dataset = dataset.map(normalize_audio_path)
train_val_split = dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = train_val_split["train"]
val_dataset = train_val_split["test"]

telugu_special_unwanted_characters = [
    'ఁ', 'ౄ', 'ౢ', 'ౣ', 'ౠ', 'ఽ',
    '౦', '౧', '౨', '౩', '౪', '౫', '౬', '౭', '౮', '౯',
    'ఀ', 'ౘ', 'ౙ', 'ౚ', '౷',
    '‘', '’', '“', '”', '%', '.', ';', '-', ',', '/', '\\', '_', '&',
    'G', 'P', 'S', 'e', 'l', 'n', 'r', 't', '\u200c', '\n'
]

chars_to_remove_regex = f"[{re.escape(''.join(telugu_special_unwanted_characters))}]"
def clean_and_rename_text(batch):
    cleaned_text = re.sub(chars_to_remove_regex, '', batch["transcription"])
    batch["sentence"] = cleaned_text.strip()
    return batch

train_dataset = train_dataset.map(clean_and_rename_text, remove_columns=["transcription"])
val_dataset = val_dataset.map(clean_and_rename_text, remove_columns=["transcription"])

train_dataset = train_dataset.cast_column("audio", Audio(sampling_rate=16000))
val_dataset = val_dataset.cast_column("audio", Audio(sampling_rate=16000))

print("Audio decoding complete!")
print("Sample entry:", train_dataset[0]) #to verify compatiblity with datasets library

def get_all_text(dataset):
    return " ".join(dataset["sentence"])

all_text = get_all_text(train_dataset) + get_all_text(val_dataset)
vocab_chars = sorted(set(all_text))
vocab_dict = {char: idx for idx, char in enumerate(vocab_chars)}

vocab_dict["|"] = vocab_dict[" "]
del vocab_dict[" "]
vocab_dict["[UNK]"] = len(vocab_dict)
vocab_dict["[PAD]"] = len(vocab_dict)

print(f"Clean vocab size: {len(vocab_dict)}")


In [None]:
login(token="*****")

In [None]:
api = HfApi()
repo_name = "PATH_TO_REPO"   #Enter model repo 
api.create_repo(repo_id=repo_name, repo_type="model", exist_ok=True)
print(f"Created (or found) repo: {repo_name}")


In [None]:
VOCAB_DIR = Path("PATH_TO_MODEL/Vocab")
VOCAB_DIR.mkdir(parents=True, exist_ok=True)
VOCAB_FILE = VOCAB_DIR / "vocab.json"
repo_name= "PATH_TO_REPO"
with open(VOCAB_FILE, "w", encoding="utf-8") as f:
    json.dump(vocab_dict, f, ensure_ascii=False)
print(f"Vocab saved to: {VOCAB_FILE}")
tokenizer = Wav2Vec2CTCTokenizer(
    vocab_file=str(VOCAB_FILE),
    unk_token="[UNK]",
    pad_token="[PAD]",
    word_delimiter_token="|"
)
print(" Tokenizer created.")
tokenizer.push_to_hub(repo_name)
feature_extractor = Wav2Vec2FeatureExtractor(
    feature_size=1,
    sampling_rate=16000,
    padding_value=0.0,
    do_normalize=True,
    return_attention_mask=True
)
print(" Feature extractor created.")

processor = Wav2Vec2Processor(
    tokenizer=tokenizer,
    feature_extractor=feature_extractor
)

processor.save_pretrained(str(VOCAB_DIR))
print(f"Processor saved locally at: {VOCAB_DIR}")


In [None]:
from transformers import Wav2Vec2CTCTokenizer, Wav2Vec2Processor
vocab_dir = "PATH_TO_MODEL/Vocab"

tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(repo_name)
tokenizer.save_pretrained(vocab_dir)
print("Tokenizer loaded from Hub")
processor = Wav2Vec2Processor.from_pretrained(vocab_dir)
print("Processor loaded from local directory.")


In [None]:
vocab_path = Path("PATH_TO_MODEL/Vocab/vocab.json")

with open(vocab_path, "r", encoding="utf-8") as f:
    vocab = json.load(f)


## Training



In [None]:
def prepare_dataset(batch):
    audio = batch["audio"]

    batch["input_values"] = processor(audio["array"], sampling_rate=audio["sampling_rate"]).input_values[0]
    batch["input_length"] = len(batch["input_values"])
    batch["labels"] = processor.tokenizer(batch["sentence"]).input_ids
    return batch
    
train_dataset = train_dataset.map(
    prepare_dataset,
    remove_columns=train_dataset.column_names,
    num_proc=4
)

val_dataset = val_dataset.map(
    prepare_dataset,
    remove_columns=val_dataset.column_names,
    num_proc=4
)

print("Datasets prepared and tokenized.")


In [None]:
#preparing data collator
@dataclass
class DataCollatorCTCWithPadding:
    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        input_features = [{"input_values": f["input_values"]} for f in features]
        label_features = [{"input_ids": f["labels"]} for f in features]
        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            return_tensors="pt"
        )

        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                return_tensors="pt"
            )

        
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels
        return batch

data_collator = DataCollatorCTCWithPadding(
    processor=processor,
    padding=True  
)
print("Data collator ready.")

In [None]:

wer_metric = evaluate.load("wer")
cer_metric = evaluate.load("cer")

def compute_metrics(pred):
    pred_ids = np.argmax(pred.predictions, axis=-1)
    label_ids = pred.label_ids
    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id
    pred_str = processor.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = processor.batch_decode(label_ids, group_tokens=False)
    return {
        "wer": wer_metric.compute(predictions=pred_str, references=label_str),
        "cer": cer_metric.compute(predictions=pred_str, references=label_str),
    }


In [None]:


model = Wav2Vec2ForCTC.from_pretrained(
    "facebook/wav2vec2-xls-r-300m", #replace with required model 
    trust_remote_code=True,
    use_safetensors=True, 
    pad_token_id=processor.tokenizer.pad_token_id,
    vocab_size=len(processor.tokenizer) 
)
model.freeze_feature_encoder()


In [None]:
PARAMS = {
    "learning_rate": 3e-4,
    "epochs": 15,
    "batch_size": 8,
    "hidden_dropout": 0.1,
    "sampling_rate": 16000
}

In [None]:
#wandb logging

wandb.login()
wandb.init(project="WANDB_PROJECT_NAME", name="xlsr-300m-medium")
OUTPUT_DIR = "PATH_TO_CHECKPOINT"

In [None]:
#base model configuration
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    group_by_length=True,
    per_device_train_batch_size=PARAMS["batch_size"],
    gradient_accumulation_steps=2,
    evaluation_strategy="steps",
    num_train_epochs=PARAMS["epochs"],
    gradient_checkpointing=True,
    fp16=True,  
    save_steps=200,
    eval_steps=200,
    logging_steps=50,
    learning_rate=PARAMS["learning_rate"],
    warmup_ratio=0.1,
    save_total_limit=2,
    report_to="wandb", 
    push_to_hub=True,
    hub_model_id=repo_name,
    disable_tqdm=False,
    logging_dir=f"{OUTPUT_DIR}/logs"
    
)


In [None]:
trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=processor
    
)

In [None]:
trainer.train()

In [None]:
processor.save_pretrained(str(SAVE_DIR))
print(f"Processor saved locally at: {SAVE_DIR}")
model.save_pretrained(str(SAVE_DIR))
print(f"Model saved locally at: {SAVE_DIR}")

In [None]:
trainer.push_to_hub()

## Evaluation

In [None]:
wer_metric = evaluate.load("wer")
cer_metric = evaluate.load("cer")

processor = AutoProcessor.from_pretrained(repo_name)
model = AutoModelForCTC.from_pretrained(repo_name).to("cuda")
def map_to_prediction(batch):
    with torch.no_grad():
        input_values = torch.tensor(batch["input_values"], device="cuda").unsqueeze(0)
        logits = model(input_values).logits
        pred_ids = torch.argmax(logits, dim=-1)
        batch["pred_str"] = processor.batch_decode(pred_ids)[0]
        batch["text"] = processor.decode(batch["labels"], group_tokens=False)
    return batch

results = val_dataset.map(map_to_prediction, remove_columns=val_dataset.column_names)

fW2 = wer_metric.compute(predictions=results["pred_str"], references=results["text"])
fC2= cer_metric.compute(predictions=results["pred_str"], references=results["text"])

In [None]:
print(f"\nFinal Test WER: {fW2:.4f}")
print(f"Final Test CER: {fC2:.4f}")
