# wav2vec 2.0 optimizer ablation

## Placeholders introduced
- `PATH_TO_DATASET` — where to place your dataset (e.g., data/ )
- `PATH_TO_OUTPUT_DIR` -path_to_output
- `PATH_TO_REPO` -path_to_base_repo
- `PARAMS` -different parameter values

In [None]:
!pip install transformers --upgrade
!pip uninstall torch torchvision torchaudio -y
!pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu118
!pip install datasets soundfile librosa evaluate jiwer accelerate
%pip install pandas
!pip install huggingface_hub
!pip install wandb 

In [None]:
import importlib.util
reqs=['transformers', 'datasets', 'soundfile', 'librosa', 'evaluate', 'jiwer', 'accelerate','pandas','wandb']
def check_installed(packages):
    for pkg in packages:
        try:
            importlib.import_module(pkg)
            print(f"{pkg} is installed")
        except ImportError:
            print(f"{pkg} is NOT installed")
check_installed(reqs)
import torch, transformers
print(f"PyTorch: {torch.__version__}")  # Must show 2.1.0+
print(f"Transformers: {transformers.__version__}")  # 4.36.0+

try:
    from torch.distributed.tensor import DTensor
    print(" DTensor available")
except ImportError:
    raise RuntimeError(" DTensor not found - upgrade PyTorch!")

In [None]:
import os, json, numpy as np, pandas as pd
from pathlib import Path
from datasets import Dataset, Audio
from transformers import (
    Wav2Vec2FeatureExtractor,
    Wav2Vec2Processor,
    Wav2Vec2ForCTC,
    Wav2Vec2CTCTokenizer,
    TrainingArguments,
    Trainer,
    get_scheduler,
    Auto_processor,
    AutoModelForCTC
)
import torch
from dataclasses import dataclass
from typing import List, Dict, Union
import evaluate 
from sklearn.model_selection import train_test_split
from datasets import load_dataset
import wandb
from datetime import datetime
import re
from huggingface_hub import login
import torch.nn as nn
from torch.optim import AdamW, Adam
from transformers.optimization import Adafactor
import torch
from transformers import AutoProcessor, AutoModelForCTC
import evaluate
import wandb

In [None]:
wandb.login()

In [None]:
json_path = "PATH_TO_DATASET/processed_json/combined_dataset.json"
AUDIO_BASE_PATH = "PATH_TO_DATASET/extracted/audio_files"
SAMPLING_RATE = 16000
print(f"Loading dataset from: {json_path}")
dataset = load_dataset("json", data_files=json_path, split="train")
print(f"Loaded dataset with {len(dataset)} samples")

print("Normalizing absolute paths to real audio files...")

def normalize_audio_path(example):
    relative_filename = example["audio"].split("clips/")[-1]
    example["audio"] = os.path.join(AUDIO_BASE_PATH, "clips", relative_filename)
    return example

dataset = dataset.map(normalize_audio_path)

data = [{"audio": x["audio"], "sentence": x["transcription"]} for x in dataset]
train_data, val_data = train_test_split(data, test_size=0.1, random_state=42)

train_dataset = Dataset.from_list(train_data)
val_dataset = Dataset.from_list(val_data)
train_dataset = train_dataset.cast_column("audio", Audio(sampling_rate=SAMPLING_RATE))
val_dataset = val_dataset.cast_column("audio", Audio(sampling_rate=SAMPLING_RATE))
print("Audio decoding complete!")
print("Sample entry:", train_dataset[0])

In [None]:
print(f"Train set size: {len(train_dataset)}")
print(f" Validation set size: {len(val_dataset)}")

In [None]:
telugu_special_unwanted_characters = [
    'ఁ', 'ౄ', 'ౢ', 'ౣ', 'ౠ', 'ఽ',
    '౦', '౧', '౨', '౩', '౪', '౫', '౬', '౭', '౮', '౯',
    'ఀ', 'ౘ', 'ౙ', 'ౚ', '౷',
    '‘', '’', '“', '”', '%', '.', ';', '-', ',', '/', '\\', '_', '&',
    'G', 'P', 'S', 'e', 'l', 'n', 'r', 't', '\u200c', '\n'
]

chars_to_remove_regex = f"[{re.escape(''.join(telugu_special_unwanted_characters))}]"
def remove_special_characters(batch):
    batch["sentence"] = re.sub(chars_to_remove_regex, '', batch["sentence"])
    return batch

train_dataset = train_dataset.map(remove_special_characters)
val_dataset   = val_dataset.map(remove_special_characters)

print("Special characters removed from 'sentence' column.")


In [None]:
base_repo_name = "PATH_TO_REPO"
processor = AutoProcessor.from_pretrained(base_repo_name)

In [None]:
def prepare_dataset(batch):
    audio = batch["audio"]

    batch["input_values"] = processor(audio["array"], sampling_rate=audio["sampling_rate"]).input_values[0]
    batch["input_length"] = len(batch["input_values"])
    batch["labels"] = processor.tokenizer(batch["sentence"]).input_ids
    return batch
    
train_dataset = train_dataset.map(
    prepare_dataset,
    remove_columns=train_dataset.column_names,
    num_proc=4
)

val_dataset = val_dataset.map(
    prepare_dataset,
    remove_columns=val_dataset.column_names,
    num_proc=4
)

print("Datasets prepared and tokenized.")


In [None]:
@dataclass
class DataCollatorCTCWithPadding:
    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        input_features = [{"input_values": f["input_values"]} for f in features]
        label_features = [{"input_ids": f["labels"]} for f in features]
        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            return_tensors="pt"
        )

        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                return_tensors="pt"
            )

        
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels
        return batch

data_collator = DataCollatorCTCWithPadding(
    processor=processor,
    padding=True  
)
print("Data collator ready.")

In [None]:
login(token="***************") #insert your hugging face token

In [None]:
wer_metric = evaluate.load("wer")
cer_metric = evaluate.load("cer")

def compute_metrics(pred):
    pred_ids = np.argmax(pred.predictions, axis=-1)
    label_ids = pred.label_ids
    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id
    pred_str = processor.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = processor.batch_decode(label_ids, group_tokens=False)
    return {
        "wer": wer_metric.compute(predictions=pred_str, references=label_str),
        "cer": cer_metric.compute(predictions=pred_str, references=label_str),
    }


In [None]:
PARAMS={
  "learning_rate":1e-4,  #Enter required learning rate value
  "epoch":15, #enter required epoch value
  "hidden_dropout":0.3 #enter required hidden dropout value
}

In [None]:
from transformers import TrainingArguments, Trainer, AutoModelForCTC, get_scheduler
from torch.optim import AdamW, Adam 
from transformers.optimization import Adafactor 
import wandb
import torch.nn as nn

def train_with_optimizer(opt_name, run_id):
    print(f"Training with optimizer: {opt_name}")
    
    model = AutoModelForCTC.from_pretrained(base_repo_name).to("cuda")
    model.freeze_feature_encoder()
    model.config.hidden_dropout = PARAMS["hidden_dropout"]
    
    output_dir = f"PATH_TO_OUTPUT_DIR/optim_ablation{run_id}"
    repo_name = f"PATH_TO_REPO/telugu_wav2vec_optimizerablation_{run_id}"

    if opt_name == "adamw":
        optimizer = AdamW(model.parameters(), lr=5e-5)
    elif opt_name == "adam":
        optimizer = Adam(model.parameters(), lr=5e-5)
    elif opt_name == "adafactor":
        optimizer = Adafactor(
            model.parameters(),
            scale_parameter=True,
            relative_step=False,  
            warmup_init=False,
            lr=LEARNING_RATE
        )
    else:
        raise ValueError(f"Unknown optimizer: {opt_name}")

    wandb.init(
        project="telugu-asr-wav2vec_ablation", 
        name=f"optimizer_ablation_{opt_name}", 
        config={"optimizer": opt_name}
    )

    training_args = TrainingArguments(
        output_dir=output_dir,
        group_by_length=True,
        learning_rate=PARAMS["learning_rate"],
        per_device_train_batch_size=8,
        gradient_accumulation_steps=2,
        eval_strategy="epoch",
        num_train_epochs=PARAMS["epoch"],
        gradient_checkpointing=True,
        fp16=True,
        logging_steps=50,
        save_strategy="epoch",
        report_to="wandb",
        run_name=f"optimizer_ablation_{opt_name}",
        push_to_hub=True,
        hub_model_id=repo_name,
        logging_dir=f"{output_dir}/logs"
    )

    num_training_steps = (
        len(train_dataset) // training_args.per_device_train_batch_size
        * training_args.num_train_epochs
    )
    lr_scheduler = get_scheduler(
        name="linear",
        optimizer=optimizer,
        num_warmup_steps=int(0.1 * num_training_steps),
        num_training_steps=num_training_steps
    )
    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=processor,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
        optimizers=(optimizer, lr_scheduler)
    )

    trainer.train()
    trainer.push_to_hub(commit_message=f"Trained with optimizer: {opt_name}")
    wandb.finish()


In [None]:
train_with_optimizer("adamw", "adamw")
train_with_optimizer("adafactor", "adafactor")

In [None]:
wer_metric = evaluate.load("wer")
cer_metric = evaluate.load("cer")

repo_names = [
    "PATH_TO_NEW_REPO_1",
    "PATH_TO_NEW_REPO_2"
]
def evaluate_model(repo_name, dataset):
    print(f" Evaluating {repo_name}")
    processor = AutoProcessor.from_pretrained(repo_name)
    model = AutoModelForCTC.from_pretrained(repo_name).to("cuda")
    model.eval()

    def map_to_prediction(batch):
        with torch.no_grad():
            input_values = torch.tensor(batch["input_values"], device="cuda").unsqueeze(0)
            logits = model(input_values).logits
            pred_ids = torch.argmax(logits, dim=-1)
            batch["pred_str"] = processor.batch_decode(pred_ids)[0]
            batch["text"] = processor.decode(batch["labels"], group_tokens=False)
        return batch

    results = dataset.map(map_to_prediction, remove_columns=dataset.column_names)
    wer = wer_metric.compute(predictions=results["pred_str"], references=results["text"])
    cer = cer_metric.compute(predictions=results["pred_str"], references=results["text"])
    return wer, cer

results_dict = {}
for repo in repo_names:
    wer, cer = evaluate_model(repo, val_dataset)
    results_dict[repo] = {"WER": wer, "CER": cer}

for model_name, metrics in results_dict.items():
    print(f"\nResults for {model_name}:\nWER: {metrics['WER']:.4f} | CER: {metrics['CER']:.4f}")
