In [None]:
!pip install datasets soundfile librosa evaluate jiwer kaggle accelerate
!pip install pandas
!pip install huggingface_hub
!pip install wandb
!pip install transformers
!pip uninstall torch torchvision torchaudio -y
!pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu118


import importlib.util
reqs=['transformers', 'datasets', 'soundfile', 'librosa', 'evaluate', 'jiwer', 'accelerate','pandas','wandb']
def check_installed(packages):
    for pkg in packages:
        try:
            importlib.import_module(pkg)
            print(f"{pkg} is installed")
        except ImportError:
            print(f"{pkg} is NOT installed")
check_installed(reqs)

import torch, transformers
print(f"PyTorch: {torch.__version__}")  
print(f"Transformers: {transformers.__version__}")

# DTensor check
try:
    from torch.distributed.tensor import DTensor
    print(" DTensor available")
except ImportError:
    raise RuntimeError(" DTensor not found - upgrade PyTorch!")

transformers is installed
datasets is installed
soundfile is installed
librosa is installed
evaluate is installed
jiwer is installed
accelerate is installed
pandas is installed
wandb is installed


In [None]:
import os, json, numpy as np, pandas as pd
from pathlib import Path
from datasets import Dataset, Audio
from transformers import (
    Wav2Vec2FeatureExtractor,
    Wav2Vec2Processor,
    Wav2Vec2ForCTC,
    Wav2Vec2CTCTokenizer,
    TrainingArguments,
    Trainer
)
import torch
from dataclasses import dataclass
from typing import List, Dict, Union
import evaluate 
from sklearn.model_selection import train_test_split
from datasets import load_dataset
import torch
from transformers import AutoProcessor, AutoModelForCTC
import wandb
from datetime import datetime
import re
from huggingface_hub import login

In [None]:
json_path = "PATH_TO_DATASET/processed_json/combined_dataset.json"
AUDIO_BASE_PATH = "PATH_TO_DATASET/extracted/audio_files"
SAMPLING_RATE = 16000

print(f"Loading dataset from: {json_path}")
dataset = load_dataset("json", data_files=json_path, split="train")
print(f"Loaded dataset with {len(dataset)} samples")

print("Normalizing absolute paths to real audio files...")

def normalize_audio_path(example):
    relative_filename = example["audio"].split("clips/")[-1]
    example["audio"] = os.path.join(AUDIO_BASE_PATH, "clips", relative_filename)
    return example

dataset = dataset.map(normalize_audio_path)

data = [{"audio": x["audio"], "sentence": x["transcription"]} for x in dataset]
train_data, val_data = train_test_split(data, test_size=0.1, random_state=42)

train_dataset = Dataset.from_list(train_data)
val_dataset = Dataset.from_list(val_data)
train_dataset = train_dataset.cast_column("audio", Audio(sampling_rate=SAMPLING_RATE))
val_dataset = val_dataset.cast_column("audio", Audio(sampling_rate=SAMPLING_RATE))
print("Audio decoding complete!")
print("Sample entry:", train_dataset[0])
print(f"Train set size: {len(train_dataset)}")
print(f" Validation set size: {len(val_dataset)}")

Loading dataset from: /workspace/datasets/processed_json_dataset/combined_dataset.json


Generating train split: 0 examples [00:00, ? examples/s]

Loaded dataset with 20476 samples
Normalizing absolute paths to real audio files...


Map:   0%|          | 0/20476 [00:00<?, ? examples/s]

Decoding audio...
Audio decoding complete!
Sample entry: {'audio': {'path': '/workspace/datasets/preparedDataset/dataset_new/clips/telugu_000001.wav', 'array': array([-0.00268555, -0.0017395 , -0.00143433, ...,  0.19799805,
        0.19128418,  0.18591309]), 'sampling_rate': 16000}, 'transcription': 'పరీక్షల అంశంపై పునరాలోచించాలని రాష్ట్ర ప్రభుత్వానికి సూచించింది'}


In [None]:
telugu_special_unwanted_characters = [
    'ఁ', 'ౄ', 'ౢ', 'ౣ', 'ౠ', 'ఽ',
    '౦', '౧', '౨', '౩', '౪', '౫', '౬', '౭', '౮', '౯',
    'ఀ', 'ౘ', 'ౙ', 'ౚ', '౷',
    '‘', '’', '“', '”', '%', '.', ';', '-', ',', '/', '\\', '_', '&',
    'G', 'P', 'S', 'e', 'l', 'n', 'r', 't', '\u200c', '\n'
]

chars_to_remove_regex = f"[{re.escape(''.join(telugu_special_unwanted_characters))}]"
def remove_special_characters(batch):
    batch["sentence"] = re.sub(chars_to_remove_regex, '', batch["sentence"])
    return batch

train_dataset = train_dataset.map(remove_special_characters)
val_dataset   = val_dataset.map(remove_special_characters)

print("Special characters removed from 'sentence' column.")

Map:   0%|          | 0/20476 [00:00<?, ? examples/s]

In [None]:
base_repo_name = "PATH_TO_REPO"

processor = AutoProcessor.from_pretrained(base_repo_name)
model = AutoModelForCTC.from_pretrained(base_repo_name).to("cuda")
model.freeze_feature_encoder()

preprocessor_config.json:   0%|          | 0.00/256 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json:   0%|          | 0.00/985 [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/45.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/548 [00:00<?, ?B/s]

In [12]:
def prepare_dataset(batch):
    audio = batch["audio"]

    batch["input_values"] = processor(audio["array"], sampling_rate=audio["sampling_rate"]).input_values[0]
    batch["input_length"] = len(batch["input_values"])
    batch["labels"] = processor.tokenizer(batch["sentence"]).input_ids
    return batch
    
train_dataset = train_dataset.map(
    prepare_dataset,
    remove_columns=train_dataset.column_names,
    num_proc=4
)

val_dataset = val_dataset.map(
    prepare_dataset,
    remove_columns=val_dataset.column_names,
    num_proc=4
)

print("Datasets prepared and tokenized.")


Map (num_proc=4):   0%|          | 0/18428 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/2048 [00:00<?, ? examples/s]

Datasets prepared and tokenized.


In [None]:
@dataclass
class DataCollatorCTCWithPadding:
    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        input_features = [{"input_values": f["input_values"]} for f in features]
        label_features = [{"input_ids": f["labels"]} for f in features]
        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            return_tensors="pt"
        )

        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                return_tensors="pt"
            )

        
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels
        return batch

data_collator = DataCollatorCTCWithPadding(
    processor=processor,
    padding=True  
)
print("Data collator ready.")

Data collator ready.


In [None]:
login(token="*****")<insert hugging face hub login token>

In [None]:
wer_metric = evaluate.load("wer")
cer_metric = evaluate.load("cer")

def compute_metrics(pred):
    pred_ids = np.argmax(pred.predictions, axis=-1)
    label_ids = pred.label_ids
    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id
    pred_str = processor.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = processor.batch_decode(label_ids, group_tokens=False)
    return {
        "wer": wer_metric.compute(predictions=pred_str, references=label_str),
        "cer": cer_metric.compute(predictions=pred_str, references=label_str),
    }


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

In [None]:
wandb.login()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:

  ········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mnikhita-james2[0m ([33mnikhita-james2-vellore-institute-of-technology[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [None]:
def train_with_hidden_dropout(hidden_dropout, run_id):
    print(f"Training with hidden dropout: {hidden_dropout}")
    
    model = AutoModelForCTC.from_pretrained(base_repo_name).to("cuda")
    model.freeze_feature_encoder()
    model.config.hidden_dropout = hidden_dropout
    
    print(" Updated classifier dropout to:", hidden_dropout)
    print(" New hidden dropout:", model.config.hidden_dropout)
    output_dir = f"/workspace/models/hidden_dropout_ablation_{run_id}"
    repo_name = f"nik1509/telugu_wav2vec_hiddendropout_ablation_{run_id}"
    PARAMS = {
        "learning_rate":5e-5,
        "epochs": 15,
        "batch_size": 8,
        "sampling_rate": 16000
    }
    wandb.init(
        project="telugu-asr-wav2vec_ablation",
        name=f"dropoutablation_{hidden_dropout}",
        config={
            "learning_rate": 5e-5, 
            "hidden_dropout": hidden_dropout,
            "freeze_feature_encoder": True,
            "base_model": base_repo_name
        }
    )

    training_args = TrainingArguments(
        output_dir=output_dir,
        learning_rate=PARAMS["learning_rate"],
        per_device_train_batch_size=PARAMS["batch_size"],
        gradient_accumulation_steps=2,
        eval_strategy="steps",
        eval_steps=700,
        logging_steps=50,
        save_steps=1500,
        num_train_epochs=PARAMS["epochs"],
        gradient_checkpointing=True,
        fp16=True,
        warmup_ratio=0.1,
        save_total_limit=2,
        report_to="wandb",
        run_name=f"dropout_ablation_{hidden_dropout}",
        push_to_hub=True,
        hub_model_id=repo_name,
        logging_dir=f"{output_dir}/logs"
    )


    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=processor,
        data_collator=data_collator,
        compute_metrics=compute_metrics
    )

    trainer.train()
    trainer.push_to_hub(commit_message=f"Trained with hidden dropout {hidden_dropout}")
    wandb.finish()

In [26]:
train_with_hidden_dropout(0.2, "hd_02")

Training with hidden dropout: 0.2
 Updated classifier dropout to: 0.2
 New hidden dropout: 0.2


  trainer = Trainer(


Step,Training Loss,Validation Loss,Wer,Cer
200,309.4665,452.549011,0.422213,0.173327
400,286.3686,454.056213,0.418434,0.171664
600,306.5472,459.250671,0.412286,0.169973
800,301.9841,461.25058,0.407717,0.169386
1000,320.2392,454.949524,0.409973,0.169551
1200,325.4418,464.644501,0.407717,0.168275
1400,276.4071,466.692352,0.40766,0.168598
1600,300.9258,461.080048,0.406701,0.16872
1800,287.1495,464.857697,0.406137,0.168899
2000,301.0417,464.345337,0.404671,0.168598




0,1
eval/cer,█▅▃▄▄▃▂▁▂▃▃▃▄▂▁▃▁▂▃▂▄▃▃▃▃▃▄▂▃▃▄▄▅▅▅▆▆▆▆▆
eval/loss,▁▁▁▂▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▄▄▄▅▅▅▆▆▆▆▆▆▆▆▇▇▇▇██
eval/runtime,█▄▄▃▄▄▇▄▇▄▃▅▅▄▂▂▂▁▂▁▁▂▁▁▂▁▂▁▂▃▂▃▄▂▂▂▂▂▂▁
eval/samples_per_second,▅▄▅▄▁▄▃▆▁▄▄▆▄▃▅▅▆▆▇▇▇█▇▆▆▆▇▇▇▇▆▅▆▆▇▅▆▆▆▅
eval/steps_per_second,▄▅▂▁▅▄▇▂▅▅▅▆▆▅▅▇▆▆▇██▇▇▇▆▇▇▇▇▇▆▆▆▅▅▇▆▇▆▇
eval/wer,█▆▆▆▆▃▄▄▃▂▃▄▄▃▂▂▃▁▃▄▄▂▃▃▃▂▁▃▂▄▁▁▁▃▂▂▂▂▁▂
train/epoch,▁▁▁▁▁▁▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇████
train/global_step,▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇█
train/grad_norm,▆▃▆▃▅▂▂▂▂▂▂▂▃▂▃▂▃▃▇▂█▁▄▃▅▅█▁▂▇▃▃▂▂▄▄▅▂█▅
train/learning_rate,▁▂▂▆██████▇▇▇▇▇▇▇▇▇▇▆▆▆▆▆▆▆▆▆▆▆▆▅▅▅▅▅▅▅▄

0,1
eval/cer,0.17063
eval/loss,584.24561
eval/runtime,99.8038
eval/samples_per_second,20.52
eval/steps_per_second,2.565
eval/wer,0.39801
total_flos,5.995827423689016e+19
train/epoch,15.0
train/global_step,17280.0
train/grad_norm,308.49826


In [34]:
train_with_hidden_dropout(0.3, "hd_03")

Training with hidden dropout: 0.3
 Updated classifier dropout to: 0.3
 New hidden dropout: 0.3


  trainer = Trainer(


Step,Training Loss,Validation Loss,Wer,Cer
700,336.5259,454.976227,0.41347,0.17107
1400,275.6354,472.184418,0.406137,0.168619
2100,288.2869,468.271118,0.407604,0.1694
2800,285.0092,474.086578,0.407547,0.169307
3500,286.3706,487.920776,0.403768,0.1671
4200,227.2902,485.690979,0.398635,0.168024
4900,253.1653,494.035461,0.403035,0.16796
5600,285.7851,498.219604,0.399481,0.167286
6300,252.8042,500.894287,0.401173,0.167946
7000,260.7867,514.076294,0.390963,0.166104




0,1
eval/cer,█▅▆▆▂▄▄▃▄▁▂▂▃▄▄▄▄▄▄▅▅▄▅▅
eval/loss,▁▂▂▂▃▃▃▃▃▄▄▅▅▅▆▆▆▆▆▇▇███
eval/runtime,▂▁▄▆▁▄▄▄█▅▂▄█▃▆▆▇▂▂▄▄▅▃▇
eval/samples_per_second,▇█▅▃█▅▅▅▁▄▇▅▁▆▃▃▂▇▇▅▅▄▆▂
eval/steps_per_second,▇█▅▃█▅▅▅▁▄▇▅▁▆▃▃▂▇▇▅▄▄▆▂
eval/wer,█▆▆▆▅▃▅▄▄▁▃▂▂▂▂▃▃▂▂▃▂▁▃▂
train/epoch,▁▁▁▁▁▂▂▂▂▂▂▂▃▃▃▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▆▆▆▇▇▇███
train/global_step,▁▁▁▁▁▂▂▂▂▂▃▃▃▄▄▄▄▄▄▄▄▄▅▅▅▆▆▆▆▇▇▇▇▇▇▇▇███
train/grad_norm,▃▂▄▆▁▁▁▂▆▂▆▂▁▂▂▃▁▃▄▅▄▂▁█▇▄▁▃▄▂▄▃▅▇▂▁▁▂▃▁
train/learning_rate,▁▂▃▄██████▇▇▇▇▇▇▇▆▆▆▆▆▆▆▆▆▆▅▅▅▅▅▅▄▄▄▄▄▄▄

0,1
eval/cer,0.16916
eval/loss,583.59338
eval/runtime,106.243
eval/samples_per_second,19.277
eval/steps_per_second,2.41
eval/wer,0.39457
total_flos,5.995827423689016e+19
train/epoch,15.0
train/global_step,17280.0
train/grad_norm,341.66656


In [35]:
import torch
from transformers import AutoProcessor, AutoModelForCTC
import evaluate

wer_metric = evaluate.load("wer")
cer_metric = evaluate.load("cer")

def evaluate_model(repo_name: str, test_dataset):
    print(f"\nEvaluating: {repo_name}")
    processor = AutoProcessor.from_pretrained(repo_name)
    model = AutoModelForCTC.from_pretrained(repo_name).to("cuda")
    model.eval()

    def map_to_prediction(batch):
        with torch.no_grad():
            input_values = torch.tensor(batch["input_values"], device="cuda").unsqueeze(0)
            logits = model(input_values).logits
            pred_ids = torch.argmax(logits, dim=-1)
            batch["pred_str"] = processor.batch_decode(pred_ids)[0]
            batch["text"] = processor.decode(batch["labels"], group_tokens=False)
        return batch

    results = test_dataset.map(map_to_prediction, remove_columns=test_dataset.column_names)

    wer = wer_metric.compute(predictions=results["pred_str"], references=results["text"])
    cer = cer_metric.compute(predictions=results["pred_str"], references=results["text"])
    
    print(f"WER for {repo_name}: {wer:.4f}")
    print(f"CER for {repo_name}: {cer:.4f}")
    
    return wer, cer


In [37]:
repo_names = [
    "nik1509/telugu_wav2vec_hidden_dropout_ablation_hd_02",
    "nik1509/telugu_wav2vec_hiddendropout_ablation_hd_03"
]

results_dict = {}
for repo in repo_names:
    wer, cer = evaluate_model(repo, val_dataset) 
    results_dict[repo] = {"WER": wer, "CER": cer}

for k, v in results_dict.items():
    print(f"Evaluation for {k}:\nWER: {v['WER']:.4f} | CER: {v['CER']:.4f}\n")


Evaluating: nik1509/telugu_wav2vec_hidden_dropout_ablation_hd_02


preprocessor_config.json:   0%|          | 0.00/256 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json:   0%|          | 0.00/985 [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/45.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/548 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.26G [00:00<?, ?B/s]



Map:   0%|          | 0/2048 [00:00<?, ? examples/s]

WER for nik1509/telugu_wav2vec_hidden_dropout_ablation_hd_02: 0.3779
CER for nik1509/telugu_wav2vec_hidden_dropout_ablation_hd_02: 0.1617

Evaluating: nik1509/telugu_wav2vec_hiddendropout_ablation_hd_03


preprocessor_config.json:   0%|          | 0.00/256 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json:   0%|          | 0.00/985 [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/45.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/548 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.26G [00:00<?, ?B/s]

Map:   0%|          | 0/2048 [00:00<?, ? examples/s]

WER for nik1509/telugu_wav2vec_hiddendropout_ablation_hd_03: 0.3747
CER for nik1509/telugu_wav2vec_hiddendropout_ablation_hd_03: 0.1612
Evaluation for nik1509/telugu_wav2vec_hidden_dropout_ablation_hd_02:
WER: 0.3779 | CER: 0.1617

Evaluation for nik1509/telugu_wav2vec_hiddendropout_ablation_hd_03:
WER: 0.3747 | CER: 0.1612

