In [1]:
!pip install -U transformers
%pip install evaluate
%pip install jiwer
%pip install --upgrade torchcodec



## Local Inference on GPU
Model page: https://huggingface.co/jonatasgrosman/exp_w2v2t_fa_hubert_s801

⚠️ If the generated code snippets do not work, please open an issue on either the [model repo](https://huggingface.co/jonatasgrosman/exp_w2v2t_fa_hubert_s801)
			and/or on [huggingface.js](https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/src/model-libraries-snippets.ts) 🙏

In [2]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("automatic-speech-recognition", model="jonatasgrosman/exp_w2v2t_fa_hubert_s801")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Device set to use cuda:0


In [None]:
# Load model directly
from transformers import AutoProcessor, AutoModelForCTC

processor = AutoProcessor.from_pretrained("jonatasgrosman/exp_w2v2t_fa_hubert_s801")
model = AutoModelForCTC.from_pretrained("jonatasgrosman/exp_w2v2t_fa_hubert_s801")

In [3]:
!pip install --upgrade datasets

import torch
import re
from tqdm.auto import tqdm
from datasets import load_dataset, Audio
from transformers import AutoProcessor, AutoModelForCTC
from evaluate import load
import numpy as np

# --- 1. CONFIGURATION ---
MODEL_ID = "jonatasgrosman/exp_w2v2t_fa_hubert_s801"
DATASET_ID = "hezarai/common-voice-13-fa"
SPLIT = "test"  # Assessing performance on the held-out test set
SAMPLING_RATE = 16000
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
BATCH_SIZE = 8 # Adjust based on your GPU memory (lower if OOM)


# --- 2. PERSIAN NORMALIZATION FUNCTION (CRITICAL FOR METRICS) ---
def normalize_text(text):
    """
    Applies standard normalization rules for ASR metric comparison:
    1. Unifies common Persian character variants (Kaf, Yeh).
    2. Removes punctuation and multiple spaces.
    """
    if not text: return ""
    text = str(text).lower().strip()

    # Unify ambiguous characters
    text = text.replace('ي', 'ی').replace('ك', 'ک')

    # Remove punctuation, symbols, and non-letter characters
    text = re.sub(r'[^\w\s]', '', text)

    # Handle zero-width non-joiner (ZWNJ) which can interfere with tokenization
    text = text.replace('\u200c', ' ')

    # Consolidate and clean whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text


# --- 3. LOAD MODEL, PROCESSOR, AND DATA ---
print(f"Loading model: {MODEL_ID} and running on {DEVICE}...")

processor = AutoProcessor.from_pretrained(MODEL_ID)
model = AutoModelForCTC.from_pretrained(MODEL_ID).to(DEVICE)

# Load the test dataset and cast to 16kHz audio format (required by HuBERT)
raw_dataset = load_dataset(DATASET_ID, split=SPLIT)
raw_dataset = raw_dataset.cast_column("audio", Audio(sampling_rate=SAMPLING_RATE))

# Load metrics
wer_metric = load("wer")
cer_metric = load("cer")


# --- 4. DATA PREPROCESSING FUNCTION ---
def prepare_dataset(batch):
    # Process the audio array for the model
    audio = batch["audio"]
    batch["input_features"] = processor(
        audio["array"],
        sampling_rate=audio["sampling_rate"],
        return_tensors="pt"
    ).input_values[0]

    # Normalize and encode the target text (labels)
    normalized_text = normalize_text(batch["sentence"])
    batch["labels"] = processor.tokenizer(normalized_text).input_ids

    return batch

# Apply preprocessing
processed_dataset = raw_dataset.map(
    prepare_dataset,
    remove_columns=raw_dataset.column_names,
    num_proc=1 # Use 1 process to avoid multiprocessing issues with torchcodec
)

# --- 5. DATA COLLATOR (for batching padded sequences) ---
def data_collator(features):
    input_features = [{"input_values": feature["input_features"]} for feature in features]
    label_features = [{"input_ids": feature["labels"]} for feature in features]

    batch = processor.feature_extractor.pad(input_features, return_tensors="pt")

    # Pad labels manually
    labels_batch = processor.tokenizer.pad(label_features, return_tensors="pt")

    batch["labels"] = labels_batch["input_ids"]
    return batch


# --- 6. EVALUATION LOOP ---
dataloader = torch.utils.data.DataLoader(
    processed_dataset,
    batch_size=BATCH_SIZE,
    collate_fn=data_collator
)

all_predictions = []
all_references = []

model.eval()
print(f"\nStarting inference on {len(processed_dataset)} test samples...")

with torch.no_grad():
    for batch in tqdm(dataloader):
        # Move tensors to the appropriate device (CPU/GPU)
        input_values = batch["input_values"].to(DEVICE)
        labels = batch["labels"].to(DEVICE)

        # Forward pass to get logits (raw predictions)
        logits = model(input_values).logits

        # CTC Greedy Decoding
        predicted_ids = torch.argmax(logits, dim=-1)

        # Decode the predicted IDs to text
        predictions_text = processor.batch_decode(predicted_ids)

        # Decode the reference IDs back to text for comparison
        # (This is safe because the labels were encoded using the same normalization)
        labels_ids = np.where(labels.cpu().numpy() != -100, labels.cpu().numpy(), processor.tokenizer.pad_token_id)
        references_text = processor.batch_decode(labels_ids, group_tokens=False)

        # Store normalized results
        all_predictions.extend([normalize_text(p) for p in predictions_text])
        all_references.extend([normalize_text(r) for r in references_text])

# --- 7. CALCULATE FINAL METRICS ---
final_wer = wer_metric.compute(predictions=all_predictions, references=all_references)
final_cer = cer_metric.compute(predictions=all_predictions, references=all_references)

print(f"\n---------------------------------------------------------")
print(f"ASR Model: {MODEL_ID}")
print(f"Dataset: {DATASET_ID} | Split: {SPLIT} ({len(processed_dataset)} samples)")
print(f"---------------------------------------------------------")
print(f"Word Error Rate (WER):      {final_wer * 100:.2f}%")
print(f"Character Error Rate (CER): {final_cer * 100:.2f}%")
print(f"---------------------------------------------------------")

Loading model: jonatasgrosman/exp_w2v2t_fa_hubert_s801 and running on cuda...


Map (num_proc=1):   0%|          | 0/10440 [00:00<?, ? examples/s]


Starting inference on 10440 test samples...


  0%|          | 0/1305 [00:00<?, ?it/s]


---------------------------------------------------------
ASR Model: jonatasgrosman/exp_w2v2t_fa_hubert_s801
Dataset: hezarai/common-voice-13-fa | Split: test (10440 samples)
---------------------------------------------------------
Word Error Rate (WER):      55.20%
Character Error Rate (CER): 16.16%
---------------------------------------------------------
