import

In [52]:
%pip install hezar

Collecting hezar
  Downloading hezar-0.42.0-py3-none-any.whl.metadata (18 kB)
Downloading hezar-0.42.0-py3-none-any.whl (195 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m195.2/195.2 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: hezar
Successfully installed hezar-0.42.0


Import stuff

In [2]:
from hezar.models import Model
from datasets import load_dataset, Audio
from evaluate import load
import re
from tqdm import tqdm

# --- 1. CONFIGURATION ---
MODEL_ID = "hezarai/whisper-small-fa"
DATASET_ID = "hezarai/common-voice-13-fa"
SPLIT = "train" # Change to 'test' for real performance assessment
MAX_SAMPLES = 50  # Set to None to run on the full dataset

# --- 2. LOAD MODEL & METRICS ---
print(f"Loading Hezar model: {MODEL_ID}...")
# The Hezar Model wrapper handles the feature extractor and tokenizer automatically
whisper = Model.load(MODEL_ID)

print("Loading metrics...")
wer_metric = load("wer")
cer_metric = load("cer")

# --- 3. LOAD DATASET ---
print("Loading dataset...")
dataset = load_dataset(DATASET_ID, split=SPLIT)

# Important: Hezar expects 16kHz audio.
# We cast the column so 'datasets' handles resampling automatically.
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))

if MAX_SAMPLES:
    print(f"Selecting first {MAX_SAMPLES} samples for faster evaluation...")
    dataset = dataset.select(range(MAX_SAMPLES))

# --- 4. PERSIAN NORMALIZATION ---
# Even with a fine-tuned model, you must normalize to get valid WER/CER
def normalize_text(text):
    if not text: return ""
    text = text.lower().strip()
    text = text.replace('ي', 'ی').replace('ك', 'ک')
    text = re.sub(r'[^\w\s]', '', text) # Remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# --- 5. EVALUATION LOOP ---
predictions = []
references = []

print("Starting inference...")
# We iterate one by one (or you can create small batches if you have a GPU)
for sample in tqdm(dataset):
    # 1. Get Audio Array
    # Hezar's predict() can take a numpy array directly
    audio_array = sample["audio"]["array"]

    # 2. Predict
    # Hezar returns a list of dictionaries, e.g., [{'text': '...'}]
    result = whisper.predict(audio_array)
    pred_text = result[0]['text']

    # 3. Get Reference
    ref_text = sample["sentence"]

    # 4. Normalize & Store
    predictions.append(normalize_text(pred_text))
    references.append(normalize_text(ref_text))

# --- 6. COMPUTE METRICS ---
print("\nComputing metrics...")
wer = wer_metric.compute(predictions=predictions, references=references)
cer = cer_metric.compute(predictions=predictions, references=references)

print(f"--------------------------------------")
print(f"Model: {MODEL_ID}")
print(f"Split: {SPLIT} | Samples: {len(predictions)}")
print(f"--------------------------------------")
print(f"WER: {wer * 100:.2f}%")
print(f"CER: {cer * 100:.2f}%")
print(f"--------------------------------------")

# Sample output for sanity check
print(f"Ref: {references[0]}")
print(f"Pred: {predictions[0]}")

Loading Hezar model: hezarai/whisper-small-fa...
Loading metrics...
Loading dataset...
Selecting first 50 samples for faster evaluation...
Starting inference...


100%|██████████| 50/50 [23:02<00:00, 27.66s/it]


Computing metrics...
--------------------------------------
Model: hezarai/whisper-small-fa
Split: train | Samples: 50
--------------------------------------
WER: 6.88%
CER: 1.65%
--------------------------------------
Ref: تا حرف پول به میان میآید گوشهایش را تیز میکند
Pred: تا حرف پول به میان میآید گوشهایش را تیز میکند





Load dataset

Preprocess

Training HMM