# ASR for Farsi


* [Setup and Imports](#setup-and-imports)
* [Load and Preprocess Data](#load-and-preprocess-data)
* [Fine-Tuning the Model](#fine-tuning-the-model)
* [Load and Use Fine-Tuned Model](#load-and-use-fine-tuned-model)

## Setup and Imports

---

### Import required libraries

In [None]:
!pip install -U datasets
!pip install -U transformers
!pip install torchaudio
!pip install librosa
!pip install jiwer
!pip install hazm
!pip install num2fawords
!pip install accelerate

In [2]:
import json
import time

import re
from hazm import Normalizer

import torch
import torchaudio
import torchaudio.transforms as transforms
import librosa

import numpy as np
from jiwer import wer

from datasets import load_dataset, Audio, load_metric

from transformers import Wav2Vec2CTCTokenizer, Wav2Vec2FeatureExtractor, Wav2Vec2Processor
from transformers import Wav2Vec2ForCTC, TrainingArguments, Trainer

## Load and Preprocess Data

---

### Load dataset from HuggingFace

In [3]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
common_voice_train = load_dataset("mozilla-foundation/common_voice_6_1", "fa", split="train")
common_voice_test = load_dataset("mozilla-foundation/common_voice_6_1", "fa", split="test")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading builder script:   0%|          | 0.00/11.5k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/10.8k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.29k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/39.9k [00:00<?, ?B/s]

The repository for mozilla-foundation/common_voice_6_1 contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/mozilla-foundation/common_voice_6_1.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Downloading data:   0%|          | 0.00/8.88G [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating other split: 0 examples [00:00, ? examples/s]

Generating invalidated split: 0 examples [00:00, ? examples/s]

### Remove Unnecessary Columns

In [5]:
columns_to_remove = ['down_votes', 'gender', 'locale', 'segment', 'up_votes', 'accent', 'age', 'client_id']

print("Train Set Columns (Before Drop):", common_voice_train.column_names)
print("Test Set Columns (Before Drop):", common_voice_test.column_names)

common_voice_train = common_voice_train.remove_columns(columns_to_remove)
common_voice_test = common_voice_test.remove_columns(columns_to_remove)

print("Train Set Columns (After Drop):", common_voice_train.column_names)
print("Test Set Columns (After Drop):", common_voice_test.column_names)

Train Set Columns (Before Drop): ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment']
Test Set Columns (Before Drop): ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment']
Train Set Columns (After Drop): ['path', 'audio', 'sentence']
Test Set Columns (After Drop): ['path', 'audio', 'sentence']


### Filter Audio

In [6]:
def filter_train_audio(batch):
    duration = len(batch["audio"]["array"]) / batch["audio"]["sampling_rate"]
    return 4.0 <= duration <= 6.0

def filter_test_audio(batch):
    duration = len(batch["audio"]["array"]) / batch["audio"]["sampling_rate"]
    return duration < 15.0

print("Common Voice Train Length (Before Filter):", len(common_voice_train))
print("Common Voice Test Length (Before Filter):", len(common_voice_test))

common_voice_train = common_voice_train.filter(filter_train_audio)
common_voice_test = common_voice_test.filter(filter_test_audio)

print("Common Voice Train Length (After Filter):", len(common_voice_train))
print("Common Voice Test Length (After Filter):", len(common_voice_test))

Common Voice Train Length (Before Filter): 7593
Common Voice Test Length (Before Filter): 5213


Filter:   0%|          | 0/7593 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5213 [00:00<?, ? examples/s]

Common Voice Train Length (After Filter): 2217
Common Voice Test Length (After Filter): 5212


### Preprocessing

In [7]:
chars_to_ignore = [
    ",", "?", ".", "!", "-", ";", ":", '""', "%", "'", '"', "�", "&",
    "#", "!", "؟", "?", "«", "»", "،", "(", ")", "؛", "'ٔ", "٬",'ٔ', ",", "?",
    ".", "!", "-", ";", ":",'"',"“", "%", "‘", "”", "=", "–", "…", "_", "”", '“', '„',
    'ā', 'š'
]

import string
chars_to_ignore = chars_to_ignore + list(string.ascii_lowercase + string.digits)


chars_to_mapping = {
    'ك': 'ک', 'دِ': 'د', 'بِ': 'ب', 'زِ': 'ز', 'ذِ': 'ذ', 'شِ': 'ش', 'سِ': 'س', 'ى': 'ی',
    'ي': 'ی', 'أ': 'ا', 'ؤ': 'و', "ے": "ی", "ۀ": "ه", "ﭘ": "پ", "ﮐ": "ک", "ﯽ": "ی",
    "ﺎ": "ا", "ﺑ": "ب", "ﺘ": "ت", "ﺧ": "خ", "ﺩ": "د", "ﺱ": "س", "ﻀ": "ض", "ﻌ": "ع",
    "ﻟ": "ل", "ﻡ": "م", "ﻢ": "م", "ﻪ": "ه", "ﻮ": "و", 'ﺍ': "ا", 'ة': "ه",
    'ﯾ': "ی", 'ﯿ': "ی", 'ﺒ': "ب", 'ﺖ': "ت", 'ﺪ': "د", 'ﺮ': "ر", 'ﺴ': "س", 'ﺷ': "ش",
    'ﺸ': "ش", 'ﻋ': "ع", 'ﻤ': "م", 'ﻥ': "ن", 'ﻧ': "ن", 'ﻭ': "و", 'ﺭ': "ر", "ﮔ": "گ",

    # "ها": "  ها", "ئ": "ی",
    "۱۴ام": "۱۴ ام",

    "a": " ای ", "b": " بی ", "c": " سی ", "d": " دی ", "e": " ایی ", "f": " اف ",
    "g": " جی ", "h": " اچ ", "i": " آی ", "j": " جی ", "k": " کی ", "l": " ال ",
    "m": " ام ", "n": " ان ", "o": " او ", "p": " پی ", "q": " کیو ", "r": " آر ",
    "s": " اس ", "t": " تی ", "u": " یو ", "v": " وی ", "w": " دبلیو ", "x": " اکس ",
    "y": " وای ", "z": " زد ",
    "\u200c": " ", "\u200d": " ", "\u200e": " ", "\u200f": " ", "\ufeff": " ",
}

In [8]:
_normalizer = Normalizer()

def multiple_replace(text, chars_to_mapping):
    pattern = "|".join(map(re.escape, chars_to_mapping.keys()))
    return re.sub(pattern, lambda m: chars_to_mapping[m.group()], str(text))

def remove_special_characters(text, chars_to_ignore_regex):
    text = re.sub(chars_to_ignore_regex, '', text).lower() + " "
    return text

def normalizer(batch, chars_to_ignore, chars_to_mapping):
    chars_to_ignore_regex = f"""[{"".join(chars_to_ignore)}]"""
    text = batch["sentence"].lower().strip()

    text = _normalizer.normalize(text)
    text = multiple_replace(text, chars_to_mapping)
    text = remove_special_characters(text, chars_to_ignore_regex)
    text = re.sub(" +", " ", text)
    text = text.strip() + " "

    batch["sentence"] = text
    return batch

common_voice_train = common_voice_train.map(
    normalizer,
    fn_kwargs={"chars_to_ignore": chars_to_ignore, "chars_to_mapping": chars_to_mapping}
)

common_voice_test = common_voice_test.map(
    normalizer,
    fn_kwargs={"chars_to_ignore": chars_to_ignore, "chars_to_mapping": chars_to_mapping}
)

Map:   0%|          | 0/2217 [00:00<?, ? examples/s]

Map:   0%|          | 0/5212 [00:00<?, ? examples/s]

In [9]:
for i in range(3):
  print("Data", i)
  print(common_voice_train[i]["sentence"])
  print(common_voice_test[i]["sentence"])

Data 0
چه جوری آخه برانکو با دست خالی تیمشو برد فینال 
از هم جداشدن خیلی سخته 
Data 1
اون میوه هات رو بردار 
بله مطمئن باشید هستم 
Data 2
خوبه که جامعه پزشکی 
تقریبا صدو پنجاه گز دورتر از جاده 


### Create Character Dictionary

In [10]:
def extract_all_chars(batch):
    all_text = " ".join(batch["sentence"])
    vocab = list(set(all_text))
    return {"vocab": [vocab], "all_text": [all_text]}

vocab_train = common_voice_train.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=common_voice_train.column_names)
vocab_test = common_voice_test.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=common_voice_test.column_names)

vocab_list = list(set(vocab_train["vocab"][0]) | set(vocab_test["vocab"][0]))
vocab_list.sort()

if " " in vocab_list:
    vocab_list.remove(" ")

print("Vocab List Length:", len(vocab_list))
print(vocab_list)

vocab_dict = {
    "<pad>": 0,
    "<s>": 1,
    "</s>": 2,
    "<unk>": 3,
    "|": 4
}

current_index = 5
for char in vocab_list:
    if char not in vocab_dict:
        vocab_dict[char] = current_index
        current_index += 1

Map:   0%|          | 0/2217 [00:00<?, ? examples/s]

Map:   0%|          | 0/5212 [00:00<?, ? examples/s]

Vocab List Length: 35
['ء', 'آ', 'ئ', 'ا', 'ب', 'ت', 'ث', 'ج', 'ح', 'خ', 'د', 'ذ', 'ر', 'ز', 'س', 'ش', 'ص', 'ض', 'ط', 'ظ', 'ع', 'غ', 'ف', 'ق', 'ل', 'م', 'ن', 'ه', 'و', 'پ', 'چ', 'ژ', 'ک', 'گ', 'ی']


In [11]:
print("Dictionary Length:", len(vocab_dict))
print(vocab_dict)

Dictionary Length: 40
{'<pad>': 0, '<s>': 1, '</s>': 2, '<unk>': 3, '|': 4, 'ء': 5, 'آ': 6, 'ئ': 7, 'ا': 8, 'ب': 9, 'ت': 10, 'ث': 11, 'ج': 12, 'ح': 13, 'خ': 14, 'د': 15, 'ذ': 16, 'ر': 17, 'ز': 18, 'س': 19, 'ش': 20, 'ص': 21, 'ض': 22, 'ط': 23, 'ظ': 24, 'ع': 25, 'غ': 26, 'ف': 27, 'ق': 28, 'ل': 29, 'م': 30, 'ن': 31, 'ه': 32, 'و': 33, 'پ': 34, 'چ': 35, 'ژ': 36, 'ک': 37, 'گ': 38, 'ی': 39}


### Save unique characters in JSON

In [12]:
vocab_file = "vocab.json"
with open(vocab_file, 'w') as vf:
    json.dump(vocab_dict, vf)

### Resample audio to 16kHz

In [13]:
def speech_file_to_array_fn(batch):
    target_sampling_rate = 16000
    speech_array, sampling_rate = librosa.load(batch["path"])
    speech_array = speech_array.squeeze()
    speech_tensor = torch.tensor(speech_array)
    transform = transforms.Resample(sampling_rate, target_sampling_rate)
    resampled_tensor = transform(speech_tensor)

    batch["speech"] = resampled_tensor.numpy()
    batch["sampling_rate"] = target_sampling_rate
    batch["duration_in_seconds"] = len(batch["speech"]) / target_sampling_rate
    batch["target_text"] = batch["sentence"]
    return batch

common_voice_train = common_voice_train.map(speech_file_to_array_fn)
common_voice_test = common_voice_test.map(speech_file_to_array_fn)

Map:   0%|          | 0/2217 [00:00<?, ? examples/s]

Map:   0%|          | 0/5212 [00:00<?, ? examples/s]

## Fine-Tuning the Model

---

### Tokenizer



In [14]:
tokenizer = Wav2Vec2CTCTokenizer(
    vocab_file=vocab_file,
    unk_token="<unk>",
    pad_token="<pad>",
    bos_token="<s>",
    eos_token="</s>",
    word_delimiter_token="|",
    do_lower_case=False
)

text = "سلام من علیرضا دستمالچی ساعی هستم"
print(" ".join(tokenizer.tokenize(text)))
print(tokenizer.decode(tokenizer.encode(text)))

س ل ا م | م ن | ع ل ی ر ض ا | د س ت م ا ل چ ی | س ا ع ی | ه س ت م
سلام من علیرضا دستمالچی ساعی هستم


### Feature Extracture

In [15]:
feature_extractor = Wav2Vec2FeatureExtractor(
    feature_size=1,
    sampling_rate=16000,
    padding_value=0.0,
    do_normalize=True,
    return_attention_mask=True
)

### Processor

In [16]:
processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

### Data Collator

In [17]:
def prepare_dataset(batch):
    assert (
        len(set(batch["sampling_rate"])) == 1
    ), f"Make sure all inputs have the same sampling rate of {processor.feature_extractor.sampling_rate}."

    batch["input_values"] = processor(batch["speech"], sampling_rate=16000).input_values

    with processor.as_target_processor():
        batch["labels"] = processor(batch["target_text"]).input_ids
    return batch


common_voice_train = common_voice_train.map(prepare_dataset, remove_columns=common_voice_train.column_names, batch_size=12, num_proc=4, batched=True)
common_voice_test = common_voice_test.map(prepare_dataset, remove_columns=common_voice_test.column_names, batch_size=12, num_proc=4, batched=True)

  self.pid = os.fork()


Map (num_proc=4):   0%|          | 0/2217 [00:00<?, ? examples/s]

  self.pid = os.fork()


Map (num_proc=4):   0%|          | 0/5212 [00:00<?, ? examples/s]



In [18]:
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union

@dataclass
class DataCollatorCTCWithPadding:

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                max_length=self.max_length_labels,
                pad_to_multiple_of=self.pad_to_multiple_of_labels,
                return_tensors="pt",
            )

        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch

In [19]:
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

### Evaluation Metric

In [20]:
import random

wer_metric = load_metric("wer")

def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids)

    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    if isinstance(label_str, list):
        if isinstance(pred_str, list) and len(pred_str) == len(label_str):
            for index in random.sample(range(len(label_str)), 3):
                print(f'reference: "{label_str[index]}"')
                print(f'predicted: "{pred_str[index]}"')

        else:
            for index in random.sample(range(len(label_str)), 3):
                print(f'reference: "{label_str[index]}"')
                print(f'predicted: "{pred_str}"')

    wer = wer_metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

  wer_metric = load_metric("wer")


Downloading builder script:   0%|          | 0.00/1.90k [00:00<?, ?B/s]

The repository for wer contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/wer.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


### Load Model

In [21]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-xlsr-53",
                                       attention_dropout=0.1,
                                       gradient_checkpointing=True,
                                       ctc_loss_reduction="mean",
                                       ctc_zero_infinity=True,
                                       bos_token_id=processor.tokenizer.bos_token_id,
                                       eos_token_id=processor.tokenizer.eos_token_id,
                                       pad_token_id=processor.tokenizer.pad_token_id,
                                       vocab_size=len(processor.tokenizer.get_vocab())).to(device)

model.freeze_feature_extractor()

Using device: cuda


config.json:   0%|          | 0.00/1.77k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.27G [00:00<?, ?B/s]

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-xlsr-53 and are newly initialized: ['lm_head.bias', 'lm_head.weight', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Define Trainer

In [22]:
training_args = TrainingArguments(
    output_dir="./wav2vec2-large-xlsr-persian-demo",
    group_by_length=True,
    per_device_train_batch_size=12,
    per_device_eval_batch_size=12,
    gradient_accumulation_steps=2,
    eval_strategy="steps",
    num_train_epochs=5,
    fp16=True,
    save_steps=10,
    eval_steps=10,
    logging_steps=10,
    learning_rate=1e-4,
    warmup_steps=1000,
    save_total_limit=2,
)

trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=common_voice_train,
    eval_dataset=common_voice_test,
    tokenizer=processor.feature_extractor,
)

### Train Model

In [None]:
start_time = time.time()
trainer.train()
end_time = time.time()
train_time = end_time - start_time

print(f"Training time: {train_time/60} minutes")



Step,Training Loss,Validation Loss,Wer
10,20.0114,26.213902,1.000869
20,21.7949,26.202385,1.000869
30,21.2387,26.167309,1.001176
40,21.2613,26.117338,1.001406
50,22.695,26.052984,1.001457
60,20.6855,25.981907,1.002147
70,21.6819,25.887863,1.001815
80,20.3627,25.78512,1.000971
90,20.843,25.654993,1.000051
100,20.4746,25.492807,1.0


reference: "همین الان راننده تاکسی گفت از بالا رسوندن بازی دیشب مساوی شه تا شلوغ نشه"
predicted: "شثثشثشثشثثثثثثشثشثشثشثشثشثقث قثثثثثثشثث"
reference: "یه مادرشوهر دارم گفته بریم از دوستش مبل بخریم و خودش برامون تخفیف آنچنانی میگیره"
predicted: "شثشثثشثثشثثثشثقذبذقثقذ"
reference: "و ادعای خرید کودکان از آمریکای جنوبی توسط ثروتمندان آمریکایی"
predicted: "شثشثشثشثثثششثشثثثشثشبقاپ"




reference: "من با اون بیرون نمی رم"
predicted: "شثثشثشثرذبقجبقبقبجب"
reference: "هرکس سرش تو کار خودشه به کجای عالم هستی برمیخوره"
predicted: "شثثشثثشثشثشثشثبقبذقشش"
reference: "معلومه حسابی قاطی کردی"
predicted: "شثشثشثشثثعذبپقشپثثث"




reference: "راستش هیچ سالی سابقه نداشته من زودتر از ایام امتحانات درس بخونم"
predicted: "شثشثشثشثثثثثثشثثثثثثشثذقذقذقذقذ"
reference: "دکتر خوردن جگر را توصیه کرد"
predicted: "قثثشثثثشثشثشق"
reference: "شبیه نیس واقعا یه سگه"
predicted: "شثثثثثثثثشثثثشثثثثثشب بسبقبقبقثثثثثث"




reference: "فقط قربانی تان را تخریب نکردید شما خودتان را تخریب کردید"
predicted: "شثشثثشثشثشثقذقذ"
reference: "می خواهم بلیط برای ریگا بخرم لطفا"
predicted: "شثثشثشثشثثثثثشثشثثشثث"
reference: "اتفاق دیگری که نشانهای از پیشرفت والیبال بانوان می دهد لژیونر شدن سه ملی پوش ایرانی است"
predicted: "شثثشثثثثثثثثثثشثثثثثثثثثثثثثثثثثثثثثثثثثثثثشثثث"




reference: "ما اونو دور نمیندازیم"
predicted: "شقذقبج"
reference: "از آخرین تلاش های صداسیما برای جذب مخاطب"
predicted: "شششذبقش"
reference: "برای توضیح بیشتر خوب است عرض کنم که من روز گذشته خواسته بودم او را دستگیر کنم"
predicted: "شثششششثثثثشثیقاق"




reference: "من غذا چینی سفارش دادم"
predicted: "ششششقذاذش"
reference: "ما یک گروه چهار نفره هستیم"
predicted: "ششششقذبذ"
reference: "آذرشب"
predicted: "قق"




reference: "هم اکنون سفارش دهید"
predicted: "ششششابق"
reference: "به تدریج منجربه ممنوعیت کلی الکل شد"
predicted: "رشششذ"
reference: "خب مجبور شدم بیارمشون"
predicted: "شششاقاقبعبقا"




reference: "به یک نفر از چهار نفر فرد بالغ"
predicted: "شششششذبر"
reference: "من تو را به عروسی دعوت می کنم"
predicted: "شششش"
reference: "چرا باید عصبانی باشم"
predicted: "ش"




reference: "یک جعبه پودر صورت به من بدهید لطفا"
predicted: "شش"
reference: "ما ویروس روحانا گرفتیم نمردیم کرونا که چیزی نیست"
predicted: "شش"
reference: "این یکی خیلی مهم است همپیمانی را به روال عادی تبدیل کنید"
predicted: "شش"




reference: "نمیزارین بگم بابا یه صدایی از اون تو اومد خودم شنیدم"
predicted: "شششش"
reference: "و دوست تو از دست بدی"
predicted: "گشش"
reference: "آیا غلط است"
predicted: "ششش"




reference: "پاپ کجا زندگی می کنه"
predicted: "شش"
reference: "شکلشو به خاطر بسپرین"
predicted: "ش"
reference: "متاسفم این کارو کردم"
predicted: "ش"




reference: "همش تقصیر تو شد تو باعث این دردسرا هستی"
predicted: ""
reference: "تو چیکار میخوای بکنی"
predicted: ""
reference: "نمیخای که از دست بدیش"
predicted: ""




reference: "من به کمکت احتیاجی ندارم"
predicted: ""
reference: "اگه بخواین میتونم حقایق بیشتری رو بهتون بگم"
predicted: ""
reference: "جوانان از ما کارآموزی می خواهند برای فرصت کار برای کارآموزی"
predicted: ""




reference: "رویکردی بر اساس اهداف و مهلتهای از پیش تعیین شده"
predicted: ""
reference: "این معلوم هست چی میگه تو میری اون تو چیکار هان زود باش جواب بده"
predicted: ""
reference: "اللهیار"
predicted: ""




reference: "اخه اونجا جنگ بود اینجا زندگی"
predicted: ""
reference: "حوض رو هم باید تعمیر کنم یه رنگ ابی بزنم دوتا ماهی قرمز بندازم توش"
predicted: ""
reference: "آلو ارغوانی است"
predicted: ""




reference: "تا آبجو شراب و سایر نوشیدنیهای الکلی را خلق کند"
predicted: ""
reference: "خب ببریمش پایین پله ها"
predicted: ""
reference: "هنوزم اونو دوست داری"
predicted: ""




Step,Training Loss,Validation Loss,Wer
10,20.0114,26.213902,1.000869
20,21.7949,26.202385,1.000869
30,21.2387,26.167309,1.001176
40,21.2613,26.117338,1.001406
50,22.695,26.052984,1.001457
60,20.6855,25.981907,1.002147
70,21.6819,25.887863,1.001815
80,20.3627,25.78512,1.000971
90,20.843,25.654993,1.000051
100,20.4746,25.492807,1.0


reference: "که اکنون در سرویسهای امنیتی روسیه استفاده می شوند پرداخته است"
predicted: ""
reference: "رهبران جهان از استقلال سودان جنوبی به حیث تازهترین کشور دنیا استقبال کردند"
predicted: ""
reference: "به کشور ما خوش آمدید"
predicted: ""




reference: "یک سال بخور نون و تره یک عمر بخور نون و کره"
predicted: ""
reference: "مکزیکو سیتی"
predicted: ""
reference: "غذا نپختن یا اتاق عوض کردنت"
predicted: ""




reference: "با این حال مردم برای قرنها انتقال خون را آزمایش کرده بودند"
predicted: ""
reference: "چون به ما می گوید که آنها نگرانند"
predicted: ""
reference: "بالاخره تونستم قهرمان دوران بچه گیمو زیارت کنم"
predicted: ""




reference: "سعیداذیت نکن"
predicted: ""
reference: "نزدیک و صمیمی و"
predicted: ""
reference: "آره ولی کم بود باز هم می خوام"
predicted: ""




reference: "دقیقا برای همین پا شدم"
predicted: ""
reference: "خوب مجانی انجامشون بده"
predicted: ""
reference: "اون یه تیکه طالبی هست"
predicted: ""




reference: "برای مقایسه به ریال تبدیل نکنین"
predicted: ""
reference: "در زمانی که هولفودز در مخمصه گرفتار بود"
predicted: ""
reference: "من تشنه هستم"
predicted: ""




## Model Evaluation

In [None]:
eval_results = trainer.evaluate()

print(f"Evaluation Results: {eval_results}")

### Print Predictions

In [None]:
t = load_dataset("mozilla-foundation/common_voice_6_1", "fa", split="test")
t = t.map(speech_file_to_array_fn)

Map:   0%|          | 0/5213 [00:00<?, ? examples/s]

In [None]:
def predict(batch):
    features = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)

    input_values = features.input_values.to(device)
    attention_mask = features.attention_mask.to(device)

    with torch.no_grad():
        logits = model(input_values, attention_mask=attention_mask).logits

    pred_ids = torch.argmax(logits, dim=-1)

    batch["predicted"] = processor.batch_decode(pred_ids)[0]
    return batch

result = t.map(predict)

Map:   0%|          | 0/5213 [00:00<?, ? examples/s]

  return F.conv1d(input, weight, bias, self.stride,


In [None]:
max_items = np.random.randint(0, len(result), 20).tolist()
for i in max_items:
    reference, predicted =  result["sentence"][i], result["predicted"][i]
    print("reference:", reference)
    print("predicted:", predicted)
    print('---')

reference: ،حتا به پسر فراری عمم هم پول داده بااینکه میدونسته ممکنه به دستش نرسه هیچوقت
predicted: ا
---
reference: كسی كه پدر خود را مسخره كند و مادرش را تحقير نمايد
predicted: ا
---
reference: .من آدم خیلی خیلی بدیم
predicted: ا
---
reference: به نظر مسخره میرسد
predicted: ا
---
reference: پسرم مدرسه میرود
predicted: ا
---
reference: آیسودا
predicted: ا
---
reference: همون دختر همسایه که ماه دیگه میخوام برم خواستگاریش
predicted: ا
---
reference: من نتوانستم این صبح پیوست را باز کنم.
predicted: ا
---
reference: چی؟ ما هنوزم همونطوریم
predicted: ا
---
reference: آیا فردا عصر آزاد هستی؟
predicted: ا
---
reference: شرکتهایی که برای محیط زیست و جامعه نقش مثبت دارند، همچنین از لحاظ مالی هم موفق هستند
predicted: ا
---
reference: بدون مزر با من باش، اگر چه دیگه وقتی نیست
predicted: ا
---
reference: امروز کیف پولتون رو.
predicted: ا
---
reference: و من، اعتقاد داشتم که او در آن زمان خارج از بدنش قرار دارد
predicted: ا
---
reference: مرد: آه
predicted: ا
---
reference: اما این بار، بجای سوق دا

### Final Report

In [None]:
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

num_train_files = len(common_voice_train)
num_test_files = len(common_voice_test)

wer = eval_results['eval_wer']

report = {
    "Batch Size": training_args.per_device_train_batch_size,
    "Num Epochs": training_args.num_train_epochs,
    "Training Files": num_train_files,
    "Testing Files": num_test_files,
    "Training Time (seconds)": train_time,
    "Total Parameters": total_params,
    "Trainable Parameters": trainable_params,
    "WER (Word Error Rate)": wer
}

for key, value in report.items():
    print(f"{key}: {value}")

Batch Size: 16
Num Epochs: 5
Training Files: 2217
Testing Files: 5212
Training Time (seconds): 1640.0276243686676
Total Parameters: 315479720
Trainable Parameters: 311269544
WER (Word Error Rate): 1.0025561065385205


### Save Model

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
save_directory = "/content/drive/My Drive/wav2vec2-large-xlsr-persian-demo"

model.save_pretrained(save_directory)

processor.save_pretrained(save_directory)

print(f"Model, tokenizer, and processor have been saved to {save_directory}")

Model, tokenizer, and processor have been saved to /content/drive/My Drive/wav2vec2-large-xlsr-persian-demo


## Load and Use Fine-Tuned Model

---

In [None]:
!pip install datasets transformers librosa torchaudio hazm num2fawords jiwer accelerate

In [None]:
from huggingface_hub import login
login()

In [None]:
import json
import re
import torch
import librosa
import numpy as np
from hazm import Normalizer
from jiwer import wer
from datasets import load_dataset, load_metric
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
from google.colab import drive


drive.mount('/content/drive')
model_path = "/content/drive/My Drive/wav2vec2-large-xlsr-persian-demo"
model = Wav2Vec2ForCTC.from_pretrained(model_path)
processor = Wav2Vec2Processor.from_pretrained(model_path)


wer_metric = load_metric("wer")

def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)
    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id
    pred_str = processor.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = processor.batch_decode(pred.label_ids, skip_special_tokens=True)
    wer = wer_metric.compute(predictions=pred_str, references=label_str)
    return {"wer": wer}


def preprocess_audio(audio_path):
    y, sr = librosa.load(audio_path, sr=16000)
    input_values = processor(y, sampling_rate=sr).input_values[0]
    return input_values


def transcribe(audio_path):
    input_values = preprocess_audio(audio_path)
    with torch.no_grad():
        logits = model(torch.tensor(input_values).unsqueeze(0)).logits
    pred_ids = torch.argmax(logits, dim=-1)
    transcription = processor.batch_decode(pred_ids)[0]
    return transcription


common_voice_test = load_dataset("mozilla-foundation/common_voice_6_1", "fa", split="test[:10]")


selected_sample = None
for sample in common_voice_test:
    duration = len(sample["audio"]["array"]) / sample["audio"]["sampling_rate"]
    if 4.0 <= duration <= 6.0:
        selected_sample = sample
        break


if selected_sample:
    audio_path = selected_sample['audio']['path']
    transcription = transcribe(audio_path)
    print(f"Transcription: {transcription}")
    print(f"Reference: {selected_sample['sentence']}")
else:
    print("No suitable sample found within the specified duration range (4-6 seconds).")