# ASR for Farsi

### Import required libraries

In [None]:
!pip install datasets transformers librosa torchaudio hazm num2fawords jiwer accelerate

In [2]:
import json
import time

import re
from hazm import Normalizer

import torch
import torchaudio
import torchaudio.transforms as transforms
import librosa

import numpy as np
from jiwer import wer

from datasets import load_dataset, Audio, load_metric

from transformers import Wav2Vec2CTCTokenizer, Wav2Vec2FeatureExtractor, Wav2Vec2Processor, DataCollatorWithPadding
from transformers import Wav2Vec2ForCTC, TrainingArguments, Trainer

### Load dataset from HuggingFace

In [3]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
common_voice_train = load_dataset("mozilla-foundation/common_voice_6_1", "fa", split="train")
common_voice_test = load_dataset("mozilla-foundation/common_voice_6_1", "fa", split="test")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading builder script:   0%|          | 0.00/11.5k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/10.8k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.29k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/39.9k [00:00<?, ?B/s]

The repository for mozilla-foundation/common_voice_6_1 contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/mozilla-foundation/common_voice_6_1.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Downloading data:   0%|          | 0.00/8.88G [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating other split: 0 examples [00:00, ? examples/s]

Generating invalidated split: 0 examples [00:00, ? examples/s]

### Remove Unnecessary Columns

In [5]:
columns_to_remove = ['down_votes', 'gender', 'locale', 'segment', 'up_votes', 'accent', 'age', 'client_id']

print("Train Set Columns (Before Drop):", common_voice_train.column_names)
print("Test Set Columns (Before Drop):", common_voice_test.column_names)

common_voice_train = common_voice_train.remove_columns(columns_to_remove)
common_voice_test = common_voice_test.remove_columns(columns_to_remove)

print("Train Set Columns (After Drop):", common_voice_train.column_names)
print("Test Set Columns (After Drop):", common_voice_test.column_names)

Train Set Columns (Before Drop): ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment']
Test Set Columns (Before Drop): ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment']
Train Set Columns (After Drop): ['path', 'audio', 'sentence']
Test Set Columns (After Drop): ['path', 'audio', 'sentence']


### Filter Audio

In [6]:
def filter_train_audio(batch):
    duration = len(batch["audio"]["array"]) / batch["audio"]["sampling_rate"]
    return 4.0 <= duration <= 6.0

def filter_test_audio(batch):
    duration = len(batch["audio"]["array"]) / batch["audio"]["sampling_rate"]
    return duration < 15.0

print("Common Voice Train Length (Before Filter):", len(common_voice_train))
print("Common Voice Test Length (Before Filter):", len(common_voice_test))

common_voice_train = common_voice_train.filter(filter_train_audio)
common_voice_test = common_voice_test.filter(filter_test_audio)

print("Common Voice Train Length (After Filter):", len(common_voice_train))
print("Common Voice Test Length (After Filter):", len(common_voice_test))

Common Voice Train Length (Before Filter): 7593
Common Voice Test Length (Before Filter): 5213


Filter:   0%|          | 0/7593 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5213 [00:00<?, ? examples/s]

Common Voice Train Length (After Filter): 2217
Common Voice Test Length (After Filter): 5212


### Preprocessing

In [7]:
chars_to_ignore = [
    ",", "?", ".", "!", "-", ";", ":", '""', "%", "'", '"', "�", "&",
    "#", "!", "؟", "?", "«", "»", "،", "(", ")", "؛", "'ٔ", "٬",'ٔ', ",", "?",
    ".", "!", "-", ";", ":",'"',"“", "%", "‘", "”", "=", "–", "…", "_", "”", '“', '„',
    'ā', 'š'
]


chars_to_mapping = {
    'ك': 'ک', 'دِ': 'د', 'بِ': 'ب', 'زِ': 'ز', 'ذِ': 'ذ', 'شِ': 'ش', 'سِ': 'س', 'ى': 'ی',
    'ي': 'ی', 'أ': 'ا', 'ؤ': 'و', "ے": "ی", "ۀ": "ه", "ﭘ": "پ", "ﮐ": "ک", "ﯽ": "ی",
    "ﺎ": "ا", "ﺑ": "ب", "ﺘ": "ت", "ﺧ": "خ", "ﺩ": "د", "ﺱ": "س", "ﻀ": "ض", "ﻌ": "ع",
    "ﻟ": "ل", "ﻡ": "م", "ﻢ": "م", "ﻪ": "ه", "ﻮ": "و", 'ﺍ': "ا", 'ة': "ه",
    'ﯾ': "ی", 'ﯿ': "ی", 'ﺒ': "ب", 'ﺖ': "ت", 'ﺪ': "د", 'ﺮ': "ر", 'ﺴ': "س", 'ﺷ': "ش",
    'ﺸ': "ش", 'ﻋ': "ع", 'ﻤ': "م", 'ﻥ': "ن", 'ﻧ': "ن", 'ﻭ': "و", 'ﺭ': "ر", "ﮔ": "گ",

    # "ها": "  ها", "ئ": "ی",
    "۱۴ام": "۱۴ ام",

    "a": " ای ", "b": " بی ", "c": " سی ", "d": " دی ", "e": " ایی ", "f": " اف ",
    "g": " جی ", "h": " اچ ", "i": " آی ", "j": " جی ", "k": " کی ", "l": " ال ",
    "m": " ام ", "n": " ان ", "o": " او ", "p": " پی ", "q": " کیو ", "r": " آر ",
    "s": " اس ", "t": " تی ", "u": " یو ", "v": " وی ", "w": " دبلیو ", "x": " اکس ",
    "y": " وای ", "z": " زد ",
    "\u200c": " ", "\u200d": " ", "\u200e": " ", "\u200f": " ", "\ufeff": " ",
}

In [8]:
normalizer = Normalizer()

def preprocess_text(text):
    text = normalizer.normalize(text)
    text = text.lower()
    escaped_chars_to_ignore = [re.escape(char) for char in chars_to_ignore]
    text = re.sub(f"[{''.join(escaped_chars_to_ignore)}]", '', text)
    for old, new in chars_to_mapping.items():
        text = text.replace(old, new)
    return text

common_voice_train = common_voice_train.map(lambda batch: {"sentence": preprocess_text(batch["sentence"])})
common_voice_test = common_voice_test.map(lambda batch: {"sentence": preprocess_text(batch["sentence"])})

Map:   0%|          | 0/2217 [00:00<?, ? examples/s]

Map:   0%|          | 0/5212 [00:00<?, ? examples/s]

In [9]:
for i in range(3):
  print("Data", i)
  print(common_voice_train[i]["sentence"])
  print(common_voice_test[i]["sentence"])

Data 0
چه جوری آخه برانکو با دست خالی تیمشو برد فینال
از هم جداشدن خیلی سخته
Data 1
اون میوه هات رو بردار
بله مطمئن باشید هستم
Data 2
 خوبه که جامعه پزشکی
تقریبا صدو پنجاه گز دورتر از جاده


### Create Character Dictionary

In [10]:
def extract_all_chars(batch):
    all_text = " ".join(batch["sentence"])
    vocab = list(set(all_text))
    return {"vocab": [vocab], "all_text": [all_text]}

vocab_train = common_voice_train.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=common_voice_train.column_names)
vocab_test = common_voice_test.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=common_voice_test.column_names)

vocab_list = list(set(vocab_train["vocab"][0]) | set(vocab_test["vocab"][0]))
vocab_dict = {v: k for k, v in enumerate(vocab_list)}

vocab_dict["|"] = vocab_dict.pop(" ")
vocab_dict["[UNK]"] = len(vocab_dict)
vocab_dict["[PAD]"] = len(vocab_dict)
vocab_dict["<s>"] = len(vocab_dict)
vocab_dict["</s>"] = len(vocab_dict)

Map:   0%|          | 0/2217 [00:00<?, ? examples/s]

Map:   0%|          | 0/5212 [00:00<?, ? examples/s]

In [11]:
print("Dictionary Length:", len(vocab_dict))
print(vocab_dict)

Dictionary Length: 40
{'ف': 0, 'ر': 1, 'ن': 2, 'ق': 3, 'ظ': 4, 'ب': 5, 'ه': 6, 'پ': 7, 'ز': 8, 'س': 9, 'ع': 10, 'ج': 11, 'م': 12, 'ی': 13, 'ش': 14, 'ء': 15, 'ص': 16, 'ث': 17, 'ا': 18, 'آ': 19, 'چ': 20, 'ت': 21, 'و': 22, 'ح': 23, 'ط': 24, 'ژ': 25, 'ل': 26, 'خ': 27, 'گ': 28, 'ئ': 29, 'ک': 30, 'د': 31, 'غ': 32, 'ذ': 33, 'ض': 34, '|': 35, '[UNK]': 36, '[PAD]': 37, '<s>': 38, '</s>': 39}


### Save unique characters in JSON

In [12]:
vocab_file = "vocab.json"
with open(vocab_file, 'w') as vf:
    json.dump(vocab_dict, vf)

### Resample audio to 16kHz

In [13]:
def resample_audio(batch):
    waveform = torch.tensor(batch["audio"]["array"], dtype=torch.float32).unsqueeze(0)
    resampled_waveform = transforms.Resample(orig_freq=48000, new_freq=16000)(waveform)
    batch["audio"]["array"] = resampled_waveform.squeeze(0).numpy()
    batch["audio"]["sampling_rate"] = 16000
    return batch

def resample_audio_librosa(batch):
    y, sr = librosa.load(batch["audio"]["path"], sr=48000)
    y_resampled = librosa.resample(y, orig_sr=48000, target_sr=16000)
    batch["audio"]["array"] = y_resampled
    batch["audio"]["sampling_rate"] = 16000
    return batch


common_voice_train = common_voice_train.cast_column("audio", Audio(sampling_rate=48000))
common_voice_train = common_voice_train.map(resample_audio_librosa)
common_voice_test = common_voice_test.cast_column("audio", Audio(sampling_rate=48000))
common_voice_test = common_voice_test.map(resample_audio_librosa)

Map:   0%|          | 0/2217 [00:00<?, ? examples/s]

Map:   0%|          | 0/5212 [00:00<?, ? examples/s]

### Tokenization and Feature Extraction -> Processor

---



In [14]:
tokenizer = Wav2Vec2CTCTokenizer(
    vocab_file=vocab_file,
    unk_token="[UNK]",
    pad_token="[PAD]",
    bos_token="<s>",
    eos_token="</s>",
    word_delimiter_token="|"
)

feature_extractor = Wav2Vec2FeatureExtractor(
    feature_size=1,
    sampling_rate=16000,
    padding_value=0.0,
    do_normalize=True,
    return_attention_mask=True
)

processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

### Data Collator

In [15]:
def prepare_dataset(batch):

    audio = batch["audio"]
    batch["input_values"] = processor(audio["array"], sampling_rate=16000).input_values[0]
    batch["labels"] = processor.tokenizer(batch["sentence"]).input_ids

    return batch

common_voice_train = common_voice_train.map(prepare_dataset, remove_columns=common_voice_train.column_names)
common_voice_test = common_voice_test.map(prepare_dataset, remove_columns=common_voice_test.column_names)

Map:   0%|          | 0/2217 [00:00<?, ? examples/s]

Map:   0%|          | 0/5212 [00:00<?, ? examples/s]

In [16]:
class CustomDataCollatorWithPadding:
    def __init__(self, processor):
        self.processor = processor

    def __call__(self, features):
        input_values = [feature["input_values"] for feature in features]
        labels = [feature["labels"] for feature in features]

        batch = self.processor.feature_extractor.pad({"input_values": input_values}, padding=True, return_tensors="pt")

        labels_batch = self.processor.tokenizer.pad({"input_ids": labels}, padding=True, return_tensors="pt")

        batch["labels"] = labels_batch["input_ids"]

        return batch

data_collator = CustomDataCollatorWithPadding(processor)

In [None]:
# data_collator = DataCollatorWithPadding(tokenizer=processor.tokenizer, padding=True)

### Evaluation Metric

In [17]:
wer_metric = load_metric("wer")

def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)
    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id
    pred_str = processor.batch_decode(pred_ids)
    labels_str = processor.batch_decode(pred.label_ids, group_tokens=False)
    wer = wer_metric.compute(predictions=pred_str, references=labels_str)
    return {"wer": wer}

  wer_metric = load_metric("wer")


Downloading builder script:   0%|          | 0.00/1.90k [00:00<?, ?B/s]

The repository for wer contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/wer.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


### Load Model

In [18]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-xlsr-53",
                                       gradient_checkpointing=True,
                                       ctc_loss_reduction="mean",
                                       bos_token_id=processor.tokenizer.bos_token_id,
                                       eos_token_id=processor.tokenizer.eos_token_id,
                                       pad_token_id=processor.tokenizer.pad_token_id,
                                       vocab_size=len(processor.tokenizer.get_vocab())).to(device)

model.freeze_feature_extractor()


if model.config.vocab_size != processor.tokenizer.vocab_size:
    model.config.vocab_size = processor.tokenizer.vocab_size
    print(f"Updated model vocabulary size: {model.config.vocab_size}")

print(f"Model's vocabulary size: {model.config.vocab_size}")

Using device: cuda


config.json:   0%|          | 0.00/1.77k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.27G [00:00<?, ?B/s]

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-xlsr-53 and are newly initialized: ['lm_head.bias', 'lm_head.weight', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model's vocabulary size: 40




In [19]:
!pip show accelerate

Name: accelerate
Version: 0.31.0
Summary: Accelerate
Home-page: https://github.com/huggingface/accelerate
Author: The HuggingFace team
Author-email: zach.mueller@huggingface.co
License: Apache
Location: /usr/local/lib/python3.10/dist-packages
Requires: huggingface-hub, numpy, packaging, psutil, pyyaml, safetensors, torch
Required-by: 


### Define Trainer

In [20]:
training_args = TrainingArguments(
    output_dir="./wav2vec2-large-xlsr-persian-demo",
    group_by_length=True,
    per_device_train_batch_size=16,
    eval_strategy="steps",
    num_train_epochs=5,
    fp16=True,
    gradient_checkpointing=True,
    save_steps=500,
    eval_steps=500,
    logging_steps=500,
    learning_rate=1e-4,
    warmup_steps=1000,
    save_total_limit=2,
)

trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=common_voice_train,
    eval_dataset=common_voice_test,
    tokenizer=processor.feature_extractor,
)



### Train Model

In [21]:
start_time = time.time()
trainer.train()
end_time = time.time()
train_time = end_time - start_time
print(f"Training time: {train_time} seconds")



Step,Training Loss,Validation Loss,Wer
500,15.4219,-0.208385,0.999974




Training time: 4741.213932514191 seconds


In [22]:
print(f"Training time: {train_time/60} minutes")

Training time: 79.02023220856985 minutes


### Model Evaluation

In [23]:
eval_results = trainer.evaluate()

print(f"Evaluation Results: {eval_results}")

Evaluation Results: {'eval_loss': -0.6472992300987244, 'eval_wer': 0.9999744389346148, 'eval_runtime': 1243.2066, 'eval_samples_per_second': 4.192, 'eval_steps_per_second': 0.524, 'epoch': 5.0}


In [24]:
print("Trainable weights:")
total_trainable_params = 0
for name, param in model.named_parameters():
    if param.requires_grad:
        num_params = param.numel()
        print(f"{name}: {num_params}")
        total_trainable_params += num_params

print(f"Total Trainable Parameters: {total_trainable_params}")

Trainable weights:
wav2vec2.masked_spec_embed: 1024
wav2vec2.feature_projection.layer_norm.weight: 512
wav2vec2.feature_projection.layer_norm.bias: 512
wav2vec2.feature_projection.projection.weight: 524288
wav2vec2.feature_projection.projection.bias: 1024
wav2vec2.encoder.pos_conv_embed.conv.bias: 1024
wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0: 128
wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1: 8388608
wav2vec2.encoder.layer_norm.weight: 1024
wav2vec2.encoder.layer_norm.bias: 1024
wav2vec2.encoder.layers.0.attention.k_proj.weight: 1048576
wav2vec2.encoder.layers.0.attention.k_proj.bias: 1024
wav2vec2.encoder.layers.0.attention.v_proj.weight: 1048576
wav2vec2.encoder.layers.0.attention.v_proj.bias: 1024
wav2vec2.encoder.layers.0.attention.q_proj.weight: 1048576
wav2vec2.encoder.layers.0.attention.q_proj.bias: 1024
wav2vec2.encoder.layers.0.attention.out_proj.weight: 1048576
wav2vec2.encoder.layers.0.attention.out_proj.bias: 1024
wav2