# ASR for Farsi

### Import required libraries

In [None]:
!pip install datasets transformers librosa torchaudio hazm num2fawords jiwer

In [2]:
import re
import torch
import torchaudio
import torchaudio.transforms as transforms
import librosa
from transformers import Wav2Vec2CTCTokenizer, Wav2Vec2FeatureExtractor, Wav2Vec2Processor, DataCollatorWithPadding
from datasets import load_dataset, Audio
from hazm import Normalizer

### Load dataset from HuggingFace

In [3]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
common_voice_train = load_dataset("mozilla-foundation/common_voice_6_1", "fa", split="train")
common_voice_test = load_dataset("mozilla-foundation/common_voice_6_1", "fa", split="test")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading builder script:   0%|          | 0.00/11.5k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/10.8k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.29k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/39.9k [00:00<?, ?B/s]

The repository for mozilla-foundation/common_voice_6_1 contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/mozilla-foundation/common_voice_6_1.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Downloading data:   0%|          | 0.00/8.88G [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating other split: 0 examples [00:00, ? examples/s]

Generating invalidated split: 0 examples [00:00, ? examples/s]

### Remove Unnecessary Columns

In [5]:
columns_to_remove = ['down_votes', 'gender', 'locale', 'segment', 'up_votes', 'accent', 'age', 'client_id']

print("Train Set Columns (Before Drop):", common_voice_train.column_names)
print("Test Set Columns (Before Drop):", common_voice_test.column_names)

common_voice_train = common_voice_train.remove_columns(columns_to_remove)
common_voice_test = common_voice_test.remove_columns(columns_to_remove)

print("Train Set Columns (After Drop):", common_voice_train.column_names)
print("Test Set Columns (After Drop):", common_voice_test.column_names)

Train Set Columns (Before Drop): ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment']
Test Set Columns (Before Drop): ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment']
Train Set Columns (After Drop): ['path', 'audio', 'sentence']
Test Set Columns (After Drop): ['path', 'audio', 'sentence']


### Filter Audio

In [6]:
def filter_train_audio(batch):
    duration = len(batch["audio"]["array"]) / batch["audio"]["sampling_rate"]
    return 4.0 <= duration <= 6.0

def filter_test_audio(batch):
    duration = len(batch["audio"]["array"]) / batch["audio"]["sampling_rate"]
    return duration < 15.0

print("Common Voice Train Length (Before Filter):", len(common_voice_train))
print("Common Voice Test Length (Before Filter):", len(common_voice_test))

common_voice_train = common_voice_train.filter(filter_train_audio)
common_voice_test = common_voice_test.filter(filter_test_audio)

print("Common Voice Train Length (After Filter):", len(common_voice_train))
print("Common Voice Test Length (After Filter):", len(common_voice_test))

Common Voice Train Length (Before Filter): 7593
Common Voice Test Length (Before Filter): 5213


Filter:   0%|          | 0/7593 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5213 [00:00<?, ? examples/s]

Common Voice Train Length (After Filter): 2217
Common Voice Test Length (After Filter): 5212


### Preprocessing

In [7]:
chars_to_ignore = [
    ",", "?", ".", "!", "-", ";", ":", '""', "%", "'", '"', "�", "&",
    "#", "!", "؟", "?", "«", "»", "،", "(", ")", "؛", "'ٔ", "٬",'ٔ', ",", "?",
    ".", "!", "-", ";", ":",'"',"“", "%", "‘", "”", "=", "–", "…", "_", "”", '“', '„',
    'ā', 'š'
]


chars_to_mapping = {
    'ك': 'ک', 'دِ': 'د', 'بِ': 'ب', 'زِ': 'ز', 'ذِ': 'ذ', 'شِ': 'ش', 'سِ': 'س', 'ى': 'ی',
    'ي': 'ی', 'أ': 'ا', 'ؤ': 'و', "ے": "ی", "ۀ": "ه", "ﭘ": "پ", "ﮐ": "ک", "ﯽ": "ی",
    "ﺎ": "ا", "ﺑ": "ب", "ﺘ": "ت", "ﺧ": "خ", "ﺩ": "د", "ﺱ": "س", "ﻀ": "ض", "ﻌ": "ع",
    "ﻟ": "ل", "ﻡ": "م", "ﻢ": "م", "ﻪ": "ه", "ﻮ": "و", 'ﺍ': "ا", 'ة': "ه",
    'ﯾ': "ی", 'ﯿ': "ی", 'ﺒ': "ب", 'ﺖ': "ت", 'ﺪ': "د", 'ﺮ': "ر", 'ﺴ': "س", 'ﺷ': "ش",
    'ﺸ': "ش", 'ﻋ': "ع", 'ﻤ': "م", 'ﻥ': "ن", 'ﻧ': "ن", 'ﻭ': "و", 'ﺭ': "ر", "ﮔ": "گ",

    # "ها": "  ها", "ئ": "ی",
    "۱۴ام": "۱۴ ام",

    "a": " ای ", "b": " بی ", "c": " سی ", "d": " دی ", "e": " ایی ", "f": " اف ",
    "g": " جی ", "h": " اچ ", "i": " آی ", "j": " جی ", "k": " کی ", "l": " ال ",
    "m": " ام ", "n": " ان ", "o": " او ", "p": " پی ", "q": " کیو ", "r": " آر ",
    "s": " اس ", "t": " تی ", "u": " یو ", "v": " وی ", "w": " دبلیو ", "x": " اکس ",
    "y": " وای ", "z": " زد ",
    "\u200c": " ", "\u200d": " ", "\u200e": " ", "\u200f": " ", "\ufeff": " ",
}

In [8]:
normalizer = Normalizer()

def preprocess_text(text):
    text = normalizer.normalize(text)
    text = text.lower()
    escaped_chars_to_ignore = [re.escape(char) for char in chars_to_ignore]
    text = re.sub(f"[{''.join(escaped_chars_to_ignore)}]", '', text)
    for old, new in chars_to_mapping.items():
        text = text.replace(old, new)
    return text

common_voice_train = common_voice_train.map(lambda batch: {"sentence": preprocess_text(batch["sentence"])})
common_voice_test = common_voice_test.map(lambda batch: {"sentence": preprocess_text(batch["sentence"])})

Map:   0%|          | 0/2217 [00:00<?, ? examples/s]

Map:   0%|          | 0/5212 [00:00<?, ? examples/s]

In [11]:
for i in range(3):
  print("Data", i)
  print(common_voice_train[i]["sentence"])
  print(common_voice_test[i]["sentence"])

Data 0
چه جوری آخه برانکو با دست خالی تیمشو برد فینال
از هم جداشدن خیلی سخته
Data 1
اون میوه هات رو بردار
بله مطمئن باشید هستم
Data 2
 خوبه که جامعه پزشکی
تقریبا صدو پنجاه گز دورتر از جاده


### Create Character Dictionary

In [13]:
def extract_unique_chars(dataset):
    all_text = " ".join(dataset["sentence"])
    return set(all_text.replace(" ", "")) # Do not consider space

unique_chars_train = extract_unique_chars(common_voice_train)
unique_chars_test = extract_unique_chars(common_voice_test)

# Merged
unique_chars = unique_chars_train.union(unique_chars_test)

print(f"Total unique characters: {len(unique_chars)}")

Total unique characters: 35


In [14]:
unique_chars.add("|")
unique_chars.add("<s>")
unique_chars.add("</s>")
unique_chars.add("<unk>")

print(f"Unique characters: {unique_chars}")

Unique characters: {'ر', 'ح', 'ژ', 'ث', 'ج', 'ق', 'م', 'گ', 'و', 'ص', 'ن', 'ض', 'ز', 'چ', 'ئ', 'غ', 'ه', '<unk>', 'ذ', '</s>', 'ء', 'پ', 'ع', 'ظ', 'ت', 'ف', 'ط', 'ب', 'خ', 'د', 'آ', 'ی', 'س', 'ل', '|', 'ک', 'ش', 'ا', '<s>'}


### Resample audio to 16kHz

In [15]:
def resample_audio(batch):
    waveform = torch.tensor(batch["audio"]["array"], dtype=torch.float32).unsqueeze(0)
    resampled_waveform = transforms.Resample(orig_freq=48000, new_freq=16000)(waveform)
    batch["audio"]["array"] = resampled_waveform.squeeze(0).numpy()
    batch["audio"]["sampling_rate"] = 16000
    return batch

def resample_audio_librosa(batch):
    y, sr = librosa.load(batch["audio"]["path"], sr=48000)
    y_resampled = librosa.resample(y, orig_sr=48000, target_sr=16000)
    batch["audio"]["array"] = y_resampled
    batch["audio"]["sampling_rate"] = 16000
    return batch


common_voice_train = common_voice_train.cast_column("audio", Audio(sampling_rate=48000))
common_voice_train = common_voice_train.map(resample_audio_librosa)
common_voice_test = common_voice_test.cast_column("audio", Audio(sampling_rate=48000))
common_voice_test = common_voice_test.map(resample_audio_librosa)

Map:   0%|          | 0/2217 [00:00<?, ? examples/s]

Map:   0%|          | 0/5212 [00:00<?, ? examples/s]

### Tokenization and Feature Extraction + Processor

In [19]:
tokenizer = Wav2Vec2CTCTokenizer.from_pretrained("facebook/wav2vec2-base-960h")

feature_extractor = Wav2Vec2FeatureExtractor(
    feature_size=1,
    sampling_rate=16000,
    do_normalize=True,
    return_attention_mask=True
)

processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

### Data Collator

In [20]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding=True)

### Evaluation Metric

In [21]:
from jiwer import wer

def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = torch.argmax(pred_logits, dim=-1)
    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id
    pred_str = processor.batch_decode(pred_ids)
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)
    wer_metric = wer(label_str, pred_str)
    return {"wer": wer_metric}

### Load Model

In [28]:
from transformers import Wav2Vec2ForCTC, TrainingArguments, Trainer

model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-xlsr-53")
model.freeze_feature_extractor()

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-xlsr-53 and are newly initialized: ['lm_head.bias', 'lm_head.weight', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
