# ASR for Farsi

### Import required libraries

In [None]:
!pip install datasets hazm

In [2]:
import re
from hazm import Normalizer
from datasets import load_dataset

### Load dataset from HuggingFace

In [3]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
common_voice_train = load_dataset("mozilla-foundation/common_voice_6_1", "fa", split="train")
common_voice_test = load_dataset("mozilla-foundation/common_voice_6_1", "fa", split="test")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


### Remove Unnecessary Columns

In [5]:
print(common_voice_train.column_names)
print(common_voice_test.column_names)

['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment']
['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment']


In [6]:
columns_to_remove = ['down_votes', 'gender', 'locale', 'segment', 'up_votes', 'accent', 'age', 'client_id']

common_voice_train = common_voice_train.remove_columns(columns_to_remove)
common_voice_test = common_voice_test.remove_columns(columns_to_remove)

print(common_voice_train.column_names)
print(common_voice_test.column_names)

['path', 'audio', 'sentence']
['path', 'audio', 'sentence']


### Preprocessing

In [7]:
chars_to_ignore = [
    ",", "?", ".", "!", "-", ";", ":", '""', "%", "'", '"', "�",
    "#", "!", "؟", "?", "«", "»", "،", "(", ")", "؛", "'ٔ", "٬",'ٔ', ",", "?",
    ".", "!", "-", ";", ":",'"',"“", "%", "‘", "”", "=", "–", "…", "_", "”", '“', '„',
    'ā', 'š'
]


chars_to_mapping = {
    'ك': 'ک', 'دِ': 'د', 'بِ': 'ب', 'زِ': 'ز', 'ذِ': 'ذ', 'شِ': 'ش', 'سِ': 'س', 'ى': 'ی',
    'ي': 'ی', 'أ': 'ا', 'ؤ': 'و', "ے": "ی", "ۀ": "ه", "ﭘ": "پ", "ﮐ": "ک", "ﯽ": "ی",
    "ﺎ": "ا", "ﺑ": "ب", "ﺘ": "ت", "ﺧ": "خ", "ﺩ": "د", "ﺱ": "س", "ﻀ": "ض", "ﻌ": "ع",
    "ﻟ": "ل", "ﻡ": "م", "ﻢ": "م", "ﻪ": "ه", "ﻮ": "و", 'ﺍ': "ا", 'ة': "ه",
    'ﯾ': "ی", 'ﯿ': "ی", 'ﺒ': "ب", 'ﺖ': "ت", 'ﺪ': "د", 'ﺮ': "ر", 'ﺴ': "س", 'ﺷ': "ش",
    'ﺸ': "ش", 'ﻋ': "ع", 'ﻤ': "م", 'ﻥ': "ن", 'ﻧ': "ن", 'ﻭ': "و", 'ﺭ': "ر", "ﮔ": "گ",

    # "ها": "  ها", "ئ": "ی",
    "۱۴ام": "۱۴ ام",

    "a": " ای ", "b": " بی ", "c": " سی ", "d": " دی ", "e": " ایی ", "f": " اف ",
    "g": " جی ", "h": " اچ ", "i": " آی ", "j": " جی ", "k": " کی ", "l": " ال ",
    "m": " ام ", "n": " ان ", "o": " او ", "p": " پی ", "q": " کیو ", "r": " آر ",
    "s": " اس ", "t": " تی ", "u": " یو ", "v": " وی ", "w": " دبلیو ", "x": " اکس ",
    "y": " وای ", "z": " زد ",
    "\u200c": " ", "\u200d": " ", "\u200e": " ", "\u200f": " ", "\ufeff": " ",
}

In [8]:
normalizer = Normalizer()

def preprocess_text(text):
    text = normalizer.normalize(text)
    text = text.lower()
    escaped_chars_to_ignore = [re.escape(char) for char in chars_to_ignore]
    text = re.sub(f"[{''.join(escaped_chars_to_ignore)}]", '', text)
    for old, new in chars_to_mapping.items():
        text = text.replace(old, new)
    return text

common_voice_train = common_voice_train.map(lambda batch: {"sentence": preprocess_text(batch["sentence"])})
common_voice_test = common_voice_test.map(lambda batch: {"sentence": preprocess_text(batch["sentence"])})

Map:   0%|          | 0/7593 [00:00<?, ? examples/s]

Map:   0%|          | 0/5213 [00:00<?, ? examples/s]

In [9]:
print(common_voice_train[0]["sentence"])
print(common_voice_test[0]["sentence"])

زود باش بگو همه رو دارن میکشن الانه که برسن صدای ضربه های محکم به در خانه به گوش می رسد
از هم جداشدن خیلی سخته


### Create Character Dictionary

In [10]:
def extract_unique_chars(dataset):
    all_text = " ".join(dataset["sentence"])
    return set(all_text)

unique_chars_train = extract_unique_chars(common_voice_train)
unique_chars_test = extract_unique_chars(common_voice_test)

# Merged
unique_chars = unique_chars_train.union(unique_chars_test)

In [11]:
unique_chars.add("|")
unique_chars.add("<s>")
unique_chars.add("</s>")
unique_chars.add("<unk>")

# ' ' and '&' are included too
print(f"Total unique characters (including special tokens): {len(unique_chars)}")
print("Unique characters:", unique_chars)

Total unique characters (including special tokens): 41
Unique characters: {'ج', '|', 'ء', ' ', 'ه', 'م', 'ئ', 'ذ', 'و', 'ف', 'ژ', '</s>', 'ص', 'ث', 'غ', 'ض', 'ک', 'ز', 'ت', 'ر', 'ح', 'پ', '<unk>', 'ل', 'ب', 'خ', 'گ', 'ظ', 'ط', 'ی', '&', 'ا', '<s>', 'چ', 'آ', 'س', 'ق', 'ن', 'ع', 'د', 'ش'}
