Download Dataset

The custom dataset collected by the "[speech2text](https://github.com/shenasa-ai/speech2text)" repository comprises audio recordings sourced from radio archives, gathered using a dedicated data crawler script. This dataset was likely curated to diversify training data for speech recognition models, potentially offering unique linguistic patterns and contexts not found in other publicly available datasets like Common Voice.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# download all_wave.zip
!gdown 1jyvhdZHn0s5Owkr21k5Ff-c96sIQLtEu
!unzip -qq all_wav.zip
# download Hamtech_VoiceDataset_Slice1.csv
!gdown 1vqvn0F0YYhEFbzLgP9wJ36vyInUnO5b5

Downloading...
From (original): https://drive.google.com/uc?id=1jyvhdZHn0s5Owkr21k5Ff-c96sIQLtEu
From (redirected): https://drive.google.com/uc?id=1jyvhdZHn0s5Owkr21k5Ff-c96sIQLtEu&confirm=t&uuid=8200fc56-76a8-45f3-817d-9b853928e9ad
To: /content/all_wav.zip
100% 2.48G/2.48G [00:32<00:00, 76.7MB/s]
Downloading...
From: https://drive.google.com/uc?id=1vqvn0F0YYhEFbzLgP9wJ36vyInUnO5b5
To: /content/Hamtech_VoiceDataset_Slice1.csv
100% 2.87M/2.87M [00:00<00:00, 24.2MB/s]


# Preprocessing

In [None]:
# Download nessecary libraries
!pip install datasets==2.10.0 --quiet
!pip install transformers --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m469.0/469.0 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.3/134.3 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# import nessecary libraries
import pandas as pd
from datasets import Dataset
from functools import reduce
import re
import string
from transformers import Wav2Vec2CTCTokenizer,Wav2Vec2FeatureExtractor,Wav2Vec2Processor
import json
import librosa
import warnings
from os import path,system,mkdir


In [None]:
transcripts = pd.read_csv('Hamtech_VoiceDataset_Slice1.csv')
transcripts = transcripts.sort_values(by='confidence_level', ascending=False)
transcripts = transcripts.iloc[:6000, :]
ds = Dataset.from_pandas(transcripts)
transcripts.head()

Unnamed: 0,wav_filename,wav_filesize,transcript,confidence_level
8029,./all_wav/Varzesh_ShabHayeNarenji21_198.wav,112420,گل اول را به زیبایی با ضربه سر,0.927557
680,./all_wav/Tehran_SayeRoshan18_127.wav,133412,شهر ورزنه را تبدیل به یک شهر,0.927557
20490,./all_wav/Varzesh_BaharVarzesh13_12.wav,134852,و یک کلمه هم وزن توماس مولر,0.927557
4824,./all_wav/Varzesh_PanjareirooBeShab1_66.wav,176164,آرزوی ورزشی شخصی و یا آرزوی ورزشی کلی,0.927557
18783,./all_wav/Varzesh_SakooyeMann13_15.wav,124996,صلی الله علیه و آله و سلم فرمودند,0.927557


In [None]:
transcripts.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6000 entries, 8029 to 21269
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   wav_filename      6000 non-null   object 
 1   wav_filesize      6000 non-null   int64  
 2   transcript        6000 non-null   object 
 3   confidence_level  6000 non-null   float64
dtypes: float64(1), int64(1), object(2)
memory usage: 234.4+ KB


In [None]:
# Take a look at unique letters in our dataset
present_chars = reduce(lambda a, b: set((*a,*b)), list(ds['transcript']))
for char in present_chars:
  print(char, end=",")

 ,ء,۶,i,ج,ث,ح,۷,e,ب,ه,b,گ,.,ق,ظ,۳,‏,ش,آ,س,۱,r,:,۲,۰,*,a,۵,ص,۸,پ,چ,ً,s,ز,ل,م,أ,n,ع,۹,غ,ض,ا,c,و,ی,د,ک,ف,ذ,ئ,ر,ژ,w,خ,‌,۴,u,ن,ت,ط,

In [None]:
char_mappings = {'ﺁ': 'آ',
                 'أ': 'ا', 'ﺍ': "ا", "ﺎ": "ا",
                 "ﺑ": "ب", 'بِ': 'ب', 'ﺒ': "ب",
                 "ﭘ": "پ",
                 "ﺘ": "ت", 'ﺖ': "ت", 'ﺕ': "ت", 'ﺗ': "ت",
                 'ﺛ': 'ث',
                 'ﺟ': 'ج',
                 'ﭽ': 'چ',
                 'ﺣ': 'ح', 'ﺤ': 'ح',
                 "ﺧ": "خ", 'ﺨ': "خ", 'ﺥ': "خ",
                 'دِ': 'د', "ﺩ": "د", 'ﺪ': "د",
                 'ذِ': 'ذ', 'ﺫ': 'ذ',
                 'ﺮ': "ر", 'ﺭ': "ر",
                 'زِ': 'ز', 'ﺯ': 'ز',
                 'ژ': 'ژ',
                 'سِ': 'س', "ﺱ": "س", 'ﺴ': "س", 'ﺴ': "س",  'ﺲ': "س",  'ﺳ': "س",
                 'شِ': 'ش', 'ﺷ': "ش", 'ﺸ': "ش", 'ش': 'ش',  'ﺶ': 'ش',  'ﺷ': 'ش',
                 'ﺼ': 'ص', 'ص': 'ص',
                 "ﻀ": "ض",
                 'ﻃ': 'ط', 'ط': 'ط',
                 'ظ': 'ظ',
                 "ﻌ": "ع", 'ﻋ': "ع", 'ﻌ': "ع",  'ع': "ع",
                 'ﻏ': 'غ', 'غ': 'غ',
                 'ﻒ':'ف' ,'ﻓ':'ف' , 'ﻔ':'ف' , 'ف':'ف' ,
                 'ق': 'ق', 'ﻖ': 'ق',
                 'ك': 'ک', "ﮐ": "ک", 'ﮐ': 'ک',  'ﮏ': 'ک',  'ﮑ': 'ک',  'ك': 'ک',  'ک': 'ک',
                 "ﮔ": "گ", 'ﮔ': "گ",  'گ': "گ",
                 "ﻟ": "ل", 'ل': "ل",  'ﻠ': "ل",  'ﻝ': "ل",
                 "ﻡ": "م", "ﻢ": "م", 'ﻤ': "م", 'م': "م",  'ﻣ': "م",  'ﻢ': "م",  'ﻤ': "م",  'ﻡ': "م",
                 'ﻥ': "ن", 'ﻧ': "ن", 'ﻥ': "ن", 'ﻦ': "ن",  'ﻧ': "ن",  'ن': "ن",'ﻨ' : "ن",
                 'ﻭ': "و", 'ؤ': 'و', "ﻮ": "و", 'ﻮ': "و",  'ؤ': "و",  'و': "و",  'ﻭ': "و",
                 "ۀ": "ه", "ﻪ": "ه", 'ة': "ه", 'ﻬ': "ه",  'ﻫ': "ه",  '٥': "ه",  'ۀ': "ه",  'ه': "ه",  'ﻩ': "ه",  'ﻪ': "ه",
                 'ى': 'ی', 'ي': 'ی', "ے": "ی", "ﯽ": "ی", 'ﯾ': "ی", 'ﯿ': "ی", 'ﯿ': "ی",  'ﯾ': "ی",  'ی': "ی",  'ئ': "ی",  'ى': "ی",  'ي': "ی",  'ﯽ': "ی",  'ﯼ': "ی",
                }


def multiple_replace(batch, chars_to_mapping):
    pattern = "|".join(map(re.escape, chars_to_mapping.keys()))
    batch['transcript'] = re.sub(pattern, lambda m: chars_to_mapping[m.group()], batch['transcript'])
    return batch

ds = ds.map(lambda batch: multiple_replace(batch,char_mappings))

Map:   0%|          | 0/6000 [00:00<?, ? examples/s]

In [None]:
# char_removals = ['ِ','\u200c','(',')','!','،','\u202c','«','…','ٍ','\n','ـ', '\u200f', '%', '*', '&'] + list(string.ascii_letters + string.digits) + [',','/','٪','ُ','-','ّ','\u202b','َ','ٔ', 'ْ', 'ء','"', '\xa0', '“', '\xad', '٬','–', '٫', '؛', '\n',':', '»', '،', '”', '\t', 'ً', '۱', '۲', '۳', '۴', '۵', '۶', '۷', '۸', '٨', '۹', '۰']

# def remove_special_characters(batch,char_removals):
#     chars_to_ignore_regex = f"""[{"".join(char_removals)}]"""
#     batch['transcript'] = re.sub(chars_to_ignore_regex, '', batch['transcript']).lower() + " "
#     return batch

# ds = ds.map(lambda batch: remove_special_characters(batch,char_removals))

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [None]:
char_removals = list(string.ascii_letters + string.digits) + ['۱', '۲', '۳', '۴', '۵', '۶', '۷', '۸', '٨', '۹', '۰', '*', 'ء', 'ً', ':', '\u200f', '\u200c']

def remove_special_characters(batch,char_removals):
    chars_to_ignore_regex = f"""[{"".join(char_removals)}]"""
    batch['transcript'] = re.sub(chars_to_ignore_regex, '', batch['transcript']).lower() + " "
    return batch

ds = ds.map(lambda batch: remove_special_characters(batch,char_removals))

Map:   0%|          | 0/6000 [00:00<?, ? examples/s]

In [None]:
vocab = reduce(lambda a, b: set((*a,*b)), ds['transcript'])
print(vocab)

{' ', 'ج', 'ث', 'ح', 'ب', 'ه', 'گ', '.', 'ق', 'ظ', 'ش', 'آ', 'س', 'ص', 'پ', 'چ', 'ز', 'ل', 'م', 'ع', 'غ', 'ض', 'ا', 'و', 'ی', 'د', 'ک', 'ف', 'ذ', 'ر', 'ژ', 'خ', 'ن', 'ت', 'ط'}


In [None]:
vocab_dict = {v: k for k, v in enumerate(vocab)}

vocab_dict["|"] = vocab_dict[" "]
del vocab_dict[" "]

vocab_dict["[UNK]"] = len(vocab_dict)
vocab_dict["[PAD]"] = len(vocab_dict)

with open('vocab.json', 'w') as vocab_file:
  json.dump(vocab_dict, vocab_file)

print(vocab_dict)

{'ج': 1, 'ث': 2, 'ح': 3, 'ب': 4, 'ه': 5, 'گ': 6, '.': 7, 'ق': 8, 'ظ': 9, 'ش': 10, 'آ': 11, 'س': 12, 'ص': 13, 'پ': 14, 'چ': 15, 'ز': 16, 'ل': 17, 'م': 18, 'ع': 19, 'غ': 20, 'ض': 21, 'ا': 22, 'و': 23, 'ی': 24, 'د': 25, 'ک': 26, 'ف': 27, 'ذ': 28, 'ر': 29, 'ژ': 30, 'خ': 31, 'ن': 32, 'ت': 33, 'ط': 34, '|': 0, '[UNK]': 35, '[PAD]': 36}


In [None]:
len(vocab_dict)

37

In [None]:

tokenizer = Wav2Vec2CTCTokenizer("vocab.json", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")
feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True)
processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

In [None]:
def prepare_dataset(batch):
  file_path = path.join(batch['wav_filename'])
  with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    speech_array, sampling_rate = librosa.load(file_path,mono=True,sr=16000)

    batch["input_values"] = processor(speech_array, sampling_rate=16000).input_values[0]
    batch["input_length"] = len(batch["input_values"])

    with processor.as_target_processor():
        batch["labels"] = processor(batch["transcript"]).input_ids

  return batch

ds = ds.map(prepare_dataset)

Map:   0%|          | 0/6000 [00:00<?, ? examples/s]

In [None]:
max_input_length_in_sec = 15
ds = ds.filter(lambda x: x < max_input_length_in_sec * processor.feature_extractor.sampling_rate, input_columns=["input_length"])

Filter:   0%|          | 0/6000 [00:00<?, ? examples/s]

In [None]:
ds = ds.train_test_split(0.2)
ds

DatasetDict({
    train: Dataset({
        features: ['wav_filename', 'wav_filesize', 'transcript', 'confidence_level', '__index_level_0__', 'input_values', 'input_length', 'labels'],
        num_rows: 4800
    })
    test: Dataset({
        features: ['wav_filename', 'wav_filesize', 'transcript', 'confidence_level', '__index_level_0__', 'input_values', 'input_length', 'labels'],
        num_rows: 1200
    })
})

In [None]:
# Save for later use
ds.save_to_disk("dataset.hf")

Flattening the indices:   0%|          | 0/4800 [00:00<?, ? examples/s]

Saving the dataset (0/2 shards):   0%|          | 0/4800 [00:00<?, ? examples/s]

Flattening the indices:   0%|          | 0/1200 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1200 [00:00<?, ? examples/s]

# Training

In [None]:
# Download nessecary libraries
!pip install datasets --quiet
!pip install transformers[torch] --quiet
!pip install --upgrade accelerate --quiet
!pip install evaluate --quiet
!pip install jiwer --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.6/297.6 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m25.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from datasets import DatasetDict

ds = DatasetDict.load_from_disk("dataset.hf")

model_columns = ['input_values', 'input_length', 'labels']
ds['train'] = ds['train'].remove_columns([col for col in ds['train'].column_names if col not in model_columns])
ds['test'] = ds['test'].remove_columns([col for col in ds['test'].column_names if col not in model_columns])

  table = cls._concat_blocks(blocks, axis=0)


In [None]:
from transformers import Wav2Vec2CTCTokenizer,Wav2Vec2FeatureExtractor,Wav2Vec2Processor


tokenizer = Wav2Vec2CTCTokenizer("vocab.json", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")
feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True)
processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

In [None]:
from IPython.display import Audio

example = ds['train'][0]

print(example["labels"])
print(example["input_values"][0:10])

print(tokenizer.decode(example["labels"]))
Audio(example["input_values"], rate=16000)

[11, 4, 0, 23, 0, 5, 23, 22, 0]
[0.0022627527359873056, 0.0022627527359873056, 0.0022627527359873056, 0.0022627527359873056, 0.0022627527359873056, 0.0022627527359873056, 0.0022627527359873056, 0.0022627527359873056, 0.0022627527359873056, 0.0022627527359873056]
آب و هوا


In [None]:
import torch
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union

@dataclass
class DataCollatorCTCWithPadding:
    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:

        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                max_length=self.max_length_labels,
                pad_to_multiple_of=self.pad_to_multiple_of_labels,
                return_tensors="pt",
            )

        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch

data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

example_features = [
    ds["train"][0],
    ds["train"][1],
    ds["train"][20],
]
example_batch = data_collator(example_features)
{k:v.shape for k,v in example_batch.items()}



{'input_values': torch.Size([3, 77788]),
 'attention_mask': torch.Size([3, 77788]),
 'labels': torch.Size([3, 18])}

In [None]:
root_dir = "./drive/MyDrive/ASR_Colab/" #change it to your own path

In [None]:
from transformers import Wav2Vec2ForCTC
from os import path

model = Wav2Vec2ForCTC.from_pretrained(path.join(root_dir ,"model_weights")).to('cuda')

In [None]:
from transformers import Wav2Vec2ForCTC

model = Wav2Vec2ForCTC.from_pretrained(
    "facebook/wav2vec2-large-xlsr-53",
    attention_dropout=0.1,
    hidden_dropout=0.1,
    feat_proj_dropout=0.0,
    mask_time_prob=0.05,
    layerdrop=0.1,
    gradient_checkpointing=True,
    ctc_loss_reduction="mean",
    pad_token_id=processor.tokenizer.pad_token_id,
    vocab_size=processor.tokenizer.vocab_size
)
model.config.ctc_zero_infinity = True
model.freeze_feature_encoder()

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-xlsr-53 and are newly initialized: ['lm_head.bias', 'lm_head.weight', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
!pip install accelerate -U



In [None]:
from transformers import TrainingArguments

save_dir = 'wav2vec_cache'
training_args = TrainingArguments(
    output_dir=save_dir,
    group_by_length=False,
    per_device_train_batch_size=1,  # using small batch size to prevent GPU running out of memory
    gradient_accumulation_steps=10, # using mini-batches to prevent GPU running out of memory
    evaluation_strategy="steps",
    num_train_epochs=5,             # increasing number of epochs can increase model performance, but takes longer time to train
    fp16=True,
    save_steps=60,
    eval_steps=20,
    logging_steps=20,
    learning_rate=3e-4,
    warmup_steps=20,
    save_total_limit=1,
    load_best_model_at_end = True
)

In [None]:
# import evaluate
# import numpy as np

# wer_metric = evaluate.load("wer")
# def compute_metrics(pred):
#     pred_logits = pred.predictions
#     pred_ids = np.argmax(pred_logits, axis=-1)

#     pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

#     pred_str = processor.batch_decode(pred_ids)
#     # we do not want to group tokens when computing the metrics
#     label_str = processor.batch_decode(pred.label_ids, group_tokens=False)
#     # print(label_str)
#     wer = wer_metric.compute(predictions=pred_str, references=label_str)

#     return {"wer": wer}

import evaluate
import numpy as np

wer_metric = evaluate.load("wer")

def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids)

    # Replace empty strings with "|"
    label_str = [label if label else "|" for label in processor.batch_decode(pred.label_ids, group_tokens=False)]

    wer = wer_metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}


In [None]:
from transformers import Trainer
import numpy as np

trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=ds["train"],
    eval_dataset=ds["test"],
    tokenizer=processor.feature_extractor,
)

In [None]:
trainer.train()



Step,Training Loss,Validation Loss,Wer
20,5.0178,3.874823,1.0
40,3.461,3.371347,1.0
60,3.2614,3.329983,1.0
80,3.3219,3.260933,1.0
100,3.3271,3.274812,1.0
120,3.2613,3.221734,1.0
140,3.2786,3.285271,1.082387
160,3.3088,3.150677,1.291121
180,3.1183,3.116758,1.112373
200,3.8225,3.294054,1.056186




In [None]:
trainer.save_model("model_weights")