In [None]:
!pip install datasets



##connect to huggingface_hub

In [None]:
from google.colab import drive
drive.mount('/content/drive')
!pip install huggingface_hub

import huggingface_hub
huggingface_hub.login(token="hf_token")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


##Load dataset

In [None]:
from datasets import load_dataset, load_metric
common_voice_train = load_dataset("mozilla-foundation/common_voice_6_1", "fa", split="train")
common_voice_test = load_dataset("mozilla-foundation/common_voice_6_1", "fa", split="test")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


##Time filtering

In [None]:
def filter_audio_files(dataset, min_length, max_length):
    def is_valid_length(example):
        # Calculate length in seconds
        audio_length = len(example["audio"]["array"]) / example["audio"]["sampling_rate"]
        return min_length <= audio_length <= max_length
    return dataset.filter(is_valid_length)

In [None]:
common_voice_train_filtered = filter_audio_files(common_voice_train, min_length=4, max_length=6)
common_voice_test_filtered = filter_audio_files(common_voice_test, min_length=0, max_length=15)

In [None]:
print(f"Number of training files after filtering: {len(common_voice_train_filtered)}")
print(f"Number of test files after filtering: {len(common_voice_test_filtered)}")

Number of training files after filtering: 2217
Number of test files after filtering: 5212


In [None]:
common_voice_train = common_voice_train_filtered
common_voice_test = common_voice_test_filtered

In [None]:
print(f"Number of training files after filtering: {len(common_voice_train)}")
print(f"Number of test files after filtering: {len(common_voice_test)}")

Number of training files after filtering: 2217
Number of test files after filtering: 5212


In [None]:
print(common_voice_train)

Dataset({
    features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'],
    num_rows: 2217
})


##Remove redundant information from the dataset

In [None]:
common_voice_train = common_voice_train.remove_columns(["down_votes", "gender", "locale", "segment", "up_votes" , "accent", "age", "client_id"])
common_voice_test = common_voice_test.remove_columns(["down_votes", "gender", "locale", "segment", "up_votes" , "accent", "age", "client_id"])

In [None]:
print(common_voice_train)

Dataset({
    features: ['path', 'audio', 'sentence'],
    num_rows: 2217
})


##Data preprocessing

In [None]:
!pip install hazm



In [None]:
from hazm import Normalizer
chars_to_mapping = {
    'ى': 'ی', 'ك': 'ک', 'أ': 'ا', 'ؤ': 'و', 'إ': 'ا', 'ة': 'ه', 'ۀ': 'ه', 'ہ': 'ه',
    'ۂ': 'ه', 'ے': 'ی', 'ۓ': 'ی', 'ې': 'ی', 'ي': 'ی', 'آ': 'ا'
}

chars_to_ignore = [
    '!', '#', '�', '’', "'", '’', '%', '’’', ':', ';', '-', '!', '.', '?', ',', '؟', '؟', '-', '!', '.', '?', ',', 'ٔ', '٬', 'ٔ', '؛', '(', ')', '،', '«', '»',
    ';', ':', '”', '’‘', '%', '‘‘', '=', '–', '…', '_', '‘‘', '‘', '„', 'ā', 'š'
]

chars_to_ignore = set(chars_to_ignore)
normalizer = Normalizer()

In [None]:
import re
from collections import defaultdict


In [None]:
unique_chars = defaultdict(int)

In [None]:
for i in range(len(common_voice_train)):
    text = common_voice_train[i]['sentence']
    text = normalizer.normalize(text)

    for src_char, dst_char in chars_to_mapping.items():
        text = text.replace(src_char, dst_char)

    text = ''.join(char for char in text if char not in chars_to_ignore)
    common_voice_train[i]['sentence'] = text
    for char in text:
        unique_chars[char] += 1

In [None]:
for i in range(len(common_voice_test)):
    text = common_voice_test[i]['sentence']
    text = normalizer.normalize(text)

    for src_char, dst_char in chars_to_mapping.items():
        text = text.replace(src_char, dst_char)

    text = ''.join(char for char in text if char not in chars_to_ignore)
    common_voice_test[i]['sentence'] = text
    for char in text:
        unique_chars[char] += 1

In [None]:
unique_chars['<s>'] = 1
unique_chars['</s>'] = 1
unique_chars['|'] = 1
unique_chars['<unk>'] = 1
vocab = sorted(unique_chars.keys())
print("تعداد حروف متمایز:", len(vocab))
print(vocab)


تعداد حروف متمایز: 64
[' ', '"', '&', '</s>', '<s>', '<unk>', 'A', 'B', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'M', 'Q', 'S', 'T', 'U', 'e', 'm', 'n', 'o', 'u', 'y', '|', 'ء', 'ئ', 'ا', 'ب', 'ت', 'ث', 'ج', 'ح', 'خ', 'د', 'ذ', 'ر', 'ز', 'س', 'ش', 'ص', 'ض', 'ط', 'ظ', 'ع', 'غ', 'ف', 'ق', 'ل', 'م', 'ن', 'ه', 'و', 'پ', 'چ', 'ژ', 'ک', 'گ', 'ی', '\u200c', 'ﯾ', 'ﯿ']


In [None]:
!pip install torchaudio librosa transformers



In [None]:
import torchaudio
import librosa
import transformers
from transformers import Wav2Vec2CTCTokenizer, Wav2Vec2FeatureExtractor, Wav2Vec2Processor
import os

##Making a dictionary

In [None]:
import json
vocab_dict = {char: idx for idx, char in enumerate(vocab)}
with open('vocab.json', 'w', encoding='utf-8') as vocab_file:
    json.dump(vocab_dict, vocab_file, ensure_ascii=False, indent=4)

##Resampling

In [None]:
def resample_audio(example):
    audio = example["audio"]["array"]
    audio_resampled = librosa.resample(audio, orig_sr=48000, target_sr=16000)
    example["audio"]["array"] = audio_resampled
    example["audio"]["sampling_rate"] = 16000
    return example

common_voice_train = common_voice_train.map(resample_audio)
common_voice_test = common_voice_test.map(resample_audio)


Map:   0%|          | 0/2217 [00:00<?, ? examples/s]

Map:   0%|          | 0/5212 [00:00<?, ? examples/s]

##Tokenization and feature extraction

In [None]:
tokenizer_save_path = "./wav2vec2-base-mine/"

In [None]:
tokenizer = Wav2Vec2CTCTokenizer("vocab.json", unk_token="<unk>", pad_token="<pad>", word_delimiter_token="|")
tokenizer.save_pretrained(tokenizer_save_path)

('./wav2vec2-base-mine/tokenizer_config.json',
 './wav2vec2-base-mine/special_tokens_map.json',
 './wav2vec2-base-mine/vocab.json',
 './wav2vec2-base-mine/added_tokens.json')

In [None]:
feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True)
feature_extractor.save_pretrained(tokenizer_save_path)

['./wav2vec2-base-mine/preprocessor_config.json']

In [None]:
processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
processor.save_pretrained(tokenizer_save_path)

[]

In [None]:
print(common_voice_train)

Dataset({
    features: ['path', 'audio', 'sentence'],
    num_rows: 2217
})


##Create Data Collator

In [None]:
def speech_file_to_array_fn(batch):
    try:
        inputs = processor(batch["audio"]["array"], sampling_rate=16000, return_tensors="pt", padding=True)
        batch["input_values"] = inputs.input_values[0]
        batch["attention_mask"] = inputs.attention_mask[0]
        batch["labels"] = processor.tokenizer(batch["sentence"]).input_ids
    except Exception as e:
        logger.error(f"Error processing batch {batch}: {e}")
    return batch

In [None]:
common_voice_train_processed = common_voice_train.map(speech_file_to_array_fn, remove_columns=["audio", "sentence"])
common_voice_test_processed = common_voice_test.map(speech_file_to_array_fn, remove_columns=["audio", "sentence"])

Map:   0%|          | 0/2217 [00:00<?, ? examples/s]

Map:   0%|          | 0/5212 [00:00<?, ? examples/s]

In [None]:
import torch
from transformers import Wav2Vec2Processor
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union

@dataclass
class DataCollatorCTCWithPadding:
    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )

        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                max_length=self.max_length_labels,
                pad_to_multiple_of=self.pad_to_multiple_of_labels,
                return_tensors="pt",
            )

        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
        batch["labels"] = labels

        return batch


In [None]:
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

In [None]:
!pip install jiwer



In [None]:
from datasets import load_metric

wer_metric = load_metric("wer")

  wer_metric = load_metric("wer")


Downloading builder script:   0%|          | 0.00/1.90k [00:00<?, ?B/s]

The repository for wer contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/wer.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


##Compute metrics

In [None]:
import numpy as np
def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = processor.batch_decode(pred.label_ids, skip_special_tokens=True)

    wer = wer_metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

##Definition and preparation of Wav2Vec2ForCTC model

In [None]:
from transformers import Wav2Vec2ForCTC

model = Wav2Vec2ForCTC.from_pretrained(
    "facebook/wav2vec2-base",
    gradient_checkpointing=True,
    ctc_loss_reduction="mean",
    attention_dropout=0.1,
    bos_token_id=processor.tokenizer.bos_token_id,
    eos_token_id=processor.tokenizer.eos_token_id,
    pad_token_id=processor.tokenizer.pad_token_id,
    vocab_size=len(processor.tokenizer.get_vocab())
)
model.freeze_feature_extractor()


config.json:   0%|          | 0.00/1.84k [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/380M [00:00<?, ?B/s]

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['lm_head.bias', 'lm_head.weight', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
!pip install accelerate -U



In [None]:
!pip install transformers[torch] -U



In [None]:
!pip show accelerate
!pip show transformers

Name: accelerate
Version: 0.31.0
Summary: Accelerate
Home-page: https://github.com/huggingface/accelerate
Author: The HuggingFace team
Author-email: zach.mueller@huggingface.co
License: Apache
Location: /usr/local/lib/python3.10/dist-packages
Requires: huggingface-hub, numpy, packaging, psutil, pyyaml, safetensors, torch
Required-by: 
Name: transformers
Version: 4.42.3
Summary: State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow
Home-page: https://github.com/huggingface/transformers
Author: The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)
Author-email: transformers@huggingface.co
License: Apache 2.0 License
Location: /usr/local/lib/python3.10/dist-packages
Requires: filelock, huggingface-hub, numpy, packaging, pyyaml, regex, requests, safetensors, tokenizers, tqdm
Required-by: 


In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./wav2vec2-large-xlsr-53-mine/",
    group_by_length=True,
    per_device_train_batch_size=6,
    evaluation_strategy="steps",
    num_train_epochs=2,
    fp16=True,
    save_steps=250,
    eval_steps=500,
    logging_steps=100,
    learning_rate=1e-4,
    weight_decay=0.005,
    warmup_steps=1000,
    save_total_limit=2,
)




In [None]:
from transformers import TrainerCallback, TrainerState, TrainerControl
class SaveModelCallback(TrainerCallback):
    def on_epoch_end(self, args, state: TrainerState, control: TrainerControl, **kwargs):
        output_dir = f"{args.output_dir}/checkpoint-{state.epoch}"
        kwargs["model"].save_pretrained(output_dir)
        kwargs["tokenizer"].save_pretrained(output_dir)
        print(f"Model saved at epoch {state.epoch} to {output_dir}")


In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=common_voice_train_processed,
    eval_dataset=common_voice_test_processed,
    tokenizer=processor.feature_extractor,
    callbacks=[SaveModelCallback]
)

In [None]:
from accelerate import Accelerator

accelerator = Accelerator()

accelerator.state._reset_state()

accelerator = Accelerator()

##Train model

In [None]:
trainer.train()



Step,Training Loss,Validation Loss,Wer
500,13.3408,21.938522,0.999006




Model saved at epoch 1.0 to ./wav2vec2-large-xlsr-53-mine//checkpoint-1.0




Model saved at epoch 2.0 to ./wav2vec2-large-xlsr-53-mine//checkpoint-2.0


TrainOutput(global_step=740, training_loss=21.450221953520902, metrics={'train_runtime': 3023.8501, 'train_samples_per_second': 1.466, 'train_steps_per_second': 0.245, 'total_flos': 5.832002736862111e+17, 'train_loss': 21.450221953520902, 'epoch': 2.0})

In [None]:
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"Total parameters: {total_params}")
print(f"Trainable parameters: {trainable_params}")

Total parameters: 94421697
Trainable parameters: 90221249
