In [1]:
import pandas as pd
from datasets import Dataset, Audio

file_path = r"C:\Users\ASUS\Desktop\DARIJA_SPEECH_RECOGNITION\Data Preprocessing\data_organization\train.txt" ##PATH TO YOUR TRAINING DATASET
train_data = pd.read_csv(file_path, sep="|", header=0)  

# Rename columns to match the expected names
train_data.rename(columns={'path': 'audio', 'transcript': 'text'}, inplace=True)

# Take only the first two columns
train_data = train_data.iloc[:, :2]

train_data.rename(columns={'path': 'audio', 'transcript': 'text'}, inplace=True)


##CONVERT PANDAS DATAFRAME TO HUGGING FACE DATASET
train_data_hf = Dataset.from_pandas(train_data)


train_data_hf = train_data_hf.cast_column("audio", Audio())

print(train_data_hf.column_names)
print(train_data_hf.features)

  from .autonotebook import tqdm as notebook_tqdm


['audio', 'text']
{'audio': Audio(sampling_rate=None, mono=True, decode=True, id=None), 'text': Value(dtype='string', id=None)}


In [3]:

file_path = R"C:\Users\ASUS\Desktop\DARIJA_SPEECH_RECOGNITION\Data Preprocessing\data_organization\test.txt" ##PATH TO YOUR TRAINING DATASET
test_data = pd.read_csv(file_path, sep="|", header=0)  


test_data.rename(columns={'path': 'audio', 'transcript': 'text'}, inplace=True)

test_data_hf = Dataset.from_pandas(test_data)
test_data_hf = test_data_hf.cast_column("audio", Audio())
print(test_data.head())

                                               audio  \
0  C:\Users\ASUS\Desktop\dataset\dataset_0\audio_...   
1  C:\Users\ASUS\Desktop\dataset\dataset_0\audio_...   
2  C:\Users\ASUS\Desktop\dataset\dataset_0\audio_...   
3  C:\Users\ASUS\Desktop\dataset\dataset_0\audio_...   
4  C:\Users\ASUS\Desktop\dataset\dataset_0\audio_...   

                                      text  
0     الله مرحبا بها ذاكشي اللي بغينا احنا  
1     غاين كفاش بديتي ريسون كون كيفاش بديت  
2  فيديوز كنضحك واح النهار عجبني راسي لابس  
3  الفوقيه نهار الجمعه صورت فيديو بدا طالع  
4                            فشد النار كعس  


In [4]:
def extract_all_chars(batch):
  all_text = " ".join(batch["text"])
  vocab = list(set(all_text))
  return {"vocab": [vocab], "all_text": [all_text]}

In [5]:
vocab_train = train_data_hf.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=train_data_hf.column_names)
vocab_test = test_data_hf.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=test_data_hf.column_names)

Map: 100%|██████████| 754/754 [00:00<00:00, 124970.57 examples/s]
Map: 100%|██████████| 320/320 [00:00<00:00, 89759.73 examples/s]


In [6]:
vocab_list = list(set(vocab_train["vocab"][0]) | set(vocab_test["vocab"][0]))

In [7]:
vocab_dict = {v: k for k, v in enumerate(vocab_list)}
vocab_dict

{'ث': 0,
 'ء': 1,
 'و': 2,
 'ز': 3,
 '7': 4,
 'ئ': 5,
 'ى': 6,
 'ش': 7,
 'ت': 8,
 '8': 9,
 '2': 10,
 'ب': 11,
 ' ': 12,
 'ط': 13,
 'س': 14,
 'ا': 15,
 'ظ': 16,
 '0': 17,
 'ح': 18,
 'ع': 19,
 '3': 20,
 '9': 21,
 'ذ': 22,
 'د': 23,
 'ج': 24,
 'ر': 25,
 'ض': 26,
 '1': 27,
 'م': 28,
 'ي': 29,
 'ن': 30,
 'خ': 31,
 'ه': 32,
 'ؤ': 33,
 'ص': 34,
 '4': 35,
 'ق': 36,
 '5': 37,
 'ف': 38,
 'ل': 39,
 'غ': 40,
 'ك': 41}

In [8]:
vocab_dict["[UNK]"] = len(vocab_dict)
vocab_dict["[PAD]"] = len(vocab_dict)
len(vocab_dict)
vocab_dict

{'ث': 0,
 'ء': 1,
 'و': 2,
 'ز': 3,
 '7': 4,
 'ئ': 5,
 'ى': 6,
 'ش': 7,
 'ت': 8,
 '8': 9,
 '2': 10,
 'ب': 11,
 ' ': 12,
 'ط': 13,
 'س': 14,
 'ا': 15,
 'ظ': 16,
 '0': 17,
 'ح': 18,
 'ع': 19,
 '3': 20,
 '9': 21,
 'ذ': 22,
 'د': 23,
 'ج': 24,
 'ر': 25,
 'ض': 26,
 '1': 27,
 'م': 28,
 'ي': 29,
 'ن': 30,
 'خ': 31,
 'ه': 32,
 'ؤ': 33,
 'ص': 34,
 '4': 35,
 'ق': 36,
 '5': 37,
 'ف': 38,
 'ل': 39,
 'غ': 40,
 'ك': 41,
 '[UNK]': 42,
 '[PAD]': 43}

In [9]:
import json
with open('vocab.json', 'w', encoding='utf-8') as vocab_file:
    json.dump(vocab_dict, vocab_file, ensure_ascii=False, indent=4)

In [10]:
from transformers import Wav2Vec2CTCTokenizer

tokenizer = Wav2Vec2CTCTokenizer("./vocab.json", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")

In [11]:
from transformers import Wav2Vec2FeatureExtractor

feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True)

In [12]:
from transformers import Wav2Vec2Processor

processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

In [13]:
train_data_hf[0]["audio"]

{'path': 'C:\\Users\\ASUS\\Desktop\\dataset\\dataset_0\\audio_chunk_0.wav',
 'array': array([1.06811523e-04, 4.57763672e-05, 7.62939453e-05, ...,
        3.08227539e-02, 2.61688232e-02, 2.08587646e-02]),
 'sampling_rate': 16000}

In [14]:
import IPython.display as ipd
import numpy as np
import random

rand_int = random.randint(0, len(train_data_hf)-1)

ipd.Audio(data=train_data_hf[rand_int]["audio"]["array"], autoplay=True, rate=16000)

In [15]:
# Function to extract 'path', 'audio', and 'text' from each example
def process_example(example):
    # Extract 'path' and 'audio' from the dictionary in the 'audio_data' column
    audio_data = example['audio']
    example['path'] = audio_data['path']  # Extract the path
    example['audio'] = audio_data['array']  # Extract the audio waveform (array)
    example['text'] = example['text']  # Ensure the transcription is in the 'text' column
    return example

# Apply the function to each row of the dataset
train_data_hf = train_data_hf.map(process_example)



Map: 100%|██████████| 754/754 [00:04<00:00, 175.30 examples/s]


In [16]:
test_data_hf = test_data_hf.map(process_example)

Map: 100%|██████████| 320/320 [00:01<00:00, 178.92 examples/s]


In [17]:
test_data_hf[0]["path"]

'C:\\Users\\ASUS\\Desktop\\dataset\\dataset_0\\audio_chunk_1500.wav'

In [18]:
if 'processor' in locals() or 'processor' in globals():
    print("Processor is defined!")
else:
    print("Processor is NOT defined!")

Processor is defined!


In [19]:
if 'processor' in locals() or 'processor' in globals():
    print("Processor is defined!")
else:
    print("Processor is NOT defined!")
# Define the function
def prepare_dataset(batch, processor = processor):

    # Get the audio and text from the batch
    audio = batch["audio"]
    text = batch["text"]

    # Process the audio to input values
    batch["input_values"] = processor(audio, sampling_rate=16000).input_values[0]
    
    # Process the text to labels (transcriptions)
    with processor.as_target_processor():
        batch["labels"] = processor(text).input_ids
    
    return batch

# Use map to apply the function on the dataset
train_data_hf = train_data_hf.map(prepare_dataset, remove_columns=train_data_hf.column_names,num_proc=4)
test_data_hf = test_data_hf.map(prepare_dataset, remove_columns=test_data_hf.column_names,  num_proc=4)

Processor is defined!


Map (num_proc=4): 100%|██████████| 754/754 [00:25<00:00, 29.05 examples/s]
Map (num_proc=4): 100%|██████████| 320/320 [00:15<00:00, 20.89 examples/s]


In [20]:
print(train_data_hf[0])

{'input_values': [0.0010577060747891665, 0.00039432611083611846, 0.000726016063708812, -0.0001032088985084556, -0.0007665888988412917, -0.000600743864197284, -0.0009324339334852993, -0.0009324339334852993, -0.0017616588156670332, -0.002425038954243064, -0.0032542638946324587, -0.0034201089292764664, -0.0034201089292764664, -0.0037517989985644817, -0.002922574058175087, -0.0030884190928190947, -0.0022591939195990562, -0.0019275038503110409, -0.0015958138974383473, -0.000600743864197284, 0.00039432611083611846, 0.0017210860969498754, 0.0023844661191105843, 0.003877070965245366, 0.0048721409402787685, 0.006364746019244194, 0.007359815761446953, 0.007691505830734968, 0.008023195900022984, 0.008189041167497635, 0.008023195900022984, 0.007857350632548332, 0.007525661028921604, 0.006364746019244194, 0.005701365880668163, 0.005369675811380148, 0.004540450870990753, 0.004374606069177389, 0.003877070965245366, 0.0035453808959573507, 0.0033795360941439867, 0.0030478460248559713, 0.002716155955567

In [21]:
import torch

from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union

@dataclass
class DataCollatorCTCWithPadding:
    """
    Data collator that will dynamically pad the inputs received.
    Args:
        processor (:class:`~transformers.Wav2Vec2Processor`)
            The processor used for proccessing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
              sequence if provided).
            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
              different lengths).
        max_length (:obj:`int`, `optional`):
            Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
        max_length_labels (:obj:`int`, `optional`):
            Maximum length of the ``labels`` returned list and optionally padding length (see above).
        pad_to_multiple_of (:obj:`int`, `optional`):
            If set will pad the sequence to a multiple of the provided value.
            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
            7.5 (Volta).
    """

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lenghts and need
        # different padding methods
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                max_length=self.max_length_labels,
                pad_to_multiple_of=self.pad_to_multiple_of_labels,
                return_tensors="pt",
            )

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch

In [22]:
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

In [23]:
from evaluate import load
wer_metric = load("wer")

In [24]:
def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids)
    # we do not want to group tokens when computing the metrics
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    wer = wer_metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

In [25]:
from transformers import Wav2Vec2ForCTC

model = Wav2Vec2ForCTC.from_pretrained(
    "facebook/wav2vec2-large-xlsr-53", 
    attention_dropout=0.1,
    hidden_dropout=0.1,
    feat_proj_dropout=0.0,
    mask_time_prob=0.05,
    layerdrop=0.1,
    ctc_loss_reduction="mean", 
    pad_token_id=processor.tokenizer.pad_token_id,
    vocab_size=len(processor.tokenizer)
)

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-xlsr-53 and are newly initialized: ['lm_head.bias', 'lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [26]:
model.freeze_feature_extractor()



In [27]:
model.gradient_checkpointing_enable()

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
  # output_dir="/content/gdrive/MyDrive/wav2vec2-large-xlsr-turkish-demo",
  output_dir = r"C:\Users\ASUS\Desktop\finetuning",
  group_by_length=True,
  per_device_train_batch_size=32,
  gradient_accumulation_steps=4,
  evaluation_strategy="steps",
  num_train_epochs=30,
  fp16=True,
  save_steps=100,
  eval_steps=100,
  logging_steps=10,
  learning_rate=3e-4,
  warmup_steps=500,
  save_total_limit=2,
)



In [29]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_data_hf,
    eval_dataset=test_data_hf,
    tokenizer=processor.feature_extractor,
)

  trainer = Trainer(


In [30]:
trainer.train()

  1%|          | 6/720 [09:27<18:56:44, 95.52s/it] 

KeyboardInterrupt: 

In [31]:
import torch
print(torch.cuda.is_available())


False
