In [4]:
import os
import torch
import torchaudio
from datasets import Dataset, Audio
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC, TrainingArguments, Trainer
from jiwer import wer


In [18]:
def load_data(data_dir="sample_data"):
    data = []
    for i in range(1, 6):
        wav_path = os.path.join(data_dir, f"recording{i}.wav")
        txt_path = os.path.join(data_dir, f"recording{i}.txt")
        with open(txt_path, "r", encoding="utf-8") as f:
            transcript = f.read().strip().lower()
        data.append({"path": wav_path, "text": transcript})
    return Dataset.from_list(data)
dataset = load_data()
dataset = dataset.cast_column("path", Audio(sampling_rate=16000))


In [6]:
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")


Map:   0%|          | 0/5 [00:00<?, ? examples/s]

In [29]:
def prepare_example(batch):
    if batch["text"].strip() == "":
        return None
    speech_array = batch["path"]["array"]
    sampling_rate = batch["path"]["sampling_rate"]
    inputs = processor.feature_extractor(speech_array, sampling_rate=sampling_rate)
    batch["input_values"] = inputs["input_values"][0]
    batch["labels"] = processor.tokenizer(batch["text"]).input_ids

    return batch


In [30]:
dataset = dataset.map(
    prepare_example,
    batch_size=1,
    num_proc=1,
    load_from_cache_file=False
)
dataset = dataset.filter(lambda x: x is not None)


Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Filter:   0%|          | 0/4 [00:00<?, ? examples/s]

In [22]:
print(dataset.column_names)


['path', 'text']


In [7]:
model = Wav2Vec2ForCTC.from_pretrained(
    "facebook/wav2vec2-base-960h",
    ctc_loss_reduction="mean",
    pad_token_id=processor.tokenizer.pad_token_id,
)

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [35]:
class DataCollatorCTCWithPadding:
    def __init__(self, processor, padding=True):
        self.processor = processor
        self.padding = padding
    def __call__(self, features):
        input_features = [{"input_values": f["input_values"]} for f in features]
        batch = self.processor.feature_extractor.pad(
            input_features,
            padding=self.padding,
            return_tensors="pt"
        )
        with self.processor.as_target_processor():
            label_features = [{"input_ids": f["labels"]} for f in features]
            labels_batch = self.processor.tokenizer.pad(
                label_features,
                padding=self.padding,
                return_tensors="pt"
            )
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
        batch["labels"] = labels
        return batch
data_collator = DataCollatorCTCWithPadding(processor=processor)

In [10]:
from jiwer import wer
def compute_metrics(pred):
    pred_ids = torch.argmax(torch.tensor(pred.predictions), dim=-1)
    pred_str = processor.batch_decode(pred_ids)
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)
    return {"wer": wer(label_str, pred_str)}


In [27]:
!pip install --upgrade transformers


Collecting transformers
  Downloading transformers-4.52.0-py3-none-any.whl.metadata (38 kB)
Downloading transformers-4.52.0-py3-none-any.whl (10.5 MB)
   ---------------------------------------- 0.0/10.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/10.5 MB 991.0 kB/s eta 0:00:11
   ---------------------------------------- 0.1/10.5 MB 1.2 MB/s eta 0:00:09
    --------------------------------------- 0.2/10.5 MB 1.5 MB/s eta 0:00:07
   - -------------------------------------- 0.5/10.5 MB 2.5 MB/s eta 0:00:04
   -- ------------------------------------- 0.7/10.5 MB 3.1 MB/s eta 0:00:04
   --- ------------------------------------ 1.0/10.5 MB 3.7 MB/s eta 0:00:03
   ---- ----------------------------------- 1.2/10.5 MB 3.9 MB/s eta 0:00:03
   ---- ----------------------------------- 1.2/10.5 MB 3.9 MB/s eta 0:00:03
   ------ --------------------------------- 1.7/10.5 MB 4.1 MB/s eta 0:00:03
   -------- ------------------------------- 2.2/10.5 MB 5.0 MB/s eta 0:00:02
   ----

In [31]:
import transformers
print(transformers.__version__)

4.51.3


In [36]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./wav2vec2-ft",
    per_device_train_batch_size=1,     
    num_train_epochs=10,               
    logging_steps=1,
    learning_rate=1e-4,
    warmup_steps=2,
    save_total_limit=1,
    remove_unused_columns=False,       
    dataloader_num_workers=0,          
    gradient_accumulation_steps=1,     
    
)


In [37]:
model.gradient_checkpointing_enable()


In [38]:
import torch

class CustomTrainer(Trainer):
    def training_step(self, model, inputs, num_items_in_batch):
        torch.cuda.empty_cache()
        return super().training_step(model, inputs, num_items_in_batch)


trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    eval_dataset=dataset,
    tokenizer=processor.feature_extractor,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)


  trainer = CustomTrainer(


In [39]:
trainer.train()


Step,Training Loss
1,7.8642
2,8.0806
3,5.8356
4,4.3485
5,4.4466
6,13.0406
7,3.9782
8,17.9393
9,14.3496
10,7.1384


TrainOutput(global_step=40, training_loss=5.548129811882973, metrics={'train_runtime': 1761.5929, 'train_samples_per_second': 0.023, 'train_steps_per_second': 0.023, 'total_flos': 3.58564176089568e+16, 'train_loss': 5.548129811882973, 'epoch': 10.0})

In [None]:
##TESTING

In [65]:
from evaluate import load
wer_metric = load("wer")  


In [76]:
import torchaudio
import torch
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
model_name = "facebook/wav2vec2-base-960h"
model=Wav2Vec2ForCTC.from_pretrained(model_name).to("cuda")
processor=Wav2Vec2Processor.from_pretrained(model_name)
speech, sr=torchaudio.load("test_data/mytest.wav")
if speech.shape[0] > 1:
    speech = speech.mean(dim=0)
if sr != 16000:
    resampler=torchaudio.transforms.Resample(sr, 16000)
    speech=resampler(speech)
inputs = processor(speech, sampling_rate=16000, return_tensors="pt", padding=True)
print("Input shape to model:", inputs["input_values"].shape)
inputs = {k: v.to("cuda") for k, v in inputs.items()}
model.eval()
with torch.no_grad():
    logits = model(**inputs).logits
    pred_ids = torch.argmax(logits, dim=-1)
text = processor.batch_decode(pred_ids)[0]
print("Transcript:", text)


Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input shape to model: torch.Size([1, 245419])
Transcript: HA I AM MY PAR I AM TWENTY THREE YEARS OLD I AM AN A A SUTONTE TA YOR TEPEUNIVERSITY THIS IS OUR SPEECI TO TAXE PROJECT FOR MISHION EARNING FOR AT GAR COURSE LET'S SEE HOW I GRET THE MOTHER IS


In [77]:
with torch.no_grad():
    logits = model(**inputs).logits

pred_ids = torch.argmax(logits, dim=-1)


In [83]:

hyp=processor.batch_decode(pred_ids, skip_special_tokens=True)[0]
print("Predicted text:", hyp)


Predicted text: HA I AM MY PAR I AM TWENTY THRE YEARS OLD I AM AN A A SUTONTE TA YOR TEPEUNIVERSITY THIS IS OUR SPECI TO TAXE PROJECT FOR MISHION EARNING FOR AT GAR COURSE LET'S SE HOW I GRET THE MOTHER IS


In [85]:


ref = "hi i am alper i am 23 years old i am an ai student at hacettepe university this is our speech to text project for machine learning for healthcare course lets see how accurate the model is"
ref=ref.upper()
wer_metric=load("wer")
wer_score = wer_metric.compute(predictions=[hyp], references=[ref])
print("Reference         :", ref)
print("Hypothesis        :", hyp)
#print("WER               :", wer(ref, hyp))
print(f"WER               : {wer_score:.3f}")


Reference         : HI I AM ALPER I AM 23 YEARS OLD I AM AN AI STUDENT AT HACETTEPE UNIVERSITY THIS IS OUR SPEECH TO TEXT PROJECT FOR MACHINE LEARNING FOR HEALTHCARE COURSE LETS SEE HOW ACCURATE THE MODEL IS
Hypothesis        : HA I AM MY PAR I AM TWENTY THRE YEARS OLD I AM AN A A SUTONTE TA YOR TEPEUNIVERSITY THIS IS OUR SPECI TO TAXE PROJECT FOR MISHION EARNING FOR AT GAR COURSE LET'S SE HOW I GRET THE MOTHER IS
WER               : 0.595
