In [1]:
import torch
print(torch.cuda.is_available())  # Should print True if GPU + CUDA are available
print(torch.cuda.device_count())  # Should print number of GPUs detected
print(torch.cuda.current_device())  # Current device index
print(torch.cuda.get_device_name(0))  # Name of the GPU device 0


True
1
0
NVIDIA GeForce RTX 3050 Laptop GPU


In [1]:
!pip install datasets==2.13.1 transformers==4.35.0 torchaudio==2.1.0 librosa==0.10.0 jiwer evaluate soundfile





[notice] A new release of pip is available: 24.0 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip

A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.5 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "e:\speecRecognition\venv\Lib\site-packages\ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "e:\speecRecognition\venv\Lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance
    app.start()
  File "e:\speecRecognition\venv\Lib\site

In [1]:
!pip install accelerate -U

Collecting accelerate
  Using cached accelerate-1.7.0-py3-none-any.whl.metadata (19 kB)
Using cached accelerate-1.7.0-py3-none-any.whl (362 kB)
Installing collected packages: accelerate
  Attempting uninstall: accelerate
    Found existing installation: accelerate 0.28.0
    Uninstalling accelerate-0.28.0:
      Successfully uninstalled accelerate-0.28.0
Successfully installed accelerate-1.7.0



[notice] A new release of pip is available: 24.0 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:

import os
import random
import numpy as np
import pandas as pd
import torch
from datasets import Dataset, Audio
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, TrainingArguments, Trainer
from dataclasses import dataclass
from typing import Any, Dict, List, Optional, Union
import re
import evaluate


  from .autonotebook import tqdm as notebook_tqdm





In [4]:

# seed for reproducibility
torch.manual_seed(42)
np.random.seed(42)
random.seed(42)

# Loading the csv file that maps audios to their transcriptions.
csv_file = "mapping.csv"  # path to mapping.csv
df = pd.read_csv(csv_file)

audio_dir = os.path.dirname(csv_file)
if "path" in df.columns:
    df["audio_path"] = df["path"]
elif "filename" in df.columns:
    df["audio_path"] = df["filename"].apply(lambda x: os.path.join(audio_dir, x))
elif "file" in df.columns:
    df["audio_path"] = df["file"].apply(lambda x: os.path.join(audio_dir, x))
elif "audio" in df.columns:
    df["audio_path"] = df["audio"]
else:
    raise ValueError("No audio file path column found in CSV.")


if "text" in df.columns:
    pass
elif "transcript" in df.columns:
    df = df.rename(columns={"transcript": "text"})
elif "sentence" in df.columns:
    df = df.rename(columns={"sentence": "text"})
elif "transcription" in df.columns:
    df = df.rename(columns={"transcription": "text"})
else:
    
    df = df.rename(columns={df.columns[-1]: "text"})


# Removing any rows with missing values
df = df.dropna(subset=["audio_path","text"]).reset_index(drop=True)

# Create a HuggingFace Dataset from the pandas DataFrame so to fit wav2vec
dataset = Dataset.from_pandas(df[["audio_path", "text"]])

dataset = dataset.rename_column("audio_path", "audio")

dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))


# Split into training and validation sets
dataset = dataset.train_test_split(test_size=0.1)
train_dataset = dataset["train"]
eval_dataset = dataset["test"]

# a text cleaning function
chars_to_ignore_regex = r"[\,\?\.\!\-\;\:\"]"
def preprocess_text(batch):
    text = batch["text"]
    text = text.lower()
    text = re.sub(chars_to_ignore_regex, "", text)
    text = text.strip()
    batch["text"] = text
    return batch

train_dataset = train_dataset.map(preprocess_text)
eval_dataset = eval_dataset.map(preprocess_text)

# Load pre-trained processor and model for Kazakh (Trained in voice of young Kazakhs)
model_name = "aismlv/wav2vec2-large-xlsr-kazakh"
processor = Wav2Vec2Processor.from_pretrained(model_name)
model = Wav2Vec2ForCTC.from_pretrained(
    model_name,
    ctc_loss_reduction="mean",
    pad_token_id=processor.tokenizer.pad_token_id
)
# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# bactching audios and adding paddings to insure equal length + CTC loss -
# function to address timing issues in audio.
@dataclass
class DataCollatorCTCWithPadding:
    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, torch.Tensor]:
        input_features = [{"input_values": f["input_values"]} for f in features]
        label_features = [{"input_ids": f["labels"]} for f in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt"
        )
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                max_length=self.max_length_labels,
                pad_to_multiple_of=self.pad_to_multiple_of_labels,
                return_tensors="pt"
            )

        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
        batch["labels"] = labels
        return batch

data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

# Preprocess dataset (feature extraction + adding noise)
def prepare_dataset(batch):
    audio = batch["audio"]
    speech = audio["array"]
    # Data augmentation: add random noise
    if random.random() < 0.3:
        noise = np.random.randn(len(speech))
        speech = speech + 0.005 * noise
        speech = np.clip(speech, -1, 1)
    batch["input_values"] = processor(speech, sampling_rate=audio["sampling_rate"]).input_values[0]
    #Converting the text transcriptions into list of token IDs so to knwo the prediction to the correct answer)
    with processor.as_target_processor():
        batch["labels"] = processor(batch["text"]).input_ids
    return batch

train_dataset = train_dataset.map(prepare_dataset, remove_columns=["audio", "text"])
eval_dataset = eval_dataset.map(prepare_dataset, remove_columns=["audio", "text"])

# Load WER metric for evaluation (WER = Word Error Rate)
wer_metric = evaluate.load("wer")

def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)
    label_ids = pred.label_ids
    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids)
    label_str = processor.batch_decode(label_ids, group_tokens=False)
    wer = wer_metric.compute(predictions=pred_str, references=label_str)
    return {"wer": wer}

# Define TrainingArguments
training_args = TrainingArguments(
    output_dir="./wav2vec2_kazakh_child",
    group_by_length=True,
    per_device_train_batch_size=2,
    num_train_epochs=10,
    learning_rate=1e-4,
    warmup_steps=500,  # Helps stabilize early training
    weight_decay=0.01,  # Slight increase for regularization
    logging_steps=50,
    logging_dir="./logs",
    fp16=True
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=processor.feature_extractor
)

# Train the model
trainer.train()


Using the latest cached version of the module from C:\Users\True\.cache\huggingface\modules\evaluate_modules\metrics\evaluate-metric--wer\85bee9e4216a78bb09b2d0d500f6af5c23da58f9210e661add540f5df6630fcd (last modified on Mon May  5 12:58:53 2025) since it couldn't be found locally at evaluate-metric--wer, or remotely on the Hugging Face Hub.
  trainer = Trainer(


Step,Training Loss
50,2.8931
100,1.9278
150,1.774
200,1.7273
250,1.4907
300,1.4415
350,1.2668
400,1.2282
450,1.2054
500,1.3755




TrainOutput(global_step=3010, training_loss=0.6875608202626935, metrics={'train_runtime': 9283.1176, 'train_samples_per_second': 0.648, 'train_steps_per_second': 0.324, 'total_flos': 3.416416177825872e+17, 'train_loss': 0.6875608202626935, 'epoch': 10.0})

In [13]:
# Inference & Evaluation in a Single Jupyter Cell

# 1) Imports
import os
import re
import random
import numpy as np
import pandas as pd
import torch
import soundfile as sf
from datasets import Dataset, Audio
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC, Trainer, TrainingArguments
from dataclasses import dataclass
from typing import List, Dict, Any, Union, Optional
import evaluate

# 2) Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# 3) Load CSV & rebuild eval_dataset exactly as during training
csv_file = "mapping.csv"
df = pd.read_csv(csv_file)

# Create full audio_path column
audio_dir = os.path.dirname(csv_file)
if "path" in df.columns:
    df["audio_path"] = df["path"]
elif "filename" in df.columns:
    df["audio_path"] = df["filename"].apply(lambda x: os.path.join(audio_dir, x))
elif "file" in df.columns:
    df["audio_path"] = df["file"].apply(lambda x: os.path.join(audio_dir, x))
elif "audio" in df.columns:
    df["audio_path"] = df["audio"]
else:
    raise ValueError("No audio file path column found in CSV.")

# Rename transcription to 'text'
if "text" not in df.columns:
    if "transcript" in df.columns:
        df = df.rename(columns={"transcript": "text"})
    elif "sentence" in df.columns:
        df = df.rename(columns={"sentence": "text"})
    elif "transcription" in df.columns:
        df = df.rename(columns={"transcription": "text"})
    else:
        df = df.rename(columns={df.columns[-1]: "text"})

# Drop missing
df = df.dropna(subset=["audio_path", "text"]).reset_index(drop=True)

# Create Hugging Face Dataset
dataset = Dataset.from_pandas(df[["audio_path", "text"]])
dataset = dataset.rename_column("audio_path", "audio")
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
dataset = dataset.train_test_split(test_size=0.1)
eval_dataset = dataset["test"]

# Text preprocessing
chars_to_ignore_regex = r"[\,\?\.\!\-\;\:\"]"
def preprocess_text(batch):
    text = batch["text"].lower()
    text = re.sub(chars_to_ignore_regex, "", text).strip()
    batch["text"] = text
    return batch

eval_dataset = eval_dataset.map(preprocess_text)

# 4) Load processor & model
ckpt_dir         = "./wav2vec2_kazakh_child/checkpoint-3010"  # <-- your checkpoint folder
base_model_name  = "aismlv/wav2vec2-large-xlsr-kazakh"

# 4a) Processor must come from the original pretrained model
processor = Wav2Vec2Processor.from_pretrained(base_model_name)

# 4b) Model weights come from the checkpoint
model = Wav2Vec2ForCTC.from_pretrained(ckpt_dir).to(device)

# 5) Prepare eval_dataset (input_values + labels)
def prepare_dataset(batch):
    speech = batch["audio"]["array"]
    batch["input_values"] = processor(speech, sampling_rate=16000).input_values[0]
    with processor.as_target_processor():
        batch["labels"] = processor(batch["text"]).input_ids
    return batch

eval_dataset = eval_dataset.map(prepare_dataset, remove_columns=["audio", "text"])

# 6) Data collator (same as training)
@dataclass
class DataCollatorCTCWithPadding:
    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, torch.Tensor]:
        input_features = [{"input_values": f["input_values"]} for f in features]
        label_features = [{"input_ids": f["labels"]} for f in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt"
        )
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                max_length=self.max_length_labels,
                pad_to_multiple_of=self.pad_to_multiple_of_labels,
                return_tensors="pt"
            )

        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
        batch["labels"] = labels
        return batch

data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

# 7) WER metric
wer_metric = evaluate.load("wer")
def compute_metrics(pred):
    pred_ids = np.argmax(pred.predictions, axis=-1)
    label_ids = pred.label_ids
    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id
    pred_str = processor.batch_decode(pred_ids)
    label_str = processor.batch_decode(label_ids, group_tokens=False)
    wer = wer_metric.compute(predictions=pred_str, references=label_str)
    return {"wer": wer}

# 8) Run evaluation on the validation set
training_args = TrainingArguments(
    output_dir="./wav2vec2_kazakh_eval",
    per_device_eval_batch_size=2,
    fp16=True if torch.cuda.is_available() else False
)

trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    eval_dataset=eval_dataset,
    tokenizer=processor.feature_extractor
)

metrics = trainer.evaluate()
print("Validation metrics:", metrics)


Using device: cuda


Map:   0%|          | 0/67 [00:00<?, ? examples/s]

Using the latest cached version of the module from C:\Users\True\.cache\huggingface\modules\evaluate_modules\metrics\evaluate-metric--wer\85bee9e4216a78bb09b2d0d500f6af5c23da58f9210e661add540f5df6630fcd (last modified on Mon May  5 12:58:53 2025) since it couldn't be found locally at evaluate-metric--wer, or remotely on the Hugging Face Hub.
  trainer = Trainer(


Validation metrics: {'eval_loss': 0.5918092131614685, 'eval_model_preparation_time': 0.003, 'eval_wer': 0.2247191011235955, 'eval_runtime': 2.6121, 'eval_samples_per_second': 25.649, 'eval_steps_per_second': 13.016}


In [20]:

# 1) Specify the path to your test WAV
wav_path = "audio/some_test.wav"  # <-- replace with your file

# 2) Run inference through the fine‐tuned model
speech, sr = sf.read(wav_path)
if sr != 16000:
    import torchaudio
    speech = torchaudio.functional.resample(torch.tensor(speech), sr, 16000).numpy()

inputs = processor(speech, sampling_rate=16000, return_tensors="pt", padding=True).input_values.to(device)
with torch.no_grad():
    logits = model(inputs).logits

pred_ids      = torch.argmax(logits, dim=-1).squeeze().tolist()
pred_text     = processor.batch_decode([pred_ids])[0]

# 3) Look up the “actual” transcription in your DataFrame
#    We assume df["audio_path"] holds the same relative path string as wav_path.
row = df[df["audio_path"] == wav_path]
if len(row) == 1:
    actual_text = row["text"].values[0]
else:
    actual_text = "<not found in mapping.csv>"

# 4) Print both
print(f"Predicted : {pred_text}")
print(f"Actual    : {actual_text}")


Predicted : қарағанды
Actual    : қарағанды
