In [33]:
# Assuming you upload your 'requirements.txt' file to the current working directory

! pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118


Looking in indexes: https://download.pytorch.org/whl/cu118, https://pypi.ngc.nvidia.com
Collecting torch
  Downloading https://download.pytorch.org/whl/cu118/torch-2.4.1%2Bcu118-cp312-cp312-win_amd64.whl (2695.4 MB)
     ---------------------------------------- 0.0/2.7 GB ? eta -:--:--
     ---------------------------------------- 0.0/2.7 GB ? eta -:--:--
     ---------------------------------------- 0.0/2.7 GB 2.0 MB/s eta 0:22:48
     ---------------------------------------- 0.0/2.7 GB 2.2 MB/s eta 0:20:05
     ---------------------------------------- 0.0/2.7 GB 2.5 MB/s eta 0:18:18
     ---------------------------------------- 0.0/2.7 GB 2.9 MB/s eta 0:15:16
     ---------------------------------------- 0.0/2.7 GB 3.4 MB/s eta 0:13:23
     ---------------------------------------- 0.0/2.7 GB 4.0 MB/s eta 0:11:06
     ---------------------------------------- 0.0/2.7 GB 4.4 MB/s eta 0:10:16
     ---------------------------------------- 0.0/2.7 GB 4.5 MB/s eta 0:09:57
     -------------



In [15]:
import torch
print(torch.cuda.is_available())  # Should return True if GPU is detected
print(torch.cuda.device_count())  # Number of GPUs available
print(torch.cuda.get_device_name(0))  # Name of the first available GPU

True
1
NVIDIA GeForce RTX 2070 Super


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [3]:
from khmernltk import word_tokenize
from transformers import WhisperForConditionalGeneration, WhisperProcessor, Seq2SeqTrainingArguments, Seq2SeqTrainer
from datasets import Audio, DatasetDict, concatenate_datasets, load_dataset
from dataclasses import dataclass
from typing import Any, Dict, List, Union
import torch
import evaluate
from dotenv import load_dotenv
import os

In [4]:
load_dotenv()

# Retrieve Hugging Face token from environment variable
HF_API_TOKEN = os.getenv("HF_API_TOKEN")

In [5]:
OUTPUT_DIR = "./outputs/whisper-tiny-khmer"
MODEL_NAME = "Whisper Fine-tuned for Khmer Speech - VIRA RITHY"
MODEL_ID = "openai/whisper-tiny"
MODEL_LANGUAGE = "khmer"

AUDIO_COLUMN_NAME = "audio"
TEXT_COLUMN_NAME = "sentence"

In [6]:
metric = evaluate.load("wer")
model = WhisperForConditionalGeneration.from_pretrained(MODEL_ID)
processor = WhisperProcessor.from_pretrained(MODEL_ID, language=MODEL_LANGUAGE, task="transcribe")
tokenizer = processor.tokenizer

preprocessing_num_workers = 4

max_input_length = 30
min_input_length = 0

model.config.forced_decoder_ids = None
model.config.suppress_tokens = []
model.config.use_cache = False

In [7]:


# Initialize the processor (model + feature extractor + tokenizer)
processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")

# New Section

In [19]:
def transform_khmer_sentence(ds):
    transcription = word_tokenize(ds['transcription'], return_tokens=False, separator=" ")
    return {"transcription": transcription }

def normalize_dataset(ds, audio_column_name=None, text_column_name=None):
    if audio_column_name is not None and audio_column_name != AUDIO_COLUMN_NAME:
        ds = ds.rename_column(audio_column_name, AUDIO_COLUMN_NAME)
    if text_column_name is not None and text_column_name != TEXT_COLUMN_NAME:
        ds = ds.rename_column(text_column_name, TEXT_COLUMN_NAME)
    # resample to the same sampling rate
    ds = ds.cast_column("audio", Audio(sampling_rate=16_000))
    # normalize columns to ["audio", "sentence"]
    ds = ds.remove_columns(set(ds.features.keys()) - set([AUDIO_COLUMN_NAME, TEXT_COLUMN_NAME]))
    return ds

In [20]:
# Load datasets without 'use_auth_token' for public datasets
google_fleurs_train_ds = load_dataset("google/fleurs", "km_kh", split="train+validation", trust_remote_code=True)
google_fleurs_train_ds = google_fleurs_train_ds.map(transform_khmer_sentence)

google_fleurs_test_ds = load_dataset("google/fleurs", "km_kh", split="test", trust_remote_code=True)
google_fleurs_test_ds = google_fleurs_test_ds.map(transform_khmer_sentence)

# Remove 'use_auth_token' for openslr since it's a public dataset
openslr_train_ds = load_dataset("openslr", "SLR42", split="train", trust_remote_code=True)

# Keep 'use_auth_token=True' for private datasets like 'seanghay/km-speech-corpus'
kmcs_train_ds = load_dataset("seanghay/km-speech-corpus", split="train", trust_remote_code=True)

kmcs_train_ds = kmcs_train_ds.map(transform_khmer_sentence)

raw_datasets = DatasetDict()
raw_datasets['train'] = concatenate_datasets([
  normalize_dataset(google_fleurs_train_ds, audio_column_name="audio", text_column_name="transcription"),
  normalize_dataset(openslr_train_ds, audio_column_name="audio", text_column_name="sentence"),
  normalize_dataset(kmcs_train_ds, audio_column_name="audio", text_column_name="transcription")
])

raw_datasets['train'] = raw_datasets['train'].shuffle(seed=10)
raw_datasets['eval'] = normalize_dataset(google_fleurs_test_ds, audio_column_name="audio", text_column_name="transcription")

print(raw_datasets)

DatasetDict({
    train: Dataset({
        features: ['audio', 'sentence'],
        num_rows: 19850
    })
    eval: Dataset({
        features: ['audio', 'sentence'],
        num_rows: 771
    })
})


In [22]:
from transformers import WhisperProcessor
import functools

# Constants
OUTPUT_DIR = "./outputs/whisper-tiny-khmer"
MODEL_NAME = "Whisper Fine-tuned for Khmer Speech - VIRA RITHY"
MODEL_ID = "openai/whisper-tiny"
MODEL_LANGUAGE = "khmer"

AUDIO_COLUMN_NAME = "audio"
TEXT_COLUMN_NAME = "sentence"

# Preprocessing workers and input length limits
preprocessing_num_workers = 1  # Set to 1 to disable multiprocessing in Jupyter notebooks
max_input_length = 30  # max input length in seconds
min_input_length = 0  # minimum input length in seconds

# Initialize processor once
processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")

# Function to prepare dataset
def prepare_dataset(batch, audio_column_name, text_column_name):
    # Commented out the print statement to remove unnecessary output
    # print("Preparing dataset")  
    
    # Load audio from the batch using the specified column name
    audio = batch[audio_column_name]
    
    # Compute log-Mel input features from input audio array
    batch["input_features"] = processor.feature_extractor(
    audio["array"], sampling_rate=audio["sampling_rate"]
    ).input_features[0] 
    
    # Compute input length of the audio sample in seconds
    batch["input_length"] = len(audio["array"]) / audio["sampling_rate"]

    # Process targets (transcriptions)
    input_str = batch[text_column_name]
    
    # Encode target text to label ids
    batch["labels"] = processor.tokenizer(input_str).input_ids

    return batch

# Use functools.partial to pre-fill the audio_column_name and text_column_name
prepare_dataset_fn = functools.partial(prepare_dataset, audio_column_name=AUDIO_COLUMN_NAME, text_column_name=TEXT_COLUMN_NAME)

# Preprocess the dataset by applying the prepare_dataset function
vectorized_datasets = raw_datasets.map(
    prepare_dataset_fn,
    num_proc=1,  # Single process mode for Jupyter notebooks
    remove_columns=next(iter(raw_datasets.values())).column_names,
    desc="preprocess dataset",
)



In [23]:
@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any
    max_input_length: int = 80  # You can set a default max length for inputs
    max_label_length: int = 448  # You can set a default max length for labels

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # Extract input features from the batch
        input_features = [{"input_features": feature["input_features"]} for feature in features]

        # Convert input features to tensors and pad them to the max length
        batch = self.processor.feature_extractor.pad(
            input_features, 
            return_tensors="pt",  # Convert to PyTorch tensors
            padding="max_length",  # Pad to max length
            max_length=self.max_input_length  # Set the max input length
        )

        # Process labels (targets) - truncate and pad separately
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        
        # Truncate the labels before padding
        for label in label_features:
            label["input_ids"] = label["input_ids"][:self.max_label_length]  # Apply truncation manually
        
        # Pad label ids to the max length
        labels_batch = self.processor.tokenizer.pad(
            label_features, 
            return_tensors="pt",  # Convert to PyTorch tensors
            padding="max_length",  # Pad to max length
            max_length=self.max_label_length  # Ensure max length for labels
        )

        # Replace padding with -100 to ignore padding in the loss calculation
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # Remove the BOS token from labels if necessary
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch

# Initialize the data collator with processor and max lengths
data_collator = DataCollatorSpeechSeq2SeqWithPadding(
    processor=processor
)


In [24]:
def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = tokenizer.pad_token_id

    # we do not want to group tokens when computing the metrics
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)
    wer = metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

In [25]:
training_args = Seq2SeqTrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=1,
    warmup_steps=800,
    max_steps=8000,
    learning_rate=6.25e-6,
    weight_decay=0.01,
    gradient_checkpointing=True,
    fp16=True,
    predict_with_generate=True,
    generation_max_length=225,
    logging_steps=25,
    report_to=["tensorboard"],
    eval_strategy="steps",  # Updated here
    eval_steps=1000,
    save_strategy="steps",
    save_steps=1000,
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    push_to_hub=True
)


In [27]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=vectorized_datasets["train"],
    eval_dataset=vectorized_datasets["eval"],
    tokenizer=processor,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)
trainer.train()

max_steps is given, it will override any value given in num_train_epochs

[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                        

 12%|█▎        | 1000/8000 [1:30:10<11:53:51,  6.12s/it]
[A

{'loss': 1.1958, 'grad_norm': 11.152442932128906, 'learning_rate': 1.7968750000000003e-07, 'epoch': 0.02}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                        

 12%|█▎        | 1000/8000 [1:32:17<11:53:51,  6.12s/it]
[A

{'loss': 1.1971, 'grad_norm': 12.81757640838623, 'learning_rate': 3.75e-07, 'epoch': 0.04}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                        

 12%|█▎        | 1000/8000 [1:34:21<11:53:51,  6.12s/it]
[A

{'loss': 1.197, 'grad_norm': 13.109082221984863, 'learning_rate': 5.703125e-07, 'epoch': 0.06}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                        

 12%|█▎        | 1000/8000 [1:35:28<11:53:51,  6.12s/it]
[A

{'loss': 1.1957, 'grad_norm': 14.273059844970703, 'learning_rate': 7.65625e-07, 'epoch': 0.08}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                        

 12%|█▎        | 1000/8000 [1:36:35<11:53:51,  6.12s/it]
[A

{'loss': 1.1912, 'grad_norm': 11.275403022766113, 'learning_rate': 9.609375e-07, 'epoch': 0.1}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                        

 12%|█▎        | 1000/8000 [1:37:45<11:53:51,  6.12s/it]
[A

{'loss': 1.2006, 'grad_norm': 13.821718215942383, 'learning_rate': 1.15625e-06, 'epoch': 0.12}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                        

 12%|█▎        | 1000/8000 [1:38:57<11:53:51,  6.12s/it]
[A

{'loss': 1.1761, 'grad_norm': 16.015647888183594, 'learning_rate': 1.3515625e-06, 'epoch': 0.14}



[A
[A
[A
[A
[A
[A

KeyboardInterrupt: 