In [1]:
!cd

d:\LingoMalay\Models_Transcribe


In [None]:
from datasets import load_dataset, Audio
from transformers import (
    WhisperProcessor, WhisperForConditionalGeneration,
    Seq2SeqTrainer, Seq2SeqTrainingArguments
)
import torch

# === 1. Load Dataset ===
dataset = load_dataset(
    "json",
    data_files=r"D:\LingoMalay\Models_Transcribe\Dataset\Kelantan\#Main(clean)\##Transcripts2.jsonl",
    split="train"
)
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))

# === 2. Load Processor (tokenizer + feature extractor) ===
# r"D:\LingoMalay\Models_Transcribe\Model\Kelantan\whisper-kelantanv1v2\checkpoint-50"
processor = WhisperProcessor.from_pretrained("mesolitica/malaysian-whisper-small-v3")
processor.tokenizer.add_special_tokens({'additional_special_tokens': ['<|kedah|>', '<|kelantan|>']})

tokenizer = processor.tokenizer

# === 3. Load Model and Resize Token Embeddings ===
# D:\LingoMalay\Models_Transcribe\Model\Kelantan\whisper-kelantan
model = WhisperForConditionalGeneration.from_pretrained(r"mesolitica/malaysian-whisper-small-v3")
model.resize_token_embeddings(len(tokenizer))

# === 4. Preprocessing Function ===
def preprocess(batch):
    audio = batch["audio"]

    # Audio features
    batch["input_features"] = processor.feature_extractor(
        audio["array"], sampling_rate=16000
    ).input_features[0]

    # Prompt-based target text
    prompt = "<|startoftranscript|><|ms|><|kelantan|><|transcribe|>"
    full_text = prompt + batch["text"].strip() + " <|endoftext|>"
    # full_text = batch["text"].strip()
    batch["labels"] = tokenizer(full_text).input_ids
    return batch

dataset = dataset.map(preprocess, remove_columns=dataset.column_names)
dataset.set_format(type="torch")

# === 5. Data Collator ===
def data_collator(batch):
    input_features = torch.stack([torch.tensor(item["input_features"]) for item in batch])

    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
        processor.tokenizer.pad_token = tokenizer.eos_token

    labels = [torch.tensor(item["labels"], dtype=torch.long) for item in batch]
    labels = torch.nn.utils.rnn.pad_sequence(
        labels, batch_first=True, padding_value=tokenizer.pad_token_id
    )
    labels[labels == tokenizer.pad_token_id] = -100  # Mask pad tokens from loss

    return {"input_features": input_features, "labels": labels}

# Freeze the encoder layers
# for param in model.model.encoder.parameters():
#     param.requires_grad = False

# === 6. Training Arguments ===
training_args = Seq2SeqTrainingArguments(
    output_dir="./Model/Kelantan/whisper-kelantanv2",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=1,
    learning_rate=1e-5, #-5 early rate, lower when fine-tuning
    warmup_steps=100,
    max_steps=1000,
    save_steps=200,
    logging_steps=200,
    fp16=True,  # Set to False if GPU doesn't support fp16
    # save_total_limit=2,
    report_to="none"
)

# === 7. Trainer Setup ===
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    tokenizer=processor.tokenizer,
    data_collator=data_collator
)

# === 8. Start Training ===
trainer.train()

  from .autonotebook import tqdm as notebook_tqdm
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
  input_features = torch.stack([torch.tensor(item["input_features"]) for item in batch])
  labels = [torch.tensor(item["labels"], dtype=torch.long) for item in batch]
  attn_output = torch.nn.functional.scaled_dot_product_attention(
Non-default generation parameters: {'max_length': 448}


{'loss': 3.1727, 'grad_norm': 21.93227767944336, 'learning_rate': 8.966666666666667e-06, 'epoch': 2.47}


  input_features = torch.stack([torch.tensor(item["input_features"]) for item in batch])
  labels = [torch.tensor(item["labels"], dtype=torch.long) for item in batch]
Non-default generation parameters: {'max_length': 448}


{'loss': 0.2292, 'grad_norm': 8.874920845031738, 'learning_rate': 6.744444444444444e-06, 'epoch': 4.94}


  input_features = torch.stack([torch.tensor(item["input_features"]) for item in batch])
  labels = [torch.tensor(item["labels"], dtype=torch.long) for item in batch]
Non-default generation parameters: {'max_length': 448}


{'loss': 0.0245, 'grad_norm': 0.05791367590427399, 'learning_rate': 4.5222222222222225e-06, 'epoch': 7.41}


  input_features = torch.stack([torch.tensor(item["input_features"]) for item in batch])
  labels = [torch.tensor(item["labels"], dtype=torch.long) for item in batch]
Non-default generation parameters: {'max_length': 448}


{'loss': 0.0031, 'grad_norm': 0.012531350366771221, 'learning_rate': 2.3000000000000004e-06, 'epoch': 9.88}


  input_features = torch.stack([torch.tensor(item["input_features"]) for item in batch])
  labels = [torch.tensor(item["labels"], dtype=torch.long) for item in batch]
Non-default generation parameters: {'max_length': 448}


{'loss': 0.0012, 'grad_norm': 0.020885955542325974, 'learning_rate': 7.777777777777778e-08, 'epoch': 12.35}


100%|██████████| 1000/1000 [10:30<00:00,  1.59it/s]

{'train_runtime': 630.5531, 'train_samples_per_second': 1.586, 'train_steps_per_second': 1.586, 'train_loss': 0.6861594058573246, 'epoch': 12.35}





TrainOutput(global_step=1000, training_loss=0.6861594058573246, metrics={'train_runtime': 630.5531, 'train_samples_per_second': 1.586, 'train_steps_per_second': 1.586, 'train_loss': 0.6861594058573246, 'epoch': 12.35})

codes below are not used

In [1]:
# === 2. Preprocessing class ===
class Preprocessor:
    def __init__(self, processor):
        self.processor = processor
        self.tokenizer = processor.tokenizer

    def __call__(self, example):
        audio = example["audio"]
        dialect = example.get("dialect", "").lower()
        dialect_token = f"<{dialect}>" 
        full_text = f"<|ms|> {dialect_token} {example['text']}"

        inputs = self.processor(audio["array"], sampling_rate=16000, return_tensors="pt")
        input_features = inputs["input_features"][0]

        labels = self.tokenizer(full_text).input_ids

        return {
            "input_features": input_features,
            "labels": labels
        }

In [None]:
from transformers import WhisperForConditionalGeneration, WhisperTokenizer, Seq2SeqTrainingArguments, Seq2SeqTrainer, WhisperProcessor, DataCollatorForSeq2Seq
from transformers.data.data_collator import default_data_collator
from datasets import load_dataset, Audio
import torch

# === 1. Load Dataset ===
dataset = load_dataset("json", data_files=r"D:\LingoMalay\Models_Transcribe\Dataset\Kelantan\#Main(clean)\##Transcripts2.jsonl", split="train")
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))

# === 2. Load Tokenizer and Add Dialect Tokens ===
tokenizer = WhisperTokenizer.from_pretrained("mesolitica/Malaysian-whisper-large-v3-turbo-v3")
tokenizer.add_special_tokens({'additional_special_tokens': ['<kedah>', '<kelantan>']})

# === 3. Load Model and Resize Token Embeddings ===
model = WhisperForConditionalGeneration.from_pretrained("mesolitica/Malaysian-whisper-large-v3-turbo-v3")
model.resize_token_embeddings(len(tokenizer))

from transformers import WhisperProcessor

processor = WhisperProcessor.from_pretrained("mesolitica/Malaysian-whisper-large-v3-turbo-v3")

def preprocess(batch):
    audio = batch["audio"]
    batch["input_features"] = processor.feature_extractor(audio["array"], sampling_rate=16000).input_features
    # batch["labels"] = tokenizer("<kedah> " + batch["text"]).input_ids
    batch["labels"] = tokenizer(batch["text"], padding="longest").input_ids
    return batch

dataset = dataset.map(preprocess)

# === 4. Preprocess Function ===
# def preprocess(example):
#     audio = example["audio"]
#     input_features = tokenizer.feature_extractor(audio["array"], sampling_rate=16000).input_values[0]
#     labels = tokenizer(example["text"]).input_ids
#     return {"input_features": input_features, "labels": labels}

# def preprocess(example):
#     audio = example["audio"]
#     dialect = example.get("dialect", "").lower()
    
#     # Prepare dialect token like <kedah>, <kelantan>, etc.
#     dialect_token = f"<{dialect}>" 

#     # Construct full text with language + dialect tokens
#     full_text = f"<|ms|> {dialect_token} {example['text']}"

#     # Tokenize audio and text
#     input_features = tokenizer.feature_extractor(audio["array"], sampling_rate=16000).input_values[0]
#     labels = tokenizer(full_text).input_ids

#     return {
#         "input_features": input_features,
#         "labels": labels
#     }

# dataset = dataset.map(preprocess, remove_columns=dataset.column_names, num_proc=4)

# processor = WhisperProcessor.from_pretrained("mesolitica/Malaysian-whisper-large-v3-turbo-v3")
# preprocessor = Preprocessor(processor)
# dataset = dataset.map(preprocessor, remove_columns=dataset.column_names, num_proc=1)

# === 5. Training Arguments ===
training_args = Seq2SeqTrainingArguments(
    output_dir="/Model/Kelantan",
    per_device_train_batch_size=8,
    gradient_accumulation_steps=2,
    learning_rate=1e-5,
    warmup_steps=500,
    num_train_epochs=5,
    logging_steps=100,
    eval_strategy="no",
    save_steps=500,
    eval_steps=500,
    save_total_limit=2,
    fp16=True,
    predict_with_generate=True,
    report_to="none"
)

# # === 6. Data Collator ===
# data_collator = DataCollatorForSeq2Seq(
#     # processor=processor,
#     tokenizer=tokenizer
#     padding=True,
#     return_tensors="pt"
# )

# === 6. Trainer ===
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    tokenizer=processor.feature_extractor,
    # data_collator=data_collator
)

# === 7. Train ===
trainer.train()

Map:   0%|          | 0/71 [00:00<?, ? examples/s]


ValueError: True is not a valid PaddingStrategy, please select one of ['longest', 'max_length', 'do_not_pad']

In [None]:
from datasets import load_dataset, Audio


from transformers import (
    WhisperProcessor, WhisperTokenizer,
    WhisperForConditionalGeneration,
    Seq2SeqTrainer, Seq2SeqTrainingArguments
)
import torch
# from accelerate import init_empty_weights, infer_auto_device_map

# === 1. Load Dataset ===
dataset = load_dataset(
    "json",
    data_files=r"D:\LingoMalay\Models_Transcribe\Dataset\Kelantan\#Main(clean)\##Transcripts2.jsonl",
    split="train"
)
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))

# === 2. Load Tokenizer, Add Dialect Tokens ===
# tokenizer = WhisperTokenizer.from_pretrained("mesolitica/malaysian-whisper-small-v3")
# tokenizer.add_special_tokens({'additional_special_tokens': ['<|kedah|>', '<|kelantan|>']})

processor = WhisperProcessor.from_pretrained("mesolitica/malaysian-whisper-small-v3")
processor.tokenizer.add_special_tokens({'additional_special_tokens': ['<|kedah|>', '<|kelantan|>']})
tokenizer = processor.tokenizer

# === 3. Load Processor & Model ===
model = WhisperForConditionalGeneration.from_pretrained("mesolitica/malaysian-whisper-small-v3")
model.resize_token_embeddings(len(processor.tokenizer))

# === 4. Preprocess Function ===
def preprocess(batch):
    audio = batch["audio"]
    batch["input_features"] = processor.feature_extractor(
        audio["array"]
    ).input_features

    prompt = "<|startoftranscript|><|ms|><|kedah|><|transcribe|> "
    text = batch["text"].strip() + " <|endoftext|>"
    batch["labels"] = processor.tokenizer(prompt + text).input_ids
    # batch["labels"] = processor.tokenizer(prompt + batch["text"]).input_ids

    return batch

dataset = dataset.map(preprocess, remove_columns=dataset.column_names)
dataset.set_format(type="torch")

# === 5. Data Collator for Padding ===
def data_collator(batch):
    # input_features = [torch.tensor(item["input_features"][0]) for item in batch]
    input_features = torch.stack([torch.tensor(item["input_features"][0]) for item in batch])

    labels = [torch.tensor(item["labels"], dtype=torch.long) for item in batch]
    # input_features = torch.stack(input_features)

    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
        processor.tokenizer.pad_token = processor.tokenizer.eos_token

    labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=tokenizer.pad_token_id)
    labels[labels == tokenizer.pad_token_id] = -100  # for loss masking

    return {"input_features": input_features, "labels": labels}

# === 6. Training Arguments ===
training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper-kelantan",
    per_device_train_batch_size=2,  # adjust to fit your GPU
    gradient_accumulation_steps=2,
    # group_by_length=True,
    learning_rate=1e-5,
    warmup_steps=100,
    max_steps=1000,
    save_steps=200,
    num_train_epochs=3,
    logging_strategy="steps",
    logging_steps=100,
    fp16=True,  # set False if not using compatible GPU
    save_total_limit=2,
    # eval_strategy="no",
    report_to="none"
)

# === 7. Trainer Setup ===
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    tokenizer=processor.tokenizer,
    data_collator=data_collator
)

# === 8. Train ===
trainer.train()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
It is strongly recommended to pass the `sampling_rate` argument to this function. Failing to do so can result in silent errors that might be hard to debug.
It is strongly recommended to pass the `sampling_rate` argument to this function. Failing to do so can result in silent errors that might be hard to debug.
It is strongly recommended to pass the `sampling_rate` argument to this function. Failing to do so can result in silent errors that might be hard to debug.
It is strongly recommended to pass the `sampling_rate` argument to this function. Failing to do so can result in silent errors that might be hard to debug.
It is strongly recommended to pass the `sampling_rate` argument to this function. Failing to do so can result in silent errors that might be hard to debug.
It is strongly recommended to pass the `sampling_rate` argument to this function. Failing to do so can

KeyboardInterrupt: 

this

In [2]:
import torch
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No CUDA GPU")

True
NVIDIA GeForce RTX 3060 Laptop GPU


In [17]:
sample = dataset[0]
print(sample.keys())  # Should show ['audio', 'text']
print(sample["input_features"])

dict_keys(['input_features', 'labels'])
[[[-0.6374825239181519, -0.6374825239181519, -0.6374825239181519, -0.6374825239181519, -0.6374825239181519, -0.6374825239181519, -0.3595653772354126, -0.18845736980438232, -0.1966555118560791, -0.07035338878631592, 0.12380892038345337, 0.20074981451034546, 0.23445063829421997, 0.31581151485443115, 0.2081720232963562, 0.27026867866516113, 0.2827172875404358, 0.14494526386260986, 0.2285560965538025, 0.2540830969810486, 0.25199300050735474, 0.30084937810897827, 0.044057250022888184, 0.11263251304626465, 0.04213529825210571, -0.19470655918121338, 0.03644394874572754, -0.22815406322479248, -0.4230771064758301, 0.04379326105117798, 0.12363541126251221, -0.3073263168334961, 0.10704290866851807, 0.10367530584335327, -0.17627310752868652, 0.11039549112319946, 0.12284249067306519, -0.04550659656524658, 0.09955424070358276, 0.01035696268081665, 0.050215184688568115, -0.09947848320007324, -0.09497618675231934, -0.055925846099853516, -0.43594372272491455, -0.

In [None]:
trainer.save_model("D:/LingoMalay/Models_Transcribe/whisper-kedah-finalv3")

In [None]:
processor.tokenizer.save_pretrained('./whisper-kedahv2-tokenizer')

In [15]:
from transformers import WhisperTokenizer, WhisperProcessor

save_path = r"D:\LingoMalay\Models_Transcribe\Model"

tokenizer = WhisperTokenizer.from_pretrained("mesolitica/Malaysian-whisper-small-v3")
tokenizer.add_special_tokens({'additional_special_tokens': ['<|kedah|>', '<|kelantan|>']})
tokenizer.save_pretrained(save_path)

processor = WhisperProcessor.from_pretrained("mesolitica/Malaysian-whisper-small-v3")
processor.save_pretrained(save_path)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


[]