In [1]:
!pip install -q --upgrade pip

!pip install -q protobuf==5.29.5

!pip install -q transformers==4.56.2 
!pip install -q datasets==2.19.1 
!pip install -q fsspec==2024.3.1 
!pip install -q gcsfs==2024.3.1 
!pip install -q accelerate==1.11.0
!pip install -q evaluate 
!pip install -q jiwer
!pip install -q bitsandbytes

!pip install -q git+https://github.com/huggingface/peft.git@main

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m24.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
bigframes 2.12.0 requires google-cloud-bigquery-storage<3.0.0,>=2.30.0, which is not installed.
google-cloud-translate 3.12.1 requires protobuf!=3.20.0,!=3.20.1,!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.19.5, but you have protobuf 5.29.5 which is incompatible.
ray 2.51.1 requires click!=8.3.0,>=7.0, but you have click 8.3.0 which is incompatible.
bigframes 2.12.0 requires rich<14,>=12.4.4, but you have rich 14.2.0 which is incompatible.
pydrive2 1.21.3 requires cryptography<44, but you have cryptography 46.0.3 which is incompatible.
pydrive2 1.21.3 requires pyOpenSSL<=24.2.1,>=19.1.0, but you have pyopenssl 25.3.0 which is incompatible.
gcsfs 20

## Import Library

In [2]:
from scipy.io.wavfile import write as write_wav
from huggingface_hub import notebook_login
from datasets import load_dataset, DatasetDict, Audio, concatenate_datasets, Dataset, Value, Features
import torch
from peft import PeftModel
from transformers import WhisperProcessor, WhisperForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer, EarlyStoppingCallback, pipeline
from transformers.models.whisper.english_normalizer import BasicTextNormalizer
from tqdm.auto import tqdm
from transformers.pipelines.pt_utils import KeyDataset
from dataclasses import dataclass
from typing import Any, Dict, List, Union
from sklearn.model_selection import train_test_split
import evaluate
import re
import gc
import os
from IPython.display import FileLink

from peft import LoraConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training

2025-11-29 09:23:22.940241: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1764408203.128140      47 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1764408203.180706      47 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
#notebook_login()

In [4]:
def get_dataset(lang=None, dataset=None, split_name=None, num_samples=None):
    print(f"Menyiapkan streaming {num_samples} data {lang}...")

    if dataset == 'edinburghcstr/edacc':
        ds_stream = load_dataset(
            dataset, 
            streaming=True, 
            trust_remote_code=True
        )
        ds_stream = ds_stream[split_name]
        ds_stream = ds_stream.filter(lambda x: x['accent'] == 'Indonesian')
    else:
        ds_stream = load_dataset(
            dataset, 
            lang, 
            split=split_name,
            streaming=True, 
            trust_remote_code=True
        )
   
    ds_head = ds_stream.shuffle(seed=42, buffer_size=1000)

    ds_head = ds_stream.take(num_samples)
    
    def generator():
        for sample in ds_head:
           
            if dataset == 'google/fleurs':
                text_col = sample.get("raw_transcription") or sample.get("transcription")
            elif dataset == 'edinburghcstr/edacc':
                text_col = sample.get("text") 
            else:
                text_col = sample.get("sentence") or sample.get("text") or sample.get("transcription")

            yield {
                "audio": sample["audio"],  
                "raw_transcription": text_col, 
                "language": "en_us" if lang == "en" else "id_id"
            }

   
    my_features = Features({
        "audio": Audio(sampling_rate=16000), 
        "raw_transcription": Value("string"),
        "language": Value("string")
    })

    dataset = Dataset.from_generator(generator, features=my_features)
    
    return dataset

In [5]:
fleur_train_en = get_dataset('en_us', 'google/fleurs', 'train', 2600)
fleur_test_en = get_dataset('en_us', 'google/fleurs', 'test', 320)

fleur_train_id = get_dataset('id_id', 'google/fleurs', 'train', 2550)
fleur_test_id = get_dataset('en_us', 'google/fleurs', 'test', 320)

Menyiapkan streaming 2600 data en_us...


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading readme: 0.00B [00:00, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Menyiapkan streaming 320 data en_us...


Generating train split: 0 examples [00:00, ? examples/s]

Menyiapkan streaming 2550 data id_id...


Generating train split: 0 examples [00:00, ? examples/s]

Menyiapkan streaming 320 data en_us...


In [6]:

edacc_dataset = get_dataset(dataset='edinburghcstr/edacc',num_samples=215, split_name='test')
edacc_dataset_split = edacc_dataset.train_test_split(test_size=0.2, seed=42)
edacc_train = edacc_dataset_split['train']
edacc_test = edacc_dataset_split['test']

Menyiapkan streaming 215 data None...


Downloading readme: 0.00B [00:00, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [7]:
edacc_test = edacc_dataset_split['test'].map(
    lambda x: {
        "audio": x["audio"], 
        "raw_transcription": x["raw_transcription"], 
        "language": "en_us"
    }
)

edacc_train = edacc_dataset_split['train'].map(
    lambda x: {
        "audio": x["audio"], 
        "raw_transcription": x["raw_transcription"], 
        "language": "en_us"
    }
)

Map:   0%|          | 0/43 [00:00<?, ? examples/s]

Map:   0%|          | 0/172 [00:00<?, ? examples/s]

In [8]:

cv_train_en = get_dataset('en', 'fsicoli/common_voice_22_0', 'train', 5200)
cv_test_en = get_dataset('en','fsicoli/common_voice_22_0', 'test', 600)

cv_train_id = get_dataset('id', 'fsicoli/common_voice_22_0', 'train', 4500)
cv_test_id = get_dataset('id', 'fsicoli/common_voice_22_0', 'test', 500)

Menyiapkan streaming 5200 data en...


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading readme: 0.00B [00:00, ?B/s]

Downloading extra modules: 0.00B [00:00, ?B/s]

Downloading extra modules: 0.00B [00:00, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]


Reading metadata...: 0it [00:00, ?it/s][A
Reading metadata...: 1it [00:00,  2.78it/s][A
Reading metadata...: 15902it [00:00, 45205.65it/s][A
Reading metadata...: 25256it [00:00, 32909.57it/s][A
Reading metadata...: 34398it [00:01, 30694.46it/s][A
Reading metadata...: 48864it [00:01, 48626.80it/s][A
Reading metadata...: 57187it [00:01, 40696.57it/s][A
Reading metadata...: 68604it [00:01, 38136.64it/s][A
Reading metadata...: 83166it [00:01, 53613.32it/s][A
Reading metadata...: 91530it [00:02, 42660.42it/s][A
Reading metadata...: 102554it [00:02, 30127.63it/s][A
Reading metadata...: 117092it [00:03, 42708.76it/s][A
Reading metadata...: 125083it [00:03, 31419.78it/s][A
Reading metadata...: 136083it [00:04, 20752.02it/s][A
Reading metadata...: 150318it [00:04, 30059.77it/s][A
Reading metadata...: 157815it [00:04, 29185.08it/s][A
Reading metadata...: 169362it [00:05, 31214.97it/s][A
Reading metadata...: 182647it [00:05, 42406.34it/s][A
Reading metadata...: 190361it [00:05,

Menyiapkan streaming 600 data en...


Generating train split: 0 examples [00:00, ? examples/s]


Reading metadata...: 0it [00:00, ?it/s][A
Reading metadata...: 1it [00:00,  6.23it/s][A
Reading metadata...: 16401it [00:00, 60754.59it/s][A


Menyiapkan streaming 4500 data id...


Generating train split: 0 examples [00:00, ? examples/s]


Reading metadata...: 0it [00:00, ?it/s][A
Reading metadata...: 4973it [00:00, 29141.72it/s]


Menyiapkan streaming 500 data id...


Generating train split: 0 examples [00:00, ? examples/s]


Reading metadata...: 0it [00:00, ?it/s][A
Reading metadata...: 3690it [00:00, 23991.71it/s]


In [9]:
print(f'Data Train English: {fleur_train_en.shape[0] + cv_train_en.shape[0] + edacc_train.shape[0]}')
print(f'Data Test English: {fleur_test_en.shape[0] + cv_test_en.shape[0] + edacc_test.shape[0]}')

print(f'Data Train Indonesian: {fleur_train_id.shape[0] + cv_train_id.shape[0]}')
print(f'Data Test Indonesian: {fleur_test_id.shape[0] + cv_test_id.shape[0]}')

Data Train English: 7972
Data Test English: 963
Data Train Indonesian: 7050
Data Test Indonesian: 820


In [10]:
train_dataset = concatenate_datasets([fleur_train_en, fleur_train_id, cv_train_en, cv_train_id, edacc_train])
test_dataset = concatenate_datasets([fleur_test_en, fleur_test_id, cv_test_en, cv_test_id, edacc_test])

train_dataset = train_dataset.shuffle(seed=42)
test_dataset = test_dataset.shuffle(seed=42)

dataset = DatasetDict({
    'train':train_dataset,
    'test':test_dataset
})

print(f'full dataset train: {train_dataset.shape}')
print(f'full dataset test: {test_dataset.shape}\n')
dataset

full dataset train: (15022, 3)
full dataset test: (1783, 3)



DatasetDict({
    train: Dataset({
        features: ['audio', 'raw_transcription', 'language'],
        num_rows: 15022
    })
    test: Dataset({
        features: ['audio', 'raw_transcription', 'language'],
        num_rows: 1783
    })
})

## Initialization Model

In [11]:
MODEL_NAME = 'openai/whisper-large-v3-turbo'
processor = WhisperProcessor.from_pretrained(MODEL_NAME)
model = WhisperForConditionalGeneration.from_pretrained(MODEL_NAME, dtype=None, low_cpu_mem_usage=True)

preprocessor_config.json:   0%|          | 0.00/340 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

normalizer.json: 0.00B [00:00, ?B/s]

added_tokens.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.62G [00:00<?, ?B/s]

generation_config.json: 0.00B [00:00, ?B/s]

In [12]:
model.config.use_cache = False
model.gradient_checkpointing_enable()

model.enable_input_require_grads()

lora_config = LoraConfig(
    r=32,
    lora_alpha=64,
    target_modules=[
        "q_proj", "v_proj", "k_proj", "out_proj",  
        "fc1", "fc2"      
    ],
    lora_dropout=0.1,
    bias="none",
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 27,852,800 || all params: 836,730,880 || trainable%: 3.3288


## Preprocessing Data

In [13]:
dataset = dataset.cast_column('audio', Audio(sampling_rate=16000))

def preprocessing_data(batch):
    audio = batch['audio']
    batch['input_features'] = processor(audio['array'], sampling_rate=audio['sampling_rate'], return_tensor='pt').input_features[0]

    language = batch['language']
    transcription = batch['raw_transcription']

    if language == 'en_us':
        lang = 'english'
    elif language == 'id_id':
        lang = 'indonesian'
    else:
        lang = language

    processor.tokenizer.set_prefix_tokens(language=lang, task='transcribe')
    batch['labels'] = processor.tokenizer(transcription).input_ids
    return batch

In [14]:
processed_dataset = dataset.map(preprocessing_data, remove_columns=dataset.column_names['train'], num_proc=1)

Map:   0%|          | 0/15022 [00:00<?, ? examples/s]

Map:   0%|          | 0/1783 [00:00<?, ? examples/s]

## Training Model

In [15]:
@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any
    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]
        batch["labels"] = labels
        return batch
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

wer_metric = evaluate.load("wer")


def normalize_text(text):
    text = text.lower()
    
    text = re.sub(r"[^a-z0-9\s]", "", text)
    return text.strip()

def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    if isinstance(pred_ids, torch.Tensor):
        pred_ids = pred_ids.detach().cpu().numpy()
    if isinstance(label_ids, torch.Tensor):
        label_ids = label_ids.detach().cpu().numpy()

    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids, skip_special_tokens=True, normalize=False)
    label_str = processor.batch_decode(label_ids, skip_special_tokens=True, normalize=False)

    pred_str_norm = [normalize_text(s) for s in pred_str]
    label_str_norm = [normalize_text(s) for s in label_str]

    wer = 100 * wer_metric.compute(predictions=pred_str_norm, references=label_str_norm)
    
    return {"wer": wer}

Downloading builder script: 0.00B [00:00, ?B/s]

In [16]:
early_stopping = EarlyStoppingCallback(early_stopping_patience=3, early_stopping_threshold=0.01)

training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper-turbo-finetune",
    
    # --- OPTIMIZER ---
    optim="adamw_torch_fused",  
    
    # --- MEMORY & BATCH SIZE ---
    per_device_train_batch_size=8,      
    gradient_accumulation_steps=4,      
    per_device_eval_batch_size=8,       
    
    # --- TRAINING DURATION ---
    num_train_epochs=2,                 
    warmup_ratio=0.1,                   
    
    # --- PERFORMANCE ---
    dataloader_pin_memory=True,
    dataloader_num_workers=2,
    dataloader_prefetch_factor=2,       
    
    # --- LEARNING RATE ---
    learning_rate=5e-5,                 
    lr_scheduler_type='cosine',
    weight_decay=0.01,                  
    
    # --- PRECISION ---
    fp16=True,
    fp16_full_eval=True,                
    
    # --- GRADIENT ---
    gradient_checkpointing=True,
    max_grad_norm=1.0,                  
    
    # --- EVALUATION ---
    eval_strategy="epoch",              
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    
    # --- GENERATION ---
    predict_with_generate=True,
    generation_max_length=225,
    generation_num_beams=1,             
    
    # --- LOGGING ---
    logging_steps=50,                   
    logging_first_step=True,
    report_to='tensorboard',
    
    # --- MISC ---
    remove_unused_columns=False,
    push_to_hub=False,
    seed=42,
)

trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=processed_dataset["train"],
    eval_dataset=processed_dataset["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    processing_class=processor,
    callbacks=[early_stopping]
)

In [17]:
import torch, gc
torch.cuda.empty_cache()
gc.collect()

trainer.train()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
You're using a WhisperTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a WhisperTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to

Epoch,Training Loss,Validation Loss,Wer
1,0.2351,0.271706,7.420081
2,0.2292,0.26382,7.334534


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
You're using a WhisperTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a WhisperTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to

TrainOutput(global_step=940, training_loss=0.32264424575136064, metrics={'train_runtime': 17723.9968, 'train_samples_per_second': 1.695, 'train_steps_per_second': 0.053, 'total_flos': 5.315156157136896e+19, 'train_loss': 0.32264424575136064, 'epoch': 2.0})

In [18]:
LORA_ADAPTER_PATH = './whisper-turbo-fine-tuning-multilingual-fleurs'
trainer.save_model(LORA_ADAPTER_PATH)
processor.save_pretrained(LORA_ADAPTER_PATH)

[]

In [19]:
MODEL_NAME = 'openai/whisper-large-v3-turbo'
processor = WhisperProcessor.from_pretrained(MODEL_NAME)
model = WhisperForConditionalGeneration.from_pretrained(MODEL_NAME, torch_dtype=torch.float16, low_cpu_mem_usage=True)


peft_model_id = './whisper-turbo-fine-tuning-multilingual-fleurs'
model = PeftModel.from_pretrained(model, peft_model_id)

model = model.merge_and_unload()

`torch_dtype` is deprecated! Use `dtype` instead!


In [20]:
model.save_pretrained('./whisper-turbo-multilingual-fleurs')
processor = WhisperProcessor.from_pretrained(peft_model_id)
processor.save_pretrained('./whisper-turbo-multilingual-fleurs')

[]

In [21]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [22]:
repo_id = "Dafisns/whisper-turbo-multilingual-fleurs"

model.push_to_hub(repo_id)
processor.push_to_hub(repo_id)

README.md: 0.00B [00:00, ?B/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/Dafisns/whisper-turbo-multilingual-fleurs/commit/b7efd0e4ce8755052830d484ca58501418b8bcb3', commit_message='Upload processor', commit_description='', oid='b7efd0e4ce8755052830d484ca58501418b8bcb3', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Dafisns/whisper-turbo-multilingual-fleurs', endpoint='https://huggingface.co', repo_type='model', repo_id='Dafisns/whisper-turbo-multilingual-fleurs'), pr_revision=None, pr_num=None)

In [24]:
MODEL_NAME = 'Dafisns/whisper-turbo-multilingual-fleurs'
processor = WhisperProcessor.from_pretrained(MODEL_NAME)
model = WhisperForConditionalGeneration.from_pretrained(MODEL_NAME, dtype=None, low_cpu_mem_usage=True)

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.62G [00:00<?, ?B/s]

generation_config.json: 0.00B [00:00, ?B/s]

In [29]:
MODEL_ID = "Dafisns/whisper-turbo-multilingual-fleurs"  
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
BATCH_SIZE = 6 

asr_pipeline = pipeline(
    "automatic-speech-recognition",
    model=MODEL_ID,
    device=DEVICE,
    torch_dtype=torch.float32,
    chunk_length_s=30
)

Device set to use cuda


In [26]:
dataset_full_en = concatenate_datasets([fleur_test_en, cv_test_en, edacc_test])

dataset_full_id = concatenate_datasets([fleur_test_id, cv_test_id])

print(f"Total Data Test English   : {len(dataset_full_en)}")
print(f"Total Data Test Indonesian: {len(dataset_full_id)}")

Total Data Test English   : 963
Total Data Test Indonesian: 820 samples


In [30]:
normalizer = BasicTextNormalizer()
wer_metric = evaluate.load("wer")

def evaluate_model(dataset, lang_code, dataset_name="Dataset"):
    print(f"\nMemulai evaluasi untuk: {dataset_name}...")
    
    predictions = []
    references = []
    
    if lang_code == "en":
        gen_kwargs = {"language": "english", "task": "transcribe"}
    elif lang_code == "id":
        gen_kwargs = {"language": "indonesian", "task": "transcribe"}
    
    for i, out in enumerate(tqdm(asr_pipeline(KeyDataset(dataset, "audio"), 
                                              batch_size=BATCH_SIZE, 
                                              generate_kwargs=gen_kwargs), 
                                 total=len(dataset))):
        
        pred_text = out["text"]
        
        if "raw_transcription" in dataset.column_names:
            ref_text = dataset[i]["raw_transcription"]
        elif "transcription" in dataset.column_names:
            ref_text = dataset[i]["transcription"]
        else:
            ref_text = dataset[i]["text"] 

        predictions.append(normalizer(pred_text))
        references.append(normalizer(ref_text))
        
    wer_score = wer_metric.compute(predictions=predictions, references=references)
    print(f"Hasil WER untuk {dataset_name}: {wer_score * 100:.2f}%")
    return wer_score

In [31]:
wer_en = evaluate_model(dataset_full_en, lang_code="en", dataset_name="Full English (Fleurs+CV+EdAcc)")

wer_id = evaluate_model(dataset_full_id, lang_code="id", dataset_name="Full Indonesian (Fleurs+CV)")

print("\n" + "="*40)
print("          REKAP HASIL EVALUASI          ")
print("="*40)
print(f"Model: {MODEL_ID}")
print(f"WER English    : {wer_en * 100:.2f}%")
print(f"WER Indonesian : {wer_id * 100:.2f}%")
print("="*40)


Memulai evaluasi untuk: Full English (Fleurs+CV+EdAcc)...


  0%|          | 0/963 [00:00<?, ?it/s]

Hasil WER untuk Full English (Fleurs+CV+EdAcc): 9.09%

Memulai evaluasi untuk: Full Indonesian (Fleurs+CV)...


  0%|          | 0/820 [00:00<?, ?it/s]

Hasil WER untuk Full Indonesian (Fleurs+CV): 6.97%

          REKAP HASIL EVALUASI          
Model: Dafisns/whisper-turbo-multilingual-fleurs
WER English    : 9.09%
WER Indonesian : 6.97%
