In [1]:
import warnings
warnings.filterwarnings('ignore')

# Test whisper models

In [2]:
import os
import torch
from datasets import load_dataset
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline

device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
print(f"device: {device}")

def initialize_pipe(model, model_path):
    
    # model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True, attn_implementation="flash_attention_2") # use_flash_attention_2=True) # 
    # model.to(device)

    if hasattr(model.generation_config, "no_timestamps_token_id"):
        return_timestamps = True
    else:
        return_timestamps = False
        
    processor = AutoProcessor.from_pretrained(model_path)

    pipe = pipeline(
        "automatic-speech-recognition",
        model=model,
        tokenizer=processor.tokenizer,
        feature_extractor=processor.feature_extractor,
        max_new_tokens=128,
        chunk_length_s=30,
        batch_size=16,
        return_timestamps=return_timestamps,
        torch_dtype=torch_dtype,
        device=device,
    )
    
    return pipe

device: cuda:0


# Test and compare different versions

## Common voice 17.0, zh-HK

In [3]:
# import os
# import pandas as pd

# dataset_name = "common_voice"

# test_data_dir = "data/cv-corpus-17.0-2024-03-15/zh-HK/clips"
# test_data_path = "data/cv-corpus-17.0-2024-03-15/zh-HK/test.tsv"

# df = pd.read_csv(test_data_path, sep='\t')
# print(df.shape)
# df.head()

# num_test = 1000

# df.rename(columns={"path": "file_id", "sentence": "true"}, inplace=True)
# df = df[:num_test]
# df.head()

## midea data

In [4]:
# import pandas as pd
# dataset_name = 'midea_500'
# # test_data_dir = "data/midea_data_500/clips"
# # test_data_dir = "data/midea_data_500/amrs"
# test_data_dir = "data/midea_data_500/wavs"

# df = pd.read_csv("./data/midea_data_500/transcripts.csv")
# df.rename(columns={"sentence": "true"}, inplace=True)
# # df['file_id'] = df['file_id'].apply(lambda x: x.replace('.mp3', '.amr'))
# df.head()

# custom_data_v0

In [6]:
import pandas as pd

dataset_name = 'custom_v0'
test_data_dir = "data/custom_data_v0/test"

with open(os.path.join(test_data_dir, 'audio_paths')) as fo:
    audio_paths = fo.readlines()
    
with open(os.path.join(test_data_dir, 'text')) as fo:
    texts = fo.readlines()
    
print(audio_paths[:5])
print(texts[:5])

start_id = 0
df = pd.DataFrame([[text.strip().split()[0], " ".join(text.strip().split()[1:])] for text in texts[start_id:]], columns=['idx', 'true'])
df.head()

df_path = pd.DataFrame([path.strip().split() for path in audio_paths[start_id:]], columns=['idx', 'file_id'])
df_path.head()

df = pd.merge(df, df_path, on='idx')
df.head()

['0d191ea9-047c-49eb-bf3f-e465e60ba215 /home/ec2-user/SageMaker/efs/Projects/whisper/data/MDCC/audio/447_1711162014_18779_1039.48_1042.69.wav\n', '68e9711f-1e1d-42b2-ad5f-a0d46a233179 /home/ec2-user/SageMaker/efs/Projects/whisper/data/MDCC/audio/447_1803291607_38267_616.4_617.61.wav\n', '1c912c48-6ed7-48b0-ada2-9b5be703ef55 /home/ec2-user/SageMaker/efs/Projects/whisper/data/MDCC/audio/447_2102251812_16686_3350.08_3352.22.wav\n', '5142143f-d0e5-4c9c-b523-6777fa4eaf69 /home/ec2-user/SageMaker/efs/Projects/whisper/data/MDCC/audio/447_1711171106_61214_431.74_433.39.wav\n', '5c033fbe-d20c-4ae1-b935-125d327d5247 /home/ec2-user/SageMaker/efs/Projects/whisper/data/MDCC/audio/447_1711162014_23687_350.5_355.14.wav\n']
['0d191ea9-047c-49eb-bf3f-e465e60ba215 由交湖可以坐車上去少女峰\n', '68e9711f-1e1d-42b2-ad5f-a0d46a233179 無由擺脫\n', '1c912c48-6ed7-48b0-ada2-9b5be703ef55 呢啲遠徵軍都有一句話\n', '5142143f-d0e5-4c9c-b523-6777fa4eaf69 大多如浮游\n', '5c033fbe-d20c-4ae1-b935-125d327d5247 但係我哋邊個亦都冇諗住去欣賞只係喺度嗌住話\n']


Unnamed: 0,idx,true,file_id
0,0d191ea9-047c-49eb-bf3f-e465e60ba215,由交湖可以坐車上去少女峰,/home/ec2-user/SageMaker/efs/Projects/whisper/...
1,68e9711f-1e1d-42b2-ad5f-a0d46a233179,無由擺脫,/home/ec2-user/SageMaker/efs/Projects/whisper/...
2,1c912c48-6ed7-48b0-ada2-9b5be703ef55,呢啲遠徵軍都有一句話,/home/ec2-user/SageMaker/efs/Projects/whisper/...
3,5142143f-d0e5-4c9c-b523-6777fa4eaf69,大多如浮游,/home/ec2-user/SageMaker/efs/Projects/whisper/...
4,5c033fbe-d20c-4ae1-b935-125d327d5247,但係我哋邊個亦都冇諗住去欣賞只係喺度嗌住話,/home/ec2-user/SageMaker/efs/Projects/whisper/...


## clear GPU

In [7]:
import gc

# Clear the model from GPU memory
def clear_model_from_gpu(model):
    # Delete the model object
    del model
    # Force garbage collection to free any unreferenced memory
    gc.collect()
    # Clear any existing CUDA cache
    torch.cuda.empty_cache()
    # Optionally, you can reinitialize the CUDA context by calling CUDA operations
    torch.cuda.synchronize()
    
    print("Model cleared from GPU memory.")

In [8]:
import os
import json
import time
from tqdm import tqdm
from faster_whisper import WhisperModel
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline

# Here, change the checkpoint path and the whisper model path

model_ids = [
    {"label": "v0-60", "model_path": "/home/ec2-user/SageMaker/efs/Projects/whisper/checkpoint/checkpoint-v0/checkpoint-60"},
    # {"label": "faster-v0-60", "model_path": "/home/ec2-user/SageMaker/efs/Projects/whisper/checkpoint/checkpoint-v0-bs64/checkpoint-60-faster"},  
    {"label": "large-v3", "model_path": "/home/ec2-user/SageMaker/efs/Models/whisper-large-v3"},
    # {"label": "faster-large-v3", "model_path": "/home/ec2-user/SageMaker/efs/Models/faster-whisper-large-v3"}, 
]

sample = os.path.join(test_data_dir, df.loc[0, 'file_id'])

results = []

for model_id in model_ids:
    label = model_id["label"]
    model_path = model_id["model_path"]
    
    if 'faster' in label: 
        model = WhisperModel(model_path)
        # model.to(device)
        segments, info = model.transcribe(sample, language="yue") # 
        pred = ''.join([segment.text for segment in segments])
        print(f"model label: {label}, result: {pred}")
    else:
        model = AutoModelForSpeechSeq2Seq.from_pretrained(model_path, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True, attn_implementation="flash_attention_2") 
        model.to(device)
        
        pipe = initialize_pipe(model, model_path)
        if label == "large-v3" or label.startswith("ckpt"):
            result = pipe(sample, generate_kwargs={"language": "cantonese"})
        else:
            result = pipe(sample)
            
        print(f"model label: {label}, result: {result['text']}")
    
    for idx, row in tqdm(df.iterrows(), total=df.shape[0]):
        filename = row['file_id']
        filepath = os.path.join(test_data_dir, filename)
        
        if not os.path.exists(filepath):
            continue            
        
        start_time = time.time()
        if label == "large-v3" or label.startswith("ckpt"):
            result = pipe(filepath, generate_kwargs={"language": "cantonese"})
        elif label == "faster-large-v3":
            segments, info = model.transcribe(filepath, language="yue") # 
            # for segment in segments:
                # print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))
            result = {'text': ''.join([segment.text for segment in segments])}
        else:
            result = pipe(filepath)
        
        time_cost = time.time() - start_time
        
        df.loc[idx, f"pred-{label}"] = result['text']
        df.loc[idx, f"time-{label}"] = round(time_cost, 2)
        df.loc[idx, f"chunks-{label}"] = json.dumps(result.get("chunks", ""))
    
    clear_model_from_gpu(model)

You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.48it/s]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token.As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


model label: v6-40, result: 由嘉湖可以坐車上去笑女風


  1%|          | 9/1435 [00:04<13:03,  1.82it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|██████████| 1435/1435 [13:22<00:00,  1.79it/s]


Model cleared from GPU memory.


Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.08it/s]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


model label: v7-60, result: 由嘉湖可以坐車上去少女峰


100%|██████████| 1435/1435 [13:19<00:00,  1.80it/s]


Model cleared from GPU memory.
model label: faster-v7-60, result: 由嘉湖可以坐車上去少女峰


100%|██████████| 1435/1435 [13:19<00:00,  1.79it/s]


Model cleared from GPU memory.


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
You have passed language=cantonese, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of language=cantonese.


model label: large-v3, result:  由家湖可以坐車上去少女峰


 74%|███████▍  | 1066/1435 [09:49<02:17,  2.68it/s]Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
 78%|███████▊  | 1113/1435 [10:08<01:47,  2.99it/s]Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
 85%|████████▍ | 1215/1435 [10:46<01:05,  3.37it/s]Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
100%|██████████| 1435/1435 [12:03<00:00,  1.98it/s]


Model cleared from GPU memory.
model label: faster-large-v3, result:  游家湖可以坐車上去少女峰


100%|██████████| 1435/1435 [13:03<00:00,  1.83it/s]

Model cleared from GPU memory.





In [9]:
df.head()

Unnamed: 0,idx,true,file_id,pred-v6-40,time-v6-40,chunks-v6-40,pred-v7-60,time-v7-60,chunks-v7-60,pred-faster-v7-60,time-faster-v7-60,chunks-faster-v7-60,pred-large-v3,time-large-v3,chunks-large-v3,pred-faster-large-v3,time-faster-large-v3,chunks-faster-large-v3
0,0d191ea9-047c-49eb-bf3f-e465e60ba215,由交湖可以坐車上去少女峰,/home/ec2-user/SageMaker/efs/Projects/whisper/...,由嘉湖可以坐車上去笑女風,0.54,"[{""timestamp"": [0.0, 3.2], ""text"": ""\u7531\u56...",由嘉湖可以坐車上去少女峰,0.57,"[{""timestamp"": [0.0, 3.22], ""text"": ""\u7531\u5...",由嘉湖可以坐車上去少女峰,0.59,"[{""timestamp"": [0.0, 3.22], ""text"": ""\u7531\u5...",由家湖可以坐車上去少女峰,0.54,"[{""timestamp"": [0.0, 3.22], ""text"": "" \u7531\u...",游家湖可以坐車上去少女峰,0.52,""""""
1,68e9711f-1e1d-42b2-ad5f-a0d46a233179,無由擺脫,/home/ec2-user/SageMaker/efs/Projects/whisper/...,無由擺脫,0.41,"[{""timestamp"": [0.0, 1.22], ""text"": ""\u7121\u7...",無由擺脫,0.41,"[{""timestamp"": [0.0, 1.22], ""text"": ""\u7121\u7...",無由擺脫,0.4,"[{""timestamp"": [0.0, 1.22], ""text"": ""\u7121\u7...",無由擺脫,0.35,"[{""timestamp"": [0.0, 1.22], ""text"": "" \u7121\u...",無由擺脫,0.45,""""""
2,1c912c48-6ed7-48b0-ada2-9b5be703ef55,呢啲遠徵軍都有一句話,/home/ec2-user/SageMaker/efs/Projects/whisper/...,呢啲遠征軍都有一句話,0.52,"[{""timestamp"": [0.0, 2.14], ""text"": ""\u5462\u5...",呢啲遠征軍都有一句話,0.52,"[{""timestamp"": [0.0, 2.14], ""text"": ""\u5462\u5...",呢啲遠征軍都有一句話,0.54,"[{""timestamp"": [0.0, 2.14], ""text"": ""\u5462\u5...",這些遠征軍都有一句話,0.43,"[{""timestamp"": [0.0, 2.14], ""text"": "" \u9019\u...",這些遠征軍都有一句話,0.46,""""""
3,5142143f-d0e5-4c9c-b523-6777fa4eaf69,大多如浮游,/home/ec2-user/SageMaker/efs/Projects/whisper/...,大多如浮游,0.43,"[{""timestamp"": [0.0, 1.66], ""text"": ""\u5927\u5...",大多如浮游,0.43,"[{""timestamp"": [0.0, 1.64], ""text"": ""\u5927\u5...",大多如浮游,0.43,"[{""timestamp"": [0.0, 1.64], ""text"": ""\u5927\u5...",大多如浮游,0.36,"[{""timestamp"": [0.0, 1.66], ""text"": "" \u5927\u...",大多如浮游。,0.45,""""""
4,5c033fbe-d20c-4ae1-b935-125d327d5247,但係我哋邊個亦都冇諗住去欣賞只係喺度嗌住話,/home/ec2-user/SageMaker/efs/Projects/whisper/...,但係我哋邊個亦都冇諗住去欣賞只係喺度喊住話,0.94,"[{""timestamp"": [0.0, 4.64], ""text"": ""\u4f46\u4...",但係我哋邊個亦都冇諗住去欣賞只係喺度嗌住話,1.01,"[{""timestamp"": [0.0, 2.98], ""text"": ""\u4f46\u4...",但係我哋邊個亦都冇諗住去欣賞只係喺度嗌住話,0.99,"[{""timestamp"": [0.0, 2.98], ""text"": ""\u4f46\u4...",但我哋邊個亦都冇諗住去欣賞 只係喺度嗌住話,0.96,"[{""timestamp"": [0.0, 2.92], ""text"": "" \u4f46\u...",但我哋邊個亦都冇諗住去欣賞 只係喺度喊住話,0.71,""""""


## CER calculation

In [10]:
import re
import jiwer
from whisper.normalizers import BasicTextNormalizer, EnglishTextNormalizer

class ChineseTextNormalizer:
    def __init__(self, split_letters: bool = False):

        self.split_letters = split_letters

    def __call__(self, s: str):
        s = s.lower()
        # s = re.sub(r"[<\[][^>\]]*[>\]]", "", s)  # remove words between brackets
        # s = re.sub(r"\(([^)]+?)\)", "", s)  # remove words between parenthesis
        # s = self.clean(s).lower()

        # if self.split_letters:
        #     s = " ".join(regex.findall(r"\X", s, regex.U))

        s = re.sub(
            r"\s+", "", s
        )  # remove any successive whitespace characters
        
        return s
    
# normalizer = EnglishTextNormalizer()
# normalizer = BasicTextNormalizer()
normalizer = ChineseTextNormalizer()

In [11]:
df.head()

Unnamed: 0,idx,true,file_id,pred-v6-40,time-v6-40,chunks-v6-40,pred-v7-60,time-v7-60,chunks-v7-60,pred-faster-v7-60,time-faster-v7-60,chunks-faster-v7-60,pred-large-v3,time-large-v3,chunks-large-v3,pred-faster-large-v3,time-faster-large-v3,chunks-faster-large-v3
0,0d191ea9-047c-49eb-bf3f-e465e60ba215,由交湖可以坐車上去少女峰,/home/ec2-user/SageMaker/efs/Projects/whisper/...,由嘉湖可以坐車上去笑女風,0.54,"[{""timestamp"": [0.0, 3.2], ""text"": ""\u7531\u56...",由嘉湖可以坐車上去少女峰,0.57,"[{""timestamp"": [0.0, 3.22], ""text"": ""\u7531\u5...",由嘉湖可以坐車上去少女峰,0.59,"[{""timestamp"": [0.0, 3.22], ""text"": ""\u7531\u5...",由家湖可以坐車上去少女峰,0.54,"[{""timestamp"": [0.0, 3.22], ""text"": "" \u7531\u...",游家湖可以坐車上去少女峰,0.52,""""""
1,68e9711f-1e1d-42b2-ad5f-a0d46a233179,無由擺脫,/home/ec2-user/SageMaker/efs/Projects/whisper/...,無由擺脫,0.41,"[{""timestamp"": [0.0, 1.22], ""text"": ""\u7121\u7...",無由擺脫,0.41,"[{""timestamp"": [0.0, 1.22], ""text"": ""\u7121\u7...",無由擺脫,0.4,"[{""timestamp"": [0.0, 1.22], ""text"": ""\u7121\u7...",無由擺脫,0.35,"[{""timestamp"": [0.0, 1.22], ""text"": "" \u7121\u...",無由擺脫,0.45,""""""
2,1c912c48-6ed7-48b0-ada2-9b5be703ef55,呢啲遠徵軍都有一句話,/home/ec2-user/SageMaker/efs/Projects/whisper/...,呢啲遠征軍都有一句話,0.52,"[{""timestamp"": [0.0, 2.14], ""text"": ""\u5462\u5...",呢啲遠征軍都有一句話,0.52,"[{""timestamp"": [0.0, 2.14], ""text"": ""\u5462\u5...",呢啲遠征軍都有一句話,0.54,"[{""timestamp"": [0.0, 2.14], ""text"": ""\u5462\u5...",這些遠征軍都有一句話,0.43,"[{""timestamp"": [0.0, 2.14], ""text"": "" \u9019\u...",這些遠征軍都有一句話,0.46,""""""
3,5142143f-d0e5-4c9c-b523-6777fa4eaf69,大多如浮游,/home/ec2-user/SageMaker/efs/Projects/whisper/...,大多如浮游,0.43,"[{""timestamp"": [0.0, 1.66], ""text"": ""\u5927\u5...",大多如浮游,0.43,"[{""timestamp"": [0.0, 1.64], ""text"": ""\u5927\u5...",大多如浮游,0.43,"[{""timestamp"": [0.0, 1.64], ""text"": ""\u5927\u5...",大多如浮游,0.36,"[{""timestamp"": [0.0, 1.66], ""text"": "" \u5927\u...",大多如浮游。,0.45,""""""
4,5c033fbe-d20c-4ae1-b935-125d327d5247,但係我哋邊個亦都冇諗住去欣賞只係喺度嗌住話,/home/ec2-user/SageMaker/efs/Projects/whisper/...,但係我哋邊個亦都冇諗住去欣賞只係喺度喊住話,0.94,"[{""timestamp"": [0.0, 4.64], ""text"": ""\u4f46\u4...",但係我哋邊個亦都冇諗住去欣賞只係喺度嗌住話,1.01,"[{""timestamp"": [0.0, 2.98], ""text"": ""\u4f46\u4...",但係我哋邊個亦都冇諗住去欣賞只係喺度嗌住話,0.99,"[{""timestamp"": [0.0, 2.98], ""text"": ""\u4f46\u4...",但我哋邊個亦都冇諗住去欣賞 只係喺度嗌住話,0.96,"[{""timestamp"": [0.0, 2.92], ""text"": "" \u4f46\u...",但我哋邊個亦都冇諗住去欣賞 只係喺度喊住話,0.71,""""""


In [12]:
new_columns = ['true', 'true-clean']
pred_cols, cer_cols, time_cols = [], [], []
chunk_cols = []
for model_id in model_ids:
    label = model_id['label']
    pred_cols.extend([f'pred-{label}',f'pred-{label}-clean'])
    cer_cols.append(f'cer-{label}')
    if f'chunks-{label}' in df.columns:
        chunk_cols.append(f'chunks-{label}')
        
    time_cols.append(f'time-{label}')

new_columns.extend(pred_cols+cer_cols+time_cols+chunk_cols)
# new_columns.extend(['file_id', 'user_id', 'category_name'])
# new_columns.extend(['file_id', 'user_id', 'category_name'])
new_columns.extend(['file_id'])


for i, row in tqdm(df.iterrows(), total=df.shape[0]):
    
    true_clean = normalizer(row['true'])
    df.loc[i, 'true-clean'] = true_clean
    
    for model_id in model_ids:
        label = model_id['label']
        pred_clean = normalizer(row[f'pred-{label}'])
        df.loc[i, f'pred-{label}-clean'] = pred_clean
        
        cer = jiwer.cer(true_clean, pred_clean)
        df.loc[i, f'cer-{label}'] = cer
        
df = df[new_columns]
df = df.fillna("")
df.to_csv(f"outputs/transcripts_{dataset_name}.csv", index=False)
df.head(10)

100%|██████████| 1435/1435 [00:02<00:00, 501.57it/s]


Unnamed: 0,true,true-clean,pred-v6-40,pred-v6-40-clean,pred-v7-60,pred-v7-60-clean,pred-faster-v7-60,pred-faster-v7-60-clean,pred-large-v3,pred-large-v3-clean,...,time-v7-60,time-faster-v7-60,time-large-v3,time-faster-large-v3,chunks-v6-40,chunks-v7-60,chunks-faster-v7-60,chunks-large-v3,chunks-faster-large-v3,file_id
0,由交湖可以坐車上去少女峰,由交湖可以坐車上去少女峰,由嘉湖可以坐車上去笑女風,由嘉湖可以坐車上去笑女風,由嘉湖可以坐車上去少女峰,由嘉湖可以坐車上去少女峰,由嘉湖可以坐車上去少女峰,由嘉湖可以坐車上去少女峰,由家湖可以坐車上去少女峰,由家湖可以坐車上去少女峰,...,0.57,0.59,0.54,0.52,"[{""timestamp"": [0.0, 3.2], ""text"": ""\u7531\u56...","[{""timestamp"": [0.0, 3.22], ""text"": ""\u7531\u5...","[{""timestamp"": [0.0, 3.22], ""text"": ""\u7531\u5...","[{""timestamp"": [0.0, 3.22], ""text"": "" \u7531\u...","""""",/home/ec2-user/SageMaker/efs/Projects/whisper/...
1,無由擺脫,無由擺脫,無由擺脫,無由擺脫,無由擺脫,無由擺脫,無由擺脫,無由擺脫,無由擺脫,無由擺脫,...,0.41,0.4,0.35,0.45,"[{""timestamp"": [0.0, 1.22], ""text"": ""\u7121\u7...","[{""timestamp"": [0.0, 1.22], ""text"": ""\u7121\u7...","[{""timestamp"": [0.0, 1.22], ""text"": ""\u7121\u7...","[{""timestamp"": [0.0, 1.22], ""text"": "" \u7121\u...","""""",/home/ec2-user/SageMaker/efs/Projects/whisper/...
2,呢啲遠徵軍都有一句話,呢啲遠徵軍都有一句話,呢啲遠征軍都有一句話,呢啲遠征軍都有一句話,呢啲遠征軍都有一句話,呢啲遠征軍都有一句話,呢啲遠征軍都有一句話,呢啲遠征軍都有一句話,這些遠征軍都有一句話,這些遠征軍都有一句話,...,0.52,0.54,0.43,0.46,"[{""timestamp"": [0.0, 2.14], ""text"": ""\u5462\u5...","[{""timestamp"": [0.0, 2.14], ""text"": ""\u5462\u5...","[{""timestamp"": [0.0, 2.14], ""text"": ""\u5462\u5...","[{""timestamp"": [0.0, 2.14], ""text"": "" \u9019\u...","""""",/home/ec2-user/SageMaker/efs/Projects/whisper/...
3,大多如浮游,大多如浮游,大多如浮游,大多如浮游,大多如浮游,大多如浮游,大多如浮游,大多如浮游,大多如浮游,大多如浮游,...,0.43,0.43,0.36,0.45,"[{""timestamp"": [0.0, 1.66], ""text"": ""\u5927\u5...","[{""timestamp"": [0.0, 1.64], ""text"": ""\u5927\u5...","[{""timestamp"": [0.0, 1.64], ""text"": ""\u5927\u5...","[{""timestamp"": [0.0, 1.66], ""text"": "" \u5927\u...","""""",/home/ec2-user/SageMaker/efs/Projects/whisper/...
4,但係我哋邊個亦都冇諗住去欣賞只係喺度嗌住話,但係我哋邊個亦都冇諗住去欣賞只係喺度嗌住話,但係我哋邊個亦都冇諗住去欣賞只係喺度喊住話,但係我哋邊個亦都冇諗住去欣賞只係喺度喊住話,但係我哋邊個亦都冇諗住去欣賞只係喺度嗌住話,但係我哋邊個亦都冇諗住去欣賞只係喺度嗌住話,但係我哋邊個亦都冇諗住去欣賞只係喺度嗌住話,但係我哋邊個亦都冇諗住去欣賞只係喺度嗌住話,但我哋邊個亦都冇諗住去欣賞 只係喺度嗌住話,但我哋邊個亦都冇諗住去欣賞只係喺度嗌住話,...,1.01,0.99,0.96,0.71,"[{""timestamp"": [0.0, 4.64], ""text"": ""\u4f46\u4...","[{""timestamp"": [0.0, 2.98], ""text"": ""\u4f46\u4...","[{""timestamp"": [0.0, 2.98], ""text"": ""\u4f46\u4...","[{""timestamp"": [0.0, 2.92], ""text"": "" \u4f46\u...","""""",/home/ec2-user/SageMaker/efs/Projects/whisper/...
5,香港電臺製作,香港電臺製作,香港電臺製作,香港電臺製作,香港電臺製作,香港電臺製作,香港電臺製作,香港電臺製作,香港電台製作,香港電台製作,...,0.43,0.44,0.37,0.44,"[{""timestamp"": [0.0, 1.32], ""text"": ""\u9999\u6...","[{""timestamp"": [0.0, 1.32], ""text"": ""\u9999\u6...","[{""timestamp"": [0.0, 1.32], ""text"": ""\u9999\u6...","[{""timestamp"": [0.0, 1.32], ""text"": "" \u9999\u...","""""",/home/ec2-user/SageMaker/efs/Projects/whisper/...
6,隨住形勢變化,隨住形勢變化,隨著形勢變化,隨著形勢變化,隨住形勢變化,隨住形勢變化,隨住形勢變化,隨住形勢變化,隨著形勢變化,隨著形勢變化,...,0.45,0.43,0.4,0.47,"[{""timestamp"": [0.0, 1.34], ""text"": ""\u96a8\u8...","[{""timestamp"": [0.0, 1.34], ""text"": ""\u96a8\u4...","[{""timestamp"": [0.0, 1.34], ""text"": ""\u96a8\u4...","[{""timestamp"": [0.0, 1.34], ""text"": "" \u96a8\u...","""""",/home/ec2-user/SageMaker/efs/Projects/whisper/...
7,話我立正姿勢不正確,話我立正姿勢不正確,話我立正姿勢不正確,話我立正姿勢不正確,話我立正姿勢不正確,話我立正姿勢不正確,話我立正姿勢不正確,話我立正姿勢不正確,話我立正姿勢不正確,話我立正姿勢不正確,...,0.52,0.55,0.48,0.5,"[{""timestamp"": [0.0, 2.72], ""text"": ""\u8a71\u6...","[{""timestamp"": [0.0, 2.72], ""text"": ""\u8a71\u6...","[{""timestamp"": [0.0, 2.72], ""text"": ""\u8a71\u6...","[{""timestamp"": [0.0, 2.72], ""text"": "" \u8a71\u...","""""",/home/ec2-user/SageMaker/efs/Projects/whisper/...
8,江嬸喺廚房摸住出嚟,江嬸喺廚房摸住出嚟,江嬸喺廚房摸住出嚟,江嬸喺廚房摸住出嚟,江嬸喺廚房摸住出嚟,江嬸喺廚房摸住出嚟,江嬸喺廚房摸住出嚟,江嬸喺廚房摸住出嚟,江嬸在廚房摸著出來,江嬸在廚房摸著出來,...,0.62,0.62,0.48,0.51,"[{""timestamp"": [0.0, 1.9], ""text"": ""\u6c5f\u5b...","[{""timestamp"": [0.0, 1.9], ""text"": ""\u6c5f\u5b...","[{""timestamp"": [0.0, 1.9], ""text"": ""\u6c5f\u5b...","[{""timestamp"": [0.0, 1.9], ""text"": "" \u6c5f\u5...","""""",/home/ec2-user/SageMaker/efs/Projects/whisper/...
9,既然搵唔到一處只屬二人嘅世界,既然搵唔到一處只屬二人嘅世界,既然搵唔到一處只屬異人嘅世界,既然搵唔到一處只屬異人嘅世界,既然搵唔到一處只屬二人嘅世界,既然搵唔到一處只屬二人嘅世界,既然搵唔到一處只屬二人嘅世界,既然搵唔到一處只屬二人嘅世界,既然搵唔到一處只屬二人嘅世界,既然搵唔到一處只屬二人嘅世界,...,0.68,0.71,0.65,0.57,"[{""timestamp"": [0.0, 3.6], ""text"": ""\u65e2\u71...","[{""timestamp"": [0.0, 3.6], ""text"": ""\u65e2\u71...","[{""timestamp"": [0.0, 3.6], ""text"": ""\u65e2\u71...","[{""timestamp"": [0.0, 3.6], ""text"": "" \u65e2\u7...","""""",/home/ec2-user/SageMaker/efs/Projects/whisper/...


In [13]:
df = df.fillna("")
cers = []
for model_id in model_ids:
    label = model_id['label']
    df = df[df[f"pred-{label}-clean"]!='']
    cer = jiwer.cer(list(df["true-clean"]), list(df[f"pred-{label}-clean"]))
    
    cers.append({'model': label, 'cer': cer})

df_cers = pd.DataFrame(cers)
df_cers.to_csv(f"outputs/cers_{dataset_name}.csv", index=False)
df_cers.head()

Unnamed: 0,model,cer
0,v6-40,0.140076
1,v7-60,0.132638
2,faster-v7-60,0.132638
3,large-v3,0.23601
4,faster-large-v3,0.220459


In [14]:
df_cer_stats = df[cer_cols].describe(percentiles=[0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 0.99])
df_cer_stats.to_csv(f"outputs/cer_stats_{dataset_name}.csv")
df_cer_stats

Unnamed: 0,cer-v6-40,cer-v7-60,cer-faster-v7-60,cer-large-v3,cer-faster-large-v3
count,1429.0,1429.0,1429.0,1429.0,1429.0
mean,0.251505,0.197313,0.197313,0.349555,0.368245
std,0.440028,0.296528,0.296528,0.470055,0.699844
min,0.0,0.0,0.0,0.0,0.0
50%,0.111111,0.095238,0.095238,0.214286,0.2
60%,0.166667,0.153846,0.153846,0.266667,0.25
70%,0.25,0.222222,0.222222,0.357143,0.333333
80%,0.333333,0.3,0.3,0.558333,0.5
90%,1.0,0.5,0.5,1.0,1.0
95%,1.0,1.0,1.0,1.0,1.0


In [15]:
df_time_stats = df[time_cols].describe(percentiles=[0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 0.99])
df_time_stats.to_csv(f"outputs/time_stats_{dataset_name}.csv")
df_time_stats

Unnamed: 0,time-v6-40,time-v7-60,time-faster-v7-60,time-large-v3,time-faster-large-v3
count,1429.0,1429.0,1429.0,1429.0,1429.0
mean,0.556473,0.55557,0.556011,0.500903,0.543786
std,0.215388,0.217685,0.217576,0.242361,0.338934
min,0.29,0.29,0.29,0.22,0.37
50%,0.5,0.51,0.5,0.44,0.49
60%,0.54,0.54,0.54,0.48,0.51
70%,0.62,0.62,0.61,0.556,0.55
80%,0.73,0.73,0.73,0.68,0.61
90%,0.88,0.86,0.86,0.8,0.68
95%,0.97,0.97,0.976,0.91,0.836


# Calculate the statistics of dataset

In [16]:
# import pandas as pd

# path = "data/cv-corpus-17.0-2024-03-15/zh-HK/clip_durations.tsv"

# df = pd.read_csv(path, sep='\t')
# df.head()

In [17]:
# df['duration[ms]'].describe()