In [1]:
import warnings
warnings.filterwarnings('ignore')

# Test whisper models

In [2]:
import os
import torch
from datasets import load_dataset
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline

device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
print(f"device: {device}")

def initialize_pipe(model, model_path):
    
    # model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True, attn_implementation="flash_attention_2") # use_flash_attention_2=True) # 
    # model.to(device)

    if hasattr(model.generation_config, "no_timestamps_token_id"):
        return_timestamps = True
    else:
        return_timestamps = False
        
    processor = AutoProcessor.from_pretrained(model_path)

    pipe = pipeline(
        "automatic-speech-recognition",
        model=model,
        tokenizer=processor.tokenizer,
        feature_extractor=processor.feature_extractor,
        max_new_tokens=128,
        chunk_length_s=30,
        batch_size=16,
        return_timestamps=return_timestamps,
        torch_dtype=torch_dtype,
        device=device,
    )
    
    return pipe

device: cuda:0


# Test and compare different versions

## Common voice 17.0, zh-HK

In [3]:
# import os
# import pandas as pd

# dataset_name = "common_voice"

# test_data_dir = "data/cv-corpus-17.0-2024-03-15/zh-HK/clips"
# test_data_path = "data/cv-corpus-17.0-2024-03-15/zh-HK/test.tsv"

# df = pd.read_csv(test_data_path, sep='\t')
# print(df.shape)
# df.head()

# num_test = 1000

# df.rename(columns={"path": "file_id", "sentence": "true"}, inplace=True)
# df = df[:num_test]
# df.head()

## midea data

In [4]:
# import pandas as pd
# dataset_name = 'midea_500'
# # test_data_dir = "data/midea_data_500/clips"
# # test_data_dir = "data/midea_data_500/amrs"
# test_data_dir = "data/midea_data_500/wavs"

# df = pd.read_csv("./data/midea_data_500/transcripts.csv")
# df.rename(columns={"sentence": "true"}, inplace=True)
# # df['file_id'] = df['file_id'].apply(lambda x: x.replace('.mp3', '.amr'))
# df.head()

## custom_data_v1

In [5]:
import pandas as pd

dataset_name = 'custom_v1'
test_data_dir = "data/custom_data_v1/test"

with open(os.path.join(test_data_dir, 'audio_paths')) as fo:
    audio_paths = fo.readlines()
    
with open(os.path.join(test_data_dir, 'text')) as fo:
    texts = fo.readlines()
    
print(audio_paths[:5])
print(texts[:5])

df = pd.DataFrame([[text.strip().split()[0], " ".join(text.strip().split()[1:])] for text in texts], columns=['idx', 'true'])
df.head()

df_path = pd.DataFrame([path.strip().split() for path in audio_paths], columns=['idx', 'file_id'])
df_path.head()

df = pd.merge(df, df_path, on='idx')
df.head()

['66d00545-775b-4a0d-9be0-ee3329db6ca0 /home/ec2-user/SageMaker/efs/Projects/whisper/data/MDCC/audio/447_1711162014_18779_1039.48_1042.69.wav\n', '9cec76cb-0141-4581-ab28-983d31d35481 /home/ec2-user/SageMaker/efs/Projects/whisper/data/MDCC/audio/447_1803291607_38267_616.4_617.61.wav\n', '01c3ee7e-bca6-4b04-a7ec-008f8284bcf8 /home/ec2-user/SageMaker/efs/Projects/whisper/data/MDCC/audio/447_2102251812_16686_3350.08_3352.22.wav\n', 'f6b46a41-bf40-43f4-be8f-b09007ed9cb9 /home/ec2-user/SageMaker/efs/Projects/whisper/data/MDCC/audio/447_1711171106_61214_431.74_433.39.wav\n', 'b9d579fa-3261-45ab-8516-458d7df1484d /home/ec2-user/SageMaker/efs/Projects/whisper/data/MDCC/audio/447_1711162014_23687_350.5_355.14.wav\n']
['66d00545-775b-4a0d-9be0-ee3329db6ca0 由交湖可以坐車上去少女峰\n', '9cec76cb-0141-4581-ab28-983d31d35481 無由擺脫\n', '01c3ee7e-bca6-4b04-a7ec-008f8284bcf8 呢啲遠徵軍都有一句話\n', 'f6b46a41-bf40-43f4-be8f-b09007ed9cb9 大多如浮游\n', 'b9d579fa-3261-45ab-8516-458d7df1484d 但係我哋邊個亦都冇諗住去欣賞只係喺度嗌住話\n']


Unnamed: 0,idx,true,file_id
0,66d00545-775b-4a0d-9be0-ee3329db6ca0,由交湖可以坐車上去少女峰,/home/ec2-user/SageMaker/efs/Projects/whisper/...
1,9cec76cb-0141-4581-ab28-983d31d35481,無由擺脫,/home/ec2-user/SageMaker/efs/Projects/whisper/...
2,01c3ee7e-bca6-4b04-a7ec-008f8284bcf8,呢啲遠徵軍都有一句話,/home/ec2-user/SageMaker/efs/Projects/whisper/...
3,f6b46a41-bf40-43f4-be8f-b09007ed9cb9,大多如浮游,/home/ec2-user/SageMaker/efs/Projects/whisper/...
4,b9d579fa-3261-45ab-8516-458d7df1484d,但係我哋邊個亦都冇諗住去欣賞只係喺度嗌住話,/home/ec2-user/SageMaker/efs/Projects/whisper/...


## clear GPU

In [6]:
import gc

# Clear the model from GPU memory
def clear_model_from_gpu(model):
    # Delete the model object
    del model
    # Force garbage collection to free any unreferenced memory
    gc.collect()
    # Clear any existing CUDA cache
    torch.cuda.empty_cache()
    # Optionally, you can reinitialize the CUDA context by calling CUDA operations
    torch.cuda.synchronize()
    
    print("Model cleared from GPU memory.")

In [7]:
import os
import json
import time
from tqdm import tqdm
from faster_whisper import WhisperModel
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline

model_ids = [
    # {"label": "small-c", "model_path": "/home/ec2-user/SageMaker/efs/Models/whisper-small-cantonese"},
    # {"label": "medium", "model_path": "/home/ec2-user/SageMaker/efs/Models/whisper-medium"},
    # {"label": "faster-large-v3", "model_path": "/home/ec2-user/SageMaker/efs/Models/faster-whisper-large-v3"},  
    # {"label": "ckpt-270", "model_path": "/home/ec2-user/SageMaker/efs/Projects/whisper/checkpoint/checkpoint-270"},  
    # {"label": "ckpt-675", "model_path": "/home/ec2-user/SageMaker/efs/Projects/whisper/checkpoint/checkpoint-675"}, 
    # {"label": "ckpt-2500", "model_path": "/home/ec2-user/SageMaker/efs/Projects/whisper/checkpoint/checkpoint-2500"},  
    # {"label": "ckpt-3500", "model_path": "/home/ec2-user/SageMaker/efs/Projects/whisper/checkpoint/checkpoint-3500"},  
    # {"label": "ckpt-68", "model_path": "/home/ec2-user/SageMaker/efs/Projects/whisper/checkpoint/checkpoint-68"},  
    # {"label": "ckpt-1000", "model_path": "/home/ec2-user/SageMaker/efs/Projects/whisper/checkpoint-v3/checkpoint-1000"},  
    # {"label": "ckpt-3000", "model_path": "/home/ec2-user/SageMaker/efs/Projects/whisper/checkpoint-v3/checkpoint-3000"}, 
    # {"label": "large-v3", "model_path": "/home/ec2-user/SageMaker/efs/Models/whisper-large-v3"},
    # # {"label": "ckpt-680", "model_path": "/home/ec2-user/SageMaker/efs/Projects/whisper/checkpoin
    # {"label": "ckpt-68", "model_path": "/home/ec2-user/SageMaker/efs/Projects/whisper/checkpoint/checkpoint-v1-5e6/checkpoint-68"},
    # # {"label": "ckpt-680", "model_path": "/home/ec2-user/SageMaker/efs/Projects/whisper/checkpoint/checkpoint-680"},
    # # {"label": "ckpt-680", "model_path": "/home/ec2-user/SageMaker/efs/Projects/whisper/checkpoint/checkpoint-680"},
    # {"label": "ckpt-4500", "model_path": "/home/ec2-user/SageMaker/efs/Projects/whisper/checkpoint/checkpoint-v3/checkpoint-4500"},
    {"label": "v5-50", "model_path": "/home/ec2-user/SageMaker/efs/Projects/whisper/checkpoint/checkpoint-v5/checkpoint-50"},
    {"label": "v5-300", "model_path": "/home/ec2-user/SageMaker/efs/Projects/whisper/checkpoint/checkpoint-v5/checkpoint-300"},
    {"label": "v6-40", "model_path": "/home/ec2-user/SageMaker/efs/Projects/whisper/checkpoint/checkpoint-v6/checkpoint-40"},
    {"label": "v6-200", "model_path": "/home/ec2-user/SageMaker/efs/Projects/whisper/checkpoint/checkpoint-v6/checkpoint-200"},
    {"label": "large-v3", "model_path": "/home/ec2-user/SageMaker/efs/Models/whisper-large-v3"},
]

# df = df[:20]
sample = os.path.join(test_data_dir, df.loc[0, 'file_id'])

results = []

for model_id in model_ids:
    label = model_id["label"]
    model_path = model_id["model_path"]
    
    if 'faster' in label: 
        model = WhisperModel(model_path)
        # model.to(device)
        segments, info = model.transcribe(sample, language="yue") # 
        pred = ''.join([segment.text for segment in segments])
        print(f"model label: {label}, result: {pred}")
    else:
        model = AutoModelForSpeechSeq2Seq.from_pretrained(model_path, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True, attn_implementation="flash_attention_2") 
        model.to(device)
        
        pipe = initialize_pipe(model, model_path)
        if label == "large-v3" or label.startswith("ckpt"):
            result = pipe(sample, generate_kwargs={"language": "cantonese"})
        else:
            result = pipe(sample)
            
        print(f"model label: {label}, result: {result['text']}")
    
    for idx, row in tqdm(df.iterrows(), total=df.shape[0]):
        filename = row['file_id']
        filepath = os.path.join(test_data_dir, filename)
        
        if not os.path.exists(filepath):
            continue            
        
        start_time = time.time()
        if label == "large-v3" or label.startswith("ckpt"):
            result = pipe(filepath, generate_kwargs={"language": "cantonese"})
        elif label == "faster-large-v3":
            segments, info = model.transcribe(filepath, language="yue") # 
            result = {'text': ''.join([segment.text for segment in segments])}
        else:
            result = pipe(filepath)
        
        time_cost = time.time() - start_time
        
        df.loc[idx, f"pred-{label}"] = result['text']
        df.loc[idx, f"time-{label}"] = round(time_cost, 2)
        df.loc[idx, f"chunks-{label}"] = json.dumps(result.get("chunks", ""))
    
    clear_model_from_gpu(model)

You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
Loading checkpoint shards: 100%|██████████| 2/2 [00:15<00:00,  7.86s/it]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token.As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


model label: v5-50, result: 由家湖可以坐車上去笑雷鋒


  2%|▏         | 9/507 [00:05<04:48,  1.73it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
 49%|████▊     | 246/507 [02:39<02:38,  1.64it/s]Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
100%|██████████| 507/507 [05:18<00:00,  1.59it/s]


Model cleared from GPU memory.


Loading checkpoint shards: 100%|██████████| 2/2 [00:16<00:00,  8.29s/it]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


model label: v5-300, result: 由嘉湖可以坐車上去紹雷峰


  1%|          | 6/507 [00:04<05:47,  1.44it/s]Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
  2%|▏         | 11/507 [00:09<06:44,  1.23it/s]Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
  3%|▎         | 16/507 [00:15<06:38,  1.23it/s]Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
  4%|▍         | 21/507 [00:21<06:31,  1.24it/s]Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
  5%|▌         | 26/507 [00:27<07:17,  1.10it/s]Whisper did not predict an ending

Model cleared from GPU memory.


Loading checkpoint shards: 100%|██████████| 2/2 [00:16<00:00,  8.07s/it]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


model label: v6-40, result: 由嘉湖可以坐車上去笑女風


100%|██████████| 507/507 [05:14<00:00,  1.61it/s]


Model cleared from GPU memory.


Loading checkpoint shards: 100%|██████████| 2/2 [00:16<00:00,  8.05s/it]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


model label: v6-200, result: 由嘉湖可以坐車上去少女峰


  4%|▍         | 22/507 [00:13<04:47,  1.69it/s]Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
 26%|██▌       | 133/507 [01:24<03:24,  1.83it/s]Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
 27%|██▋       | 139/507 [01:31<04:49,  1.27it/s]Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
 28%|██▊       | 140/507 [01:34<09:26,  1.54s/it]Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
 81%|████████  | 409/507 [04:34<00:50,  1.95it/s]Whisper did not predict an e

Model cleared from GPU memory.


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
You have passed language=cantonese, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of language=cantonese.


model label: large-v3, result:  由家湖可以坐車上去少女峰


100%|██████████| 507/507 [04:40<00:00,  1.81it/s]


Model cleared from GPU memory.


In [8]:
df.head()

Unnamed: 0,idx,true,file_id,pred-v5-50,time-v5-50,chunks-v5-50,pred-v5-300,time-v5-300,chunks-v5-300,pred-v6-40,time-v6-40,chunks-v6-40,pred-v6-200,time-v6-200,chunks-v6-200,pred-large-v3,time-large-v3,chunks-large-v3
0,66d00545-775b-4a0d-9be0-ee3329db6ca0,由交湖可以坐車上去少女峰,/home/ec2-user/SageMaker/efs/Projects/whisper/...,由家湖可以坐車上去笑雷鋒,0.63,"[{""timestamp"": [0.0, 3.22], ""text"": ""\u7531\u5...",由嘉湖可以坐車上去紹雷峰,0.68,"[{""timestamp"": [0.0, 3.22], ""text"": ""\u7531\u5...",由嘉湖可以坐車上去笑女風,0.59,"[{""timestamp"": [0.0, 3.2], ""text"": ""\u7531\u56...",由嘉湖可以坐車上去少女峰,0.6,"[{""timestamp"": [0.0, 3.22], ""text"": ""\u7531\u5...",由家湖可以坐車上去少女峰,0.55,"[{""timestamp"": [0.0, 3.22], ""text"": "" \u7531\u..."
1,9cec76cb-0141-4581-ab28-983d31d35481,無由擺脫,/home/ec2-user/SageMaker/efs/Projects/whisper/...,無有擺脫,0.45,"[{""timestamp"": [0.0, 1.22], ""text"": ""\u7121\u6...",無憂擺脫字幕組,0.63,"[{""timestamp"": [0.0, 1.22], ""text"": ""\u7121\u6...",無由擺脫,0.44,"[{""timestamp"": [0.0, 1.22], ""text"": ""\u7121\u7...",無由擺脫,0.44,"[{""timestamp"": [0.0, 1.22], ""text"": ""\u7121\u7...",無由擺脫,0.39,"[{""timestamp"": [0.0, 1.22], ""text"": "" \u7121\u..."
2,01c3ee7e-bca6-4b04-a7ec-008f8284bcf8,呢啲遠徵軍都有一句話,/home/ec2-user/SageMaker/efs/Projects/whisper/...,呢啲遠征軍都有一句話,0.54,"[{""timestamp"": [0.0, 2.14], ""text"": ""\u5462\u5...",呢啲軟精軍都有一句話,0.58,"[{""timestamp"": [0.0, 2.14], ""text"": ""\u5462\u5...",呢啲遠征軍都有一句話,0.56,"[{""timestamp"": [0.0, 2.14], ""text"": ""\u5462\u5...",呢啲遠征軍都有一句話,0.56,"[{""timestamp"": [0.0, 2.14], ""text"": ""\u5462\u5...",這些遠征軍都有一句話,0.44,"[{""timestamp"": [0.0, 2.14], ""text"": "" \u9019\u..."
3,f6b46a41-bf40-43f4-be8f-b09007ed9cb9,大多如浮游,/home/ec2-user/SageMaker/efs/Projects/whisper/...,大多如浮游,0.47,"[{""timestamp"": [0.0, 1.66], ""text"": ""\u5927\u5...",大多如浮遊,0.46,"[{""timestamp"": [0.0, 1.66], ""text"": ""\u5927\u5...",大多如浮游,0.46,"[{""timestamp"": [0.0, 1.66], ""text"": ""\u5927\u5...",大多如浮游,0.45,"[{""timestamp"": [0.0, 1.66], ""text"": ""\u5927\u5...",大多如浮游,0.39,"[{""timestamp"": [0.0, 1.66], ""text"": "" \u5927\u..."
4,b9d579fa-3261-45ab-8516-458d7df1484d,但係我哋邊個亦都冇諗住去欣賞只係喺度嗌住話,/home/ec2-user/SageMaker/efs/Projects/whisper/...,但係我哋邊個亦都冇諗住去欣賞只係喺度嗌住話,1.0,"[{""timestamp"": [0.0, 2.94], ""text"": ""\u4f46\u4...",但我哋邊個亦都冇諗住去欣賞欣賞欣賞欣只係喺住話,1.22,"[{""timestamp"": [0.0, 3.0], ""text"": ""\u4f46\u62...",但係我哋邊個亦都冇諗住去欣賞只係喺度喊住話,1.0,"[{""timestamp"": [0.0, 4.64], ""text"": ""\u4f46\u4...",但我哋邊個亦都冇諗住去欣賞只係喺住話,0.87,"[{""timestamp"": [0.0, 4.64], ""text"": ""\u4f46\u6...",但我哋邊個亦都冇諗住去欣賞 只係喺度嗌住話,0.98,"[{""timestamp"": [0.0, 2.92], ""text"": "" \u4f46\u..."


## CER calculation

In [9]:
import re
import jiwer
from whisper.normalizers import BasicTextNormalizer, EnglishTextNormalizer

class ChineseTextNormalizer:
    def __init__(self, split_letters: bool = False):

        self.split_letters = split_letters

    def __call__(self, s: str):
        s = s.lower()
        # s = re.sub(r"[<\[][^>\]]*[>\]]", "", s)  # remove words between brackets
        # s = re.sub(r"\(([^)]+?)\)", "", s)  # remove words between parenthesis
        # s = self.clean(s).lower()

        # if self.split_letters:
        #     s = " ".join(regex.findall(r"\X", s, regex.U))

        s = re.sub(
            r"\s+", "", s
        )  # remove any successive whitespace characters
        
        return s
    
# normalizer = EnglishTextNormalizer()
# normalizer = BasicTextNormalizer()
normalizer = ChineseTextNormalizer()

In [10]:
df.head()

Unnamed: 0,idx,true,file_id,pred-v5-50,time-v5-50,chunks-v5-50,pred-v5-300,time-v5-300,chunks-v5-300,pred-v6-40,time-v6-40,chunks-v6-40,pred-v6-200,time-v6-200,chunks-v6-200,pred-large-v3,time-large-v3,chunks-large-v3
0,66d00545-775b-4a0d-9be0-ee3329db6ca0,由交湖可以坐車上去少女峰,/home/ec2-user/SageMaker/efs/Projects/whisper/...,由家湖可以坐車上去笑雷鋒,0.63,"[{""timestamp"": [0.0, 3.22], ""text"": ""\u7531\u5...",由嘉湖可以坐車上去紹雷峰,0.68,"[{""timestamp"": [0.0, 3.22], ""text"": ""\u7531\u5...",由嘉湖可以坐車上去笑女風,0.59,"[{""timestamp"": [0.0, 3.2], ""text"": ""\u7531\u56...",由嘉湖可以坐車上去少女峰,0.6,"[{""timestamp"": [0.0, 3.22], ""text"": ""\u7531\u5...",由家湖可以坐車上去少女峰,0.55,"[{""timestamp"": [0.0, 3.22], ""text"": "" \u7531\u..."
1,9cec76cb-0141-4581-ab28-983d31d35481,無由擺脫,/home/ec2-user/SageMaker/efs/Projects/whisper/...,無有擺脫,0.45,"[{""timestamp"": [0.0, 1.22], ""text"": ""\u7121\u6...",無憂擺脫字幕組,0.63,"[{""timestamp"": [0.0, 1.22], ""text"": ""\u7121\u6...",無由擺脫,0.44,"[{""timestamp"": [0.0, 1.22], ""text"": ""\u7121\u7...",無由擺脫,0.44,"[{""timestamp"": [0.0, 1.22], ""text"": ""\u7121\u7...",無由擺脫,0.39,"[{""timestamp"": [0.0, 1.22], ""text"": "" \u7121\u..."
2,01c3ee7e-bca6-4b04-a7ec-008f8284bcf8,呢啲遠徵軍都有一句話,/home/ec2-user/SageMaker/efs/Projects/whisper/...,呢啲遠征軍都有一句話,0.54,"[{""timestamp"": [0.0, 2.14], ""text"": ""\u5462\u5...",呢啲軟精軍都有一句話,0.58,"[{""timestamp"": [0.0, 2.14], ""text"": ""\u5462\u5...",呢啲遠征軍都有一句話,0.56,"[{""timestamp"": [0.0, 2.14], ""text"": ""\u5462\u5...",呢啲遠征軍都有一句話,0.56,"[{""timestamp"": [0.0, 2.14], ""text"": ""\u5462\u5...",這些遠征軍都有一句話,0.44,"[{""timestamp"": [0.0, 2.14], ""text"": "" \u9019\u..."
3,f6b46a41-bf40-43f4-be8f-b09007ed9cb9,大多如浮游,/home/ec2-user/SageMaker/efs/Projects/whisper/...,大多如浮游,0.47,"[{""timestamp"": [0.0, 1.66], ""text"": ""\u5927\u5...",大多如浮遊,0.46,"[{""timestamp"": [0.0, 1.66], ""text"": ""\u5927\u5...",大多如浮游,0.46,"[{""timestamp"": [0.0, 1.66], ""text"": ""\u5927\u5...",大多如浮游,0.45,"[{""timestamp"": [0.0, 1.66], ""text"": ""\u5927\u5...",大多如浮游,0.39,"[{""timestamp"": [0.0, 1.66], ""text"": "" \u5927\u..."
4,b9d579fa-3261-45ab-8516-458d7df1484d,但係我哋邊個亦都冇諗住去欣賞只係喺度嗌住話,/home/ec2-user/SageMaker/efs/Projects/whisper/...,但係我哋邊個亦都冇諗住去欣賞只係喺度嗌住話,1.0,"[{""timestamp"": [0.0, 2.94], ""text"": ""\u4f46\u4...",但我哋邊個亦都冇諗住去欣賞欣賞欣賞欣只係喺住話,1.22,"[{""timestamp"": [0.0, 3.0], ""text"": ""\u4f46\u62...",但係我哋邊個亦都冇諗住去欣賞只係喺度喊住話,1.0,"[{""timestamp"": [0.0, 4.64], ""text"": ""\u4f46\u4...",但我哋邊個亦都冇諗住去欣賞只係喺住話,0.87,"[{""timestamp"": [0.0, 4.64], ""text"": ""\u4f46\u6...",但我哋邊個亦都冇諗住去欣賞 只係喺度嗌住話,0.98,"[{""timestamp"": [0.0, 2.92], ""text"": "" \u4f46\u..."


In [11]:
new_columns = ['true', 'true-clean']
pred_cols, cer_cols, time_cols = [], [], []
chunk_cols = []
for model_id in model_ids:
    label = model_id['label']
    pred_cols.extend([f'pred-{label}',f'pred-{label}-clean'])
    cer_cols.append(f'cer-{label}')
    if f'chunks-{label}' in df.columns:
        chunk_cols.append(f'chunks-{label}')
        
    time_cols.append(f'time-{label}')

new_columns.extend(pred_cols+cer_cols+time_cols+chunk_cols)
# new_columns.extend(['file_id', 'user_id', 'category_name'])
# new_columns.extend(['file_id', 'user_id', 'category_name'])
new_columns.extend(['file_id'])


for i, row in tqdm(df.iterrows(), total=df.shape[0]):
    
    true_clean = normalizer(row['true'])
    df.loc[i, 'true-clean'] = true_clean
    
    for model_id in model_ids:
        label = model_id['label']
        pred_clean = normalizer(row[f'pred-{label}'])
        df.loc[i, f'pred-{label}-clean'] = pred_clean
        
        cer = jiwer.cer(true_clean, pred_clean)
        df.loc[i, f'cer-{label}'] = cer
        
df = df[new_columns]
df = df.fillna("")
df.to_csv(f"outputs/transcripts_{dataset_name}.csv", index=False)
df.head(10)

100%|██████████| 507/507 [00:01<00:00, 470.95it/s]


Unnamed: 0,true,true-clean,pred-v5-50,pred-v5-50-clean,pred-v5-300,pred-v5-300-clean,pred-v6-40,pred-v6-40-clean,pred-v6-200,pred-v6-200-clean,...,time-v5-300,time-v6-40,time-v6-200,time-large-v3,chunks-v5-50,chunks-v5-300,chunks-v6-40,chunks-v6-200,chunks-large-v3,file_id
0,由交湖可以坐車上去少女峰,由交湖可以坐車上去少女峰,由家湖可以坐車上去笑雷鋒,由家湖可以坐車上去笑雷鋒,由嘉湖可以坐車上去紹雷峰,由嘉湖可以坐車上去紹雷峰,由嘉湖可以坐車上去笑女風,由嘉湖可以坐車上去笑女風,由嘉湖可以坐車上去少女峰,由嘉湖可以坐車上去少女峰,...,0.68,0.59,0.6,0.55,"[{""timestamp"": [0.0, 3.22], ""text"": ""\u7531\u5...","[{""timestamp"": [0.0, 3.22], ""text"": ""\u7531\u5...","[{""timestamp"": [0.0, 3.2], ""text"": ""\u7531\u56...","[{""timestamp"": [0.0, 3.22], ""text"": ""\u7531\u5...","[{""timestamp"": [0.0, 3.22], ""text"": "" \u7531\u...",/home/ec2-user/SageMaker/efs/Projects/whisper/...
1,無由擺脫,無由擺脫,無有擺脫,無有擺脫,無憂擺脫字幕組,無憂擺脫字幕組,無由擺脫,無由擺脫,無由擺脫,無由擺脫,...,0.63,0.44,0.44,0.39,"[{""timestamp"": [0.0, 1.22], ""text"": ""\u7121\u6...","[{""timestamp"": [0.0, 1.22], ""text"": ""\u7121\u6...","[{""timestamp"": [0.0, 1.22], ""text"": ""\u7121\u7...","[{""timestamp"": [0.0, 1.22], ""text"": ""\u7121\u7...","[{""timestamp"": [0.0, 1.22], ""text"": "" \u7121\u...",/home/ec2-user/SageMaker/efs/Projects/whisper/...
2,呢啲遠徵軍都有一句話,呢啲遠徵軍都有一句話,呢啲遠征軍都有一句話,呢啲遠征軍都有一句話,呢啲軟精軍都有一句話,呢啲軟精軍都有一句話,呢啲遠征軍都有一句話,呢啲遠征軍都有一句話,呢啲遠征軍都有一句話,呢啲遠征軍都有一句話,...,0.58,0.56,0.56,0.44,"[{""timestamp"": [0.0, 2.14], ""text"": ""\u5462\u5...","[{""timestamp"": [0.0, 2.14], ""text"": ""\u5462\u5...","[{""timestamp"": [0.0, 2.14], ""text"": ""\u5462\u5...","[{""timestamp"": [0.0, 2.14], ""text"": ""\u5462\u5...","[{""timestamp"": [0.0, 2.14], ""text"": "" \u9019\u...",/home/ec2-user/SageMaker/efs/Projects/whisper/...
3,大多如浮游,大多如浮游,大多如浮游,大多如浮游,大多如浮遊,大多如浮遊,大多如浮游,大多如浮游,大多如浮游,大多如浮游,...,0.46,0.46,0.45,0.39,"[{""timestamp"": [0.0, 1.66], ""text"": ""\u5927\u5...","[{""timestamp"": [0.0, 1.66], ""text"": ""\u5927\u5...","[{""timestamp"": [0.0, 1.66], ""text"": ""\u5927\u5...","[{""timestamp"": [0.0, 1.66], ""text"": ""\u5927\u5...","[{""timestamp"": [0.0, 1.66], ""text"": "" \u5927\u...",/home/ec2-user/SageMaker/efs/Projects/whisper/...
4,但係我哋邊個亦都冇諗住去欣賞只係喺度嗌住話,但係我哋邊個亦都冇諗住去欣賞只係喺度嗌住話,但係我哋邊個亦都冇諗住去欣賞只係喺度嗌住話,但係我哋邊個亦都冇諗住去欣賞只係喺度嗌住話,但我哋邊個亦都冇諗住去欣賞欣賞欣賞欣只係喺住話,但我哋邊個亦都冇諗住去欣賞欣賞欣賞欣只係喺住話,但係我哋邊個亦都冇諗住去欣賞只係喺度喊住話,但係我哋邊個亦都冇諗住去欣賞只係喺度喊住話,但我哋邊個亦都冇諗住去欣賞只係喺住話,但我哋邊個亦都冇諗住去欣賞只係喺住話,...,1.22,1.0,0.87,0.98,"[{""timestamp"": [0.0, 2.94], ""text"": ""\u4f46\u4...","[{""timestamp"": [0.0, 3.0], ""text"": ""\u4f46\u62...","[{""timestamp"": [0.0, 4.64], ""text"": ""\u4f46\u4...","[{""timestamp"": [0.0, 4.64], ""text"": ""\u4f46\u6...","[{""timestamp"": [0.0, 2.92], ""text"": "" \u4f46\u...",/home/ec2-user/SageMaker/efs/Projects/whisper/...
5,香港電臺製作,香港電臺製作,香港電臺製作,香港電臺製作,香港電臺製作,香港電臺製作,香港電臺製作,香港電臺製作,香港電臺製作,香港電臺製作,...,0.49,0.47,0.46,0.38,"[{""timestamp"": [0.0, 1.32], ""text"": ""\u9999\u6...","[{""timestamp"": [0.0, 1.32], ""text"": ""\u9999\u6...","[{""timestamp"": [0.0, 1.32], ""text"": ""\u9999\u6...","[{""timestamp"": [0.0, 1.32], ""text"": ""\u9999\u6...","[{""timestamp"": [0.0, 1.32], ""text"": "" \u9999\u...",/home/ec2-user/SageMaker/efs/Projects/whisper/...
6,隨住形勢變化,隨住形勢變化,隨著形勢變化,隨著形勢變化,隨住形勢變化就開始變化變化變化變化變化變化變化變化變化變化變化變化變化變化變化變化變化變化變...,隨住形勢變化就開始變化變化變化變化變化變化變化變化變化變化變化變化變化變化變化變化變化變化變...,隨著形勢變化,隨著形勢變化,隨住形勢變化,隨住形勢變化,...,3.49,0.47,0.46,0.41,"[{""timestamp"": [0.0, 1.34], ""text"": ""\u96a8\u8...","[{""timestamp"": [0.0, 1.34], ""text"": ""\u96a8\u4...","[{""timestamp"": [0.0, 1.34], ""text"": ""\u96a8\u8...","[{""timestamp"": [0.0, 1.34], ""text"": ""\u96a8\u4...","[{""timestamp"": [0.0, 1.34], ""text"": "" \u96a8\u...",/home/ec2-user/SageMaker/efs/Projects/whisper/...
7,話我立正姿勢不正確,話我立正姿勢不正確,話我立正姿勢不正確,話我立正姿勢不正確,話我立正姿不正確,話我立正姿不正確,話我立正姿勢不正確,話我立正姿勢不正確,話我立正姿不正確,話我立正姿不正確,...,0.52,0.56,0.5,0.5,"[{""timestamp"": [0.0, 2.72], ""text"": ""\u8a71\u6...","[{""timestamp"": [0.0, 2.72], ""text"": ""\u8a71\u6...","[{""timestamp"": [0.0, 2.72], ""text"": ""\u8a71\u6...","[{""timestamp"": [0.0, 2.72], ""text"": ""\u8a71\u6...","[{""timestamp"": [0.0, 2.72], ""text"": "" \u8a71\u...",/home/ec2-user/SageMaker/efs/Projects/whisper/...
8,江嬸喺廚房摸住出嚟,江嬸喺廚房摸住出嚟,江嬸喺廚房摸住出嚟,江嬸喺廚房摸住出嚟,江嬸喺廚房摸住出嚟,江嬸喺廚房摸住出嚟,江嬸喺廚房摸住出嚟,江嬸喺廚房摸住出嚟,江嬸喺廚房摸住出嚟,江嬸喺廚房摸住出嚟,...,0.66,0.63,0.63,0.5,"[{""timestamp"": [0.0, 1.9], ""text"": ""\u6c5f\u5b...","[{""timestamp"": [0.0, 1.9], ""text"": ""\u6c5f\u5b...","[{""timestamp"": [0.0, 1.9], ""text"": ""\u6c5f\u5b...","[{""timestamp"": [0.0, 1.9], ""text"": ""\u6c5f\u5b...","[{""timestamp"": [0.0, 1.9], ""text"": "" \u6c5f\u5...",/home/ec2-user/SageMaker/efs/Projects/whisper/...
9,既然搵唔到一處只屬二人嘅世界,既然搵唔到一處只屬二人嘅世界,既然搵唔到一處只屬二人嘅世界,既然搵唔到一處只屬二人嘅世界,既然搵唔到一處只屬二人嘅世界,既然搵唔到一處只屬二人嘅世界,既然搵唔到一處只屬異人嘅世界,既然搵唔到一處只屬異人嘅世界,既然搵唔到一處只屬二人嘅世界,既然搵唔到一處只屬二人嘅世界,...,0.73,0.74,0.71,0.65,"[{""timestamp"": [0.0, 3.6], ""text"": ""\u65e2\u71...","[{""timestamp"": [0.0, 3.6], ""text"": ""\u65e2\u71...","[{""timestamp"": [0.0, 3.6], ""text"": ""\u65e2\u71...","[{""timestamp"": [0.0, 3.6], ""text"": ""\u65e2\u71...","[{""timestamp"": [0.0, 3.6], ""text"": "" \u65e2\u7...",/home/ec2-user/SageMaker/efs/Projects/whisper/...


In [12]:
df = df.fillna("")
cers = []
for model_id in model_ids:
    label = model_id['label']
    df = df[df[f"pred-{label}-clean"]!='']
    cer = jiwer.cer(list(df["true-clean"]), list(df[f"pred-{label}-clean"]))
    
    cers.append({'model': label, 'cer': cer})

df_cers = pd.DataFrame(cers)
df_cers.to_csv(f"outputs/cers_{dataset_name}.csv", index=False)
df_cers.head()

Unnamed: 0,model,cer
0,v5-50,0.176541
1,v5-300,0.951884
2,v6-40,0.134075
3,v6-200,0.2125
4,large-v3,0.213699


In [13]:
df_cer_stats = df[cer_cols].describe(percentiles=[0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 0.99])
df_cer_stats.to_csv(f"outputs/cer_stats_{dataset_name}.csv")
df_cer_stats

Unnamed: 0,cer-v5-50,cer-v5-300,cer-v6-40,cer-v6-200,cer-large-v3
count,507.0,507.0,507.0,507.0,507.0
mean,0.185161,0.930128,0.148618,0.203069,0.232897
std,0.256432,2.403695,0.190303,0.564116,0.24548
min,0.0,0.0,0.0,0.0,0.0
50%,0.105263,0.2,0.1,0.125,0.181818
60%,0.166667,0.285714,0.153846,0.166667,0.230769
70%,0.222222,0.5,0.182955,0.214286,0.272727
80%,0.307692,0.897778,0.25,0.25,0.360909
90%,0.5,2.67,0.333333,0.333333,0.5
95%,0.75,4.171429,0.5,0.5,0.739286


In [14]:
df_time_stats = df[time_cols].describe(percentiles=[0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 0.99])
df_time_stats.to_csv(f"outputs/time_stats_{dataset_name}.csv")
df_time_stats

Unnamed: 0,time-v5-50,time-v5-300,time-v6-40,time-v6-200,time-large-v3
count,507.0,507.0,507.0,507.0,507.0
mean,0.626036,0.92,0.617988,0.642189,0.550513
std,0.239035,0.89798,0.202201,0.328328,0.204968
min,0.32,0.34,0.31,0.32,0.27
50%,0.56,0.57,0.56,0.56,0.48
60%,0.6,0.61,0.59,0.596,0.53
70%,0.68,0.7,0.672,0.68,0.6
80%,0.77,0.838,0.77,0.78,0.7
90%,0.91,3.21,0.92,0.904,0.834
95%,1.01,3.27,1.037,1.03,0.947


# Calculate the statistics of dataset

In [15]:
# import pandas as pd

# path = "data/cv-corpus-17.0-2024-03-15/zh-HK/clip_durations.tsv"

# df = pd.read_csv(path, sep='\t')
# df.head()

In [16]:
# df['duration[ms]'].describe()