# Some configurations

In [16]:
import warnings
warnings.filterwarnings('ignore')

# Load test data

## media's data

In [19]:
import os

test_data_dir = "data/test2/录音"

filenames = [filename for filename in os.listdir(test_data_dir) if filename.endswith('.amr')]
print(len(filenames))
print(filenames[:5])

531
['1c84d346-ef3f-4cd8-85eb-4e15e9f07d04.amr', '23ce50a0-36ea-47fa-bfd5-13283d071fbd.amr', '4b7aa1f7-890d-4d5f-892d-882c73107adf.amr', '4d17faad-85bf-4b3b-b152-3b9a788cfa3e.amr', 'bfd0f17f-c8cd-4b33-9406-2cd40e92a5fd.amr']


## Common voice 17.0

In [20]:
# import os
# import pandas as pd

# test_data_dir = "data/cv-corpus-17.0-2024-03-15/zh-HK/clips"
# test_data_path = "data/cv-corpus-17.0-2024-03-15/zh-HK/test.tsv"

# test_data = pd.read_csv(test_data_path, sep='\t')
# print(test_data.shape)
# test_data.head()

In [21]:
# num_test = 10000
# filenames = test_data['path'][:num_test]
# filenames[:5]

# Test whisper models

In [22]:
## old version
# import whisper

# model = whisper.load_model("base")

# audio_path = "data/test/f57c791f-074f-4706-94ca-a9b4bcab7ce4.amr"

# # load audio and pad/trim it to fit 30 seconds
# audio = whisper.load_audio(audio_path)
# audio = whisper.pad_or_trim(audio)

# # make log-Mel spectrogram and move to the same device as the model
# mel = whisper.log_mel_spectrogram(audio).to(model.device)

# # detect the spoken language
# _, probs = model.detect_language(mel)
# print(f"Detected language: {max(probs, key=probs.get)}")

# # decode the audio
# options = whisper.DecodingOptions()
# result = whisper.decode(model, mel, options)

# # print the recognized text
# print(result.text)

# def whisper_decode(path):
#     # load audio and pad/trim it to fit 30 seconds
#     audio = whisper.load_audio(path)
#     audio = whisper.pad_or_trim(audio)

#     # make log-Mel spectrogram and move to the same device as the model
#     mel = whisper.log_mel_spectrogram(audio).to(model.device)

#     # detect the spoken language
#     _, probs = model.detect_language(mel)
#     # print(f"Detected language: {max(probs, key=probs.get)}")

#     # decode the audio
#     options = whisper.DecodingOptions()
#     result = whisper.decode(model, mel, options)

#     # print the recognized text
#     # print(result.text)
    
#     return {'text': result.text, 'language': max(probs, key=probs.get)}

In [23]:
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from datasets import load_dataset

device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
print(f"device: {device}")

def initialize_pipe(model_id):
    
    model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True, attn_implementation="flash_attention_2") # use_flash_attention_2=True) # 

    model.to(device)

    if hasattr(model.generation_config, "no_timestamps_token_id"):
        return_timestamps = True
    else:
        return_timestamps = False
        
    processor = AutoProcessor.from_pretrained(model_id)

    pipe = pipeline(
        "automatic-speech-recognition",
        model=model,
        tokenizer=processor.tokenizer,
        feature_extractor=processor.feature_extractor,
        max_new_tokens=128,
        chunk_length_s=30,
        batch_size=16,
        return_timestamps=return_timestamps,
        torch_dtype=torch_dtype,
        device=device,
    )
    
    return pipe

device: cuda:0


# Test and compare different versions

In [113]:
df = pd.read_csv("./data/test2/语料(1).csv")
df.rename(columns={"content": "true", "record_file_id": "file_id"}, inplace=True)
df.head()

Unnamed: 0,category_name,true,user_id,file_id
0,Model,CMSN 20 si,wuzy37,8deb96cd-1939-46d0-8f87-5bd798ade923.amr
1,Model,CMSRO 20di cr,wuzy37,09b311d8-c557-4bfb-bac5-f0d18a5da8e9.amr
2,Model,CMSN 20 si,ex_xietian2,6ae92f35-3139-417c-b0f4-d30ac25b3937.amr
3,Model,CMSRO 20di cr,ex_xietian2,3b65f69e-5aea-4211-acaf-b7bb2f651755.amr
4,Model,CMSRO 20di rd,ex_xietian2,c40afd4e-dafb-40a5-8c6e-ce9283a33278.amr


In [114]:
import json
from faster_whisper import WhisperModel

model_ids = [
    {"label": "small-c", "model_path": "/home/ec2-user/SageMaker/efs/Models/whisper-small-cantonese"},
    {"label": "medium", "model_path": "/home/ec2-user/SageMaker/efs/Models/whisper-medium"},
    {"label": "large-v3", "model_path": "/home/ec2-user/SageMaker/efs/Models/whisper-large-v3"},  
    {"label": "faster-large-v3", "model_path": "/home/ec2-user/SageMaker/efs/Models/faster-whisper-large-v3"},  
]

sample = os.path.join(test_data_dir, filenames[0])

results = []

for model_id in model_ids:
    label = model_id["label"]
    model_path = model_id["model_path"]
    
    if 'faster' in label: 
        model_faster = WhisperModel(model_path)
        segments, info = model_faster.transcribe(sample, language="yue") # 
        pred = ''.join([segment.text for segment in segments])
        print(f"model label: {label}, result: {pred}")
    else:
        pipe = initialize_pipe(model_path)
        result = pipe(sample)
        print(f"model label: {label}, result: {result['text']}")
    
    for idx, row in tqdm(df.iterrows(), total=len(filenames)):
        filename = row['file_id']
        filepath = os.path.join(test_data_dir, filename)
        
        if not os.path.exists(filepath):
            continue            
        
        start_time = time.time()
        if label == "large-v3":
            result = pipe(filepath, generate_kwargs={"language": "cantonese"})
        elif label == "faster-large-v3":
            segments, info = model_faster.transcribe(sample, language="yue") # 
            result = {'text': ''.join([segment.text for segment in segments])}
        else:
            result = pipe(filepath)
        
        time_cost = time.time() - start_time
        # result['filename'] = filename
        # result['time_cost'] = round(time_cost, 2)
        # results.append(result)
        
        df.loc[idx, f"pred-{label}"] = result['text']
        df.loc[idx, f"time-{label}"] = round(time_cost, 2)
        df.loc[idx, f"chunks-{label}"] = json.dumps(result.get("chunks", ""))
        

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


model label: small-c, result: GR-L40H


100%|██████████| 531/531 [02:14<00:00,  3.93it/s]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.


model label: medium, result:  gr-l40h-l-l-l-l-l-l-l-l-l-l-l-l-l-l-l-l-l-l-l-l-l-l-l-l-l-l-l-l-l-l-l-l-l-l-l-l-l-l-l-l-l-l-l-l-l-l-l-l-l-l-l-l-l-l-l-l-l-l-l-l-l


 13%|█▎        | 67/531 [00:26<03:15,  2.38it/s]Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
 32%|███▏      | 168/531 [01:09<02:24,  2.51it/s]Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
 37%|███▋      | 196/531 [01:23<02:18,  2.42it/s]Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
 37%|███▋      | 199/531 [01:26<03:55,  1.41it/s]Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
 45%|████▍     | 238/531 [01:44<01:51,  2.64it/s]Whisper did not predict an e

model label: large-v3, result: GR-L420H


 74%|███████▍  | 394/531 [02:57<00:51,  2.67it/s]Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
 78%|███████▊  | 416/531 [03:09<00:52,  2.20it/s]Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
100%|██████████| 531/531 [04:00<00:00,  2.20it/s]


model label: faster-large-v3, result:  GR-L420H


100%|██████████| 531/531 [06:57<00:00,  1.27it/s]


In [115]:
df.head()

Unnamed: 0,category_name,true,user_id,file_id,pred-small-c,time-small-c,chunks-small-c,pred-medium,time-medium,chunks-medium,pred-large-v3,time-large-v3,chunks-large-v3,pred-faster-large-v3,time-faster-large-v3,chunks-faster-large-v3
0,Model,CMSN 20 si,wuzy37,8deb96cd-1939-46d0-8f87-5bd798ade923.amr,你嘅咪醒呀你先啦,0.24,"""""",CNS and audience.,0.33,"[{""timestamp"": [0.0, 3.08], ""text"": "" CNS and ...",新年快樂,0.35,"[{""timestamp"": [0.0, 3.1], ""text"": "" \u65b0\u5...",GR-L420H,0.79,""""""
1,Model,CMSRO 20di cr,wuzy37,09b311d8-c557-4bfb-bac5-f0d18a5da8e9.amr,I'm going to make a cake with a lot of strawb...,0.28,"""""",Thank you.,0.3,"[{""timestamp"": [0.0, 7.28], ""text"": "" Thank yo...",《天使的名字》,0.42,"[{""timestamp"": [0.0, 7.78], ""text"": "" \u300a\u...",GR-L420H,0.78,""""""
2,Model,CMSN 20 si,ex_xietian2,6ae92f35-3139-417c-b0f4-d30ac25b3937.amr,錯失,0.16,"""""",撤出,0.29,"[{""timestamp"": [0.0, 1.0], ""text"": ""\u64a4\u51...",測試,0.32,"[{""timestamp"": [0.0, 0.96], ""text"": "" \u6e2c\u...",GR-L420H,0.78,""""""
3,Model,CMSRO 20di cr,ex_xietian2,3b65f69e-5aea-4211-acaf-b7bb2f651755.amr,CNSR Office DICR,0.23,"""""",CNSRO-20DICR,0.39,"[{""timestamp"": [0.0, 5.0], ""text"": "" CNSRO-20D...",CNSRO20DICR,0.4,"[{""timestamp"": [0.0, 5.28], ""text"": "" CNSRO20D...",GR-L420H,0.78,""""""
4,Model,CMSRO 20di rd,ex_xietian2,c40afd4e-dafb-40a5-8c6e-ce9283a33278.amr,自動化車是先你手和你腳合地,0.27,"""""",自动化测试CNHR520DR2D,0.54,"[{""timestamp"": [0.0, 5.0], ""text"": ""\u81ea\u52...",自動化測試CNNX-R5-20DR2D,0.68,"[{""timestamp"": [0.0, 4.94], ""text"": "" \u81ea\u...",GR-L420H,0.78,""""""


In [116]:
new_columns = ['true', 'true-clean']
part1, part2 = [], []
for model_id in model_ids:
    label = model_id['label']
    part1.extend([f'pred-{label}',f'pred-{label}-clean'])
    part2.append(f'time-{label}')

new_columns.extend(part1)
new_columns.extend(part2)
new_columns.extend(['file_id', 'user_id', 'category_name'])

for i, row in tqdm(df.iterrows(), total=df.shape[0]):
    
    true_clean = normalizer(row['true'])
    df.loc[i, 'true-clean'] = true_clean
    
    for model_id in model_ids:
        label = model_id['label']
        pred_clean = normalizer(row[f'pred-{label}'])
        df.loc[i, f'pred-{label}-clean'] = pred_clean
        
df = df[new_columns]
df = df.fillna("")
df.to_csv("outputs/transcripts.csv", index=False)
df.head(10)

100%|██████████| 531/531 [00:00<00:00, 1267.98it/s]


Unnamed: 0,true,true-clean,pred-small-c,pred-small-c-clean,pred-medium,pred-medium-clean,pred-large-v3,pred-large-v3-clean,pred-faster-large-v3,pred-faster-large-v3-clean,time-small-c,time-medium,time-large-v3,time-faster-large-v3,file_id,user_id,category_name
0,CMSN 20 si,cmsn20si,你嘅咪醒呀你先啦,你嘅咪醒呀你先啦,CNS and audience.,cnsandaudience.,新年快樂,新年快樂,GR-L420H,gr-l420h,0.24,0.33,0.35,0.79,8deb96cd-1939-46d0-8f87-5bd798ade923.amr,wuzy37,Model
1,CMSRO 20di cr,cmsro20dicr,I'm going to make a cake with a lot of strawb...,i'mgoingtomakeacakewithalotofstrawberries,Thank you.,thankyou.,《天使的名字》,《天使的名字》,GR-L420H,gr-l420h,0.28,0.3,0.42,0.78,09b311d8-c557-4bfb-bac5-f0d18a5da8e9.amr,wuzy37,Model
2,CMSN 20 si,cmsn20si,錯失,錯失,撤出,撤出,測試,測試,GR-L420H,gr-l420h,0.16,0.29,0.32,0.78,6ae92f35-3139-417c-b0f4-d30ac25b3937.amr,ex_xietian2,Model
3,CMSRO 20di cr,cmsro20dicr,CNSR Office DICR,cnsrofficedicr,CNSRO-20DICR,cnsro-20dicr,CNSRO20DICR,cnsro20dicr,GR-L420H,gr-l420h,0.23,0.39,0.4,0.78,3b65f69e-5aea-4211-acaf-b7bb2f651755.amr,ex_xietian2,Model
4,CMSRO 20di rd,cmsro20dird,自動化車是先你手和你腳合地,自動化車是先你手和你腳合地,自动化测试CNHR520DR2D,自动化测试cnhr520dr2d,自動化測試CNNX-R5-20DR2D,自動化測試cnnx-r5-20dr2d,GR-L420H,gr-l420h,0.27,0.54,0.68,0.78,c40afd4e-dafb-40a5-8c6e-ce9283a33278.amr,ex_xietian2,Model
5,AF-52CS1TRHK(H),af-52cs1trhk(h),HFWK我CSE TIRHK,hfwk我csetirhk,AF-52CSE TRHK,af-52csetrhk,AF-52CSE TRH Key,af-52csetrhkey,GR-L420H,gr-l420h,0.26,0.39,0.42,0.78,8ab13ab7-a573-4ac1-b1bf-0b229a19cca6.amr,ex_xietian2,Model
6,CMSN 20 si,cmsn20si,CMSN20SI,cmsn20si,CMSN20SI,cmsn20si,CMSN20SI,cmsn20si,GR-L420H,gr-l420h,0.19,0.33,0.32,0.79,2956c101-d82f-4f2c-8f1b-6119fa87694d.amr,yeungchimkuen,Model
7,CMSRO 20di cr,cmsro20dicr,CMSRO20DICR,cmsro20dicr,CMSRO20DICR,cmsro20dicr,CMSRO20DICR,cmsro20dicr,GR-L420H,gr-l420h,0.21,0.36,0.36,0.79,c124f9b4-38e1-4cee-8b41-49f57f069806.amr,yeungchimkuen,Model
8,CMSRO 20di rd,cmsro20dird,CMSRO20DIRD,cmsro20dird,CMSRO20DIRD,cmsro20dird,CMSRO20-DIRD,cmsro20-dird,GR-L420H,gr-l420h,0.21,0.36,0.39,0.78,17f3f07c-eca6-41d8-a3e4-9c0120595eb3.amr,yeungchimkuen,Model
9,AF-52CS1TRHK(H),af-52cs1trhk(h),ASHeven 與ECS-一TRHKQH,asheven與ecs-一trhkqh,AS-52CS-1TR-HKQ-H,as-52cs-1tr-hkq-h,AS-52CS1TRHKQH,as-52cs1trhkqh,GR-L420H,gr-l420h,0.28,0.48,0.47,0.78,5102309c-8fb3-4232-b257-2bd8ed6ef53e.amr,yeungchimkuen,Model


In [117]:
df = df.fillna("")
cers = []
time_columns = []
for model_id in model_ids:
    label = model_id['label']
    df = df[df[f"pred-{label}-clean"]!='']
    cer = jiwer.cer(list(df["true-clean"]), list(df[f"pred-{label}-clean"]))
    
    cers.append({'model': label, 'cer': cer})
    time_columns.append(f"time-{label}")

df_cers = pd.DataFrame(cers)
df_cers.to_csv("outputs/cers.csv", index=False)
df_cers.head()

Unnamed: 0,model,cer
0,small-c,0.678737
1,medium,0.524964
2,large-v3,0.339927
3,faster-large-v3,0.784176


In [118]:
df_time = df[time_columns].describe(percentiles=[0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 0.99])
df_time.to_csv("outputs/time_stats.csv")
df_time.head()

Unnamed: 0,time-small-c,time-medium,time-large-v3,time-faster-large-v3
count,531.0,531.0,531.0,531.0
mean,0.251846,0.430245,0.451601,0.784652
std,0.034003,0.206499,0.178088,0.010169
min,0.16,0.27,0.29,0.77
50%,0.25,0.41,0.44,0.78


## whisper small

In [None]:
model_id = "/home/ec2-user/SageMaker/efs/Models/whisper-small-cantonese"

pipe = initialize_pipe(model_id)

sample = os.path.join(test_data_dir, filenames[0])

result = pipe(sample)
print(result["text"])

In [None]:
import pandas as pd
from tqdm import tqdm
import time

results = []

for filename in tqdm(filenames, total=len(filenames)):
    start_time = time.time()
    result = pipe(os.path.join(test_data_dir, filename)) # , generate_kwargs={"language": "chinese"})
    time_cost = time.time() - start_time
    result['filename'] = filename
    result['time_cost'] = round(time_cost, 2)
    results.append(result)

In [None]:
df_pred_small = pd.DataFrame(results)
df_pred_small.rename(columns={'text': 'pred_small', 'chunks': 'chunks_small', 'time_cost': 'time_cost_small'}, inplace=True)
df_pred_small.head()

## whisper medium 

In [61]:
# model_id = "/home/ec2-user/SageMaker/efs/Models/whisper-medium"
model_id = "/home/ec2-user/SageMaker/efs/Models/whisper-small-cantonese"

pipe = initialize_pipe(model_id)

sample = os.path.join(test_data_dir, filenames[0])

result = pipe(sample)
print(result["text"])

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


GR-L40H


In [62]:
import pandas as pd
from tqdm import tqdm
import time

results = []

for filename in tqdm(filenames, total=len(filenames)):
    start_time = time.time()
    result = pipe(os.path.join(test_data_dir, filename)) # , generate_kwargs={"language": "chinese"})
    time_cost = time.time() - start_time
    result['filename'] = filename
    result['time_cost'] = round(time_cost, 2)
    results.append(result)

100%|██████████| 531/531 [02:14<00:00,  3.96it/s]


In [63]:
df_pred_medium = pd.DataFrame(results)
df_pred_medium.rename(columns={'text': 'pred_medium', 'chunks': 'chunks_medium', 'time_cost': 'time_cost_medium'}, inplace=True)
df_pred_medium.head()

Unnamed: 0,pred_medium,filename,time_cost_medium
0,GR-L40H,1c84d346-ef3f-4cd8-85eb-4e15e9f07d04.amr,0.19
1,ER海份SGS204HKG,23ce50a0-36ea-47fa-bfd5-13283d071fbd.amr,0.24
2,尼島區,4b7aa1f7-890d-4d5f-892d-882c73107adf.amr,0.18
3,BW哈神BH85SCH1酷WK,4d17faad-85bf-4b3b-b152-3b9a788cfa3e.amr,0.28
4,九龍城區,bfd0f17f-c8cd-4b33-9406-2cd40e92a5fd.amr,0.18


## whisper large

In [27]:
model_id = "/home/ec2-user/SageMaker/efs/Models/whisper-large-v3"

pipe = initialize_pipe(model_id)

sample = os.path.join(test_data_dir, filenames[0])

result = pipe(sample)
print(result["text"])

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


GR-L420H


In [28]:
import pandas as pd
from tqdm import tqdm
import time

results_large = []

for filename in tqdm(filenames, total=len(filenames)):
    start_time = time.time()
    result = pipe(os.path.join(test_data_dir, filename), generate_kwargs={"language": "cantonese"})
    time_cost = time.time() - start_time
    result['filename'] = filename
    result['time_cost'] = round(time_cost, 2)
    results_large.append(result)
    
df_pred_large = pd.DataFrame(results_large)
df_pred_large.rename(columns={'text': 'pred_large', 'chunks': 'chunks_large', 'time_cost': 'time_cost_large'}, inplace=True)
df_pred_large.head()

 27%|██▋       | 141/531 [01:01<02:43,  2.39it/s]Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
 52%|█████▏    | 278/531 [02:04<02:00,  2.10it/s]Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
100%|██████████| 531/531 [03:58<00:00,  2.22it/s]


Unnamed: 0,pred_large,chunks_large,filename,time_cost_large
0,GR-L420H,"[{'timestamp': (0.0, 2.6), 'text': ' GR-L420H'}]",1c84d346-ef3f-4cd8-85eb-4e15e9f07d04.amr,0.41
1,er-sgs20-whkg,"[{'timestamp': (0.0, 4.16), 'text': ' er-sgs20...",23ce50a0-36ea-47fa-bfd5-13283d071fbd.amr,0.42
2,離島區,"[{'timestamp': (0.0, 1.76), 'text': ' 離島區'}]",4b7aa1f7-890d-4d5f-892d-882c73107adf.amr,0.33
3,字幕提供者 李慧琼,"[{'timestamp': (0.0, 4.48), 'text': ' 字幕提供者 李慧...",4d17faad-85bf-4b3b-b152-3b9a788cfa3e.amr,0.52
4,九龍城區,"[{'timestamp': (0.0, 1.92), 'text': ' 九龍城區'}]",bfd0f17f-c8cd-4b33-9406-2cd40e92a5fd.amr,0.34


In [29]:
## convert whisper-large-v3 to faster-whisper-large-v3

# !ct2-transformers-converter --model openai/whisper-large-v3 --output_dir faster-whisper-large-v3 \
#     --copy_files tokenizer.json preprocessor_config.json --quantization float16

## faster_whisper_v3

In [30]:
from faster_whisper import WhisperModel

model_id_faster = "/home/ec2-user/SageMaker/efs/Models/faster-whisper-large-v3"
model_faster = WhisperModel(model_id_faster)

segments, info = model_faster.transcribe(sample, language="yue") # 
print("Detected language '%s' with probability %f" % (info.language, info.language_probability))
for segment in segments:
    print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))
    print()

Detected language 'yue' with probability 1.000000
[0.00s -> 2.60s]  GR-L420H



In [31]:
import time

results = []

for filename in tqdm(filenames, total=len(filenames)):
    result = {}
    start_time = time.time()
    segments, info = model_faster.transcribe(os.path.join(test_data_dir, filename), language="yue")
    result['pred_faster'] = ''.join([segment.text for segment in segments])
    time_cost = time.time() - start_time
    result['filename'] = filename
    result['time_cost_faster'] = round(time_cost, 2)
    results.append(result)

df_pred_faster = pd.DataFrame(results)
df_pred_faster.head()

100%|██████████| 531/531 [04:25<00:00,  2.00it/s]


Unnamed: 0,pred_faster,filename,time_cost_faster
0,GR-L420H,1c84d346-ef3f-4cd8-85eb-4e15e9f07d04.amr,0.41
1,ER-SGS20庫WHKG,23ce50a0-36ea-47fa-bfd5-13283d071fbd.amr,0.48
2,離島區,4b7aa1f7-890d-4d5f-892d-882c73107adf.amr,0.39
3,WWHBH85S2H1QWK,4d17faad-85bf-4b3b-b152-3b9a788cfa3e.amr,0.49
4,九龍城區,bfd0f17f-c8cd-4b33-9406-2cd40e92a5fd.amr,0.4


## combine all true and pred results

In [64]:
df_true = pd.read_csv("./data/test2/语料(1).csv")
df_all = pd.merge(df_true, df_pred_medium, left_on='record_file_id', right_on='filename')

# df_true = pd.read_csv(test_data_path, sep='\t')
# df_true = df_true[:num_test]
# df_all = pd.merge(df_true, df_pred_medium, left_on='path', right_on='filename')

In [65]:
df_all.shape

(531, 7)

In [66]:
df_all.head(10)

Unnamed: 0,category_name,content,user_id,record_file_id,pred_medium,filename,time_cost_medium
0,Model,CMSN 20 si,wuzy37,8deb96cd-1939-46d0-8f87-5bd798ade923.amr,你嘅咪醒呀你先啦,8deb96cd-1939-46d0-8f87-5bd798ade923.amr,0.24
1,Model,CMSRO 20di cr,wuzy37,09b311d8-c557-4bfb-bac5-f0d18a5da8e9.amr,I'm going to make a cake with a lot of strawb...,09b311d8-c557-4bfb-bac5-f0d18a5da8e9.amr,0.28
2,Model,CMSN 20 si,ex_xietian2,6ae92f35-3139-417c-b0f4-d30ac25b3937.amr,錯失,6ae92f35-3139-417c-b0f4-d30ac25b3937.amr,0.16
3,Model,CMSRO 20di cr,ex_xietian2,3b65f69e-5aea-4211-acaf-b7bb2f651755.amr,CNSR Office DICR,3b65f69e-5aea-4211-acaf-b7bb2f651755.amr,0.22
4,Model,CMSRO 20di rd,ex_xietian2,c40afd4e-dafb-40a5-8c6e-ce9283a33278.amr,自動化車是先你手和你腳合地,c40afd4e-dafb-40a5-8c6e-ce9283a33278.amr,0.27
5,Model,AF-52CS1TRHK(H),ex_xietian2,8ab13ab7-a573-4ac1-b1bf-0b229a19cca6.amr,HFWK我CSE TIRHK,8ab13ab7-a573-4ac1-b1bf-0b229a19cca6.amr,0.25
6,Model,CMSN 20 si,yeungchimkuen,2956c101-d82f-4f2c-8f1b-6119fa87694d.amr,CMSN20SI,2956c101-d82f-4f2c-8f1b-6119fa87694d.amr,0.19
7,Model,CMSRO 20di cr,yeungchimkuen,c124f9b4-38e1-4cee-8b41-49f57f069806.amr,CMSRO20DICR,c124f9b4-38e1-4cee-8b41-49f57f069806.amr,0.2
8,Model,CMSRO 20di rd,yeungchimkuen,17f3f07c-eca6-41d8-a3e4-9c0120595eb3.amr,CMSRO20DIRD,17f3f07c-eca6-41d8-a3e4-9c0120595eb3.amr,0.22
9,Model,AF-52CS1TRHK(H),yeungchimkuen,5102309c-8fb3-4232-b257-2bd8ed6ef53e.amr,ASHeven 與ECS-一TRHKQH,5102309c-8fb3-4232-b257-2bd8ed6ef53e.amr,0.29


In [67]:
df_all = pd.merge(df_all, df_pred_large, left_on='filename', right_on='filename')
df_all.head()

Unnamed: 0,category_name,content,user_id,record_file_id,pred_medium,filename,time_cost_medium,pred_large,chunks_large,time_cost_large
0,Model,CMSN 20 si,wuzy37,8deb96cd-1939-46d0-8f87-5bd798ade923.amr,你嘅咪醒呀你先啦,8deb96cd-1939-46d0-8f87-5bd798ade923.amr,0.24,新年快樂,"[{'timestamp': (0.0, 3.1), 'text': ' 新年快樂'}]",0.34
1,Model,CMSRO 20di cr,wuzy37,09b311d8-c557-4bfb-bac5-f0d18a5da8e9.amr,I'm going to make a cake with a lot of strawb...,09b311d8-c557-4bfb-bac5-f0d18a5da8e9.amr,0.28,《天使的名字》,"[{'timestamp': (0.0, 7.78), 'text': ' 《天使的名字》'}]",0.41
2,Model,CMSN 20 si,ex_xietian2,6ae92f35-3139-417c-b0f4-d30ac25b3937.amr,錯失,6ae92f35-3139-417c-b0f4-d30ac25b3937.amr,0.16,測試,"[{'timestamp': (0.0, 0.96), 'text': ' 測試'}]",0.31
3,Model,CMSRO 20di cr,ex_xietian2,3b65f69e-5aea-4211-acaf-b7bb2f651755.amr,CNSR Office DICR,3b65f69e-5aea-4211-acaf-b7bb2f651755.amr,0.22,CNSRO20DICR,"[{'timestamp': (0.0, 5.28), 'text': ' CNSRO20D...",0.39
4,Model,CMSRO 20di rd,ex_xietian2,c40afd4e-dafb-40a5-8c6e-ce9283a33278.amr,自動化車是先你手和你腳合地,c40afd4e-dafb-40a5-8c6e-ce9283a33278.amr,0.27,自動化測試CNNX-R5-20DR2D,"[{'timestamp': (0.0, 4.94), 'text': ' 自動化測試CNN...",0.66


In [68]:
df_all = pd.merge(df_all, df_pred_faster, left_on='filename', right_on='filename')
df_all.head()

Unnamed: 0,category_name,content,user_id,record_file_id,pred_medium,filename,time_cost_medium,pred_large,chunks_large,time_cost_large,pred_faster,time_cost_faster
0,Model,CMSN 20 si,wuzy37,8deb96cd-1939-46d0-8f87-5bd798ade923.amr,你嘅咪醒呀你先啦,8deb96cd-1939-46d0-8f87-5bd798ade923.amr,0.24,新年快樂,"[{'timestamp': (0.0, 3.1), 'text': ' 新年快樂'}]",0.34,新年快樂 新年快樂,1.27
1,Model,CMSRO 20di cr,wuzy37,09b311d8-c557-4bfb-bac5-f0d18a5da8e9.amr,I'm going to make a cake with a lot of strawb...,09b311d8-c557-4bfb-bac5-f0d18a5da8e9.amr,0.28,《天使的名字》,"[{'timestamp': (0.0, 7.78), 'text': ' 《天使的名字》'}]",0.41,浅水湖 浅水湖 浅水湖 浅水湖 浅水湖,6.83
2,Model,CMSN 20 si,ex_xietian2,6ae92f35-3139-417c-b0f4-d30ac25b3937.amr,錯失,6ae92f35-3139-417c-b0f4-d30ac25b3937.amr,0.16,測試,"[{'timestamp': (0.0, 0.96), 'text': ' 測試'}]",0.31,測試,0.39
3,Model,CMSRO 20di cr,ex_xietian2,3b65f69e-5aea-4211-acaf-b7bb2f651755.amr,CNSR Office DICR,3b65f69e-5aea-4211-acaf-b7bb2f651755.amr,0.22,CNSRO20DICR,"[{'timestamp': (0.0, 5.28), 'text': ' CNSRO20D...",0.39,CNSRO20DICR,0.44
4,Model,CMSRO 20di rd,ex_xietian2,c40afd4e-dafb-40a5-8c6e-ce9283a33278.amr,自動化車是先你手和你腳合地,c40afd4e-dafb-40a5-8c6e-ce9283a33278.amr,0.27,自動化測試CNNX-R5-20DR2D,"[{'timestamp': (0.0, 4.94), 'text': ' 自動化測試CNN...",0.66,自動化測試CNNX-R5-20DR2D,0.55


# Calculate Character Error Rate (CER)

In [69]:
import re

class ChineseTextNormalizer:
    def __init__(self, split_letters: bool = False):

        self.split_letters = split_letters

    def __call__(self, s: str):
        s = s.lower()
        # s = re.sub(r"[<\[][^>\]]*[>\]]", "", s)  # remove words between brackets
        # s = re.sub(r"\(([^)]+?)\)", "", s)  # remove words between parenthesis
        # s = self.clean(s).lower()

        # if self.split_letters:
        #     s = " ".join(regex.findall(r"\X", s, regex.U))

        s = re.sub(
            r"\s+", "", s
        )  # remove any successive whitespace characters
        
        return s

In [70]:
import jiwer
from whisper.normalizers import BasicTextNormalizer, EnglishTextNormalizer

# normalizer = EnglishTextNormalizer()
# normalizer = BasicTextNormalizer()
normalizer = ChineseTextNormalizer()

In [72]:
for i, row in tqdm(df_all.iterrows(), total=df_all.shape[0]):
    
    content_clean = normalizer(row['content'])
    # content_clean = normalizer(row['sentence'])
    pred_medium_clean = normalizer(row['pred_medium'])
    pred_large_clean = normalizer(row['pred_large'])
    pred_faster_clean = normalizer(row['pred_faster'])
    
    df_all.loc[i, 'content_clean'] = content_clean
    df_all.loc[i, 'pred_medium_clean'] = pred_medium_clean
    df_all.loc[i, 'pred_large_clean'] = pred_large_clean
    df_all.loc[i, 'pred_faster_clean'] = pred_faster_clean

    # 'chunks_medium',
new_columns = ['category_name', 'user_id', 'content', 'content_clean', 'pred_medium', 'pred_medium_clean', 'pred_large', 'pred_large_clean', 'pred_faster', 'pred_faster_clean', 'time_cost_medium', 'time_cost_large', 'time_cost_faster', 'chunks_large', 'record_file_id',  'filename']
# new_columns = ['sentence', 'content_clean', 'pred_medium', 'pred_medium_clean', 'pred_large', 'pred_large_clean', 'pred_faster', 'pred_faster_clean', 'time_cost_medium', 'time_cost_large', 'time_cost_faster', 'filename']

df_all = df_all[new_columns]
df_all.head(10)

100%|██████████| 531/531 [00:00<00:00, 1756.14it/s]


Unnamed: 0,category_name,user_id,content,content_clean,pred_medium,pred_medium_clean,pred_large,pred_large_clean,pred_faster,pred_faster_clean,time_cost_medium,time_cost_large,time_cost_faster,chunks_large,record_file_id,filename
0,Model,wuzy37,CMSN 20 si,cmsn20si,你嘅咪醒呀你先啦,你嘅咪醒呀你先啦,新年快樂,新年快樂,新年快樂 新年快樂,新年快樂新年快樂,0.24,0.34,1.27,"[{'timestamp': (0.0, 3.1), 'text': ' 新年快樂'}]",8deb96cd-1939-46d0-8f87-5bd798ade923.amr,8deb96cd-1939-46d0-8f87-5bd798ade923.amr
1,Model,wuzy37,CMSRO 20di cr,cmsro20dicr,I'm going to make a cake with a lot of strawb...,i'mgoingtomakeacakewithalotofstrawberries,《天使的名字》,《天使的名字》,浅水湖 浅水湖 浅水湖 浅水湖 浅水湖,浅水湖浅水湖浅水湖浅水湖浅水湖,0.28,0.41,6.83,"[{'timestamp': (0.0, 7.78), 'text': ' 《天使的名字》'}]",09b311d8-c557-4bfb-bac5-f0d18a5da8e9.amr,09b311d8-c557-4bfb-bac5-f0d18a5da8e9.amr
2,Model,ex_xietian2,CMSN 20 si,cmsn20si,錯失,錯失,測試,測試,測試,測試,0.16,0.31,0.39,"[{'timestamp': (0.0, 0.96), 'text': ' 測試'}]",6ae92f35-3139-417c-b0f4-d30ac25b3937.amr,6ae92f35-3139-417c-b0f4-d30ac25b3937.amr
3,Model,ex_xietian2,CMSRO 20di cr,cmsro20dicr,CNSR Office DICR,cnsrofficedicr,CNSRO20DICR,cnsro20dicr,CNSRO20DICR,cnsro20dicr,0.22,0.39,0.44,"[{'timestamp': (0.0, 5.28), 'text': ' CNSRO20D...",3b65f69e-5aea-4211-acaf-b7bb2f651755.amr,3b65f69e-5aea-4211-acaf-b7bb2f651755.amr
4,Model,ex_xietian2,CMSRO 20di rd,cmsro20dird,自動化車是先你手和你腳合地,自動化車是先你手和你腳合地,自動化測試CNNX-R5-20DR2D,自動化測試cnnx-r5-20dr2d,自動化測試CNNX-R5-20DR2D,自動化測試cnnx-r5-20dr2d,0.27,0.66,0.55,"[{'timestamp': (0.0, 4.94), 'text': ' 自動化測試CNN...",c40afd4e-dafb-40a5-8c6e-ce9283a33278.amr,c40afd4e-dafb-40a5-8c6e-ce9283a33278.amr
5,Model,ex_xietian2,AF-52CS1TRHK(H),af-52cs1trhk(h),HFWK我CSE TIRHK,hfwk我csetirhk,AF-52CSE TRH Key,af-52csetrhkey,AF-52CSE TRH Key,af-52csetrhkey,0.25,0.41,0.46,"[{'timestamp': (0.0, 5.66), 'text': ' AF-52CSE...",8ab13ab7-a573-4ac1-b1bf-0b229a19cca6.amr,8ab13ab7-a573-4ac1-b1bf-0b229a19cca6.amr
6,Model,yeungchimkuen,CMSN 20 si,cmsn20si,CMSN20SI,cmsn20si,CMSN20SI,cmsn20si,CMSN20SI,cmsn20si,0.19,0.32,0.4,"[{'timestamp': (0.0, 3.08), 'text': ' CMSN20SI'}]",2956c101-d82f-4f2c-8f1b-6119fa87694d.amr,2956c101-d82f-4f2c-8f1b-6119fa87694d.amr
7,Model,yeungchimkuen,CMSRO 20di cr,cmsro20dicr,CMSRO20DICR,cmsro20dicr,CMSRO20DICR,cmsro20dicr,CMSRO20DICR,cmsro20dicr,0.2,0.36,0.42,"[{'timestamp': (0.0, 4.08), 'text': ' CMSRO20D...",c124f9b4-38e1-4cee-8b41-49f57f069806.amr,c124f9b4-38e1-4cee-8b41-49f57f069806.amr
8,Model,yeungchimkuen,CMSRO 20di rd,cmsro20dird,CMSRO20DIRD,cmsro20dird,CMSRO20-DIRD,cmsro20-dird,CMSRO20 DIRD,cmsro20dird,0.22,0.38,0.43,"[{'timestamp': (0.0, 4.02), 'text': ' CMSRO20-...",17f3f07c-eca6-41d8-a3e4-9c0120595eb3.amr,17f3f07c-eca6-41d8-a3e4-9c0120595eb3.amr
9,Model,yeungchimkuen,AF-52CS1TRHK(H),af-52cs1trhk(h),ASHeven 與ECS-一TRHKQH,asheven與ecs-一trhkqh,AS-52CS1TRHKQH,as-52cs1trhkqh,AS-52CS1TRHKQH,as-52cs1trhkqh,0.29,0.47,0.47,"[{'timestamp': (0.0, 6.06), 'text': ' AS-52CS1...",5102309c-8fb3-4232-b257-2bd8ed6ef53e.amr,5102309c-8fb3-4232-b257-2bd8ed6ef53e.amr


In [73]:
df_all.fillna('', inplace=True)
df_all.to_csv("outputs/test2_transcriptions_small.csv", index=False)

In [74]:
cer_medium = jiwer.cer(list(df_all["content_clean"]), list(df_all["pred_medium_clean"]))
cer_large = jiwer.cer(list(df_all["content_clean"]), list(df_all["pred_large_clean"]))
cer_faster = jiwer.cer(list(df_all["content_clean"]), list(df_all["pred_faster_clean"]))

In [75]:
df_cer = pd.DataFrame({"CER_medium": [cer_medium], "CER_large": [cer_large], "CER_faster": [cer_faster]})
df_cer = df_cer *100
df_cer = df_cer.round(2)
df_cer.to_csv("outputs/test2_cer_small.csv", index=False)
df_cer

Unnamed: 0,CER_medium,CER_large,CER_faster
0,67.87,33.99,32.7


In [76]:
df_stats = df_all[['time_cost_medium', 'time_cost_large', 'time_cost_faster']].describe(percentiles=[0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 0.99])
df_stats.to_csv("outputs/test2_time_stats_small.csv")
df_stats

Unnamed: 0,time_cost_medium,time_cost_large,time_cost_faster
count,531.0,531.0,531.0
mean,0.251676,0.449171,0.498569
std,0.033529,0.176234,0.523833
min,0.16,0.29,0.38
50%,0.25,0.44,0.46
60%,0.26,0.45,0.46
70%,0.27,0.46,0.47
80%,0.28,0.49,0.48
90%,0.29,0.51,0.49
95%,0.31,0.55,0.5


# clip_duration analysis

In [None]:
import pandas as pd

path = "data/cv-corpus-17.0-2024-03-15/zh-HK/clip_durations.tsv"

df = pd.read_csv(path, sep='\t')
df.head()

In [None]:
df['duration[ms]'].describe()

In [None]:
path = "data/cv-corpus-17.0-2024-03-15/zh-HK/train.tsv"

df = pd.read_csv(path, sep='\t')
df.head()

In [None]:
df['len'] = df['sentence'].apply(len)
df.head()

In [None]:
df['len'].describe()

In [None]:
23*8

In [47]:
df_all['len'] = df_all['content_clean'].apply(len)
df_all['len'].describe()

count    531.000000
mean      11.806026
std        2.977596
min        2.000000
25%       11.000000
50%       12.000000
75%       13.000000
max       23.000000
Name: len, dtype: float64