In [1]:
import warnings
warnings.filterwarnings('ignore')

# Test whisper models

In [2]:
import torch
from datasets import load_dataset

device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
print(f"device: {device}")

def initialize_pipe(model, model_path):
    
    # model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True, attn_implementation="flash_attention_2") # use_flash_attention_2=True) # 
    # model.to(device)

    if hasattr(model.generation_config, "no_timestamps_token_id"):
        return_timestamps = True
    else:
        return_timestamps = False
        
    processor = AutoProcessor.from_pretrained(model_path)

    pipe = pipeline(
        "automatic-speech-recognition",
        model=model,
        tokenizer=processor.tokenizer,
        feature_extractor=processor.feature_extractor,
        max_new_tokens=128,
        chunk_length_s=30,
        batch_size=16,
        return_timestamps=return_timestamps,
        torch_dtype=torch_dtype,
        device=device,
    )
    
    return pipe

device: cuda:0


# Test and compare different versions

## Common voice 17.0, zh-HK

In [3]:
# import os
# import pandas as pd

# test_data_dir = "data/cv-corpus-17.0-2024-03-15/zh-HK/clips"
# test_data_path = "data/cv-corpus-17.0-2024-03-15/zh-HK/test.tsv"

# test_data = pd.read_csv(test_data_path, sep='\t')
# print(test_data.shape)
# test_data.head()

# num_test = 10000
# filenames = test_data['path'][:num_test]
# filenames[:5]

## midea data

In [5]:
import pandas as pd
test_data_dir = "data/midea_data_500/clips"

df = pd.read_csv("./data/midea_data_500/transcripts.csv")
df.rename(columns={"sentence": "true"}, inplace=True)
df.head()

Unnamed: 0,category_name,true,user_id,file_id
0,Model,CMSN 20 si,wuzy37,8deb96cd-1939-46d0-8f87-5bd798ade923.mp3
1,Model,CMSRO 20di cr,wuzy37,09b311d8-c557-4bfb-bac5-f0d18a5da8e9.mp3
2,Model,CMSN 20 si,ex_xietian2,6ae92f35-3139-417c-b0f4-d30ac25b3937.mp3
3,Model,CMSRO 20di cr,ex_xietian2,3b65f69e-5aea-4211-acaf-b7bb2f651755.mp3
4,Model,CMSRO 20di rd,ex_xietian2,c40afd4e-dafb-40a5-8c6e-ce9283a33278.mp3


## clear GPU

In [6]:
import gc

# Clear the model from GPU memory
def clear_model_from_gpu(model):
    # Delete the model object
    del model
    # Force garbage collection to free any unreferenced memory
    gc.collect()
    # Clear any existing CUDA cache
    torch.cuda.empty_cache()
    # Optionally, you can reinitialize the CUDA context by calling CUDA operations
    torch.cuda.synchronize()
    
    print("Model cleared from GPU memory.")

In [15]:
import os
import json
import time
from tqdm import tqdm
from faster_whisper import WhisperModel
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline

model_ids = [
    # {"label": "small-c", "model_path": "/home/ec2-user/SageMaker/efs/Models/whisper-small-cantonese"},
    # {"label": "medium", "model_path": "/home/ec2-user/SageMaker/efs/Models/whisper-medium"},
    # {"label": "faster-large-v3", "model_path": "/home/ec2-user/SageMaker/efs/Models/faster-whisper-large-v3"},  
    # {"label": "ckpt-270", "model_path": "/home/ec2-user/SageMaker/efs/Projects/whisper/checkpoint/checkpoint-270"},  
    # {"label": "ckpt-675", "model_path": "/home/ec2-user/SageMaker/efs/Projects/whisper/checkpoint/checkpoint-675"}, 
    {"label": "large-v3", "model_path": "/home/ec2-user/SageMaker/efs/Models/whisper-large-v3"},
]

df = df[:20]
sample = os.path.join(test_data_dir, df.loc[0, 'file_id'])

results = []

for model_id in model_ids:
    label = model_id["label"]
    model_path = model_id["model_path"]
    
    if 'faster' in label: 
        model = WhisperModel(model_path)
        # model.to(device)
        segments, info = model.transcribe(sample, language="yue") # 
        pred = ''.join([segment.text for segment in segments])
        print(f"model label: {label}, result: {pred}")
    else:
        model = AutoModelForSpeechSeq2Seq.from_pretrained(model_path, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True, attn_implementation="flash_attention_2") 
        model.to(device)
        
        pipe = initialize_pipe(model, model_path)
        result = pipe(sample)
        print(f"model label: {label}, result: {result['text']}")
    
    for idx, row in tqdm(df.iterrows(), total=df.shape[0]):
        filename = row['file_id']
        filepath = os.path.join(test_data_dir, filename)
        
        if not os.path.exists(filepath):
            continue            
        
        start_time = time.time()
        if label == "large-v3":
            result = pipe(filepath, generate_kwargs={"language": "cantonese"})
        elif label == "faster-large-v3":
            segments, info = model.transcribe(filepath, language="yue") # 
            result = {'text': ''.join([segment.text for segment in segments])}
        else:
            result = pipe(filepath)
        
        time_cost = time.time() - start_time
        
        df.loc[idx, f"pred-{label}"] = result['text']
        df.loc[idx, f"time-{label}"] = round(time_cost, 2)
        df.loc[idx, f"chunks-{label}"] = json.dumps(result.get("chunks", ""))
    
    clear_model_from_gpu(model)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


model label: large-v3, result:  Thank you.


100%|██████████| 20/20 [00:08<00:00,  2.44it/s]

Model cleared from GPU memory.





In [18]:
os.path.exists(sample)

True

In [16]:
df.head()

Unnamed: 0,category_name,true,user_id,file_id,pred-large-v3,time-large-v3,chunks-large-v3,pred-ckpt-270,time-ckpt-270,chunks-ckpt-270,pred-ckpt-675,time-ckpt-675,chunks-ckpt-675
0,Model,CMSN 20 si,wuzy37,8deb96cd-1939-46d0-8f87-5bd798ade923.mp3,新年快樂,0.34,"[{""timestamp"": [0.0, 4.0], ""text"": "" \u65b0\u5...",,3.18,[],�,0.28,"[{""timestamp"": [0.0, null], ""text"": ""\ufffd""}]"
1,Model,CMSRO 20di cr,wuzy37,09b311d8-c557-4bfb-bac5-f0d18a5da8e9.mp3,字幕由 Amara.org 社群提供,0.57,"[{""timestamp"": [0.0, 7.88], ""text"": "" \u5b57\u...",,3.1,[],�,0.29,"[{""timestamp"": [0.0, null], ""text"": ""\ufffd""}]"
2,Model,CMSN 20 si,ex_xietian2,6ae92f35-3139-417c-b0f4-d30ac25b3937.mp3,收下,0.28,"[{""timestamp"": [0.0, 1.0], ""text"": "" \u6536\u4...",,3.14,[],�,0.28,"[{""timestamp"": [0.0, null], ""text"": ""\ufffd""}]"
3,Model,CMSRO 20di cr,ex_xietian2,3b65f69e-5aea-4211-acaf-b7bb2f651755.mp3,CNSRO20 DICI,0.38,"[{""timestamp"": [0.0, 5.32], ""text"": "" CNSRO20 ...",,3.15,[],�,0.29,"[{""timestamp"": [0.0, null], ""text"": ""\ufffd""}]"
4,Model,CMSRO 20di rd,ex_xietian2,c40afd4e-dafb-40a5-8c6e-ce9283a33278.mp3,自動化測試CMS好無好你驚好驚,0.62,"[{""timestamp"": [0.0, 5.0], ""text"": "" \u81ea\u5...",,3.12,[],�,0.27,"[{""timestamp"": [0.0, null], ""text"": ""\ufffd""}]"


## CER calculation

In [None]:
import re
import jiwer
from whisper.normalizers import BasicTextNormalizer, EnglishTextNormalizer

class ChineseTextNormalizer:
    def __init__(self, split_letters: bool = False):

        self.split_letters = split_letters

    def __call__(self, s: str):
        s = s.lower()
        # s = re.sub(r"[<\[][^>\]]*[>\]]", "", s)  # remove words between brackets
        # s = re.sub(r"\(([^)]+?)\)", "", s)  # remove words between parenthesis
        # s = self.clean(s).lower()

        # if self.split_letters:
        #     s = " ".join(regex.findall(r"\X", s, regex.U))

        s = re.sub(
            r"\s+", "", s
        )  # remove any successive whitespace characters
        
        return s
    
# normalizer = EnglishTextNormalizer()
# normalizer = BasicTextNormalizer()
normalizer = ChineseTextNormalizer()

In [None]:
df.head()

In [None]:
new_columns = ['true', 'true-clean']
pred_cols, cer_cols, time_cols = [], [], []
for model_id in model_ids:
    label = model_id['label']
    pred_cols.extend([f'pred-{label}',f'pred-{label}-clean'])
    cer_cols.append(f'cer-{label}')
    time_cols.append(f'time-{label}')

new_columns.extend(pred_cols+cer_cols+time_cols)
new_columns.extend(['file_id', 'user_id', 'category_name'])

for i, row in tqdm(df.iterrows(), total=df.shape[0]):
    
    true_clean = normalizer(row['true'])
    df.loc[i, 'true-clean'] = true_clean
    
    for model_id in model_ids:
        label = model_id['label']
        pred_clean = normalizer(row[f'pred-{label}'])
        df.loc[i, f'pred-{label}-clean'] = pred_clean
        
        cer = jiwer.cer(true_clean, pred_clean)
        df.loc[i, f'cer-{label}'] = cer
        
df = df[new_columns]
df = df.fillna("")
df.to_csv("outputs/transcripts.csv", index=False)
df.head(10)

In [None]:
df = df.fillna("")
cers = []
for model_id in model_ids:
    label = model_id['label']
    df = df[df[f"pred-{label}-clean"]!='']
    cer = jiwer.cer(list(df["true-clean"]), list(df[f"pred-{label}-clean"]))
    
    cers.append({'model': label, 'cer': cer})

df_cers = pd.DataFrame(cers)
df_cers.to_csv("outputs/cers.csv", index=False)
df_cers.head()

In [None]:
df_cer_stats = df[cer_cols].describe(percentiles=[0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 0.99])
df_cer_stats.to_csv("outputs/cer_stats.csv")
df_cer_stats.head()

In [None]:
df_time_stats = df[time_cols].describe(percentiles=[0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 0.99])
df_time_stats.to_csv("outputs/time_stats.csv")
df_time_stats.head()

# Calculate the statistics of dataset

In [None]:
# import pandas as pd

# path = "data/cv-corpus-17.0-2024-03-15/zh-HK/clip_durations.tsv"

# df = pd.read_csv(path, sep='\t')
# df.head()

In [None]:
# df['duration[ms]'].describe()