### faster whisper

In [1]:
from faster_whisper import WhisperModel

model_size_tiny = "tiny"
model_size_medium = "medium"
model_size_large = "large"
model_size_large_v2 = "large-v2"

# Run on GPU with FP16
# model = WhisperModel(model_size, device="cuda", compute_type="float16")
# or run on GPU with INT8
# model = WhisperModel(model_size, device="cuda", compute_type="int8_float16")
# or run on CPU with INT8
model_1 = WhisperModel(model_size_tiny, device="cpu", compute_type="int8")
model_2 = WhisperModel(model_size_medium, device="cuda", compute_type="float16")
model_3 = WhisperModel(model_size_large, device="cuda", compute_type="float16")
model_4 = WhisperModel(model_size_large_v2, device="cuda", compute_type="float16")


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
import string
from jiwer import wer

def reward_wer(reference, hypothesis):
    raw_wer = wer(reference, hypothesis)
    normalized = min(raw_wer, 1.0)
    return 1 - normalized

# original_transcription[386] = "words"
# original_transcription[474] = "milk"
# original_transcription[477] = "well"
# original_transcription[693] = "yes"
# original_transcription[1294] = "i can do that"

def test(asr_model, beam_size=1, num=0):
    # get all examples which name contains "example_generate_data_3" in directory "output/1124-2204", i=0 to 39
    # ground_truths = ["but i will be in a minute", "i dont know", "why not","goodbye", "look"]
    # name = f"example_generate_data_{num}"
    # wavs = [f"output/1124-2204/{name}_item_{i}.wav" for i in range(40)]
    name = "example_generate_eval_-1"
    index_map = {
        0: 386,
        1: 474,
        2: 477,
        3: 693,
        4: 1294
    }
    ground_truths = [
        "words",
        "milk",
        "well",
        "yes",
        "i can do that"
    ]
    wavs = [f"output/1124-2204/{name}_data_{index_map[num]}_item_{i}.wav" for i in range(10)]
    
    rewards = []
    for wav in wavs:
        segments, info = asr_model.transcribe(wav, beam_size=beam_size)
        segments = list(segments)
        modified_text = ''.join(char for char in segments[0].text if char not in string.punctuation).lower().strip()
        ground_truth = ground_truths[num]
        reward = reward_wer(modified_text, ground_truth)
        rewards.append(reward)
        
    return rewards

In [5]:
import time

def test_different_model_sizes():
    # for model in [model_1, model_2, model_3, model_4]:
    models = [model_1, model_2, model_3, model_4]
    for m in range(1,5):
        print(f"Model {m}")
        model = models[m-1]
        
        print(f"Beam size: 1")
        for num in range(5):
            start = time.time()
            rewards = test(model, num=num)
            print(f"Average reward for data {num}: {sum(rewards) / len(rewards):.2%}")
            print(f"Time: {time.time() - start:.2f}s")
            
        print(f"Beam size: 5")
        for num in range(5):
            start = time.time()
            rewards = test(model, beam_size=5, num=num)
            print(f"Average reward for data {num}: {sum(rewards) / len(rewards):.2%}")
            print(f"Time: {time.time() - start:.2f}s")
            
        print("--------------------")
            
test_different_model_sizes()

Model 1
Beam size: 1
Average reward for data 0: 60.00%
Time: 6.36s
Average reward for data 1: 90.00%
Time: 4.50s
Average reward for data 2: 10.00%
Time: 6.24s
Average reward for data 3: 100.00%
Time: 4.03s
Average reward for data 4: 46.50%
Time: 4.62s
Beam size: 5
Average reward for data 0: 60.00%
Time: 6.38s
Average reward for data 1: 90.00%
Time: 4.33s
Average reward for data 2: 10.00%
Time: 6.25s
Average reward for data 3: 100.00%
Time: 4.05s
Average reward for data 4: 56.50%
Time: 4.10s
--------------------
Model 2
Beam size: 1
Average reward for data 0: 10.00%
Time: 4.25s
Average reward for data 1: 100.00%
Time: 2.77s
Average reward for data 2: 0.00%
Time: 5.18s
Average reward for data 3: 90.00%
Time: 2.40s
Average reward for data 4: 40.83%
Time: 2.57s
Beam size: 5
Average reward for data 0: 10.00%
Time: 3.45s
Average reward for data 1: 100.00%
Time: 3.09s
Average reward for data 2: 0.00%
Time: 4.87s
Average reward for data 3: 80.00%
Time: 2.65s
Average reward for data 4: 47.50%
T

In [None]:
# beam_size = 5

# start = time.time()
# rewards = test(model, beam_size)
# print(f"Reward: {rewards}")
# print(f"Average reward: {sum(rewards) / len(rewards):.2%}")
# print(f"Time: {time.time() - start:.2f}s")

# start = time.time()
# rewards = test(model_2, beam_size)
# print(f"Reward: {rewards}")
# print(f"Average reward: {sum(rewards) / len(rewards):.2%}")
# print(f"Time: {time.time() - start:.2f}s")

# start = time.time()
# rewards = test(model_3, beam_size)
# print(f"Reward: {rewards}")
# print(f"Average reward: {sum(rewards) / len(rewards):.2%}")
# print(f"Time: {time.time() - start:.2f}s")

# start = time.time()
# rewards = test(model_4, beam_size)
# print(f"Reward: {rewards}")
# print(f"Average reward: {sum(rewards) / len(rewards):.2%}")
# print(f"Time: {time.time() - start:.2f}s")


In [None]:
# model_size_large = "medium"
# model_3 = WhisperModel(model_size_large, device="cuda", compute_type="float16")
# start = time.time()
# rewards = [test(model_3) for _ in range(10)]
# print(f"Average reward: {sum(rewards) / len(rewards):.2%}")
# print(f"Time: {time.time() - start:.2f}s")

In [None]:
from jiwer import wer

text1 = " Neighboring fields"
text2 = "neighboring fields"
error_rate = wer(text1, text2)
print(error_rate)

print(f"Word Error Rate: {error_rate:.2%}")


In [None]:
def reward_wer(reference, hypothesis):
    raw_wer = wer(reference, hypothesis)
    normalized = min(raw_wer, 1.0)
    return 1 - normalized

text1 = "make a ring for you"
text2 = "neighboring fields"

reward = reward_wer(text1, text2)
print(f"Reward: {reward:.2f}")

In [None]:
import string
text = "Neighboring fields." # 8
text = "But I will be, in a minute." # 16
text = "Will you?" # 65
text = "Do they?" # 100
text = "Don't you?" # 102

text = "I don't know" # 105
text = "Why is it?" # 112
text = "Why not?" # 132
text = "Good-bye" # 140
text = "The idea!" # 184


modified_text = ''.join(char for char in text if char not in string.punctuation).lower().strip()
print(modified_text)

In [None]:
import string

text_list = ["Neighboring fields.", "But I will be, in a minute.", "Will you?", "Do they?", "Don't you?", "I don't know", "Why is it?", "Why not?", "Good-bye", "The idea!"]
index_list = [8, 16, 65, 100, 102, 105, 112, 132, 140, 184]
for i, text in enumerate(text_list):
    modified_text = ''.join(char for char in text if char not in string.punctuation).lower().strip()
    print(f"{index_list[i]} {modified_text}")
    