Working from the Hugging Face OpenAI Whisper Large V3 page (https://huggingface.co/openai/whisper-large-v3) to practice using the transformer;

changed to Large V3 Turbo (https://huggingface.co/openai/whisper-large-v3-turbo).

- Uses environment AKStandard_AIML (Py 3.11.9)

# To Do

- use more functions; the print results should be summarized for example.
- docker to EC2 instance?
- connect notebook to EC2 instance?
- acceleration via NVIDIA GPU in EC2 instance?
- acceleration via Flash Attention/torch.compile???

# Imports

In [1]:
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from datasets import load_dataset

import time
import warnings

# Learning to run on GPU

In [15]:
# Running MacBook Air with Apple M2 chip - need to tell Torch to use Metal Performance Shaders (MPS, https://developer.apple.com/metal/pytorch/)
# and confirm not running on CPU

# import torch

# define/assign device
device = torch.device("mps")  # Force Metal backend

# ensure that mps is active and behaving on GPU
# Create a tensor and move it to GPU
x = torch.ones(5, device=device)
print(x)

mps
tensor([1., 1., 1., 1., 1.], device='mps:0')


In [20]:
# Test CPU vs MPS via matrix multiplication - from ChatGPT - CPU faster

# import torch
# import time

# Set device
device_mps = torch.device("mps")
device_cpu = torch.device("cpu")

# Create a random tensor
size = (10000, 10000)
a_cpu = torch.randn(size, device="cpu")
b_cpu = torch.randn(size, device="cpu")

a_mps = torch.randn(size, device="mps")
b_mps = torch.randn(size, device="mps")

# CPU computation
start = time.time()
c_cpu = a_cpu @ b_cpu  # Matrix multiplication
torch.cuda.synchronize() if torch.cuda.is_available() else None
print("CPU Time:", time.time() - start)

# MPS computation
start = time.time()
c_mps = a_mps @ b_mps  # Matrix multiplication
torch.mps.synchronize()  # Ensure all ops are finished
print("MPS Time:", time.time() - start)

CPU Time: 2.241041898727417
MPS Time: 5.3711090087890625


In [12]:
# ChatGPT came up with this to test CPU vs MPS speed.
# Note: had to lengthen the text - just multiplied the sentence so it's longer;
# otherwise, CPU is faster with short input.

# import torch
# from transformers import AutoModel, AutoTokenizer
# import time

model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

for device_str in ["cpu", "mps"]:  # Test both CPU and MPS
    device = torch.device(device_str)
    model = AutoModel.from_pretrained(model_name).to(device)

    text = "This is a test sentence."*32
    inputs = tokenizer(text, return_tensors="pt").to(device)

    # Run inference and time it
    torch.mps.empty_cache() if device_str == "mps" else None
    start_time = time.time()

    with torch.no_grad():
        output = model(**inputs)

    elapsed_time = time.time() - start_time
    print(f"Inference Time on {device_str.upper()}: {elapsed_time:.4f} seconds")

Inference Time on CPU: 0.7383 seconds
Inference Time on MPS: 0.5304 seconds


In [14]:
# using a much longer example dataset to test CPU vs MPS speed
# also from ChatGPT

# import torch
# import time
# from transformers import AutoModel, AutoTokenizer
# from datasets import load_dataset

# Choose dataset (SQuAD is large, IMDB is also an option)
dataset = load_dataset("squad", split="train")  # Stanford Q&A dataset

# Extract the first 512 examples (adjust as needed)
texts = [example["context"] for example in dataset.select(range(128))]

# Choose model
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Function to benchmark
def benchmark(device_str):
    device = torch.device(device_str)
    model = AutoModel.from_pretrained(model_name).to(device)

    # Tokenize dataset
    inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True).to(device)

    # Run inference
    torch.mps.empty_cache() if device_str == "mps" else None
    start_time = time.time()

    with torch.no_grad():
        output = model(**inputs)

    elapsed_time = time.time() - start_time
    print(f"Inference Time on {device_str.upper()}: {elapsed_time:.4f} seconds")

# Run benchmarks
benchmark("cpu")
benchmark("mps")

Inference Time on CPU: 14.8640 seconds
Inference Time on MPS: 8.3764 seconds


# File Conversion

In [11]:
# it is apparently not convenient to convert .m4a to .mp3 or .wav on mac
# so the following code is recommended

from pydub import AudioSegment

# Load the .m4a file
audio = AudioSegment.from_file("./data/2045 Spring 2025 Exam 2 MW.m4a", format="m4a")

# Export as .wav
audio.export("./data/2045 Spring 2025 Exam 2 MW.wav", format="wav")
print("Conversion complete!")

Conversion complete!


# Choose Data, Model

In [2]:
# get data

# real data options
sample = './data/2045 Spring 2025 Exam 2 Key.wav'
# sample = './data/2045 Spring 2025 Exam 2 MW.wav'

# Hugging Face test data

# dataset = load_dataset("distil-whisper/librispeech_long", "clean", split="validation")
# sample = dataset[0]["audio"]

In [3]:
# define/get model, processor
# need to figure out what processor is?

model_id = "openai/whisper-large-v3-turbo"
# model_id = "openai/whisper-large-v3"

processor = AutoProcessor.from_pretrained(model_id)

# Tutorial Starts Here

# GPU Version

In [7]:
# set device, torch type, model parameters, move to 'device'

# I think I can just use (and test sometime with 'cpu' instead)
device = 'mps'

# unclear to me if M2 can use float16 (recommended for NVIDIA GPU per tutorial) so using float32
torch_dtype = torch.float32

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id,
    # language = 'en',
    torch_dtype=torch_dtype,
    # low_cpu_mem_usage=True, # dropped because running on GPU
    use_safetensors=True) # this is apparently faster and more secure? https://huggingface.co/docs/diffusers/v0.28.1/using-diffusers/using_safetensors

# apparently moves model to GPU or specified device
model.to(device)

WhisperForConditionalGeneration(
  (model): WhisperModel(
    (encoder): WhisperEncoder(
      (conv1): Conv1d(128, 1280, kernel_size=(3,), stride=(1,), padding=(1,))
      (conv2): Conv1d(1280, 1280, kernel_size=(3,), stride=(2,), padding=(1,))
      (embed_positions): Embedding(1500, 1280)
      (layers): ModuleList(
        (0-31): 32 x WhisperEncoderLayer(
          (self_attn): WhisperSdpaAttention(
            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)
            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)
            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)
            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1280, out_features=5120, bias=True)
          (fc2): Linear(in_features=5120, out_features=1280, bia

In [8]:
# pipe to hold model

pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    torch_dtype=torch_dtype,
    device=device,
    return_timestamps=True
)

Device set to use mps


In [9]:
# run model

# warnings are annoying me
warnings.filterwarnings("ignore")

# start time to see how long the model takes to run
start_time = time.time()

result = pipe(sample)
print(f"The output is : {result['text']}")

# stop time
print(f"The model took {round(time.time() - start_time, 2)} seconds.")

# turn warnings back on
warnings.simplefilter("default")

The output is :  Key. 8, 8, 8, 7, 3, 8, 4, 4, 8, 2, 2, 3, 2, 5, 2, 2, 2, 4, 4, 4, 2, 2, 3, 3, 2, 1, 1, 4, 4.
The model took 26.0 seconds.


# CPU Version

In [4]:
# set device, torch type, model parameters, move to 'device'

# I think I can just use (and test sometime with 'cpu' instead)
device = 'cpu'

# unclear to me if M2 can use float16 (recommended for NVIDIA GPU per tutorial) so using float32
torch_dtype = torch.float32

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id,
    # language = 'en',
    torch_dtype=torch_dtype,
    low_cpu_mem_usage=True, # dropped because running on GPU
    use_safetensors=True) # this is apparently faster and more secure? https://huggingface.co/docs/diffusers/v0.28.1/using-diffusers/using_safetensors

# apparently moves model to GPU or specified device
model.to(device)

WhisperForConditionalGeneration(
  (model): WhisperModel(
    (encoder): WhisperEncoder(
      (conv1): Conv1d(128, 1280, kernel_size=(3,), stride=(1,), padding=(1,))
      (conv2): Conv1d(1280, 1280, kernel_size=(3,), stride=(2,), padding=(1,))
      (embed_positions): Embedding(1500, 1280)
      (layers): ModuleList(
        (0-31): 32 x WhisperEncoderLayer(
          (self_attn): WhisperSdpaAttention(
            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)
            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)
            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)
            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1280, out_features=5120, bias=True)
          (fc2): Linear(in_features=5120, out_features=1280, bia

In [5]:
# pipe to hold model

pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    torch_dtype=torch_dtype,
    device=device,
    return_timestamps=True
)

Device set to use cpu


In [6]:
# run model

# warnings are annoying me
warnings.filterwarnings("ignore")

# start time to see how long the model takes to run
start_time = time.time()

result = pipe(sample)
print(f"The output is : {result['text']}")

# stop time
print(f"The model took {round(time.time() - start_time, 2)} seconds.")

# turn warnings back on
warnings.simplefilter("default")

Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.


The output is :  Key. 8, 8, 8, 7, 3, 8, 4, 4, 8, 2, 2, 3, 2, 5, 2, 2, 2, 4, 4, 4, 2, 2, 3, 3, 2, 1, 1, 4, 4.
The model took 15.85 seconds.


# Speed Notes

30 Mar 2025\
data = ./data/2045 Spring 2025 Exam 2 MW.wav\
model = whisper large v3
- GPU: 4394.433904886246 seconds
- CPU: 783.8126468658447 seconds

30 Mar 2025\
data = ./data/2045 Spring 2025 Exam 2 MW.wav\
model = whisper large v3 turbo
- GPU: 328.7707591056824 seconds
- CPU: 213.9266393184662 seconds

30 Mar 2025\
data = ./data/2045 Spring 2025 Exam 2 Key.wav\
model = whisper large v3 turbo
- GPU: 18.15 seconds
- CPU: 18.55 seconds

30 Mar 2025\
data = ./data/2045 Spring 2025 Exam 2 Key.mp3\
model = whisper large v3 turbo
- GPU: 16.13 seconds
- CPU: 17.29 seconds

30 Mar 2025 (repeated)\
data = ./data/2045 Spring 2025 Exam 2 Key.wav\
model = whisper large v3 turbo
- GPU: 26.0 seconds
- CPU: 15.85 seconds

# Modifying Output

In [10]:
result

{'text': ' Key. 8, 8, 8, 7, 3, 8, 4, 4, 8, 2, 2, 3, 2, 5, 2, 2, 2, 4, 4, 4, 2, 2, 3, 3, 2, 1, 1, 4, 4.',
 'chunks': [{'timestamp': (0.0, 29.0),
   'text': ' Key. 8, 8, 8, 7, 3, 8, 4, 4, 8, 2, 2, 3, 2, 5, 2, 2, 2, 4, 4, 4, 2, 2, 3, 3,'},
  {'timestamp': (0.0, 5.08), 'text': ' 2, 1, 1, 4, 4.'}]}

# raw from Large V3 page

In [None]:

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)

processor = AutoProcessor.from_pretrained(model_id)

pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    torch_dtype=torch_dtype,
    device=device,
)

dataset = load_dataset("distil-whisper/librispeech_long", "clean", split="validation")
sample = dataset[0]["audio"]

result = pipe(sample)
print(result["text"])
