In [1]:
!python --version

Python 3.10.14


In [2]:
%pip install -q "openvino>=2024.1.0" "nncf>=2.10.0"

Note: you may need to restart the kernel to use updated packages.


In [4]:
%pip install -q "python-ffmpeg<=1.0.16" moviepy "onnx!=1.16.2" 

Note: you may need to restart the kernel to use updated packages.


In [5]:
%pip install -q "torch>=2.1" --extra-index-url https://download.pytorch.org/whl/cpu

Note: you may need to restart the kernel to use updated packages.


In [6]:
%pip install -q "git+https://github.com/huggingface/optimum-intel.git"

Note: you may need to restart the kernel to use updated packages.


In [7]:
import torch
print(torch.__version__)

2.3.1


In [17]:
import numpy as np
print(np.version.version)

1.26.4


In [8]:
model_id = "openai/whisper-small"

In [9]:
from pathlib import Path

model_dir = model_id.split("/")[-1]

if not Path(model_dir).exists():
    !optimum-cli export openvino -m {model_id} {model_dir} --weight-format fp16

In [10]:
import openvino as ov
core = ov.Core()
supported_devices = core.available_devices + ["AUTO"]
print(supported_devices)

In [12]:
from optimum.intel.openvino import OVModelForSpeechSeq2Seq
from transformers import AutoProcessor, pipeline

device = 'CPU'

ov_model = OVModelForSpeechSeq2Seq.from_pretrained(model_dir, device=device)
processor = AutoProcessor.from_pretrained(model_dir)

# GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
# > It looks like it at this point, until METAL support is added, no GPU for ARM Macintoshes.
# https://github.com/black-forest-labs/flux/issues/48#issuecomment-2273668173

pipe = pipeline(
    "automatic-speech-recognition",
    model=ov_model,
    chunk_length_s=30,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
)

Compiling the encoder to CPU ...
Compiling the decoder to CPU ...
Compiling the decoder to CPU ...
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [16]:
import os
print(os.path.exists("jp-sample.wav"))

True


In [27]:
import numpy as np
from scipy.io import wavfile
# we can't use the "wave" standard library because it can't read 32f files.
# in Adobe Audition you can't export lower than 32f so we'll have to covert.
# We'll use wavefile scipy.io wavfile

audio_path = "./jp-sample.wav"

# Read the 32-bit float WAV file
sampling_rate, raw = wavfile.read('./jp-sample.wav')

  sampling_rate, raw = wavfile.read('./jp-sample.wav')


In [32]:
task = 'transcribe' # transcribe / translate
transcription = pipe(
    {"raw": raw, "sampling_rate": sampling_rate }, 
    generate_kwargs={"task": task}, 
    return_timestamps=True
)["chunks"]

In [29]:
print(transcription)

[{'timestamp': (0.0, 2.0), 'text': ' Neko wa kawaii desu.'}]
