In [1]:
# Let's see how to retrieve time steps for a model
from transformers import AutoTokenizer, AutoFeatureExtractor, AutoModelForCTC
from datasets import load_dataset
import datasets
import torch
import torchaudio

In [2]:
# import model, feature extractor, tokenizer
model = AutoModelForCTC.from_pretrained("pyannote/speaker-diarization")
tokenizer = AutoTokenizer.from_pretrained("pyannote/speaker-diarization")
feature_extractor = AutoFeatureExtractor.from_pretrained("pyannote/speaker-diarization")

OSError: pyannote/speaker-diarization is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo with `use_auth_token` or log in with `huggingface-cli login` and pass `use_auth_token=True`.

In [3]:
# load a sample
sample_path = "./table-eng.wav"
input_values, _ = torchaudio.load(sample_path)
"I am Alive"

'I am Alive'

In [4]:
# forward sample through model
logits = model(input_values).logits[0]
pred_ids = torch.argmax(logits, axis=-1)
"I am Alive"

'I am Alive'

In [7]:
# retrieve word stamps (analogous commands for `output_char_offsets`)
outputs = tokenizer.decode(pred_ids, output_word_offsets=True)
# compute `time_offset` in seconds as product of downsampling ratio and sampling_rate
time_offset = model.config.inputs_to_logits_ratio / feature_extractor.sampling_rate
word_offsets = [
    {
        "word": d["word"],
        "start_time": round(d["start_offset"] * time_offset, 2),
        "end_time": round(d["end_offset"] * time_offset, 2),
    }
    for d in outputs.word_offsets
]
# compare word offsets with audio `common_voice_en_100038.mp3` online on the dataset viewer:
# https://huggingface.co/datasets/common_voice/viewer/en/train
word_offsets

[{'word': 'ER', 'start_time': 0.2, 'end_time': 0.28},
 {'word': 'YA', 'start_time': 0.4, 'end_time': 0.5},
 {'word': 'SHE', 'start_time': 0.68, 'end_time': 0.98},
 {'word': 'KEE', 'start_time': 1.56, 'end_time': 1.86},
 {'word': 'MA', 'start_time': 2.28, 'end_time': 2.62}]