In [None]:
 from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
 from datasets import load_dataset
 import torch
 
 # load model and processor
 processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-lv-60-espeak-cv-ft")
 model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-lv-60-espeak-cv-ft")
     
 # load dummy dataset and read soundfiles
 ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation")
 
 # tokenize
 input_values = processor(ds[0]["audio"]["array"], return_tensors="pt").input_values
 
 # retrieve logits
 with torch.no_grad():
   logits = model(input_values).logits
 
 # take argmax and decode
 predicted_ids_softmax = torch.softmax(logits, dim=-1)
 predicted_ids = torch.argmax(logits, dim=-1)

 transcription = processor.batch_decode(predicted_ids)

In [None]:
import matplotlib.pyplot as plt
emission = predicted_ids_softmax[0][:, 0:1].cpu().detach()
print(emission[0].shape)
def plot():
    fig, ax = plt.subplots()
    img = ax.imshow(emission.T)
    ax.set_title("Frame-wise class probability")
    ax.set_xlabel("Time")
    ax.set_ylabel("Labels")
    fig.tight_layout()


plot()

In [None]:
covab = processor.tokenizer.get_vocab()
vocabInverse = {x:y for y,x in covab.items() }
print(predicted_ids[0])
covab
# [vocabInverse[x] for x in predicted_ids[0].tolist()]

In [None]:
def plot():
    fig, ax = plt.subplots()
    img = ax.imshow(emission.T)
    ax.set_title("Frame-wise class probability")
    ax.set_xlabel("Time")
    ax.set_ylabel("Labels")
    fig.colorbar(img, ax=ax, shrink=0.6, location="bottom")
    fig.tight_layout()


plot()