In [None]:
from google.colab import drive
drive.mount('/content/drive')

!pip install git+https://github.com/huggingface/datasets.git
!pip install git+https://github.com/huggingface/transformers.git
!pip install jiwer
!pip install torchaudio
!pip install librosa
!pip install datasets

In [None]:
%env LC_ALL=C.UTF-8
%env LANG=C.UTF-8
%env TRANSFORMERS_CACHE=/content/cache
%env HF_DATASETS_CACHE=/content/cache
%env CUDA_LAUNCH_BLOCKING=1

In [None]:
import numpy as np
import pandas as pd
import os
import sys
import IPython
import matplotlib
import matplotlib.pyplot as plt
import requests
import torch
import torchaudio
from pathlib import Path
from tqdm import tqdm
from sklearn.model_selection import train_test_split

In [None]:
torch.random.manual_seed(0)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
bundle = torchaudio.pipelines.WAV2VEC2_ASR_BASE_960H
model = bundle.get_model().to(device)

In [None]:
def speech_file_to_array_fn(path):
    waveform, sampling_rate = torchaudio.load(path)
    waveform = waveform.to(device)
    return waveform

In [None]:
with torch.inference_mode():
  emission, _ = model(speech_file_to_array_fn("/content/drive/MyDrive/audio_speech_actors_01-24/Actor_01/03-01-01-01-01-02-01.wav"))

In [None]:
print(emission.shape)
red = emission.detach().cpu().numpy() 
print(red.shape)
print(len(red.flatten()))



torch.Size([1, 500, 32])
(1, 500, 32)
16000


In [None]:
print(emission.shape)
red = emission.detach().cpu().numpy() 
print(red.shape)

torch.Size([1, 495, 32])
(1, 495, 32)


In [None]:
class GreedyCTCDecoder(torch.nn.Module):
    def __init__(self, labels, blank=0):
        super().__init__()
        self.labels = labels
        self.blank = blank

    def forward(self, emission: torch.Tensor) -> str:
        """Given a sequence emission over labels, get the best path string
        Args:
          emission (Tensor): Logit tensors. Shape `[num_seq, num_label]`.

        Returns:
          str: The resulting transcript
        """
        indices = torch.argmax(emission, dim=-1)  # [num_seq,]
        indices = torch.unique_consecutive(indices, dim=-1)
        indices = [i for i in indices if i != self.blank]
        return "".join([self.labels[i] for i in indices])

In [None]:
decoder = GreedyCTCDecoder(labels=bundle.get_labels())
transcript = decoder(emission[0])

In [None]:
print(transcript)
IPython.display.Audio(("/content/drive/MyDrive/simon.wav"))

USTUCHAES|NACHHAMPATAAPANSUCHAPAPULO|
