<a href="https://colab.research.google.com/github/AmeyaKI/multimodal-asr/blob/main/ml/b_asr/asr_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive, files
drive.mount('/content/drive')
import os
from pathlib import Path

Mounted at /content/drive


In [2]:
%pip install jiwer
from jiwer import wer, cer
import torch
import numpy as np
from transformers import AutoProcessor, AutoModelForCTC
import librosa

Collecting jiwer
  Downloading jiwer-4.0.0-py3-none-any.whl.metadata (3.3 kB)
Collecting rapidfuzz>=3.9.7 (from jiwer)
  Downloading rapidfuzz-3.14.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (12 kB)
Downloading jiwer-4.0.0-py3-none-any.whl (23 kB)
Downloading rapidfuzz-3.14.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (3.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m52.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz, jiwer
Successfully installed jiwer-4.0.0 rapidfuzz-3.14.3


In [36]:
# CLASS AND FUNCTION _IFY LATER
# add error checking statements


file_name = 'file00000' + '.wav' # file_name from mic_vad.py
recordings_path = Path(f'{Path.cwd()}/drive/MyDrive/Colab Notebooks/assistant/vad_recordings') # Path.is_dir() and Path.exists()
file_path = recordings_path / file_name
file_path

PosixPath('/content/drive/MyDrive/Colab Notebooks/assistant/vad_recordings/file00000.wav')

In [33]:
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU name:", torch.cuda.get_device_name())

CUDA available: True
GPU name: Tesla T4


In [38]:
class ASR():
  DEFAULT_MODEL = 'nvidia/parakeet-ctc-1.1b' # train custom model
  DEFAULT_DEVICE = 'cuda'
  TARGET_SAMPLE_RATE = 16000 # resample to 16,000 hz


  def __init__(self, custom_model=None, device=None):
    self.model_name = custom_model or self.DEFAULT_MODEL
    self.device = device or self.DEFAULT_DEVICE

    self.processor = AutoProcessor.from_pretrained(self.model_name)
    self.model = AutoModelForCTC.from_pretrained(self.model_name, dtype='auto').to(self.device)
    self.model.eval()

  def process_audio(self, audio_path):
    if audio_path is None:
      raise ValueError

    # load audio
    waveform, sample_rate = librosa.load(
        audio_path,
        sr=None,
        mono=True)

    # resample if sample_rate is incorrect
    if sample_rate != self.TARGET_SAMPLE_RATE:
        waveform = librosa.resample(
            waveform,
            orig_sr = sample_rate,
            target_sr = self.TARGET_SAMPLE_RATE)

    # normalizing audio amp
    max_val = np.max(np.abs(waveform))
    if max_val > 0:
        waveform /= max_val
    return waveform


  def transcribe(self, audio_path):
    waveform = self.process_audio(audio_path)

    # calc model inputs
    model_inputs = self.processor(
        waveform,
        sampling_rate = self.TARGET_SAMPLE_RATE,
        return_tensors = 'pt'
        )

    model_inputs = model_inputs.to(
        device = self.model.device,
        dtype = self.model.dtype
        )

    with torch.no_grad(): # obtain logits
        # Ensure input_features are in bfloat16 to match model's dtype
        model_inputs.input_features = model_inputs.input_features.to(torch.bfloat16)
        logits = self.model(**model_inputs).logits

    # decode and translate logits to predicted text
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = self.processor.batch_decode(predicted_ids)
    predicted_text = transcription[0]

    return predicted_text

In [41]:
asr_model = ASR()

In [40]:
asr_model.transcribe(file_path)

'one two three one two three'