In [1]:
from pydantic import BaseSettings, Field

class BaseConfig(BaseSettings):
    """Define any config here.
../models/
    See here for documentation:
    https://pydantic-docs.helpmanual.io/usage/settings/
    """

    num_replicas: int = 1
    num_cpus: int = 4
    num_gpus: int = 1

    # KNative assigns a $PORT environment variable to the container
    port: int = Field(default=8080, env="PORT",description="App Server Port")
    asr_model_path: str = '../models/stt_en_conformer_ctc_medium.nemo'

config = BaseConfig()

In [2]:
import logging
from typing import List, Union

import torch
import pytorch_lightning as pl
from nemo.utils import model_utils
from nemo.collections.asr.models import ASRModel

import numpy as np

class SpeechRecognizer:
    def __init__(self, config) -> ASRModel:

        self.device: Union[List[int], int] = [0] if (torch.cuda.is_available() and config.num_gpus >= 1) else 1
        self.accelerator: str = 'gpu' if (torch.cuda.is_available() and config.num_gpus >= 1) else 'cpu'
        self.map_location: str = torch.device(f'cuda:{self.device[0]}') if self.accelerator == 'gpu' else 'cpu'

        # Load model
        model_cfg = ASRModel.restore_from(restore_path=config.asr_model_path, return_config=True)
        classpath = model_cfg.target  # original class path
        imported_class = model_utils.import_class_by_path(classpath)  # type: ASRModel
        logging.info(f"Restoring model : {imported_class.__name__}")
        self.model = imported_class.restore_from(
            restore_path=config.asr_model_path, map_location=self.map_location,
        )

        trainer = pl.Trainer(devices=self.device, accelerator=self.accelerator)
        self.model.set_trainer(trainer)
        self.model = self.model.eval()

    ''' Main prediction function '''
    def predict(self, audio_tensor: Union[np.ndarray, torch.tensor]) -> str:
        
        if type(audio_tensor) is np.ndarray:
            audio_tensor = torch.tensor(audio_tensor)
        
        elif type(audio_tensor) is not torch.tensor:
            raise TypeError('Input is not an np array or tensor')
        
        audio_length_tensor = torch.tensor(audio_tensor.shape)
        audio_tensor = audio_tensor.unsqueeze(0)
        
        with torch.no_grad():
            
            logits, logits_len, greedy_predictions = self.model.forward(
                            input_signal=audio_tensor.to(self.map_location), 
                            input_signal_length=audio_length_tensor.to(self.map_location),
                        )
            
            
            hypotheses, all_hyp = self.model.decoding.ctc_decoder_predictions_tensor(
                            logits, decoder_lengths=logits_len, return_hypotheses=False,
                        )

        transcription = hypotheses[0]
        del logits, logits_len, greedy_predictions
            
        return transcription

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

[NeMo W 2023-05-27 09:35:44 optimizers:54] Apex was not found. Using the lamb or fused_adam optimizer will error out.
[NeMo W 2023-05-27 09:35:44 experimental:27] Module <class 'nemo.collections.asr.modules.audio_modules.SpectrogramToMultichannelFeatures'> is experimental, not ready for production and is not fully supported. Use at your own risk.


In [6]:
recog = SpeechRecognizer(config)

[NeMo I 2023-05-27 09:36:40 mixins:170] Tokenizer SentencePieceTokenizer initialized with 1024 tokens


[NeMo W 2023-05-27 09:36:40 modelPT:161] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    manifest_filepath: /data/NeMo_ASR_SET/English/v2.0/train/tarred_audio_manifest.json
    sample_rate: 16000
    batch_size: 64
    shuffle: true
    num_workers: 8
    pin_memory: true
    use_start_end_token: false
    trim_silence: false
    max_duration: 20.0
    min_duration: 0.1
    shuffle_n: 2048
    is_tarred: true
    tarred_audio_filepaths: /data/NeMo_ASR_SET/English/v2.0/train/audio__OP_0..4095_CL_.tar
    
[NeMo W 2023-05-27 09:36:40 modelPT:168] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). 
    Validation config : 
    manifest_filepath:
    - /data/ASR/LibriSpeech/librispeech_withs

[NeMo I 2023-05-27 09:36:40 features:287] PADDING: 0
[NeMo I 2023-05-27 09:36:41 audio_preprocessing:517] Numba CUDA SpecAugment kernel is being used
[NeMo I 2023-05-27 09:36:41 save_restore_connector:249] Model EncDecCTCModelBPE was successfully restored from /mnt/d/projects/ray-speech-test/models/stt_en_conformer_ctc_medium.nemo.


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [3]:
import librosa
audio, sr = librosa.load('sample.wav', sr=None)

In [7]:
recog.predict(audio)

'he came for the singapore summit where his speech was warmly received'