In [1]:
!pip install librosa

[0m

In [2]:
import torch
import torchaudio
import torchaudio.functional as F
import torchaudio.transforms as T

import librosa

print(torch.__version__)
print(torchaudio.__version__)

1.13.1
0.13.1+cu116


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import os
import logging
from config import config, BaseConfig
from typing import Any, List, Optional, Union

import torch
from speechbrain.pretrained import EncoderClassifier

class TweakedEncoderClassifier(EncoderClassifier):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        
    def classify_batch(self, wavs, wav_lens=None):
        """Performs classification on the top of the encoded features.

        It returns the posterior probabilities, the index and, if the label
        encoder is specified it also the text label.

        Arguments
        ---------
        wavs : torch.tensor
            Batch of waveforms [batch, time, channels] or [batch, time]
            depending on the model. Make sure the sample rate is fs=16000 Hz.
        wav_lens : torch.tensor
            Lengths of the waveforms relative to the longest one in the
            batch, tensor of shape [batch]. The longest one should have
            relative length 1.0 and others len(waveform) / max_length.
            Used for ignoring padding.

        Returns
        -------
        out_prob
            The log posterior probabilities of each class ([batch, N_class])
        score:
            It is the value of the log-posterior for the best class ([batch,])
        index
            The indexes of the best class ([batch,])
        text_lab:
            List with the text labels corresponding to the indexes.
            (label encoder should be provided).
        """
        emb = self.encode_batch(wavs, wav_lens)
        out_prob = self.mods.classifier(emb).squeeze(1)

        return out_prob
    
    def postproc(self, out_prob):
        score, index = torch.max(out_prob, dim=-1)
        text_lab = self.hparams.label_encoder.decode_torch(index)
        return score, index, text_lab

''' CPU/GPU Configurations '''
if torch.cuda.is_available():
    DEVICE = [0]  # use 0th CUDA device
    ACCELERATOR = 'gpu'
else:
    DEVICE = 1
    ACCELERATOR = 'cpu'

MAP_LOCATION: str = torch.device('cuda:{}'.format(DEVICE[0]) if ACCELERATOR == 'gpu' else 'cpu')


''' Helper functions '''
def initialize_lid_model(cfg: BaseConfig) -> EncoderClassifier:

    # lid_model = EncoderClassifier.from_hparams(source=cfg.model_source, savedir=cfg.model_dir)
    lid_model = TweakedEncoderClassifier.from_hparams(source=cfg.model_source, savedir=cfg.model_source)

    return lid_model

''' Initialize models '''
lid_model = initialize_lid_model(config)

''' Main prediction function '''


  return torch._C._cuda_getDeviceCount() > 0


' Main prediction function '

In [4]:
example = torch.rand([20800])

In [5]:
example

tensor([0.6807, 0.2068, 0.5845,  ..., 0.6307, 0.5405, 0.2605])

In [6]:

prediction =  lid_model(example)


In [7]:
prediction.shape

torch.Size([1, 107])

In [8]:
lid_model(example)

tensor([[-7.7311, -5.6210, -6.5004, -5.5601, -6.3318, -5.7018, -6.3189, -6.8618,
         -5.6906, -6.6859, -4.5642, -6.2262, -3.8397, -5.0002, -7.9657, -5.6553,
         -1.3815, -6.5146, -7.1239, -6.1227, -3.6416, -7.9683, -5.4432, -6.2893,
         -6.4234, -5.9083, -6.2119, -4.4752, -5.2745, -4.5266, -9.2858, -6.0738,
         -7.3615, -5.2734, -6.4271, -5.2064, -3.6185, -4.7815, -5.5289, -6.2376,
         -7.8543, -6.4990, -4.6146, -5.7218, -4.6755, -5.7481, -3.4200, -6.2221,
         -5.9652, -5.6839, -7.2023, -6.9333, -3.7232, -6.4151, -6.2971, -5.2658,
         -5.0486, -6.4945, -4.5951, -2.9194, -4.9518, -7.1848, -4.9657, -5.7736,
         -3.2184, -5.9307, -7.2750, -8.4384, -6.1516, -2.4973, -3.3454, -5.1725,
         -7.1693, -5.9150, -7.0843, -4.5789, -5.4315, -5.4546, -8.5199, -5.8809,
         -2.5536, -7.1859, -5.2396, -4.2056, -5.2110, -5.0028, -4.9338, -5.9506,
         -5.2701, -6.5793, -6.2318, -8.2121, -6.6880, -6.4567, -7.1406, -6.1185,
         -5.1429, -6.3483, -

In [9]:
lid_model.classify_batch(example)

tensor([[-7.7311, -5.6210, -6.5004, -5.5601, -6.3318, -5.7018, -6.3189, -6.8618,
         -5.6906, -6.6859, -4.5642, -6.2262, -3.8397, -5.0002, -7.9657, -5.6553,
         -1.3815, -6.5146, -7.1239, -6.1227, -3.6416, -7.9683, -5.4432, -6.2893,
         -6.4234, -5.9083, -6.2119, -4.4752, -5.2745, -4.5266, -9.2858, -6.0738,
         -7.3615, -5.2734, -6.4271, -5.2064, -3.6185, -4.7815, -5.5289, -6.2376,
         -7.8543, -6.4990, -4.6146, -5.7218, -4.6755, -5.7481, -3.4200, -6.2221,
         -5.9652, -5.6839, -7.2023, -6.9333, -3.7232, -6.4151, -6.2971, -5.2658,
         -5.0486, -6.4945, -4.5951, -2.9194, -4.9518, -7.1848, -4.9657, -5.7736,
         -3.2184, -5.9307, -7.2750, -8.4384, -6.1516, -2.4973, -3.3454, -5.1725,
         -7.1693, -5.9150, -7.0843, -4.5789, -5.4315, -5.4546, -8.5199, -5.8809,
         -2.5536, -7.1859, -5.2396, -4.2056, -5.2110, -5.0028, -4.9338, -5.9506,
         -5.2701, -6.5793, -6.2318, -8.2121, -6.6880, -6.4567, -7.1406, -6.1185,
         -5.1429, -6.3483, -

In [10]:
from predict import predict

predict('../data/008605190016.wav')

torch.Size([20800])


[(['vi: Vietnamese'], 'Language')]

In [16]:
def trace(model, output_path):
    input = torch.rand([20800])
    output = model(input)
#     print(output)
    traced_model = torch.jit.trace(model, input)
    output = traced_model(input)
#     print(output)
    return traced_model.save(output_path)

In [17]:
out_status = trace(lid_model, 'test.pt')

tensor([[ -8.4211,  -5.7684,  -6.5353,  -5.9944,  -6.6675,  -6.2393,  -6.8956,
          -6.2319,  -5.3388,  -6.5591,  -5.3820,  -6.2806,  -4.5073,  -4.7015,
          -8.8633,  -6.1905,  -1.2191,  -5.8663,  -6.6861,  -5.8478,  -4.1391,
          -7.7261,  -6.2417,  -6.4681,  -7.2637,  -6.6452,  -6.2837,  -4.6377,
          -5.7326,  -4.8048, -10.1575,  -6.5117,  -8.3743,  -5.4764,  -7.0779,
          -4.9656,  -3.8062,  -5.2660,  -5.9786,  -7.1776,  -7.4311,  -6.6288,
          -4.4689,  -5.3252,  -5.1155,  -6.2268,  -3.3577,  -6.2834,  -6.2473,
          -6.5521,  -7.1720,  -7.5314,  -3.4090,  -6.3522,  -7.2313,  -5.5708,
          -5.5663,  -6.6187,  -4.3495,  -3.4982,  -5.2640,  -7.3352,  -5.8228,
          -4.6462,  -2.8901,  -5.5363,  -7.7914,  -8.5715,  -6.2589,  -2.4978,
          -3.1449,  -5.6118,  -6.5843,  -5.7991,  -7.0528,  -5.7123,  -5.8994,
          -5.1514,  -8.5538,  -6.8959,  -2.3273,  -7.0570,  -5.2221,  -4.3042,
          -5.8285,  -5.4263,  -5.3495,  -7.1256,  -4

!ls

In [18]:
!ls

008605190016-0-100.wav	__init__.py  flagged		       test.pt
008605190016.wav	app.py	     lid-preproc.py
DataLoader.ipynb	config.py    speechbrain-triton.ipynb
