In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


## Data Preprocessing

Two important things to put into consideration before proceeding with your data, are the file format and the sample rate.

The file format must be a .wav file and the sample rate (sr) too should be 16000 (16k)Hz.

One can carry out this preprocessing step using the following: 

1. Audacity Software
2. Python Codes

If you have the __Audacity software__ installed in your system and are familiar with it, then you can upload your file, and reset the sample rate to 16000, then export the file as .wav. But this does not work in batches. 

However, you can also work with Python to resample your audio file by following the codes below.

`The codes do not cover converting the files to .wav, but the *pyDub* library does something around this.`


In [None]:
import soundfile as sf
import librosa
import os
from IPython.display import Audio

In [None]:
## checking the sample rate of any random audio file in the folder.
# it's important to set sr as None so as to get the actual sr the audio has.
y, sr = librosa.load("filename.wav",sr=None)
sr

16000

In [None]:
# listen to an audio file
Audio("filename.wav")

In [None]:
# listen to an audio file  
Audio("filename.wav")

__Changing the Sample rate for a single file__

In [None]:
# set sr to your desired value. In this case, 16000
y, sr = librosa.load(
    "filename.wav", 
    sr=16000
    )
sr

16000

In [None]:
# Now export the audio file to drive
sf.write(
    "filename.wav", 
         y, 
         sr)

__Changing the Sample Rate for multiple audios in a folder__

In [None]:
# define the exixting paths to the folder conatining the audios
upsampledAudioPath = "filepath"

# define the exixting paths to the folder where the audios will be exported to after resampling
downsampledAudioPath = "new_filepath"

In [None]:
allAudio = os.listdir(upsampledAudioPath)
# print(allAudio)

In [None]:
# resample and export only files with .wav format
for i, wav in enumerate(allAudio):
    if ".wav" in wav.lower():
        y, sr = librosa.load(upsampledAudioPath+'/'+wav, sr=16000)
        sf.write(downsampledAudioPath+'/'+wav, y, sr)
    else:
        pass

In [None]:
len(os.listdir(downsampledAudioPath))

In [None]:
assert len(os.listdir(upsampledAudioPath)) == len(os.listdir(downsampledAudioPath))

## Installing NeMo and it's dependencies

When the manifests are ready (i.e for the train, validation and test), then we can begin the developemnt proper.

In [None]:
%%capture
!pip install frozendict
!pip install g2p_en
!pip install torch_stft
!pip install soundfile
!pip install kaldiio
!pip install pydub
!pip install pangu

# exit()

In [None]:
"""
You can run either this notebook locally (if you have all the dependencies and a GPU) or on Google Colab.

Instructions for setting up Colab are as follows:
1. Open a new Python 3 notebook.
2. Import this notebook from GitHub (File -> Upload Notebook -> "GITHUB" tab -> copy/paste GitHub URL)
3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select "GPU" for hardware accelerator)
4. Run this cell to set up dependencies.
5. Restart the runtime (Runtime -> Restart Runtime) for any upgraded packages to take effect
"""
# If you're using Google Colab and not running locally, run this cell.

## Install dependencies
!pip install wget
!apt-get install sox libsndfile1 ffmpeg
!pip install unidecode
!pip install matplotlib>=3.3.2

## Install NeMo

BRANCH = 'r1.0.0rc1'
!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]

## Grab the config we'll use in this example
!mkdir configs
!wget -P configs/ https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/examples/asr/conf/config.yaml

"""
Remember to restart the runtime for the kernel to pick up any upgraded packages (e.g. matplotlib)!
Alternatively, you can uncomment the exit() below to crash and restart the kernel, in the case
that you want to use the "Run All Cells" (or similar) option.
"""
exit()

In [None]:
# NeMo's "core" package
import nemo
# NeMo's ASR collection - this collections contains complete ASR models and
# building blocks (modules) for ASR
import nemo.collections.asr as nemo_asr

In [None]:
import glob
import os
import subprocess
import tarfile
import wget

import librosa
import IPython.display as ipd

import json
from ruamel.yaml import YAML

from omegaconf import DictConfig
import copy
import pytorch_lightning as pl
import torch

from datetime import datetime

In [None]:
data_path = '.' 

In [None]:
# convert data to 16kHz


### Training an ASR model Using QuartzNet

In [None]:
config_path = '/content/configs/config.yaml'

yaml = YAML(typ='safe')
with open(config_path) as f:
    params = yaml.load(f)
print(params)

{'name': 'QuartzNet15x5', 'sample_rate': 16000, 'repeat': 1, 'dropout': 0.0, 'separable': True, 'labels': [' ', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', "'"], 'model': {'train_ds': {'manifest_filepath': '???', 'sample_rate': 16000, 'labels': [' ', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', "'"], 'batch_size': 32, 'trim_silence': True, 'max_duration': 16.7, 'shuffle': True, 'is_tarred': False, 'tarred_audio_filepaths': None, 'tarred_shard_strategy': 'scatter'}, 'validation_ds': {'manifest_filepath': '???', 'sample_rate': 16000, 'labels': [' ', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', "'"], 'batch_size': 32, 'shuffle': False}, 'preprocessor': {'_target_': 'nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor', 'normal

In [None]:
model_name="QuartzNet15x5Base-En" ##

# This line will download the pre-trained model from NVIDIA's NGC cloud and instantiate it for you
Qmodel = nemo_asr.models.EncDecCTCModel.from_pretrained(model_name=model_name)

[NeMo I 2021-09-09 04:10:16 cloud:56] Found existing object /root/.cache/torch/NeMo/NeMo_1.0.0rc1/QuartzNet15x5Base-En/2b066be39e9294d7100fb176ec817722/QuartzNet15x5Base-En.nemo.
[NeMo I 2021-09-09 04:10:16 cloud:62] Re-using file from: /root/.cache/torch/NeMo/NeMo_1.0.0rc1/QuartzNet15x5Base-En/2b066be39e9294d7100fb176ec817722/QuartzNet15x5Base-En.nemo
[NeMo I 2021-09-09 04:10:16 common:654] Instantiating model from pre-trained checkpoint
[NeMo I 2021-09-09 04:10:17 features:240] PADDING: 16
[NeMo I 2021-09-09 04:10:17 features:256] STFT using torch
[NeMo I 2021-09-09 04:10:18 modelPT:376] Model EncDecCTCModel was successfully restored from /root/.cache/torch/NeMo/NeMo_1.0.0rc1/QuartzNet15x5Base-En/2b066be39e9294d7100fb176ec817722/QuartzNet15x5Base-En.nemo.


## Testing the model trained on 7.79 hours audio data.

In [None]:
# Load model from checkpoint  -- 7.79
Model = Qmodel.load_from_checkpoint(checkpoint_path="./ASR MODEL_with_7.79h_training_data/newest_lightning_logs/version_1/checkpoints/epoch=34-step=6159.ckpt")

[NeMo W 2021-09-09 04:10:27 modelPT:133] Please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    manifest_filepath: /content/drive/MyDrive/ASR for Learn at Home/speech-to-text using
      NVIDIA NeMo/newestTrainManifest.json
    sample_rate: 16000
    labels:
    - a
    - b
    - c
    - d
    - e
    - f
    - g
    - h
    - i
    - j
    - k
    - l
    - m
    - 'n'
    - o
    - p
    - q
    - r
    - s
    - t
    - u
    - v
    - w
    - x
    - 'y'
    - z
    - '0'
    - '1'
    - '2'
    - '3'
    - '4'
    - '5'
    - '6'
    - '7'
    - '8'
    - '9'
    - ''''
    - '['
    - ']'
    - ' '
    batch_size: 32
    trim_silence: true
    max_duration: 16.7
    shuffle: true
    is_tarred: false
    tarred_audio_filepaths: null
    tarred_shard_strategy: scatter
    
[NeMo W 2021-09-09 04:10:27 modelPT:140] Please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_val

[NeMo I 2021-09-09 04:10:27 features:240] PADDING: 16
[NeMo I 2021-09-09 04:10:27 features:256] STFT using torch


In [None]:
wav_files = ['filename.wav']
                                        
                                        
Model.transcribe(paths2audio_files=wav_files)
                   