In [None]:
# Download NeMo and Config files
import os
if not os.path.exists('/kaggle/working/configs/config.yaml'):
    !pip3 install num2words
    
    # Install dependencies
    !pip install wget
    !apt-get install -y sox libsndfile1 ffmpeg
    !pip install unidecode
    !pip install matplotlib>=3.3.2

    ## Install NeMo
    BRANCH = 'r1.8.2'
    !python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]

    ## Grab the config for CONFORMER-CTC-BPE
    !mkdir configs
    !wget -P configs/ https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/examples/asr/conf/config.yaml
    !wget -P configs/ https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/examples/asr/conf/conformer/conformer_ctc_bpe.yaml
    
    ## Grab Tokenizer
    !mkdir tokenizers
    !wget -P tokenizers/ https://raw.githubusercontent.com/NVIDIA/NeMo/main/scripts/tokenizers/process_asr_text_tokenizer.py

In [None]:
# Utility
from num2words import num2words
import numpy as np
import pandas as pd 
import random
import json
import yaml

# Audio
import librosa
import IPython.display as ipd
from tqdm.notebook import tqdm

# Graphics 
%matplotlib inline
import librosa.display
import matplotlib.pyplot as plt

# Models
import torch
import nemo
import nemo.collections.asr as nemo_asr
import pytorch_lightning as pl
from sklearn.model_selection import train_test_split

# Consts and Settings
import warnings
warnings.filterwarnings('ignore')

SEED = 42
PATH = '/kaggle/input/speech-to-text-russian/data/'
PATH_TRAIN = PATH + 'train_wavs/'
PATH_TEST = PATH + 'test_wavs/'
PATH_WORKING = '/kaggle/working/'

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(SEED)

torch.cuda.empty_cache()

cuda = torch.cuda.is_available()
device_name = 'gpu' if cuda else 'cpu'
device = torch.device('cuda' if cuda else 'cpu')

device_name

In [None]:
# Load and listen to the audio file
example_file = PATH_TRAIN + '0.wav'
audio, sample_rate = librosa.load(example_file)

ipd.Audio(example_file, rate=sample_rate)

In [None]:
# Plot our example audio file's waveform
plt.rcParams['figure.figsize'] = (20,5)
plt.title('Waveform of Audio Example')
plt.ylabel('Amplitude')

_ = librosa.display.waveshow(audio)

In [None]:
# Get spectrogram using Librosa's Short-Time Fourier Transform (stft)
spec = np.abs(librosa.stft(audio))
spec_db = librosa.amplitude_to_db(spec, ref=np.max)  # Decibels

# Use log scale to view frequencies
librosa.display.specshow(spec_db, y_axis='log', x_axis='time')
plt.colorbar()
plt.title('Audio Spectrogram');

In [None]:
mel_spec = librosa.feature.melspectrogram(audio, sr=sample_rate)
mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)

librosa.display.specshow(
    mel_spec_db, x_axis='time', y_axis='mel')
plt.colorbar()
plt.title('Mel Spectrogram');

In [None]:
# Filter transcription
train_labels = pd.read_csv(PATH + 'train_labels.csv')
            

def str_filter(utterance):
    bad_symbols = set(['0','1','2','3','4','5','6','7','8','9','_','a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z','і','ї','ґ'])
    utterance.strip('0123456789.- _')
    utterance = ' '.join([num2words(i, lang='ru') if i.isdigit() else i for i in utterance.split()])
    utterance = ''.join([c for c in utterance if c not in bad_symbols])
    return utterance

train_labels['Expected'] = train_labels['Expected'].apply(str_filter)
train_labels.head()

In [None]:
sample_sub = pd.read_csv(PATH + 'sample_submission.csv')
sample_sub.head()

In [None]:
# --- Building Manifest Files --- #
def build_manifest(path, train_labels, manifest_path, test_mode=False):
    output = ''
    
    for i, row in tqdm(train_labels.iterrows(), total=train_labels.shape[0]):
        file_id = row['Id']
        transcript = row['Expected' if not test_mode else 'Predicted']
        audio_path = f"{path}{file_id}.wav"
        duration = librosa.core.get_duration(filename=audio_path)

        # Write the metadata to the manifest
        metadata = {
            "audio_filepath": audio_path,
            "duration": duration,
            "text": transcript
        }

        dump = json.dumps(metadata)
        output += dump + '\n'
    
    with open(manifest_path, 'w') as fout:
        fout.write(output)

In [None]:
# os.remove(PATH_WORKING + 'manifest_train.json')
# os.remove(PATH_WORKING + 'manifest_validation.json')
# os.remove(PATH_WORKING + 'manifest_test.json')

labels_train, labels_val = train_test_split(train_labels, test_size=0.33, random_state=SEED)
 
train_manifest = PATH_WORKING + 'manifest_train.json'
if not os.path.exists(train_manifest):
    build_manifest(PATH_TRAIN, labels_train, 'manifest_train.json')
    
val_manifest = PATH_WORKING + 'manifest_validation.json'
if not os.path.exists(val_manifest):
    build_manifest(PATH_TRAIN, labels_val, 'manifest_validation.json')


test_manifest = PATH_WORKING + 'manifest_test.json'
if not os.path.exists(test_manifest):
    build_manifest(PATH_TEST, sample_sub, 'manifest_test.json', test_mode=True)

In [None]:
# Check building manifest
with open('/kaggle/working/manifest_train.json', 'r') as fin:
    print(json.loads(fin.read().split('\n')[1]))

In [None]:
def parse_variables(yaml):   
    to_change = []
    def rec_parse(g_key, yaml):
        nonlocal to_change
        if not type(yaml) is dict:
            if type(yaml) is str and '$' in yaml:
                to_change.append( (g_key, yaml) )
        else:
            for key, value in yaml.items():
                rec_parse(g_key + [key], value)
    
    rec_parse([], yaml)
    
    for way, var in to_change: 
        # KOSTYL OVER HERE
        var = var[2:][:-1].split('.')
        if len(var) == 1:
            value = yaml[var[0]]
        elif len(var) == 2:
            value = yaml[var[0]][var[1]]
        elif len(var) == 3:
            value = yaml[var[0]][var[1]][var[2]]
            
        if len(way) == 2:
            yaml[way[0]][way[1]] = value
        elif len(way) == 3:
            yaml[way[0]][way[1]][way[2]] = value
        elif len(way) == 4:
            yaml[way[0]][way[1]][way[2]][way[3]] = value
        
    return yaml



config_path = PATH_WORKING + 'configs/conformer_ctc_bpe.yaml'

# --- Config Information ---#
try:
    from ruamel.yaml import YAML
except ModuleNotFoundError:
    from ruamel_yaml import YAML

yaml = YAML(typ='safe')
with open(config_path) as f:
    params = yaml.load(f)
    
params = parse_variables(params)

## Training with PyTorch Lightning

In [None]:
trainer = pl.Trainer(accelerator=device_name, max_epochs=50)

In [None]:
# Building vocabulary and tokenizer model
if not os.path.exists('/kaggle/working/tokenizer_spe_unigram_v128/tokenizer.model'):
    !python /kaggle/working/tokenizers/process_asr_text_tokenizer.py \
     --manifest="/kaggle/working/manifest_train.json" \
     --data_root="/kaggle/working/" \
     --vocab_size=128 \
     --tokenizer="spe" \
     --spe_type="unigram" \
     --spe_character_coverage=1.0

In [None]:
# Configure Model
from omegaconf import DictConfig

# Set DataSets
params['model']['train_ds']['manifest_filepath'] = train_manifest
params['model']['validation_ds']['manifest_filepath'] = val_manifest
params['model']['test_ds']['manifest_filepath'] = test_manifest

# Set Tokenizer
params['model']['tokenizer']['dir'] = PATH_WORKING + 'tokenizer_spe_unigram_v128/'

# Reduce batch_size because of memory
batch_size = 8
params['model']['train_ds']['batch_size'] = batch_size
params['model']['validation_ds']['batch_size'] = batch_size
params['model']['test_ds']['batch_size'] = batch_size

#                             Models Params
#  +-------------+---------+---------+----------+------------+-----+
#  | Model       | d_model | n_heads | n_layers | time_masks | lr  |
#  +=============+=========+========+===========+============+=====+
#  | Small  (13M)|   176   |    4   |    16     |     5      | 5.0 |
#  +-------------+---------+--------+-----------+------------+-----+
#  | Medium (30M)|   256   |    4   |    18     |     5      | 5.0 |
#  +-------------+---------+--------+-----------+------------+-----+
#  | Large (121M)|   512   |    8   |    18     |     10     | 2.0 |
#  +---------------------------------------------------------------+

# Using Small model
params['model']['encoder']['d_model'] = 176
params['model']['encoder']['n_heads'] = 4
params['model']['encoder']['n_layers'] = 16
params['model']['spec_augment']['time_masks'] = 5
params['model']['optim']['lr'] = 5.0

In [None]:
first_asr_model = nemo_asr.models.EncDecCTCModelBPE(cfg=DictConfig(params['model']), trainer=trainer)

In [None]:
trainer.fit(first_asr_model)

In [None]:
first_asr_model.save_to('conformer-v1')