In [9]:
import nemo
import nemo.collections.asr as nemo_asr
import pytorch_lightning as pl
from omegaconf import DictConfig
import pathlib
import nemo.collections.asr as nemo_asr
import pytorch_lightning as pl

quartznet = nemo_asr.models.EncDecCTCModel.from_pretrained(
    model_name="QuartzNet15x5Base-En"
)

[NeMo I 2021-10-26 17:30:19 cloud:56] Found existing object /home/boris/.cache/torch/NeMo/NeMo_1.4.0/QuartzNet15x5Base-En/2b066be39e9294d7100fb176ec817722/QuartzNet15x5Base-En.nemo.
[NeMo I 2021-10-26 17:30:19 cloud:62] Re-using file from: /home/boris/.cache/torch/NeMo/NeMo_1.4.0/QuartzNet15x5Base-En/2b066be39e9294d7100fb176ec817722/QuartzNet15x5Base-En.nemo
[NeMo I 2021-10-26 17:30:19 common:702] Instantiating model from pre-trained checkpoint
[NeMo I 2021-10-26 17:30:19 features:262] PADDING: 16
[NeMo I 2021-10-26 17:30:19 features:279] STFT using torch
[NeMo I 2021-10-26 17:30:20 save_restore_connector:143] Model EncDecCTCModel was successfully restored from /home/boris/.cache/torch/NeMo/NeMo_1.4.0/QuartzNet15x5Base-En/2b066be39e9294d7100fb176ec817722/QuartzNet15x5Base-En.nemo.


In [10]:
datasets_dir = '../../datasets/'

In [11]:
import pandas as pd
import numpy as np
import fastwer

def calculate_score(dataset, model, model_name='-', k=None, log=False):
    print(f'Calculating score for model {model_name} on {dataset}')
    if dataset == 'LJSpeech':
        metadata = pd.read_csv(datasets_dir + 'LJSpeech-1.1/metadata_test.csv')
        if k is not None:
            metadata = metadata[:k]
        files = metadata['file_name'].apply(lambda x: f'{datasets_dir}/LJSpeech-1.1/wavs/{x}.wav').values
        texts = metadata['transcript'].values
    elif dataset == 'AN4':
        metadata = pd.read_csv(f'{datasets_dir}/an4/metadata.csv')
        files = metadata['file_name'].values
        texts = metadata['transcript'].values
        
    
    wer = []
    cer = []
    predictions = model.transcribe(paths2audio_files=files)
    r = np.random.randint(1, 100)
    print(texts[r])
    print(predictions[r])
    for i in range(len(predictions)):
        wer.append(fastwer.score_sent(texts[i].lower(), predictions[i], char_level=False))
        cer.append(fastwer.score_sent(texts[i].lower(), predictions[i], char_level=True))

    wer = np.array(wer)
    cer = np.array(wer)
    
    wer = wer[wer != float('+inf')]
    cer = wer[wer != float('+inf')]
    
    print(np.mean(wer))
    wer = np.round(np.mean(wer), 2)
    cer = np.round(np.mean(cer), 2)
    if log:
        print(f'wer:{np.round(wer, 2)}; cer:{np.round(cer, 2)}')
    
    return wer, cer

In [17]:
calculate_score('LJSpeech', quartznet, 'quartznet_15x5', k=100)
calculate_score('AN4', quartznet, 'quartznet_15x5', k=100)

Calculating score for model quartznet_15x5 on LJSpeech


Transcribing:   0%|          | 0/25 [00:00<?, ?it/s]

would see anything in the newspaper about his defection, unless he engaged in activities similar to those
would see anything in the newspaper about his defection unless he engaged in activities similar to those
17.392450999999998


(17.39, 17.39)

In [13]:
from itertools import zip_longest
from typing import Any, Callable, Dict, List, Optional

from pytorch_lightning import LightningModule, Trainer
from pytorch_lightning.callbacks import Callback
from pytorch_lightning.utilities import rank_zero_info

import copy

class PrintTableMetricsCallback(Callback):
    """Prints a table with the metrics in columns on every epoch end.
    Example::
        from pl_bolts.callbacks import PrintTableMetricsCallback
        callback = PrintTableMetricsCallback()
    Pass into trainer like so:
    .. code-block:: python
        trainer = pl.Trainer(callbacks=[callback])
        trainer.fit(...)
        # ------------------------------
        # at the end of every epoch it will print
        # ------------------------------
        # loss│train_loss│val_loss│epoch
        # ──────────────────────────────
        # 2.2541470527648926│2.2541470527648926│2.2158432006835938│0
    """

    def __init__(self) -> None:
        self.metrics: List = []

    def on_epoch_end(self, trainer: Trainer, pl_module: LightningModule) -> None:
        metrics_dict = copy.copy(trainer.callback_metrics)
        self.metrics.append(metrics_dict)
        rank_zero_info(dicts_to_table(self.metrics))
        
def dicts_to_table(
    dicts: List[Dict],
    keys: Optional[List[str]] = None,
    pads: Optional[List[str]] = None,
    fcodes: Optional[List[str]] = None,
    convert_headers: Optional[Dict[str, Callable]] = None,
    header_names: Optional[List[str]] = None,
    skip_none_lines: bool = False,
    replace_values: Optional[Dict[str, Any]] = None,
) -> str:
    """Generate ascii table from dictionary Taken from (https://stackoverflow.com/questions/40056747/print-a-list-
    of-dictionaries-in-table-form)
    Args:
        dicts: input dictionary list; empty lists make keys OR header_names mandatory
        keys: order list of keys to generate columns for; no key/dict-key should
            suffix with '____' else adjust code-suffix
        pads: indicate padding direction and size, eg <10 to right pad alias left-align
        fcodes: formating codes for respective column type, eg .3f
        convert_headers: apply converters(dict) on column keys k, eg timestamps
        header_names: supply for custom column headers instead of keys
        skip_none_lines: skip line if contains None
        replace_values: specify per column keys k a map from seen value to new value;
                        new value must comply with the columns fcode; CAUTION: modifies input (due speed)
    Example:
        >>> a = {'a': 1, 'b': 2}
        >>> b = {'a': 3, 'b': 4}
        >>> print(dicts_to_table([a, b]))
        a│b
        ───
        1│2
        3│4
    """
    # optional arg prelude
    if keys is None:
        if len(dicts) > 0:
            keys = dicts[0].keys()  # type: ignore[assignment]
        elif header_names is not None:
            keys = header_names
        else:
            raise ValueError("keys or header_names mandatory on empty input list")
    if pads is None:
        pads = [""] * len(keys)  # type: ignore[arg-type]
    elif len(pads) != len(keys):  # type: ignore[arg-type]
        raise ValueError(f"bad pad length {len(pads)}, expected: {len(keys)}")  # type: ignore[arg-type]
    if fcodes is None:
        fcodes = [""] * len(keys)  # type: ignore[arg-type]
    elif len(fcodes) != len(fcodes):
        raise ValueError(f"bad fcodes length {len(fcodes)}, expected: {len(keys)}")  # type: ignore[arg-type]
    if convert_headers is None:
        convert_headers = {}
    if header_names is None:
        header_names = keys
    if replace_values is None:
        replace_values = {}
    # build header
    headline = "│".join(f"{v:{pad}}" for v, pad in zip_longest(header_names, pads))  # type: ignore[arg-type]
    underline = "─" * len(headline)
    # suffix special keys to apply converters to later on
    marked_keys = [h + "____" if h in convert_headers else h for h in keys]  # type: ignore[union-attr]
    marked_values = {}
    s = "│".join(f"{{{h}:{pad}{fcode}}}" for h, pad, fcode in zip_longest(marked_keys, pads, fcodes))
    lines = [headline, underline]
    for d in dicts:
        none_keys = [k for k, v in d.items() if v is None]
        if skip_none_lines and none_keys:
            continue
        elif replace_values:
            for k in d.keys():
                if k in replace_values and d[k] in replace_values[k]:
                    d[k] = replace_values[k][d[k]]
                if d[k] is None:
                    raise ValueError(f"bad or no mapping for key '{k}' is None. Use skip or change replace mapping.")
        elif none_keys:
            raise ValueError(f"keys {none_keys} are None in {d}. Do skip or use replace mapping.")
        for h in convert_headers:
            if h in keys:  # type: ignore[operator]
                converter = convert_headers[h]
                marked_values[h + "____"] = converter(d)
        line = s.format(**d, **marked_values)
        lines.append(line)
    return "\n".join(lines)

In [14]:
import nemo
import nemo.collections.asr as nemo_asr
import pytorch_lightning as pl
from omegaconf import DictConfig
import pathlib
import nemo.collections.asr as nemo_asr
import pytorch_lightning as pl

quartznet = nemo_asr.models.EncDecCTCModel.from_pretrained(
    model_name="QuartzNet15x5Base-En"
)

train_manifest_file = "../../datasets/LJSpeech-1.1/train_manifest.json"
val_manifest_file = "../../datasets/LJSpeech-1.1/test_manifest.json"

callback = PrintTableMetricsCallback()
trainer = pl.Trainer(gpus=1, max_epochs=10, callbacks=[callback])


new_opt = {
    "betas": [0.8, 0.25],
    "lr": 0.0001,
    "name": "novograd",
    "sched": {
        "last_epoch": -1,
        "min_lr": 0.0,
        "monitor": "val_loss",
        "name": "CosineAnnealing",
        "reduce_on_plateau": False,
        "warmup_ratio": 0.12,
        "warmup_steps": None,
    },
    "weight_decay": 0.001,
}



train_ds_config = {
    "batch_size": 4,
    "is_tarred": False,
    "num_workers": 12,
    "pin_memory": True,
    "labels": [
        " ",
        "a",
        "b",
        "c",
        "d",
        "e",
        "f",
        "g",
        "h",
        "i",
        "j",
        "k",
        "l",
        "m",
        "n",
        "o",
        "p",
        "q",
        "r",
        "s",
        "t",
        "u",
        "v",
        "w",
        "x",
        "y",
        "z",
        "'",
    ],
    "manifest_filepath": str(train_manifest_file),
    "max_duration": 16.7,
    "sample_rate": 16000,
    "shuffle": True,
    "tarred_audio_filepaths": None,
    "trim_silence": True,
}

train_ds_config = DictConfig(train_ds_config)

val_ds_config = {
    "batch_size": 4,
    "num_workers": 12,
    "pin_memory": True,
    "labels": [
        " ",
        "a",
        "b",
        "c",
        "d",
        "e",
        "f",
        "g",
        "h",
        "i",
        "j",
        "k",
        "l",
        "m",
        "n",
        "o",
        "p",
        "q",
        "r",
        "s",
        "t",
        "u",
        "v",
        "w",
        "x",
        "y",
        "z",
        "'",
    ],
    "manifest_filepath": str(val_manifest_file),
    "sample_rate": 16000,
    "shuffle": False,
}

val_ds_config = DictConfig(val_ds_config)


quartznet.setup_training_data(train_data_config=train_ds_config)
quartznet.setup_validation_data(val_data_config=val_ds_config)
quartznet.set_trainer(trainer)
quartznet.setup_optimization(optim_config=DictConfig(new_opt))

trainer.fit(quartznet)
quartznet.save_to("test.nemo")

[NeMo I 2021-10-26 17:30:39 cloud:56] Found existing object /home/boris/.cache/torch/NeMo/NeMo_1.4.0/QuartzNet15x5Base-En/2b066be39e9294d7100fb176ec817722/QuartzNet15x5Base-En.nemo.
[NeMo I 2021-10-26 17:30:39 cloud:62] Re-using file from: /home/boris/.cache/torch/NeMo/NeMo_1.4.0/QuartzNet15x5Base-En/2b066be39e9294d7100fb176ec817722/QuartzNet15x5Base-En.nemo
[NeMo I 2021-10-26 17:30:39 common:702] Instantiating model from pre-trained checkpoint
[NeMo I 2021-10-26 17:30:40 features:262] PADDING: 16
[NeMo I 2021-10-26 17:30:40 features:279] STFT using torch
[NeMo I 2021-10-26 17:30:40 save_restore_connector:143] Model EncDecCTCModel was successfully restored from /home/boris/.cache/torch/NeMo/NeMo_1.4.0/QuartzNet15x5Base-En/2b066be39e9294d7100fb176ec817722/QuartzNet15x5Base-En.nemo.


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


[NeMo I 2021-10-26 17:30:40 audio_to_text_dataset:37] Model level config does not container `sample_rate`, please explicitly provide `sample_rate` to the dataloaders.
[NeMo I 2021-10-26 17:30:40 audio_to_text_dataset:37] Model level config does not container `labels`, please explicitly provide `labels` to the dataloaders.
[NeMo I 2021-10-26 17:30:40 collections:173] Dataset loaded with 10480 files totalling 19.11 hours
[NeMo I 2021-10-26 17:30:40 collections:174] 0 files were filtered totalling 0.00 hours
[NeMo I 2021-10-26 17:30:40 audio_to_text_dataset:37] Model level config does not container `sample_rate`, please explicitly provide `sample_rate` to the dataloaders.
[NeMo I 2021-10-26 17:30:40 audio_to_text_dataset:37] Model level config does not container `labels`, please explicitly provide `labels` to the dataloaders.
[NeMo I 2021-10-26 17:30:41 collections:173] Dataset loaded with 2620 files totalling 4.81 hours
[NeMo I 2021-10-26 17:30:41 collections:174] 0 files were filtered t

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


[NeMo I 2021-10-26 17:30:41 modelPT:544] Optimizer config = Novograd (
    Parameter Group 0
        amsgrad: False
        betas: [0.95, 0.25]
        eps: 1e-08
        grad_averaging: False
        lr: 0.0001
        weight_decay: 0.001
    )
[NeMo I 2021-10-26 17:30:41 lr_scheduler:625] Scheduler "<nemo.core.optim.lr_scheduler.CosineAnnealing object at 0x7fe029946410>" 
    will be used during training (effective maximum steps = 26200) - 
    Parameters : 
    (last_epoch: -1
    min_lr: 0.0
    warmup_ratio: 0.12
    warmup_steps: null
    max_steps: 26200
    )



  | Name              | Type                              | Params
------------------------------------------------------------------------
0 | preprocessor      | AudioToMelSpectrogramPreprocessor | 0     
1 | encoder           | ConvASREncoder                    | 18.9 M
2 | decoder           | ConvASRDecoder                    | 29.7 K
3 | loss              | CTCLoss                           | 0     
4 | spec_augmentation | SpectrogramAugmentation           | 0     
5 | _wer              | WER                               | 0     
------------------------------------------------------------------------
18.9 M    Trainable params
0         Non-trainable params
18.9 M    Total params
75.698    Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

val_loss│val_wer
────────────────
103.06062316894531│0.125


Training: -1it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

val_loss│val_wer
────────────────
103.06062316894531│0.125
5.842962265014648│0.03738697990775108
val_loss│val_wer
────────────────
103.06062316894531│0.125
5.842962265014648│0.03738697990775108
5.842962265014648│0.03738697990775108
      rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")
    


In [16]:
calculate_score('LJSpeech', quartznet, 'quartznet_15x5', k=100)
calculate_score('AN4', quartznet, 'quartznet_15x5', k=100)

Calculating score for model quartznet_15x5 on LJSpeech


Transcribing:   0%|          | 0/25 [00:00<?, ?it/s]

but there were no serious accidents, beyond those caused by the goring of a maddened, over-driven ox which forced its way through the crowd.
but there were no serious accidents beyond those caused by the goring of a maddened overdriven ox which forced its way through the crowd
17.392450999999998
Calculating score for model quartznet_15x5 on AN4


Transcribing:   0%|          | 0/33 [00:00<?, ?it/s]

enter nine one six nine
enter 
225.67828350515464


(225.68, 225.68)

In [None]:
quartznet.save_to("test.nemo")

In [6]:
data_dir = '../../'
datasets_dir = '../../datasets/'

In [7]:
model = quartznet