In [None]:
# Install dependencies
!pip install wget
!apt-get install -y sox libsndfile1 ffmpeg
!pip install text-unidecode
!pip install matplotlib>=3.3.2
## Install NeMo
BRANCH = 'r2.0.0rc0'
!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH  #egg=nemo_toolkit[all] 
## Grab the config we'll use in this example
!mkdir configs
!git clone https://github.com/Alkholy53/ASR-Squad

In [None]:
! pip install editdistance
!pip install webdataset
!pip install pyannote.metrics
!pip install einops
! pip install pyannote.core
! pip install inflect
! pip install hydra.core
! pip install lhotse
!pip install numpy soundfile joblib omegaconf lhotse
! pip install jiwer
!pip install  gdown
!pip install --upgrade boto3

In [None]:
!gdown --id 13gKcDfU0N1VuXtRM2dCFYBMEgYPXKeLf 

In [None]:
!git clone https://github.com/NVIDIA/NeMo.git

In [None]:
%%writefile /kaggle/working/NeMo/scripts/asr_language_modeling/ngram_lm/install_beamsearch_decoders.sh
#!/usr/bin/env bash
# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Use this script to install KenLM, OpenSeq2Seq decoder, Flashlight decoder
shopt -s expand_aliases

NEMO_PATH=/kaggle/working/NeMo  # Path to NeMo folder: /workspace/nemo if you use NeMo/Dockerfile
if [ "$#" -eq 1 ]; then
  NEMO_PATH=$1
fi
KENLM_MAX_ORDER=10 # Maximum order of KenLM model, also specified in the setup_os2s_decoders.py

if [ -d "$NEMO_PATH" ]; then
  echo "The folder '$NEMO_PATH' exists."
else
  echo "Error: The folder '$NEMO_PATH' does not exist. Specify it as a first command line positional argument!"
  exit 1
fi
cd $NEMO_PATH

if [ $(id -u) -eq 0 ]; then
  alias aptupdate='apt-get update'
  alias b2install='./b2'
else
  alias aptupdate='sudo apt-get update'
  alias b2install='sudo ./b2'
fi

aptupdate && apt-get upgrade -y && apt-get install -y swig liblzma-dev && rm -rf /var/lib/apt/lists/* # liblzma needed for flashlight decoder

# install Boost package for KenLM
wget https://boostorg.jfrog.io/artifactory/main/release/1.80.0/source/boost_1_80_0.tar.bz2 --no-check-certificate && tar --bzip2 -xf $NEMO_PATH/boost_1_80_0.tar.bz2 && cd boost_1_80_0 && ./bootstrap.sh && b2install --layout=tagged link=static,shared threading=multi,single install -j4 && cd .. || echo FAILURE
export BOOST_ROOT=$NEMO_PATH/boost_1_80_0

git clone https://github.com/NVIDIA/OpenSeq2Seq
cd OpenSeq2Seq
git checkout ctc-decoders
cd ..
mv OpenSeq2Seq/decoders $NEMO_PATH/
rm -rf OpenSeq2Seq
cd $NEMO_PATH/decoders
cp $NEMO_PATH/scripts/installers/setup_os2s_decoders.py ./setup.py
./setup.sh

# install KenLM
cd $NEMO_PATH/decoders/kenlm/build && cmake -DKENLM_MAX_ORDER=$KENLM_MAX_ORDER .. && make -j2
cd $NEMO_PATH/decoders/kenlm
python setup.py install --max_order=$KENLM_MAX_ORDER
export KENLM_LIB=$NEMO_PATH/decoders/kenlm/build/bin
export KENLM_ROOT=$NEMO_PATH/decoders/kenlm
cd ..

# install Flashlight
git clone https://github.com/flashlight/text && cd text
python setup.py bdist_wheel
pip install dist/*.whl
cd ..


In [None]:
!./kaggle/working/NeMo/scripts/asr_language_modeling/ngram_lm/install_beamsearch_decoders.sh

In [None]:
%%writefile /kaggle/working/NeMo/scripts/asr_language_modeling/ngram_lm/train_kenlm.py

import logging
import os
os.environ['HYDRA_FULL_ERROR'] = '1'
import subprocess
import sys
from dataclasses import dataclass, field
from glob import glob
from typing import List

from omegaconf import MISSING

# Update the Python path to include the scripts directory
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..', '..')))

from scripts.asr_language_modeling.ngram_lm import kenlm_utils

from nemo.core.config import hydra_runner
from nemo.utils import logging

"""
NeMo's beam search decoders only support char-level encodings. In order to make it work with BPE-level encodings, we
use a trick to encode the sub-word tokens of the training data as unicode characters and train a char-level KenLM. 
"""


@dataclass
class TrainKenlmConfig:
    """
    Train an N-gram language model with KenLM to be used with beam search decoder of ASR models.
    """

    train_paths: List[
        str
    ] = MISSING  # List of training files or folders. Files can be a plain text file or ".json" manifest or ".json.gz". Example: [/path/to/manifest/file,/path/to/folder]

    nemo_model_file: str = MISSING  # The path to '.nemo' file of the ASR model, or name of a pretrained NeMo model
    kenlm_model_file: str = MISSING  # The path to store the KenLM binary model file
    ngram_length: int = MISSING  # The order of N-gram LM
    kenlm_bin_path: str = MISSING  # The path to the bin folder of KenLM.

    preserve_arpa: bool = False  # Whether to preserve the intermediate ARPA file.
    ngram_prune: List[int] = field(
        default_factory=lambda: [0]
    )  # List of digits to prune Ngram. Example: [0,0,1]. See Pruning section on the https://kheafield.com/code/kenlm/estimation
    cache_path: str = ""  # Cache path to save tokenized files.
    verbose: int = 1  # Verbose level, default is 1.


@hydra_runner(config_path=None, config_name='TrainKenlmConfig', schema=TrainKenlmConfig)
def main(args: TrainKenlmConfig):
    train_paths = kenlm_utils.get_train_list(args.train_paths)

    if isinstance(args.ngram_prune, str):
        args.ngram_prune = [args.ngram_prune]

    tokenizer, encoding_level, is_aggregate_tokenizer = kenlm_utils.setup_tokenizer(args.nemo_model_file)

    if encoding_level == "subword":
        discount_arg = "--discount_fallback"  # --discount_fallback is needed for training KenLM for BPE-based models
    else:
        discount_arg = ""

    arpa_file = f"{args.kenlm_model_file}.tmp.arpa"
    """ LMPLZ ARGUMENT SETUP """
    kenlm_args = [
        os.path.join(args.kenlm_bin_path, 'lmplz'),
        "-o",
        str(args.ngram_length),
        "--arpa",
        arpa_file,
        discount_arg,
        "--prune",
    ] + [str(n) for n in args.ngram_prune]

    if args.cache_path:
        if not os.path.exists(args.cache_path):
            os.makedirs(args.cache_path, exist_ok=True)

        """ DATASET SETUP """
        encoded_train_files = []
        for file_num, train_file in enumerate(train_paths):
            logging.info(f"Encoding the train file '{train_file}' number {file_num+1} out of {len(train_paths)} ...")

            cached_files = glob(os.path.join(args.cache_path, os.path.split(train_file)[1]) + "*")
            encoded_train_file = os.path.join(args.cache_path, os.path.split(train_file)[1] + f"_{file_num}.tmp.txt")
            if (
                cached_files and cached_files[0] != encoded_train_file
            ):  # cached_files exists but has another file name: f"_{file_num}.tmp.txt"
                os.rename(cached_files[0], encoded_train_file)
                logging.info("Rename", cached_files[0], "to", encoded_train_file)

            encoded_train_files.append(encoded_train_file)

        kenlm_utils.iter_files(
            source_path=train_paths,
            dest_path=encoded_train_files,
            tokenizer=tokenizer,
            encoding_level=encoding_level,
            is_aggregate_tokenizer=is_aggregate_tokenizer,
            verbose=args.verbose,
        )

        first_process_args = ["cat"] + encoded_train_files
        first_process = subprocess.Popen(first_process_args, stdout=subprocess.PIPE, stderr=sys.stderr)

        logging.info(f"Running lmplz command \n\n{' '.join(kenlm_args)}\n\n")
        kenlm_p = subprocess.run(
            kenlm_args,
            stdin=first_process.stdout,
            capture_output=False,
            text=True,
            stdout=sys.stdout,
            stderr=sys.stderr,
        )
        first_process.wait()

    else:
        logging.info(f"Running lmplz command \n\n{' '.join(kenlm_args)}\n\n")
        kenlm_p = subprocess.Popen(kenlm_args, stdout=sys.stdout, stdin=subprocess.PIPE, stderr=sys.stderr)

        kenlm_utils.iter_files(
            source_path=train_paths,
            dest_path=kenlm_p.stdin,
            tokenizer=tokenizer,
            encoding_level=encoding_level,
            is_aggregate_tokenizer=is_aggregate_tokenizer,
            verbose=args.verbose,
        )

        kenlm_p.communicate()

    if kenlm_p.returncode != 0:
        raise RuntimeError("Training KenLM was not successful!")

    """ BINARY BUILD """

    kenlm_args = [
        os.path.join(args.kenlm_bin_path, "build_binary"),
        "trie",
        arpa_file,
        args.kenlm_model_file,
    ]
    logging.info(f"Running binary_build command \n\n{' '.join(kenlm_args)}\n\n")
    ret = subprocess.run(kenlm_args, capture_output=False, text=True, stdout=sys.stdout, stderr=sys.stderr)

    if ret.returncode != 0:
        raise RuntimeError("Training KenLM was not successful!")

    if not args.preserve_arpa:
        os.remove(arpa_file)
        logging.info(f"Deleted the arpa file '{arpa_file}'.")


if __name__ == '__main__':
    main()


In [None]:
!python /kaggle/working/NeMo/scripts/asr_language_modeling/ngram_lm/train_kenlm.py nemo_model_file="/kaggle/working/final_asr_model.nemo" \
                          train_paths="[\"/kaggle/input/dataset-ja/final_train.json\"]" \
                          kenlm_bin_path="/kaggle/working/NeMo/decoders/kenlm/build/bin" \
                          kenlm_model_file="/kaggle/working/kenlm_model.binary" \
                          ngram_length=6 \
                          preserve_arpa=true


In [None]:
%load /kaggle/working/NeMo/scripts/asr_language_modeling/ngram_lm/eval_beamsearch_ngram_ctc.py
import contextlib
import json
import os

os.environ['HYDRA_FULL_ERROR'] = '1'
import pickle
import tempfile
from dataclasses import dataclass, field, is_dataclass
from pathlib import Path
from typing import List, Optional

import editdistance
import numpy as np
import torch
from omegaconf import MISSING, OmegaConf
from sklearn.model_selection import ParameterGrid
from tqdm.auto import tqdm

import nemo.collections.asr as nemo_asr
from nemo.collections.asr.parts.submodules import rnnt_beam_decoding
from nemo.core.config import hydra_runner
from nemo.utils import logging

# fmt: off


@dataclass
class EvalBeamSearchNGramConfig:
    nemo_model_file: str = MISSING
    input_manifest: str = MISSING
    decoding_mode: str = MISSING  # Add this line
    kenlm_model_file: Optional[str] = None
    preds_output_folder: Optional[str] = None
    probs_cache_file: Optional[str] = None

    acoustic_batch_size: int = 128
    beam_batch_size: int = 128
    device: str = "cuda"
    use_amp: bool = False
    num_workers: int = 1

    decoding_strategy: str = "beam"
    beam_width: List[int] = field(default_factory=lambda: [128, 256])
    beam_alpha: List[float] = field(default_factory=lambda: [0.5, 1.0])
    maes_prefix_alpha: List[int] = field(default_factory=lambda: [2])
    maes_expansion_gamma: List[float] = field(default_factory=lambda: [2.3])
    hat_subtract_ilm: bool = False
    hat_ilm_weight: List[float] = field(default_factory=lambda: [0.0])

    decoding: rnnt_beam_decoding.BeamRNNTInferConfig = field(default_factory=lambda: rnnt_beam_decoding.BeamRNNTInferConfig(beam_size=128))


# fmt: on


def decoding_step(
    model: nemo_asr.models.ASRModel,
    cfg: EvalBeamSearchNGramConfig,
    all_probs: List[torch.Tensor],
    target_transcripts: List[str],
    preds_output_file: str = None,
    beam_batch_size: int = 128,
    progress_bar: bool = True,
):
    level = logging.getEffectiveLevel()
    logging.setLevel(logging.CRITICAL)
    model.change_decoding_strategy(None)

    cfg.decoding.hat_ilm_weight = cfg.decoding.hat_ilm_weight * cfg.hat_subtract_ilm
    cfg.decoding.return_best_hypothesis = False
    cfg.decoding.ngram_lm_model = cfg.kenlm_model_file
    cfg.decoding.hat_subtract_ilm = cfg.hat_subtract_ilm

    model.cfg.decoding.strategy = cfg.decoding_strategy
    model.cfg.decoding.beam = cfg.decoding
    model.change_decoding_strategy(model.cfg.decoding)
    logging.setLevel(level)

    wer_dist_first = cer_dist_first = 0
    wer_dist_best = cer_dist_best = 0
    words_count = 0
    chars_count = 0
    sample_idx = 0
    if preds_output_file:
        out_file = open(preds_output_file, 'w', encoding='utf_8', newline='\n')

    if progress_bar:
        if cfg.decoding_strategy == "greedy_batch":
            description = "Greedy_batch decoding.."
        else:
            description = f"{cfg.decoding_strategy} decoding with bw={cfg.decoding.beam_size}, ba={cfg.decoding.ngram_lm_alpha}, ma={cfg.decoding.maes_prefix_alpha}, mg={cfg.decoding.maes_expansion_gamma}, hat_ilmw={cfg.decoding.hat_ilm_weight}"
        it = tqdm(range(int(np.ceil(len(all_probs) / beam_batch_size))), desc=description, ncols=120)
    else:
        it = range(int(np.ceil(len(all_probs) / beam_batch_size)))
    for batch_idx in it:
        probs_batch = all_probs[batch_idx * beam_batch_size : (batch_idx + 1) * beam_batch_size]
        probs_lens = torch.tensor([prob.shape[-1] for prob in probs_batch])
        with torch.no_grad():
            packed_batch = torch.zeros(len(probs_batch), probs_batch[0].shape[0], max(probs_lens), device='cpu')
            for prob_index in range(len(probs_batch)):
                packed_batch[prob_index, :, : probs_lens[prob_index]] = torch.tensor(
                    probs_batch[prob_index].unsqueeze(0), device=packed_batch.device, dtype=packed_batch.dtype
                )
            best_hyp_batch, beams_batch = model.decoding.rnnt_decoder_predictions_tensor(
                packed_batch, probs_lens, return_hypotheses=True,
            )
        if cfg.decoding_strategy == "greedy_batch":
            beams_batch = [[x] for x in best_hyp_batch]

        for beams_idx, beams in enumerate(beams_batch):
            target = target_transcripts[sample_idx + beams_idx]
            target_split_w = target.split()
            target_split_c = list(target)
            words_count += len(target_split_w)
            chars_count += len(target_split_c)
            wer_dist_min = cer_dist_min = 10000
            for candidate_idx, candidate in enumerate(beams):
                pred_text = candidate.text
                pred_split_w = pred_text.split()
                wer_dist = editdistance.eval(target_split_w, pred_split_w)
                pred_split_c = list(pred_text)
                cer_dist = editdistance.eval(target_split_c, pred_split_c)

                wer_dist_min = min(wer_dist_min, wer_dist)
                cer_dist_min = min(cer_dist_min, cer_dist)

                if candidate_idx == 0:
                    wer_dist_first += wer_dist
                    cer_dist_first += cer_dist

                score = candidate.score
                if preds_output_file:
                    out_file.write('{}\t{}\n'.format(pred_text, score))
            wer_dist_best += wer_dist_min
            cer_dist_best += cer_dist_min
        sample_idx += len(probs_batch)

    if cfg.decoding_strategy == "greedy_batch":
        return wer_dist_first / words_count, cer_dist_first / chars_count

    if preds_output_file:
        out_file.close()
        logging.info(f"Stored the predictions of {cfg.decoding_strategy} decoding at '{preds_output_file}'.")

    if cfg.decoding.ngram_lm_model:
        logging.info(
            f"WER/CER with {cfg.decoding_strategy} decoding and N-gram model = {wer_dist_first / words_count:.2%}/{cer_dist_first / chars_count:.2%}"
        )
    else:
        logging.info(
            f"WER/CER with {cfg.decoding_strategy} decoding = {wer_dist_first / words_count:.2%}/{cer_dist_first / chars_count:.2%}"
        )
    logging.info(
        f"Oracle WER/CER in candidates with perfect LM= {wer_dist_best / words_count:.2%}/{cer_dist_best / chars_count:.2%}"
    )
    logging.info(f"=================================================================================")

    return wer_dist_first / words_count, cer_dist_first / chars_count


@hydra_runner(config_path=None, config_name='EvalBeamSearchNGramConfig', schema=EvalBeamSearchNGramConfig)
def main(cfg: EvalBeamSearchNGramConfig):
    if is_dataclass(cfg):
        cfg = OmegaConf.structured(cfg)  # type: EvalBeamSearchNGramConfig

    valid_decoding_strategis = ["greedy_batch", "beam", "tsd", "alsd", "maes"]
    if cfg.decoding_strategy not in valid_decoding_strategis:
        raise ValueError(
            f"Given decoding_strategy={cfg.decoding_strategy} is invalid. Available options are :\n"
            f"{valid_decoding_strategis}"
        )

    if cfg.nemo_model_file.endswith('.nemo'):
        asr_model = nemo_asr.models.ASRModel.restore_from(cfg.nemo_model_file, map_location=torch.device(cfg.device))
    else:
        logging.warning(
            "nemo_model_file does not end with .nemo, therefore trying to load a pretrained model with this name."
        )
        asr_model = nemo_asr.models.ASRModel.from_pretrained(
            cfg.nemo_model_file, map_location=torch.device(cfg.device)
        )

    if cfg.kenlm_model_file:
        if not os.path.exists(cfg.kenlm_model_file):
            raise FileNotFoundError(f"Could not find the KenLM model file '{cfg.kenlm_model_file}'.")
        if cfg.decoding_strategy != "maes":
            raise ValueError(f"Decoding with kenlm model is supported only for maes decoding algorithm.")
        lm_path = cfg.kenlm_model_file
    else:
        lm_path = None
        cfg.beam_alpha = [0.0]
    if cfg.hat_subtract_ilm:
        assert lm_path, "kenlm must be set for hat internal lm subtraction"

    if cfg.decoding_strategy != "maes":
        cfg.maes_expansion_gamma = [1.0]
        cfg.maes_prefix_alpha = [1.0]

    if cfg.device == "cuda" and not torch.cuda.is_available():
        logging.warning("You have set device=cuda but no CUDA devices found. Setting device=cpu instead.")
        cfg.device = "cpu"
    elif cfg.device == "cpu" and torch.cuda.is_available():
        logging.warning("You have set device=cpu, but there are available CUDA devices. Using CPU for inference.")

    if cfg.probs_cache_file and os.path.exists(cfg.probs_cache_file):
        logging.info(f"Restoring the probs cache from '{cfg.probs_cache_file}'.")
        with open(cfg.probs_cache_file, "rb") as cache_f:
            probs_dict = pickle.load(cache_f)
    else:
        logging.info(f"Computing and caching the probabilities of samples in '{cfg.input_manifest}'.")
        probs_dict = {}
        for test_batch in asr_model.transcribe(
            paths2audio_files=cfg.input_manifest,
            batch_size=cfg.acoustic_batch_size,
            num_workers=cfg.num_workers,
            return_hypotheses=False,
            use_amp=cfg.use_amp,
            channel_selector=None,
        ):
            for test_pred in test_batch:
                probs_dict[test_pred.audio_file] = (test_pred.feature_probs.cpu(), test_pred.tokens.cpu())
        if cfg.probs_cache_file:
            with open(cfg.probs_cache_file, "wb") as cache_f:
                pickle.dump(probs_dict, cache_f)

    for grid_idx, params in enumerate(ParameterGrid(cfg.dict_config)):
        for k, v in params.items():
            OmegaConf.update(cfg, k, v, merge=True)

        preds_output_file = None
        if cfg.preds_output_folder:
            preds_output_file = os.path.join(
                cfg.preds_output_folder,
                f"beam_search_preds_bs={cfg.decoding.beam_size}_ba={cfg.decoding.ngram_lm_alpha}_ma={cfg.decoding.maes_prefix_alpha}_mg={cfg.decoding.maes_expansion_gamma}_ilmw={cfg.decoding.hat_ilm_weight}.txt",
            )
            os.makedirs(cfg.preds_output_folder, exist_ok=True)

        all_probs = []
        target_transcripts = []
        for file, (probs, _) in probs_dict.items():
            all_probs.append(probs)
            target_transcripts.append(file)

        wer, cer = decoding_step(
            asr_model,
            cfg,
            all_probs,
            target_transcripts,
            preds_output_file=preds_output_file,
            beam_batch_size=cfg.beam_batch_size,
        )
        logging.info(f"WER/CER = {wer:.2%}/{cer:.2%}")


if __name__ == "__main__":
    main()


In [None]:
!python /kaggle/working/NeMo/scripts/asr_language_modeling/ngram_lm/eval_beamsearch_ngram_ctc.py nemo_model_file="/kaggle/working/final_asr_model.nemo" \
       input_manifest="/kaggle/input/dataset-ja/banana.json" \
       kenlm_model_file="/kaggle/working/kenlm_model.binary" \
       beam_width="[128, 256]" \
       beam_alpha="[0.5, 1.0]" \
       beam_beta="[0.5, 1.0]" \
       preds_output_folder="/kaggle/working/predictions" \
       probs_cache_file=null \
       decoding_mode=beamsearch_ngram \
       decoding_strategy="beam"
