In [None]:
"""
You can run either this notebook locally (if you have all the dependencies and a GPU) or on Google Colab.
Instructions for setting up Colab are as follows:
1. Open a new Python 3 notebook.
2. Import this notebook from GitHub (File -> Upload Notebook -> "GITHUB" tab -> copy/paste GitHub URL)
3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select "GPU" for hardware accelerator)
4. Run this cell to set up dependencies.
5. Restart the runtime (Runtime -> Restart Runtime) for any upgraded packages to take effect
NOTE: User is responsible for checking the content of datasets and the applicable licenses and determining if suitable for the intended use.
"""
# Install dependencies
!pip install wget
!apt-get install -y sox libsndfile1 ffmpeg
!pip install text-unidecode
!pip install matplotlib>=3.3.2
## Install NeMo
BRANCH = 'r2.0.0rc0'
!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH  #egg=nemo_toolkit[all] 
## Grab the config we'll use in this example
!mkdir configs
!wget -P configs/ https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/examples/asr/conf/citrinet/config_bpe.yaml
!wget -P configs/ https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/examples/asr/conf/conformer/conformer_ctc_bpe.yaml

"""
Remember to restart the runtime for the kernel to pick up any upgraded packages (e.g. matplotlib)!
Alternatively, you can uncomment the exit() below to crash and restart the kernel, in the case
that you want to use the "Run All Cells" (or similar) option.
"""
# exit()

Dependances 

In [None]:
!git clone https://github.com/AmirKaseb/SphinxSpeech.git

In [None]:
!git clone https://github.com/Alkholy53/ASR-Squad.git

In [None]:
! pip install editdistance
!pip install webdataset
!pip install pyannote.metrics
!pip install einops
! pip install pyannote.core
! pip install inflect
! pip install hydra.core
! pip install lhotse
!pip install numpy soundfile joblib omegaconf lhotse
! pip install jiwer
#pip install -r /kaggle/working/SphinxSpeech/requirements.txt

In [None]:
!conda install -y gdown

In [None]:
!gdown --id 1-s-kBiyEabNHIB5AKeSExsC8Xy7uA9TJ 

In [None]:
!gdown --id 1hbdyqUKfmInvssLCsWNK1gebcKF5GcZO

In [None]:
import os

# Define the path to your ZIP file
zip_file_path = '/kaggle/working/tokenizers_v2.zip'
unzip_dir = '/kaggle/working/tokenizers_v2/'

# Create the directory to unzip into
os.makedirs(unzip_dir, exist_ok=True)

# Unzip the file
!unzip -q {zip_file_path} -d {unzip_dir}

# Check the contents of the directory to ensure the files were unzipped
os.listdir(unzip_dir)


In [None]:
%%writefile /kaggle/working/configs/conformer_ctc_bpe.yaml
name: "Conformer-CTC-BPE"

model:
  sample_rate: 16000
  log_prediction: true # enables logging sample predictions in the output during training
  ctc_reduction: 'mean_batch'
  skip_nan_grad: false

  train_ds:
    manifest_filepath: "/kaggle/input/dataset-ja/final_train.json"
    sample_rate: ${model.sample_rate}
    batch_size: 16 # you may increase batch_size if your memory allows
    shuffle: true
    num_workers: 8
    pin_memory: true
    max_duration: 28 # it is set for LibriSpeech, you may need to update it for your dataset
    min_duration: 0.384
    # tarred datasets
    is_tarred: false
    tarred_audio_filepaths: null
    shuffle_n: 2048
    # bucketing params
    bucketing_strategy: "synced_randomized"
    bucketing_batch_size: null

  validation_ds:
    manifest_filepath: "/kaggle/input/dataset-ja/banana.json"
    sample_rate: ${model.sample_rate}
    batch_size: 16 # you may increase batch_size if your memory allows
    shuffle: false
    use_start_end_token: false
    num_workers: 8
    pin_memory: true

  test_ds:
    manifest_filepath: "/kaggle/input/dataset-ja/final_test.json"
    sample_rate: ${model.sample_rate}
    batch_size: 16 # you may increase batch_size if your memory allows
    shuffle: false
    use_start_end_token: false
    num_workers: 8
    pin_memory: true

  # recommend to SPE Unigram tokenizer with small vocab size of 128 or 256 when using 4x sub-sampling
  # you may find more detail on how to train a tokenizer at: /scripts/tokenizers/process_asr_text_tokenizer.py
  tokenizer:
    dir: "/kaggle/working/ASR-Squad/Tokenizers/tokenizer_spe_unigram_v64"  # path to directory which contains either tokenizer.model (bpe) or vocab.txt (wpe)
    type: bpe  # Can be either bpe (SentencePiece tokenizer) or wpe (WordPiece tokenizer)

  preprocessor:
    _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor
    sample_rate: ${model.sample_rate}
    normalize: "per_feature"
    window_size: 0.025
    window_stride: 0.01
    window: "hann"
    features: 80
    n_fft: 512
    log: true
    frame_splicing: 1
    dither: 0.00001
    pad_to: 0
    pad_value: 0.0

  spec_augment:
    _target_: nemo.collections.asr.modules.SpectrogramAugmentation
    freq_masks: 2 # set to zero to disable it
    # you may use lower time_masks for smaller models to have a faster convergence
    time_masks: 5 # set to zero to disable it
    freq_width: 27
    time_width: 0.05

  encoder:
    _target_: nemo.collections.asr.modules.ConformerEncoder
    feat_in: ${model.preprocessor.features}
    feat_out: -1 # you may set it if you need different output size other than the default d_model
    n_layers: 16
    d_model: 176

    # Sub-sampling params
    subsampling: striding # vggnet, striding, stacking or stacking_norm, dw_striding
    subsampling_factor: 4 # must be power of 2 for striding and vggnet
    subsampling_conv_channels: -1 # -1 sets it to d_model
    causal_downsampling: false

    # Feed forward module's params
    ff_expansion_factor: 4

    # Multi-headed Attention Module's params
    self_attention_model: rel_pos # rel_pos or abs_pos
    n_heads: 4 # may need to be lower for smaller d_models
    # [left, right] specifies the number of steps to be seen from left and right of each step in self-attention
    att_context_size: [-1, -1] # -1 means unlimited context
    att_context_style: regular # regular or chunked_limited
    xscaling: true # scales up the input embeddings by sqrt(d_model)
    untie_biases: true # unties the biases of the TransformerXL layers
    pos_emb_max_len: 5000

    # Convolution module's params
    conv_kernel_size: 31
    conv_norm_type: 'batch_norm' # batch_norm or layer_norm or groupnormN (N specifies the number of groups)
    # conv_context_size can be"causal" or a list of two integers while conv_context_size[0]+conv_context_size[1]+1==conv_kernel_size
    # null means [(kernel_size-1)//2, (kernel_size-1)//2], and 'causal' means [(kernel_size-1), 0]
    conv_context_size: null

    ### regularization
    dropout: 0.1 # The dropout used in most of the Conformer Modules
    dropout_pre_encoder: 0.1 # The dropout used before the encoder
    dropout_emb: 0.0 # The dropout used for embeddings
    dropout_att: 0.1 # The dropout for multi-headed attention modules

    # set to non-zero to enable stochastic depth
    stochastic_depth_drop_prob: 0.0
    stochastic_depth_mode: linear  # linear or uniform
    stochastic_depth_start_layer: 1

  decoder:
    _target_: nemo.collections.asr.modules.ConvASRDecoder
    feat_in: null
    num_classes: -1
    vocabulary: []

  # config for InterCTC loss: https://arxiv.org/abs/2102.03216
  # specify loss weights and which layers to use for InterCTC
  # e.g., to reproduce the paper results, set loss_weights: [0.3]
  # and apply_at_layers: [8] (assuming 18 layers). Note that final
  # layer loss coefficient is automatically adjusted (to 0.7 in above example)
  interctc:
    loss_weights: []
    apply_at_layers: []

  optim:
    name: adamw
    lr: 5.0
    # optimizer arguments
    betas: [0.9, 0.98]
    # less necessity for weight_decay as we already have large augmentations with SpecAug
    # you may need weight_decay for large models, stable AMP training, small datasets, or when lower augmentations are used
    # weight decay of 0.0 with lr of 2.0 also works fine
    weight_decay: 1e-3

    # scheduler setup
    sched:
      name: NoamAnnealing
      d_model: ${model.encoder.d_model}
      # scheduler config override
      warmup_steps: 10000
      warmup_ratio: null
      min_lr: 1e-6

trainer:
  devices: -1 # number of GPUs, -1 would use all available GPUs
  num_nodes: 1
  max_epochs: 100
  max_steps: -1 # computed at runtime if not set
  val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations
  accelerator: auto
  strategy: ddp
  accumulate_grad_batches: 1
  gradient_clip_val: 0.0
  precision: 32  # 16, 32, or bf16
  log_every_n_steps: 10  # Interval of logging.
  enable_progress_bar: True
  num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it
  check_val_every_n_epoch: 1 # number of evaluations on validation every n epochs
  sync_batchnorm: true
  enable_checkpointing: False  # Provided by exp_manager
  logger: false  # Provided by exp_manager
  benchmark: false # needs to be false for models with variable-length speech input as it slows down training

exp_manager:
  exp_dir: "/kaggle/working/results"
  name: ${name}
  create_tensorboard_logger: true
  create_checkpoint_callback: true
  checkpoint_callback_params:
    # in case of multiple validation sets, first one is used
    monitor: "val_wer"
    mode: "min"
    save_top_k: 5
    always_save_nemo: True # saves the checkpoints as nemo files instead of PTL checkpoints

  # you need to set these two to True to continue the training
  resume_if_exists: false
  resume_ignore_no_checkpoint: false

  # You may use this section to create a W&B logger
  create_wandb_logger: false
  wandb_logger_kwargs:
    name: null
    project: null


In [None]:
%%writefile /kaggle/working/configs/conformer_ctc_bpe.yaml
name: "Conformer-CTC-BPE"

model:
  sample_rate: 16000
  log_prediction: true
  ctc_reduction: 'mean_batch'
  skip_nan_grad: false

  train_ds:
    manifest_filepath: "/kaggle/input/dataset-ja/final_train.json"
    sample_rate: ${model.sample_rate}
    batch_size: 32  # Increased batch size
    shuffle: true
    num_workers: 4  # Reduced to suggested maximum workers
    pin_memory: true
    max_duration: 28
    min_duration: 0.384
    is_tarred: false
    tarred_audio_filepaths: null
    shuffle_n: 2048
    bucketing_strategy: "synced_randomized"
    bucketing_batch_size: null

  validation_ds:
    manifest_filepath: "/kaggle/input/dataset-ja/banana.json"
    sample_rate: ${model.sample_rate}
    batch_size: 32  # Increased batch size
    shuffle: false
    use_start_end_token: false
    num_workers: 4  # Reduced to suggested maximum workers
    pin_memory: true

  test_ds:
    manifest_filepath: "/kaggle/input/dataset-ja/final_test.json"
    sample_rate: ${model.sample_rate}
    batch_size: 32  # Increased batch size
    shuffle: false
    use_start_end_token: false
    num_workers: 4  # Reduced to suggested maximum workers
    pin_memory: true

  tokenizer:
    dir: "/kaggle/working/ASR-Squad/Tokenizers/tokenizer_spe_unigram_v64"
    type: bpe

  preprocessor:
    _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor
    sample_rate: ${model.sample_rate}
    normalize: "per_feature"
    window_size: 0.025
    window_stride: 0.01
    window: "hann"
    features: 80
    n_fft: 512
    log: true
    frame_splicing: 1
    dither: 0.00001
    pad_to: 0
    pad_value: 0.0

  spec_augment:
    _target_: nemo.collections.asr.modules.SpectrogramAugmentation
    freq_masks: 1  # Slight reduction
    time_masks: 2  # Slight reduction
    freq_width: 27
    time_width: 0.05

  encoder:
    _target_: nemo.collections.asr.modules.ConformerEncoder
    feat_in: ${model.preprocessor.features}
    feat_out: -1
    n_layers: 18
    d_model: 256
    subsampling: striding
    subsampling_factor: 4
    subsampling_conv_channels: -1
    causal_downsampling: false
    ff_expansion_factor: 4
    self_attention_model: rel_pos
    n_heads: 4
    att_context_size: [-1, -1]
    att_context_style: regular
    xscaling: true
    untie_biases: true
    pos_emb_max_len: 5000
    conv_kernel_size: 31
    conv_norm_type: 'batch_norm'
    conv_context_size: null
    dropout: 0.1
    dropout_pre_encoder: 0.1
    dropout_emb: 0.0
    dropout_att: 0.1
    stochastic_depth_drop_prob: 0.0
    stochastic_depth_mode: linear
    stochastic_depth_start_layer: 1

  decoder:
    _target_: nemo.collections.asr.modules.ConvASRDecoder
    feat_in: null
    num_classes: -1
    vocabulary: []

  interctc:
    loss_weights: []
    apply_at_layers: []

  optim:
    name: adamw
    lr: 5.0
    betas: [0.9, 0.98]
    weight_decay: 1e-3

    sched:
      name: NoamAnnealing
      d_model: ${model.encoder.d_model}
      warmup_steps: 10000
      min_lr: 1e-6

trainer:
  devices: -1
  num_nodes: 1
  max_epochs: 20
  max_steps: -1
  val_check_interval: 1.0
  accelerator: auto
  strategy: ddp
  accumulate_grad_batches: 2  # Gradient accumulation
  gradient_clip_val: 0.0
  precision: 16  # Mixed precision training
  log_every_n_steps: 50  # Reduced logging frequency
  enable_progress_bar: True
  num_sanity_val_steps: 0
  check_val_every_n_epoch: 1
  sync_batchnorm: false  # Disable Sync BatchNorm
  enable_checkpointing: False
  logger: false
  benchmark: false

exp_manager:
  exp_dir: "/kaggle/working/results"
  name: ${name}
  create_tensorboard_logger: true
  create_checkpoint_callback: true
  checkpoint_callback_params:
    monitor: "val_wer"
    mode: "min"
    save_top_k: 1  # Save only the best checkpoint
    always_save_nemo: True

  resume_if_exists: false
  resume_ignore_no_checkpoint: false
  create_wandb_logger: false
  wandb_logger_kwargs:
    name: null
    project: null


In [None]:
!touch speech_to_text_ctc_bpe.py

In [None]:
%%writefile speech_to_text_ctc_bpe.py


import pytorch_lightning as pl
from omegaconf import OmegaConf

from nemo.collections.asr.models.ctc_bpe_models import EncDecCTCModelBPE
from nemo.core.config import hydra_runner
from nemo.utils import logging
from nemo.utils.exp_manager import exp_manager


@hydra_runner(config_path="/kaggle/working/configs/", config_name="conformer_ctc_bpe")
def main(cfg):
    logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}')

    trainer = pl.Trainer(**cfg.trainer)
    exp_manager(trainer, cfg.get("exp_manager", None))
    asr_model = EncDecCTCModelBPE(cfg=cfg.model, trainer=trainer)

    # Initialize the weights of the model from another model, if provided via config
    asr_model.maybe_init_from_pretrained_checkpoint(cfg)

    trainer.fit(asr_model)

    if hasattr(cfg.model, 'test_ds') and cfg.model.test_ds.manifest_filepath is not None:
        if asr_model.prepare_test(trainer):
            trainer.test(asr_model)  
    # Save the model
    final_model_path = '/kaggle/working/final_asr_model.nemo'
    asr_model.save_to(final_model_path)
    logging.info(f'Model saved at {final_model_path}')

if __name__ == '__main__':
    main()

In [None]:
!mkdir results

In [None]:
!python /kaggle/working/speech_to_text_ctc_bpe.py

In [None]:
BRANCH = 'r2.0.0rc0'

!wget -P configs/ https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/examples/asr/transcribe_speech.py
!wget -P configs/ https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/examples/asr/speech_to_text_eval.py

In [None]:
!python /kaggle/working/configs/transcribe_speech.py \
  model_path="/kaggle/working/SphinxSpeech/Model/first_model.nemo" \
  dataset_manifest="/kaggle/input/dataset-ja/final_test.json" \
  output_filename="/kaggle/working/test_with_predictions.json" 


In [None]:
#!python /kaggle/working/configs/transcribe_speech.py \
  #model_path="/kaggle/working/final_asr_model.nemo" \
  #dataset_manifest="/kaggle/input/dataset-ja/final_test.json" \
  #output_filename="/kaggle/working/test_with_predictions.json" \
  #batch_size=8 \
 # cuda=1 \
  #amp=True

In [None]:

# Calculate WER
!python /kaggle/working/configs/speech_to_text_eval.py \
  dataset_manifest="/kaggle/working/test_with_predictions.json" \
  use_cer=False \
  only_score_manifest=True

# Calculate CER
!python /kaggle/working/configs/speech_to_text_eval.py \
  dataset_manifest="/kaggle/working/test_with_predictions.json" \
  use_cer=True \
  only_score_manifest=True

In [None]:
!git clone https://github.com/NVIDIA/NeMo.git

In [None]:
%%writefile /kaggle/working/NeMo/scripts/asr_language_modeling/ngram_lm/install_beamsearch_decoders.sh
#!/usr/bin/env bash
# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Use this script to install KenLM, OpenSeq2Seq decoder, Flashlight decoder
shopt -s expand_aliases

NEMO_PATH=/kaggle/working/NeMo  # Path to NeMo folder: /workspace/nemo if you use NeMo/Dockerfile
if [ "$#" -eq 1 ]; then
  NEMO_PATH=$1
fi
KENLM_MAX_ORDER=10 # Maximum order of KenLM model, also specified in the setup_os2s_decoders.py

if [ -d "$NEMO_PATH" ]; then
  echo "The folder '$NEMO_PATH' exists."
else
  echo "Error: The folder '$NEMO_PATH' does not exist. Specify it as a first command line positional argument!"
  exit 1
fi
cd $NEMO_PATH

if [ $(id -u) -eq 0 ]; then
  alias aptupdate='apt-get update'
  alias b2install='./b2'
else
  alias aptupdate='sudo apt-get update'
  alias b2install='sudo ./b2'
fi

aptupdate && apt-get upgrade -y && apt-get install -y swig liblzma-dev && rm -rf /var/lib/apt/lists/* # liblzma needed for flashlight decoder

# install Boost package for KenLM
wget https://boostorg.jfrog.io/artifactory/main/release/1.80.0/source/boost_1_80_0.tar.bz2 --no-check-certificate && tar --bzip2 -xf $NEMO_PATH/boost_1_80_0.tar.bz2 && cd boost_1_80_0 && ./bootstrap.sh && b2install --layout=tagged link=static,shared threading=multi,single install -j4 && cd .. || echo FAILURE
export BOOST_ROOT=$NEMO_PATH/boost_1_80_0

git clone https://github.com/NVIDIA/OpenSeq2Seq
cd OpenSeq2Seq
git checkout ctc-decoders
cd ..
mv OpenSeq2Seq/decoders $NEMO_PATH/
rm -rf OpenSeq2Seq
cd $NEMO_PATH/decoders
cp $NEMO_PATH/scripts/installers/setup_os2s_decoders.py ./setup.py
./setup.sh

# install KenLM
cd $NEMO_PATH/decoders/kenlm/build && cmake -DKENLM_MAX_ORDER=$KENLM_MAX_ORDER .. && make -j2
cd $NEMO_PATH/decoders/kenlm
python setup.py install --max_order=$KENLM_MAX_ORDER
export KENLM_LIB=$NEMO_PATH/decoders/kenlm/build/bin
export KENLM_ROOT=$NEMO_PATH/decoders/kenlm
cd ..

# install Flashlight
git clone https://github.com/flashlight/text && cd text
python setup.py bdist_wheel
pip install dist/*.whl
cd ..


In [None]:
pwd

In [None]:
cd /kaggle/working/NeMo/scripts/asr_language_modeling/ngram_lm

In [None]:
!./install_beamsearch_decoders.sh

In [None]:
# Remove the old version of CMake
#!sudo apt-get  remove -y cmake

# Install a newer version of CMake (e.g., from Kitware)
#!sudo apt-get update -y
#!sudo apt-get install -y software-properties-common
#!sudo add-apt-repository ppa:kitware/release
#!sudo apt-get update -y
#!sudo apt-get install -y cmake


In [None]:
%%writefile /kaggle/working/NeMo/scripts/asr_language_modeling/ngram_lm/train_kenlm.py
# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

# This script would train an N-gram language model with KenLM library (https://github.com/kpu/kenlm) which can be used
# with the beam search decoders on top of the ASR models. This script supports both character level and BPE level
# encodings and models which is detected automatically from the type of the model.
# After the N-gram model is trained, and stored in the binary format, you may use
# 'scripts/ngram_lm/eval_beamsearch_ngram.py' to evaluate it on an ASR model.
#
# You need to install the KenLM library and also the beam search decoders to use this feature. Please refer
# to 'scripts/ngram_lm/install_beamsearch_decoders.sh' on how to install them.
#
# USAGE: python train_kenlm.py nemo_model_file=<path to the .nemo file of the model> \
#                              train_paths=<list of paths to the training text or JSON manifest file> \
#                              kenlm_bin_path=<path to the bin folder of KenLM library> \
#                              kenlm_model_file=<path to store the binary KenLM model> \
#                              ngram_length=<order of N-gram model> \
#
# After training is done, the binary LM model is stored at the path specified by '--kenlm_model_file'.
# You may find more info on how to use this script at:
# https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/asr/asr_language_modeling.html

import logging
import os
os.environ['HYDRA_FULL_ERROR'] = '1'
import subprocess
import sys
from dataclasses import dataclass, field
from glob import glob
from typing import List

from omegaconf import MISSING

# Update the Python path to include the scripts directory
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..', '..')))

from scripts.asr_language_modeling.ngram_lm import kenlm_utils

from nemo.core.config import hydra_runner
from nemo.utils import logging

"""
NeMo's beam search decoders only support char-level encodings. In order to make it work with BPE-level encodings, we
use a trick to encode the sub-word tokens of the training data as unicode characters and train a char-level KenLM. 
"""


@dataclass
class TrainKenlmConfig:
    """
    Train an N-gram language model with KenLM to be used with beam search decoder of ASR models.
    """

    train_paths: List[
        str
    ] = MISSING  # List of training files or folders. Files can be a plain text file or ".json" manifest or ".json.gz". Example: [/path/to/manifest/file,/path/to/folder]

    nemo_model_file: str = MISSING  # The path to '.nemo' file of the ASR model, or name of a pretrained NeMo model
    kenlm_model_file: str = MISSING  # The path to store the KenLM binary model file
    ngram_length: int = MISSING  # The order of N-gram LM
    kenlm_bin_path: str = MISSING  # The path to the bin folder of KenLM.

    preserve_arpa: bool = False  # Whether to preserve the intermediate ARPA file.
    ngram_prune: List[int] = field(
        default_factory=lambda: [0]
    )  # List of digits to prune Ngram. Example: [0,0,1]. See Pruning section on the https://kheafield.com/code/kenlm/estimation
    cache_path: str = ""  # Cache path to save tokenized files.
    verbose: int = 1  # Verbose level, default is 1.


@hydra_runner(config_path=None, config_name='TrainKenlmConfig', schema=TrainKenlmConfig)
def main(args: TrainKenlmConfig):
    train_paths = kenlm_utils.get_train_list(args.train_paths)

    if isinstance(args.ngram_prune, str):
        args.ngram_prune = [args.ngram_prune]

    tokenizer, encoding_level, is_aggregate_tokenizer = kenlm_utils.setup_tokenizer(args.nemo_model_file)

    if encoding_level == "subword":
        discount_arg = "--discount_fallback"  # --discount_fallback is needed for training KenLM for BPE-based models
    else:
        discount_arg = ""

    arpa_file = f"{args.kenlm_model_file}.tmp.arpa"
    """ LMPLZ ARGUMENT SETUP """
    kenlm_args = [
        os.path.join(args.kenlm_bin_path, 'lmplz'),
        "-o",
        str(args.ngram_length),
        "--arpa",
        arpa_file,
        discount_arg,
        "--prune",
    ] + [str(n) for n in args.ngram_prune]

    if args.cache_path:
        if not os.path.exists(args.cache_path):
            os.makedirs(args.cache_path, exist_ok=True)

        """ DATASET SETUP """
        encoded_train_files = []
        for file_num, train_file in enumerate(train_paths):
            logging.info(f"Encoding the train file '{train_file}' number {file_num+1} out of {len(train_paths)} ...")

            cached_files = glob(os.path.join(args.cache_path, os.path.split(train_file)[1]) + "*")
            encoded_train_file = os.path.join(args.cache_path, os.path.split(train_file)[1] + f"_{file_num}.tmp.txt")
            if (
                cached_files and cached_files[0] != encoded_train_file
            ):  # cached_files exists but has another file name: f"_{file_num}.tmp.txt"
                os.rename(cached_files[0], encoded_train_file)
                logging.info("Rename", cached_files[0], "to", encoded_train_file)

            encoded_train_files.append(encoded_train_file)

        kenlm_utils.iter_files(
            source_path=train_paths,
            dest_path=encoded_train_files,
            tokenizer=tokenizer,
            encoding_level=encoding_level,
            is_aggregate_tokenizer=is_aggregate_tokenizer,
            verbose=args.verbose,
        )

        first_process_args = ["cat"] + encoded_train_files
        first_process = subprocess.Popen(first_process_args, stdout=subprocess.PIPE, stderr=sys.stderr)

        logging.info(f"Running lmplz command \n\n{' '.join(kenlm_args)}\n\n")
        kenlm_p = subprocess.run(
            kenlm_args,
            stdin=first_process.stdout,
            capture_output=False,
            text=True,
            stdout=sys.stdout,
            stderr=sys.stderr,
        )
        first_process.wait()

    else:
        logging.info(f"Running lmplz command \n\n{' '.join(kenlm_args)}\n\n")
        kenlm_p = subprocess.Popen(kenlm_args, stdout=sys.stdout, stdin=subprocess.PIPE, stderr=sys.stderr)

        kenlm_utils.iter_files(
            source_path=train_paths,
            dest_path=kenlm_p.stdin,
            tokenizer=tokenizer,
            encoding_level=encoding_level,
            is_aggregate_tokenizer=is_aggregate_tokenizer,
            verbose=args.verbose,
        )

        kenlm_p.communicate()

    if kenlm_p.returncode != 0:
        raise RuntimeError("Training KenLM was not successful!")

    """ BINARY BUILD """

    kenlm_args = [
        os.path.join(args.kenlm_bin_path, "build_binary"),
        "trie",
        arpa_file,
        args.kenlm_model_file,
    ]
    logging.info(f"Running binary_build command \n\n{' '.join(kenlm_args)}\n\n")
    ret = subprocess.run(kenlm_args, capture_output=False, text=True, stdout=sys.stdout, stderr=sys.stderr)

    if ret.returncode != 0:
        raise RuntimeError("Training KenLM was not successful!")

    if not args.preserve_arpa:
        os.remove(arpa_file)
        logging.info(f"Deleted the arpa file '{arpa_file}'.")


if __name__ == '__main__':
    main()


In [None]:
cd /kaggle/working/NeMo

In [None]:
!python /kaggle/working/NeMo/scripts/asr_language_modeling/ngram_lm/train_kenlm.py nemo_model_file="/kaggle/working/final_asr_model.nemo" \
                          train_paths="[\"/kaggle/input/dataset-ja/final_train.json\"]" \
                          kenlm_bin_path="/kaggle/working/NeMo/decoders/kenlm/build/bin" \
                          kenlm_model_file="/kaggle/working/kenlm_model.binary" \
                          ngram_length=6 \
                          preserve_arpa=true


In [None]:
import os

os.environ['HYDRA_FULL_ERROR'] = '1'

In [None]:
%%writefile /kaggle/working/NeMo/scripts/asr_language_modeling/ngram_lm/eval_beamsearch_ngram_ctc.py
# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

"""
# This script would evaluate an N-gram language model trained with KenLM library (https://github.com/kpu/kenlm) in
# fusion with beam search decoders on top of a trained ASR model with CTC decoder. To evaluate a model with 
# Transducer (RNN-T) decoder use another script 'scripts/asr_language_modeling/ngram_lm/eval_beamsearch_ngram_transducer.py'. 
# NeMo's beam search decoders are capable of using the KenLM's N-gram models
# to find the best candidates. This script supports both character level and BPE level
# encodings and models which is detected automatically from the type of the model.
# You may train the LM model with 'scripts/asr_language_modeling/ngram_lm/train_kenlm.py'.

# Config Help

To discover all arguments of the script, please run :
python eval_beamsearch_ngram_ctc.py --help
python eval_beamsearch_ngram_ctc.py --cfg job

# USAGE

python eval_beamsearch_ngram_ctc.py nemo_model_file=<path to the .nemo file of the model> \
           input_manifest=<path to the evaluation JSON manifest file> \
           kenlm_model_file=<path to the binary KenLM model> \
           beam_width=[<list of the beam widths, separated with commas>] \
           beam_alpha=[<list of the beam alphas, separated with commas>] \
           beam_beta=[<list of the beam betas, separated with commas>] \
           preds_output_folder=<optional folder to store the predictions> \
           probs_cache_file=null \
           decoding_mode=beamsearch_ngram
           ...


# Grid Search for Hyper parameters

For grid search, you can provide a list of arguments as follows -

           beam_width=[4,8,16,....] \
           beam_alpha=[-2.0,-1.0,...,1.0,2.0] \
           beam_beta=[-1.0,-0.5,0.0,...,1.0] \

# You may find more info on how to use this script at:
# https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/asr/asr_language_modeling.html

"""


import contextlib
import json
import os
import pickle
from dataclasses import dataclass, field, is_dataclass
from pathlib import Path
from typing import List, Optional

import editdistance
import numpy as np
import torch
from omegaconf import MISSING, OmegaConf
from sklearn.model_selection import ParameterGrid
from tqdm.auto import tqdm

import nemo.collections.asr as nemo_asr
from nemo.collections.asr.models import EncDecHybridRNNTCTCModel
from nemo.collections.asr.parts.submodules import ctc_beam_decoding
from nemo.collections.asr.parts.utils.transcribe_utils import PunctuationCapitalization, TextProcessingConfig
from nemo.core.config import hydra_runner
from nemo.utils import logging

# fmt: off


@dataclass
class EvalBeamSearchNGramConfig:
    """
    Evaluate an ASR model with beam search decoding and n-gram KenLM language model.
    """
    # # The path of the '.nemo' file of the ASR model or the name of a pretrained model (ngc / huggingface)
    nemo_model_file: str = MISSING

    # File paths
    input_manifest: str = MISSING  # The manifest file of the evaluation set
    kenlm_model_file: Optional[str] = None  # The path of the KenLM binary model file
    preds_output_folder: Optional[str] = None  # The optional folder where the predictions are stored
    probs_cache_file: Optional[str] = None  # The cache file for storing the logprobs of the model

    # Parameters for inference
    acoustic_batch_size: int = 16  # The batch size to calculate log probabilities
    beam_batch_size: int = 128  # The batch size to be used for beam search decoding
    device: str = "cuda"  # The device to load the model onto to calculate log probabilities
    use_amp: bool = False  # Whether to use AMP if available to calculate log probabilities

    # Beam Search hyperparameters

    # The decoding scheme to be used for evaluation.
    # Can be one of ["greedy", "beamsearch", "beamsearch_ngram"]
    decoding_mode: str = "beamsearch_ngram"

    beam_width: List[int] = field(default_factory=lambda: [128])  # The width or list of the widths for the beam search decoding
    beam_alpha: List[float] = field(default_factory=lambda: [1.0])  # The alpha parameter or list of the alphas for the beam search decoding
    beam_beta: List[float] = field(default_factory=lambda: [0.0])  # The beta parameter or list of the betas for the beam search decoding

    decoding_strategy: str = "beam"
    decoding: ctc_beam_decoding.BeamCTCInferConfig = field(default_factory=lambda: ctc_beam_decoding.BeamCTCInferConfig(beam_size=128))
    
    text_processing: Optional[TextProcessingConfig] = field(default_factory=lambda: TextProcessingConfig(
        punctuation_marks = ".,?",
        separate_punctuation = False,
        do_lowercase = False,
        rm_punctuation = False,
    ))
# fmt: on


def beam_search_eval(
    model: nemo_asr.models.ASRModel,
    cfg: EvalBeamSearchNGramConfig,
    all_probs: List[torch.Tensor],
    target_transcripts: List[str],
    preds_output_file: str = None,
    lm_path: str = None,
    beam_alpha: float = 1.0,
    beam_beta: float = 0.0,
    beam_width: int = 128,
    beam_batch_size: int = 128,
    progress_bar: bool = True,
    punctuation_capitalization: PunctuationCapitalization = None,
):
    level = logging.getEffectiveLevel()
    logging.setLevel(logging.CRITICAL)
    # Reset config
    if isinstance(model, EncDecHybridRNNTCTCModel):
        model.change_decoding_strategy(decoding_cfg=None, decoder_type="ctc")
    else:
        model.change_decoding_strategy(None)

    # Override the beam search config with current search candidate configuration
    cfg.decoding.beam_size = beam_width
    cfg.decoding.beam_alpha = beam_alpha
    cfg.decoding.beam_beta = beam_beta
    cfg.decoding.return_best_hypothesis = False
    cfg.decoding.kenlm_path = cfg.kenlm_model_file

    # Update model's decoding strategy config
    model.cfg.decoding.strategy = cfg.decoding_strategy
    model.cfg.decoding.beam = cfg.decoding

    # Update model's decoding strategy
    if isinstance(model, EncDecHybridRNNTCTCModel):
        model.change_decoding_strategy(model.cfg.decoding, decoder_type='ctc')
        decoding = model.ctc_decoding
    else:
        model.change_decoding_strategy(model.cfg.decoding)
        decoding = model.decoding
    logging.setLevel(level)

    wer_dist_first = cer_dist_first = 0
    wer_dist_best = cer_dist_best = 0
    words_count = 0
    chars_count = 0
    sample_idx = 0
    if preds_output_file:
        out_file = open(preds_output_file, 'w', encoding='utf_8', newline='\n')

    if progress_bar:
        it = tqdm(
            range(int(np.ceil(len(all_probs) / beam_batch_size))),
            desc=f"Beam search decoding with width={beam_width}, alpha={beam_alpha}, beta={beam_beta}",
            ncols=120,
        )
    else:
        it = range(int(np.ceil(len(all_probs) / beam_batch_size)))
    for batch_idx in it:
        # disabling type checking
        probs_batch = all_probs[batch_idx * beam_batch_size : (batch_idx + 1) * beam_batch_size]
        probs_lens = torch.tensor([prob.shape[0] for prob in probs_batch])
        with torch.no_grad():
            packed_batch = torch.zeros(len(probs_batch), max(probs_lens), probs_batch[0].shape[-1], device='cpu')

            for prob_index in range(len(probs_batch)):
                packed_batch[prob_index, : probs_lens[prob_index], :] = torch.tensor(
                    probs_batch[prob_index], device=packed_batch.device, dtype=packed_batch.dtype
                )

            _, beams_batch = decoding.ctc_decoder_predictions_tensor(
                packed_batch, decoder_lengths=probs_lens, return_hypotheses=True,
            )

        for beams_idx, beams in enumerate(beams_batch):
            target = target_transcripts[sample_idx + beams_idx]
            target_split_w = target.split()
            target_split_c = list(target)
            words_count += len(target_split_w)
            chars_count += len(target_split_c)
            wer_dist_min = cer_dist_min = 10000
            for candidate_idx, candidate in enumerate(beams):  # type: (int, ctc_beam_decoding.rnnt_utils.Hypothesis)
                pred_text = candidate.text
                if cfg.text_processing.do_lowercase:
                    pred_text = punctuation_capitalization.do_lowercase([pred_text])[0]
                if cfg.text_processing.rm_punctuation:
                    pred_text = punctuation_capitalization.rm_punctuation([pred_text])[0]
                if cfg.text_processing.separate_punctuation:
                    pred_text = punctuation_capitalization.separate_punctuation([pred_text])[0]
                pred_split_w = pred_text.split()
                wer_dist = editdistance.eval(target_split_w, pred_split_w)
                pred_split_c = list(pred_text)
                cer_dist = editdistance.eval(target_split_c, pred_split_c)

                wer_dist_min = min(wer_dist_min, wer_dist)
                cer_dist_min = min(cer_dist_min, cer_dist)

                if candidate_idx == 0:
                    # first candidate
                    wer_dist_first += wer_dist
                    cer_dist_first += cer_dist

                score = candidate.score
                if preds_output_file:
                    out_file.write('{}\t{}\n'.format(pred_text, score))
            wer_dist_best += wer_dist_min
            cer_dist_best += cer_dist_min
        sample_idx += len(probs_batch)

    if preds_output_file:
        out_file.close()
        logging.info(f"Stored the predictions of beam search decoding at '{preds_output_file}'.")

    if lm_path:
        logging.info(
            'WER/CER with beam search decoding and N-gram model = {:.2%}/{:.2%}'.format(
                wer_dist_first / words_count, cer_dist_first / chars_count
            )
        )
    else:
        logging.info(
            'WER/CER with beam search decoding = {:.2%}/{:.2%}'.format(
                wer_dist_first / words_count, cer_dist_first / chars_count
            )
        )
    logging.info(
        'Oracle WER/CER in candidates with perfect LM= {:.2%}/{:.2%}'.format(
            wer_dist_best / words_count, cer_dist_best / chars_count
        )
    )
    logging.info(f"=================================================================================")

    return wer_dist_first / words_count, cer_dist_first / chars_count


@hydra_runner(config_path=None, config_name='EvalBeamSearchNGramConfig', schema=EvalBeamSearchNGramConfig)
def main(cfg: EvalBeamSearchNGramConfig):
    if is_dataclass(cfg):
        cfg = OmegaConf.structured(cfg)  # type: EvalBeamSearchNGramConfig

    valid_decoding_modes = ["greedy", "beamsearch", "beamsearch_ngram"]
    if cfg.decoding_mode not in valid_decoding_modes:
        raise ValueError(
            f"Given decoding_mode={cfg.decoding_mode} is invalid. Available options are :\n" f"{valid_decoding_modes}"
        )

    if cfg.nemo_model_file.endswith('.nemo'):
        asr_model = nemo_asr.models.ASRModel.restore_from(cfg.nemo_model_file, map_location=torch.device(cfg.device))
    else:
        logging.warning(
            "nemo_model_file does not end with .nemo, therefore trying to load a pretrained model with this name."
        )
        asr_model = nemo_asr.models.ASRModel.from_pretrained(
            cfg.nemo_model_file, map_location=torch.device(cfg.device)
        )

    target_transcripts = []
    manifest_dir = Path(cfg.input_manifest).parent
    with open(cfg.input_manifest, 'r', encoding='utf_8') as manifest_file:
        audio_file_paths = []
        for line in tqdm(manifest_file, desc=f"Reading Manifest {cfg.input_manifest} ...", ncols=120):
            data = json.loads(line)
            audio_file = Path(data['audio_filepath'])
            if not audio_file.is_file() and not audio_file.is_absolute():
                audio_file = manifest_dir / audio_file
            target_transcripts.append(data['text'])
            audio_file_paths.append(str(audio_file.absolute()))

    punctuation_capitalization = PunctuationCapitalization(cfg.text_processing.punctuation_marks)
    if cfg.text_processing.do_lowercase:
        target_transcripts = punctuation_capitalization.do_lowercase(target_transcripts)
    if cfg.text_processing.rm_punctuation:
        target_transcripts = punctuation_capitalization.rm_punctuation(target_transcripts)
    if cfg.text_processing.separate_punctuation:
        target_transcripts = punctuation_capitalization.separate_punctuation(target_transcripts)

    if cfg.probs_cache_file and os.path.exists(cfg.probs_cache_file):
        logging.info(f"Found a pickle file of probabilities at '{cfg.probs_cache_file}'.")
        logging.info(f"Loading the cached pickle file of probabilities from '{cfg.probs_cache_file}' ...")
        with open(cfg.probs_cache_file, 'rb') as probs_file:
            all_probs = pickle.load(probs_file)

        if len(all_probs) != len(audio_file_paths):
            raise ValueError(
                f"The number of samples in the probabilities file '{cfg.probs_cache_file}' does not "
                f"match the manifest file. You may need to delete the probabilities cached file."
            )
    else:

        @contextlib.contextmanager
        def default_autocast():
            yield

        if cfg.use_amp:
            if torch.cuda.is_available() and hasattr(torch.cuda, 'amp') and hasattr(torch.cuda.amp, 'autocast'):
                logging.info("AMP is enabled!\n")
                autocast = torch.cuda.amp.autocast

            else:
                autocast = default_autocast
        else:

            autocast = default_autocast

        with autocast():
            with torch.no_grad():
                if isinstance(asr_model, EncDecHybridRNNTCTCModel):
                    asr_model.cur_decoder = 'ctc'
                all_logits = asr_model.transcribe(audio_file_paths, batch_size=cfg.acoustic_batch_size)

        all_probs = all_logits
        if cfg.probs_cache_file:
            os.makedirs(os.path.split(cfg.probs_cache_file)[0], exist_ok=True)
            logging.info(f"Writing pickle files of probabilities at '{cfg.probs_cache_file}'...")
            with open(cfg.probs_cache_file, 'wb') as f_dump:
                pickle.dump(all_probs, f_dump)

    wer_dist_greedy = 0
    cer_dist_greedy = 0
    words_count = 0
    chars_count = 0
    for batch_idx, probs in enumerate(all_probs):
        preds = np.argmax(probs, axis=0)
        preds_tensor = torch.tensor(preds, device='cpu').unsqueeze(0)
        if isinstance(asr_model, EncDecHybridRNNTCTCModel):
            pred_text = asr_model.ctc_decoding.ctc_decoder_predictions_tensor(preds_tensor)[0][0]
        else:
                # Debugging: Print shapes and lengths
            print("Shape of preds_tensor:", preds_tensor.shape)
            print("Length of preds_tensor:", len(preds_tensor))

            # Ensure decoder_lengths is provided
            decoder_lengths = torch.full(
                [preds_tensor.shape[0]], preds_tensor.shape[1], dtype=torch.long, device=preds_tensor.device
            )

            # Debugging: Print decoder lengths
            print("Decoder lengths:", decoder_lengths)
            pred_text = asr_model.wer.decoding.ctc_decoder_predictions_tensor(preds_tensor)[0][0]

        if cfg.text_processing.do_lowercase:
            pred_text = punctuation_capitalization.do_lowercase([pred_text])[0]
        if cfg.text_processing.rm_punctuation:
            pred_text = punctuation_capitalization.rm_punctuation([pred_text])[0]
        if cfg.text_processing.separate_punctuation:
            pred_text = punctuation_capitalization.separate_punctuation([pred_text])[0]

        pred_split_w = pred_text.split()
        target_split_w = target_transcripts[batch_idx].split()
        pred_split_c = list(pred_text)
        target_split_c = list(target_transcripts[batch_idx])

        wer_dist = editdistance.eval(target_split_w, pred_split_w)
        cer_dist = editdistance.eval(target_split_c, pred_split_c)

        wer_dist_greedy += wer_dist
        cer_dist_greedy += cer_dist
        words_count += len(target_split_w)
        chars_count += len(target_split_c)

    logging.info('Greedy WER/CER = {:.2%}/{:.2%}'.format(wer_dist_greedy / words_count, cer_dist_greedy / chars_count))

    asr_model = asr_model.to('cpu')

    if cfg.decoding_mode == "beamsearch_ngram":
        if not os.path.exists(cfg.kenlm_model_file):
            raise FileNotFoundError(f"Could not find the KenLM model file '{cfg.kenlm_model_file}'.")
        lm_path = cfg.kenlm_model_file
    else:
        lm_path = None

    # 'greedy' decoding_mode would skip the beam search decoding
    if cfg.decoding_mode in ["beamsearch_ngram", "beamsearch"]:
        if cfg.beam_width is None or cfg.beam_alpha is None or cfg.beam_beta is None:
            raise ValueError("beam_width, beam_alpha and beam_beta are needed to perform beam search decoding.")
        params = {'beam_width': cfg.beam_width, 'beam_alpha': cfg.beam_alpha, 'beam_beta': cfg.beam_beta}
        hp_grid = ParameterGrid(params)
        hp_grid = list(hp_grid)

        best_wer_beam_size, best_cer_beam_size = None, None
        best_wer_alpha, best_cer_alpha = None, None
        best_wer_beta, best_cer_beta = None, None
        best_wer, best_cer = 1e6, 1e6

        logging.info(f"==============================Starting the beam search decoding===============================")
        logging.info(f"Grid search size: {len(hp_grid)}")
        logging.info(f"It may take some time...")
        logging.info(f"==============================================================================================")

        if cfg.preds_output_folder and not os.path.exists(cfg.preds_output_folder):
            os.mkdir(cfg.preds_output_folder)
        for hp in hp_grid:
            if cfg.preds_output_folder:
                preds_output_file = os.path.join(
                    cfg.preds_output_folder,
                    f"preds_out_width{hp['beam_width']}_alpha{hp['beam_alpha']}_beta{hp['beam_beta']}.tsv",
                )
            else:
                preds_output_file = None

            candidate_wer, candidate_cer = beam_search_eval(
                asr_model,
                cfg,
                all_probs=all_probs,
                target_transcripts=target_transcripts,
                preds_output_file=preds_output_file,
                lm_path=lm_path,
                beam_width=hp["beam_width"],
                beam_alpha=hp["beam_alpha"],
                beam_beta=hp["beam_beta"],
                beam_batch_size=cfg.beam_batch_size,
                progress_bar=True,
                punctuation_capitalization=punctuation_capitalization,
            )

            if candidate_cer < best_cer:
                best_cer_beam_size = hp["beam_width"]
                best_cer_alpha = hp["beam_alpha"]
                best_cer_beta = hp["beam_beta"]
                best_cer = candidate_cer

            if candidate_wer < best_wer:
                best_wer_beam_size = hp["beam_width"]
                best_wer_alpha = hp["beam_alpha"]
                best_wer_beta = hp["beam_beta"]
                best_wer = candidate_wer

        logging.info(
            f'Best WER Candidate = {best_wer:.2%} :: Beam size = {best_wer_beam_size}, '
            f'Beam alpha = {best_wer_alpha}, Beam beta = {best_wer_beta}'
        )

        logging.info(
            f'Best CER Candidate = {best_cer:.2%} :: Beam size = {best_cer_beam_size}, '
            f'Beam alpha = {best_cer_alpha}, Beam beta = {best_cer_beta}'
        )
        logging.info(f"=================================================================================")


if __name__ == '__main__':
    main()


In [None]:
import json

# Load the JSON data from the file
with open('/kaggle/input/dataset-ja/banana.json', 'r', encoding='utf-8') as file:
    lines = file.readlines()

# Remove the "duration" field from each JSON object
modified_lines = []
for line in lines:
    item = json.loads(line)
    if 'duration' in item:
        del item['duration']
    modified_lines.append(json.dumps(item, ensure_ascii=False))

# Save the modified JSON data back to the file
with open('output_file.json', 'w', encoding='utf-8') as file:
    for line in modified_lines:
        file.write(line + '\n')

print("The 'duration' field has been removed.")


In [None]:
%%writefile /opt/conda/lib/python3.10/site-packages/nemo/collections/asr/parts/submodules/ctc_greedy_decoding.py
# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from dataclasses import dataclass, field
from typing import List, Optional

import torch
from omegaconf import DictConfig, OmegaConf

from nemo.collections.asr.parts.utils import rnnt_utils
from nemo.collections.asr.parts.utils.asr_confidence_utils import ConfidenceMethodConfig, ConfidenceMethodMixin
from nemo.core.classes import Typing, typecheck
from nemo.core.neural_types import HypothesisType, LengthsType, LogprobsType, NeuralType
from nemo.utils import logging, logging_mode


def pack_hypotheses(
    hypotheses: List[rnnt_utils.Hypothesis],
    logitlen: torch.Tensor,
) -> List[rnnt_utils.Hypothesis]:

    if logitlen is not None:
        if hasattr(logitlen, 'cpu'):
            logitlen_cpu = logitlen.to('cpu')
        else:
            logitlen_cpu = logitlen

    for idx, hyp in enumerate(hypotheses):  # type: rnnt_utils.Hypothesis
        hyp.y_sequence = torch.tensor(hyp.y_sequence, dtype=torch.long)

        if logitlen is not None:
            hyp.length = logitlen_cpu[idx]

        if hyp.dec_state is not None:
            hyp.dec_state = _states_to_device(hyp.dec_state)

    return hypotheses


def _states_to_device(dec_state, device='cpu'):
    if torch.is_tensor(dec_state):
        dec_state = dec_state.to(device)

    elif isinstance(dec_state, (list, tuple)):
        dec_state = tuple(_states_to_device(dec_i, device) for dec_i in dec_state)

    return dec_state


_DECODER_LENGTHS_NONE_WARNING = "Passing in decoder_lengths=None for CTC decoding is likely to be an error, since it is unlikely that each element of your batch has exactly the same length. decoder_lengths will default to decoder_output.shape[0]."


class GreedyCTCInfer(Typing, ConfidenceMethodMixin):
    """A greedy CTC decoder.

    Provides a common abstraction for sample level and batch level greedy decoding.

    Args:
        blank_index: int index of the blank token. Can be 0 or len(vocabulary).
        preserve_alignments: Bool flag which preserves the history of logprobs generated during
            decoding (sample / batched). When set to true, the Hypothesis will contain
            the non-null value for `logprobs` in it. Here, `logprobs` is a torch.Tensors.
        compute_timestamps: A bool flag, which determines whether to compute the character/subword, or
                word based timestamp mapping the output log-probabilities to discrite intervals of timestamps.
                The timestamps will be available in the returned Hypothesis.timestep as a dictionary.
        preserve_frame_confidence: Bool flag which preserves the history of per-frame confidence scores
            generated during decoding. When set to true, the Hypothesis will contain
            the non-null value for `frame_confidence` in it. Here, `frame_confidence` is a List of floats.
        confidence_method_cfg: A dict-like object which contains the method name and settings to compute per-frame
            confidence scores.

            name: The method name (str).
                Supported values:
                    - 'max_prob' for using the maximum token probability as a confidence.
                    - 'entropy' for using a normalized entropy of a log-likelihood vector.

            entropy_type: Which type of entropy to use (str). Used if confidence_method_cfg.name is set to `entropy`.
                Supported values:
                    - 'gibbs' for the (standard) Gibbs entropy. If the alpha (α) is provided,
                        the formula is the following: H_α = -sum_i((p^α_i)*log(p^α_i)).
                        Note that for this entropy, the alpha should comply the following inequality:
                        (log(V)+2-sqrt(log^2(V)+4))/(2*log(V)) <= α <= (1+log(V-1))/log(V-1)
                        where V is the model vocabulary size.
                    - 'tsallis' for the Tsallis entropy with the Boltzmann constant one.
                        Tsallis entropy formula is the following: H_α = 1/(α-1)*(1-sum_i(p^α_i)),
                        where α is a parameter. When α == 1, it works like the Gibbs entropy.
                        More: https://en.wikipedia.org/wiki/Tsallis_entropy
                    - 'renyi' for the Rényi entropy.
                        Rényi entropy formula is the following: H_α = 1/(1-α)*log_2(sum_i(p^α_i)),
                        where α is a parameter. When α == 1, it works like the Gibbs entropy.
                        More: https://en.wikipedia.org/wiki/R%C3%A9nyi_entropy

            alpha: Power scale for logsoftmax (α for entropies). Here we restrict it to be > 0.
                When the alpha equals one, scaling is not applied to 'max_prob',
                and any entropy type behaves like the Shannon entropy: H = -sum_i(p_i*log(p_i))

            entropy_norm: A mapping of the entropy value to the interval [0,1].
                Supported values:
                    - 'lin' for using the linear mapping.
                    - 'exp' for using exponential mapping with linear shift.

    """

    @property
    def input_types(self):
        """Returns definitions of module input ports."""
        # Input can be of dimension -
        # ('B', 'T', 'D') [Log probs] or ('B', 'T') [Labels]

        return {
            "decoder_output": NeuralType(None, LogprobsType()),
            "decoder_lengths": NeuralType(tuple('B'), LengthsType()),
        }

    @property
    def output_types(self):
        """Returns definitions of module output ports."""
        return {"predictions": [NeuralType(elements_type=HypothesisType())]}

    def __init__(
        self,
        blank_id: int,
        preserve_alignments: bool = False,
        compute_timestamps: bool = False,
        preserve_frame_confidence: bool = False,
        confidence_method_cfg: Optional[DictConfig] = None,
    ):
        super().__init__()

        self.blank_id = blank_id
        self.preserve_alignments = preserve_alignments
        # we need timestamps to extract non-blank per-frame confidence
        self.compute_timestamps = compute_timestamps | preserve_frame_confidence
        self.preserve_frame_confidence = preserve_frame_confidence

        # set confidence calculation method
        self._init_confidence_method(confidence_method_cfg)

    @typecheck()
    def forward(
        self,
        decoder_output: torch.Tensor,
        decoder_lengths: Optional[torch.Tensor],
    ):
        """Returns a list of hypotheses given an input batch of the encoder hidden embedding.
        Output token is generated auto-repressively.

        Args:
            decoder_output: A tensor of size (batch, timesteps, features) or (batch, timesteps) (each timestep is a label).
            decoder_lengths: list of int representing the length of each sequence
                output sequence.

        Returns:
            packed list containing batch number of sentences (Hypotheses).
        """

        logging.warning(
            "CTC decoding strategy 'greedy' is slower than 'greedy_batch', which implements the same exact interface. Consider changing your strategy to 'greedy_batch' for a free performance improvement.",
            mode=logging_mode.ONCE,
        )

        if decoder_lengths is None:
            logging.warning(_DECODER_LENGTHS_NONE_WARNING, mode=logging_mode.ONCE)

        with torch.inference_mode():
            hypotheses = []
            # Process each sequence independently

            if decoder_output.is_cuda:
                # This two-liner is around twenty times faster than:
                # `prediction_cpu_tensor = decoder_output.cpu()`
                # cpu() does not use pinned memory, meaning that a slow pageable
                # copy must be done instead.
                prediction_cpu_tensor = torch.empty(
                    decoder_output.shape, dtype=decoder_output.dtype, device=torch.device("cpu"), pin_memory=True
                )
                prediction_cpu_tensor.copy_(decoder_output, non_blocking=True)
            else:
                prediction_cpu_tensor = decoder_output

            if decoder_lengths is not None and isinstance(decoder_lengths, torch.Tensor):
                # Before this change, self._greedy_decode_labels would copy
                # each scalar from GPU to CPU one at a time, in the line:
                # prediction = prediction[:out_len]
                # Doing one GPU to CPU copy ahead of time amortizes that overhead.
                decoder_lengths = decoder_lengths.cpu()

            if prediction_cpu_tensor.ndim < 2 or prediction_cpu_tensor.ndim > 3:
                raise ValueError(
                    f"`decoder_output` must be a tensor of shape [B, T] (labels, int) or "
                    f"[B, T, V] (log probs, float). Provided shape = {prediction_cpu_tensor.shape}"
                )

            # determine type of input - logprobs or labels
            if prediction_cpu_tensor.ndim == 2:  # labels
                greedy_decode = self._greedy_decode_labels
            else:
                greedy_decode = self._greedy_decode_logprobs

            for ind in range(prediction_cpu_tensor.shape[0]):
                out_len = decoder_lengths[ind] if decoder_lengths is not None else None
                hypothesis = greedy_decode(prediction_cpu_tensor[ind], out_len)
                hypotheses.append(hypothesis)

            # Pack results into Hypotheses
            packed_result = pack_hypotheses(hypotheses, decoder_lengths)

        return (packed_result,)

    @torch.no_grad()
    def _greedy_decode_logprobs(self, x: torch.Tensor, out_len: Optional[torch.Tensor]):
        # x: [T, D]
        # out_len: [seq_len]

        # Initialize blank state and empty label set in Hypothesis
        hypothesis = rnnt_utils.Hypothesis(score=0.0, y_sequence=[], dec_state=None, timestep=[], last_token=None)
        prediction = x.cpu()

        if out_len is not None:
            prediction = prediction[:out_len]

        prediction_logprobs, prediction_labels = prediction.max(dim=-1)

        non_blank_ids = prediction_labels != self.blank_id
        hypothesis.y_sequence = prediction_labels.tolist()
        hypothesis.score = (prediction_logprobs[non_blank_ids]).sum()

        if self.preserve_alignments:
            # Preserve the logprobs, as well as labels after argmax
            hypothesis.alignments = (prediction.clone(), prediction_labels.clone())

        if self.compute_timestamps:
            hypothesis.timestep = torch.nonzero(non_blank_ids, as_tuple=False)[:, 0].tolist()

        if self.preserve_frame_confidence:
            hypothesis.frame_confidence = self._get_confidence(prediction)

        return hypothesis

    @torch.no_grad()
    def _greedy_decode_labels(self, x: torch.Tensor, out_len: Optional[torch.Tensor]):
        # x: [T]
        # out_len: [seq_len]

        # Initialize blank state and empty label set in Hypothesis
        hypothesis = rnnt_utils.Hypothesis(score=0.0, y_sequence=[], dec_state=None, timestep=[], last_token=None)
        prediction_labels = x.cpu()

        if out_len is not None:
            prediction_labels = prediction_labels[:out_len]

        non_blank_ids = prediction_labels != self.blank_id
        hypothesis.y_sequence = prediction_labels.tolist()
        hypothesis.score = -1.0

        if self.preserve_alignments:
            raise ValueError("Requested for alignments, but predictions provided were labels, not log probabilities.")

        if self.compute_timestamps:
            hypothesis.timestep = torch.nonzero(non_blank_ids, as_tuple=False)[:, 0].tolist()

        if self.preserve_frame_confidence:
            raise ValueError(
                "Requested for per-frame confidence, but predictions provided were labels, not log probabilities."
            )

        return hypothesis

    def __call__(self, *args, **kwargs):
        return self.forward(*args, **kwargs)


class GreedyBatchedCTCInfer(Typing, ConfidenceMethodMixin):
    """A vectorized greedy CTC decoder.

    This is basically always faster than GreedyCTCInfer, and supports
    the same interface. See issue #8891 on github for what is wrong
    with GreedyCTCInfer. GreedyCTCInfer loops over each element in the
    batch, running kernels at batch size one. CPU overheads end up
    dominating. This implementation does appropriate masking to
    appropriately do the same operation in a batched manner.

    Args:
        blank_index: int index of the blank token. Can be 0 or len(vocabulary).
        preserve_alignments: Bool flag which preserves the history of logprobs generated during
            decoding (sample / batched). When set to true, the Hypothesis will contain
            the non-null value for `logprobs` in it. Here, `logprobs` is a torch.Tensors.
        compute_timestamps: A bool flag, which determines whether to compute the character/subword, or
                word based timestamp mapping the output log-probabilities to discrite intervals of timestamps.
                The timestamps will be available in the returned Hypothesis.timestep as a dictionary.
        preserve_frame_confidence: Bool flag which preserves the history of per-frame confidence scores
            generated during decoding. When set to true, the Hypothesis will contain
            the non-null value for `frame_confidence` in it. Here, `frame_confidence` is a List of floats.
        confidence_method_cfg: A dict-like object which contains the method name and settings to compute per-frame
            confidence scores.

            name: The method name (str).
                Supported values:
                    - 'max_prob' for using the maximum token probability as a confidence.
                    - 'entropy' for using a normalized entropy of a log-likelihood vector.

            entropy_type: Which type of entropy to use (str). Used if confidence_method_cfg.name is set to `entropy`.
                Supported values:
                    - 'gibbs' for the (standard) Gibbs entropy. If the alpha (α) is provided,
                        the formula is the following: H_α = -sum_i((p^α_i)*log(p^α_i)).
                        Note that for this entropy, the alpha should comply the following inequality:
                        (log(V)+2-sqrt(log^2(V)+4))/(2*log(V)) <= α <= (1+log(V-1))/log(V-1)
                        where V is the model vocabulary size.
                    - 'tsallis' for the Tsallis entropy with the Boltzmann constant one.
                        Tsallis entropy formula is the following: H_α = 1/(α-1)*(1-sum_i(p^α_i)),
                        where α is a parameter. When α == 1, it works like the Gibbs entropy.
                        More: https://en.wikipedia.org/wiki/Tsallis_entropy
                    - 'renyi' for the Rényi entropy.
                        Rényi entropy formula is the following: H_α = 1/(1-α)*log_2(sum_i(p^α_i)),
                        where α is a parameter. When α == 1, it works like the Gibbs entropy.
                        More: https://en.wikipedia.org/wiki/R%C3%A9nyi_entropy

            alpha: Power scale for logsoftmax (α for entropies). Here we restrict it to be > 0.
                When the alpha equals one, scaling is not applied to 'max_prob',
                and any entropy type behaves like the Shannon entropy: H = -sum_i(p_i*log(p_i))

            entropy_norm: A mapping of the entropy value to the interval [0,1].
                Supported values:
                    - 'lin' for using the linear mapping.
                    - 'exp' for using exponential mapping with linear shift.

    """

    @property
    def input_types(self):
        """Returns definitions of module input ports."""
        # Input can be of dimension -
        # ('B', 'T', 'D') [Log probs] or ('B', 'T') [Labels]

        return {
            "decoder_output": NeuralType(None, LogprobsType()),
            "decoder_lengths": NeuralType(tuple('B'), LengthsType()),
        }

    @property
    def output_types(self):
        """Returns definitions of module output ports."""
        return {"predictions": [NeuralType(elements_type=HypothesisType())]}

    def __init__(
        self,
        blank_id: int,
        preserve_alignments: bool = False,
        compute_timestamps: bool = False,
        preserve_frame_confidence: bool = False,
        confidence_method_cfg: Optional[DictConfig] = None,
    ):
        super().__init__()

        self.blank_id = blank_id
        self.preserve_alignments = preserve_alignments
        # we need timestamps to extract non-blank per-frame confidence
        self.compute_timestamps = compute_timestamps | preserve_frame_confidence
        self.preserve_frame_confidence = preserve_frame_confidence

        # set confidence calculation method
        self._init_confidence_method(confidence_method_cfg)

    @typecheck()
    def forward(
        self,
        decoder_output: torch.Tensor,
        decoder_lengths: Optional[torch.Tensor],
    ):
        """Returns a list of hypotheses given an input batch of the encoder hidden embedding.
        Output token is generated auto-repressively.

        Args:
            decoder_output: A tensor of size (batch, timesteps, features) or (batch, timesteps) (each timestep is a label).
            decoder_lengths: list of int representing the length of each sequence
                output sequence.

        Returns:
            packed list containing batch number of sentences (Hypotheses).
        """

        input_decoder_lengths = decoder_lengths

        if decoder_lengths is None:
            logging.warning(_DECODER_LENGTHS_NONE_WARNING, mode=logging_mode.ONCE)
            decoder_lengths = torch.tensor(
                [decoder_output.shape[1]], dtype=torch.long, device=decoder_output.device
            ).expand(decoder_output.shape[0])

        # GreedyCTCInfer::forward(), by accident, works with
        # decoder_lengths on either CPU or GPU when decoder_output is
        # on GPU. For the sake of backwards compatibility, we also
        # allow decoder_lengths to be on the CPU device. In this case,
        # we simply copy the decoder_lengths from CPU to GPU. If both
        # tensors are already on the same device, this is a no-op.
        decoder_lengths = decoder_lengths.to(decoder_output.device)

        if decoder_output.ndim == 2:
            hypotheses = self._greedy_decode_labels_batched(decoder_output, decoder_lengths)
        else:
            hypotheses = self._greedy_decode_logprobs_batched(decoder_output, decoder_lengths)
        packed_result = pack_hypotheses(hypotheses, input_decoder_lengths)
        return (packed_result,)

    @torch.no_grad()
    def _greedy_decode_logprobs_batched(self, x: torch.Tensor, out_len: torch.Tensor):
        # x: [B, T, D]
        # out_len: [B]

        batch_size = x.shape[0]
        max_time = x.shape[1]

        predictions = x
        # In CTC greedy decoding, each output maximum likelihood token
        # is calculated independent of the other tokens.
        predictions_logprobs, predictions_labels = predictions.max(dim=-1)

        # Since predictions_logprobs is a padded matrix in the time
        # dimension, we consider invalid timesteps to be "blank".
        time_steps = torch.arange(max_time, device=x.device).unsqueeze(0).expand(batch_size, max_time)
        non_blank_ids_mask = torch.logical_and(predictions_labels != self.blank_id, time_steps < out_len.unsqueeze(1))
        # Sum the non-blank labels to compute the score of the
        # transcription. This follows from Eq. (3) of "Connectionist
        # Temporal Classification: Labelling Unsegmented Sequence Data
        # with Recurrent Neural Networks".
        scores = torch.where(non_blank_ids_mask, predictions_logprobs, 0.0).sum(axis=1)

        scores = scores.cpu()
        predictions_labels = predictions_labels.cpu()
        out_len = out_len.cpu()

        if self.preserve_alignments or self.preserve_frame_confidence:
            predictions = predictions.cpu()

        hypotheses = []

        # This mimics the for loop in GreedyCTCInfer::forward.
        for i in range(batch_size):
            hypothesis = rnnt_utils.Hypothesis(score=0.0, y_sequence=[], dec_state=None, timestep=[], last_token=None)
            hypothesis.score = scores[i]

            prediction_labels_no_padding = predictions_labels[i, : out_len[i]].tolist()

            assert predictions_labels.dtype == torch.int64
            hypothesis.y_sequence = prediction_labels_no_padding

            if self.preserve_alignments:
                hypothesis.alignments = (
                    predictions[i, : out_len[i], :].clone(),
                    predictions_labels[i, : out_len[i]].clone(),
                )
            if self.compute_timestamps:
                # TOOD: Could do this in a vectorized manner... Would
                # prefer to have nonzero_static, though, for sanity.
                # Or do a prefix sum on out_len
                hypothesis.timestep = torch.nonzero(non_blank_ids_mask[i], as_tuple=False)[:, 0].cpu().tolist()
            if self.preserve_frame_confidence:
                hypothesis.frame_confidence = self._get_confidence(predictions[i, : out_len[i], :])

            hypotheses.append(hypothesis)

        return hypotheses

    @torch.no_grad()
    def _greedy_decode_labels_batched(self, x: torch.Tensor, out_len: torch.Tensor):
        """
        This does greedy decoding in the case where you have already found the
        most likely token at each timestep.
        """
        # x: [B, T]
        # out_len: [B]

        batch_size = x.shape[0]
        max_time = x.shape[1]

        predictions_labels = x
        time_steps = torch.arange(max_time, device=x.device).unsqueeze(0).expand(batch_size, max_time)
        non_blank_ids_mask = torch.logical_and(predictions_labels != self.blank_id, time_steps < out_len.unsqueeze(1))
        predictions_labels = predictions_labels.cpu()
        out_len = out_len.cpu()

        hypotheses = []

        for i in range(batch_size):
            hypothesis = rnnt_utils.Hypothesis(score=0.0, y_sequence=[], dec_state=None, timestep=[], last_token=None)
            hypothesis.y_sequence = predictions_labels[i, : out_len[i]].tolist()
            hypothesis.score = -1.0

            if self.preserve_alignments:
                raise ValueError(
                    "Requested for alignments, but predictions provided were labels, not log probabilities."
                )
            if self.compute_timestamps:
                # TOOD: Could do this in a vectorized manner... Would
                # prefer to have nonzero_static, though, for sanity.
                # Or do a prefix sum on out_len
                hypothesis.timestep = torch.nonzero(non_blank_ids_mask[i], as_tuple=False)[:, 0].cpu().tolist()
            if self.preserve_frame_confidence:
                raise ValueError(
                    "Requested for per-frame confidence, but predictions provided were labels, not log probabilities."
                )

            hypotheses.append(hypothesis)

        return hypotheses

    def __call__(self, *args, **kwargs):
        return self.forward(*args, **kwargs)


@dataclass
class GreedyCTCInferConfig:
    preserve_alignments: bool = False
    compute_timestamps: bool = False
    preserve_frame_confidence: bool = False
    confidence_method_cfg: Optional[ConfidenceMethodConfig] = field(default_factory=lambda: ConfidenceMethodConfig())

    def __post_init__(self):
        # OmegaConf.structured ensures that post_init check is always executed
        self.confidence_method_cfg = OmegaConf.structured(
            self.confidence_method_cfg
            if isinstance(self.confidence_method_cfg, ConfidenceMethodConfig)
            else ConfidenceMethodConfig(**self.confidence_method_cfg)
        )


In [None]:
!python /kaggle/working/NeMo/scripts/asr_language_modeling/ngram_lm/eval_beamsearch_ngram_ctc.py nemo_model_file="/kaggle/working/final_asr_model.nemo" \
       input_manifest="/kaggle/input/dataset-ja/banana.json" \
       kenlm_model_file="/kaggle/working/kenlm_model.binary" \
       beam_width=[64,128] \
       beam_alpha=[1.0] \
       beam_beta=[1.0,0.5]\
       preds_output_folder="/kaggle/working/predictions" \
       probs_cache_file=null \
       decoding_mode=beamsearch_ngram \
       decoding_strategy="beam"


In [None]:
!python /kaggle/working/NeMo/scripts/asr_language_modeling/ngram_lm/eval_beamsearch_ngram_ctc.py \
       nemo_model_file="/kaggle/working/final_asr_model.nemo" \
       input_manifest="/kaggle/working/output_file.json" \
       kenlm_model_file="/kaggle/working/kenlm_model.binary" \
       beam_width=[64,128] \
       beam_alpha=[1.0] \
       beam_beta=[1.0,0.5]\
       preds_output_folder="/kaggle/working/predictions" \
       probs_cache_file=null \
       decoding_mode=beamsearch_ngram \
       decoding_strategy="beam"


In [None]:
#cd /kaggle/working/NeMo

In [None]:
# Install Flashlight
#!git clone https://github.com/flashlight/text
#cd text
#!python setup.py bdist_wheel  # Build the wheel file
#!pip install dist/*.whl       # Install the built wheel
#cd ..


In [None]:
#cd /kaggle/working/NeMo/text

In [None]:
#!python setup.py bdist_wheel  # Build the wheel file

In [None]:
#!pip install dist/*.whl       # Install the built wheel


In [None]:
!sudo apt-get remove -y cmake


In [None]:
!wget https://github.com/Kitware/CMake/releases/download/v3.24.0/cmake-3.24.0-linux-x86_64.tar.gz
!tar -xzvf cmake-3.24.0-linux-x86_64.tar.gz
!sudo cp -r cmake-3.24.0-linux-x86_64/* /usr/local/


In [None]:
!cmake --version


In [None]:
# Install Flashlight
git clone https://github.com/flashlight/text
cd text
python setup.py bdist_wheel  # Build the wheel file
pip install dist/*.whl       # Install the built wheel
cd ..


In [None]:
import os
os.environ['HYDRA_FULL_ERROR'] = '1'

In [None]:
%%writefile /kaggle/working/NeMo/scripts/asr_language_modeling/ngram_lm/train_kenlm.py
# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

# This script would train an N-gram language model with KenLM library (https://github.com/kpu/kenlm) which can be used
# with the beam search decoders on top of the ASR models. This script supports both character level and BPE level
# encodings and models which is detected automatically from the type of the model.
# After the N-gram model is trained, and stored in the binary format, you may use
# 'scripts/ngram_lm/eval_beamsearch_ngram.py' to evaluate it on an ASR model.
#
# You need to install the KenLM library and also the beam search decoders to use this feature. Please refer
# to 'scripts/ngram_lm/install_beamsearch_decoders.sh' on how to install them.
#
# USAGE: python train_kenlm.py nemo_model_file=<path to the .nemo file of the model> \
#                              train_paths=<list of paths to the training text or JSON manifest file> \
#                              kenlm_bin_path=<path to the bin folder of KenLM library> \
#                              kenlm_model_file=<path to store the binary KenLM model> \
#                              ngram_length=<order of N-gram model> \
#
# After training is done, the binary LM model is stored at the path specified by '--kenlm_model_file'.
# You may find more info on how to use this script at:
# https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/asr/asr_language_modeling.html

import logging
import os
os.environ['HYDRA_FULL_ERROR'] = '1'
import subprocess
import sys
from dataclasses import dataclass, field
from glob import glob
from typing import List

from omegaconf import MISSING

# Update the Python path to include the scripts directory
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..', '..')))

from scripts.asr_language_modeling.ngram_lm import kenlm_utils

from nemo.core.config import hydra_runner
from nemo.utils import logging

"""
NeMo's beam search decoders only support char-level encodings. In order to make it work with BPE-level encodings, we
use a trick to encode the sub-word tokens of the training data as unicode characters and train a char-level KenLM. 
"""


@dataclass
class TrainKenlmConfig:
    """
    Train an N-gram language model with KenLM to be used with beam search decoder of ASR models.
    """

    train_paths: List[
        str
    ] = MISSING  # List of training files or folders. Files can be a plain text file or ".json" manifest or ".json.gz". Example: [/path/to/manifest/file,/path/to/folder]

    nemo_model_file: str = MISSING  # The path to '.nemo' file of the ASR model, or name of a pretrained NeMo model
    kenlm_model_file: str = MISSING  # The path to store the KenLM binary model file
    ngram_length: int = MISSING  # The order of N-gram LM
    kenlm_bin_path: str = MISSING  # The path to the bin folder of KenLM.

    preserve_arpa: bool = False  # Whether to preserve the intermediate ARPA file.
    ngram_prune: List[int] = field(
        default_factory=lambda: [0]
    )  # List of digits to prune Ngram. Example: [0,0,1]. See Pruning section on the https://kheafield.com/code/kenlm/estimation
    cache_path: str = ""  # Cache path to save tokenized files.
    verbose: int = 1  # Verbose level, default is 1.


@hydra_runner(config_path=None, config_name='TrainKenlmConfig', schema=TrainKenlmConfig)
def main(args: TrainKenlmConfig):
    train_paths = kenlm_utils.get_train_list(args.train_paths)

    if isinstance(args.ngram_prune, str):
        args.ngram_prune = [args.ngram_prune]

    tokenizer, encoding_level, is_aggregate_tokenizer = kenlm_utils.setup_tokenizer(args.nemo_model_file)

    if encoding_level == "subword":
        discount_arg = "--discount_fallback"  # --discount_fallback is needed for training KenLM for BPE-based models
    else:
        discount_arg = ""

    arpa_file = f"{args.kenlm_model_file}.tmp.arpa"
    """ LMPLZ ARGUMENT SETUP """
    kenlm_args = [
        os.path.join(args.kenlm_bin_path, 'lmplz'),
        "-o",
        str(args.ngram_length),
        "--arpa",
        arpa_file,
        discount_arg,
        "--prune",
    ] + [str(n) for n in args.ngram_prune]

    if args.cache_path:
        if not os.path.exists(args.cache_path):
            os.makedirs(args.cache_path, exist_ok=True)

        """ DATASET SETUP """
        encoded_train_files = []
        for file_num, train_file in enumerate(train_paths):
            logging.info(f"Encoding the train file '{train_file}' number {file_num+1} out of {len(train_paths)} ...")

            cached_files = glob(os.path.join(args.cache_path, os.path.split(train_file)[1]) + "*")
            encoded_train_file = os.path.join(args.cache_path, os.path.split(train_file)[1] + f"_{file_num}.tmp.txt")
            if (
                cached_files and cached_files[0] != encoded_train_file
            ):  # cached_files exists but has another file name: f"_{file_num}.tmp.txt"
                os.rename(cached_files[0], encoded_train_file)
                logging.info("Rename", cached_files[0], "to", encoded_train_file)

            encoded_train_files.append(encoded_train_file)

        kenlm_utils.iter_files(
            source_path=train_paths,
            dest_path=encoded_train_files,
            tokenizer=tokenizer,
            encoding_level=encoding_level,
            is_aggregate_tokenizer=is_aggregate_tokenizer,
            verbose=args.verbose,
        )

        first_process_args = ["cat"] + encoded_train_files
        first_process = subprocess.Popen(first_process_args, stdout=subprocess.PIPE, stderr=sys.stderr)

        logging.info(f"Running lmplz command \n\n{' '.join(kenlm_args)}\n\n")
        kenlm_p = subprocess.run(
            kenlm_args,
            stdin=first_process.stdout,
            capture_output=False,
            text=True,
            stdout=sys.stdout,
            stderr=sys.stderr,
        )
        first_process.wait()

    else:
        logging.info(f"Running lmplz command \n\n{' '.join(kenlm_args)}\n\n")
        kenlm_p = subprocess.Popen(kenlm_args, stdout=sys.stdout, stdin=subprocess.PIPE, stderr=sys.stderr)

        kenlm_utils.iter_files(
            source_path=train_paths,
            dest_path=kenlm_p.stdin,
            tokenizer=tokenizer,
            encoding_level=encoding_level,
            is_aggregate_tokenizer=is_aggregate_tokenizer,
            verbose=args.verbose,
        )

        kenlm_p.communicate()

    if kenlm_p.returncode != 0:
        raise RuntimeError("Training KenLM was not successful!")

    """ BINARY BUILD """

    kenlm_args = [
        os.path.join(args.kenlm_bin_path, "build_binary"),
        "trie",
        arpa_file,
        args.kenlm_model_file,
    ]
    logging.info(f"Running binary_build command \n\n{' '.join(kenlm_args)}\n\n")
    ret = subprocess.run(kenlm_args, capture_output=False, text=True, stdout=sys.stdout, stderr=sys.stderr)

    if ret.returncode != 0:
        raise RuntimeError("Training KenLM was not successful!")

    if not args.preserve_arpa:
        os.remove(arpa_file)
        logging.info(f"Deleted the arpa file '{arpa_file}'.")


if __name__ == '__main__':
    main()


In [None]:
import os

os.environ['HYDRA_FULL_ERROR'] = '1'

In [None]:
%load /kaggle/working/NeMo/scripts/asr_language_modeling/ngram_lm/eval_beamsearch_ngram_ctc.py
import contextlib
import json
import os

os.environ['HYDRA_FULL_ERROR'] = '1'
import pickle
import tempfile
from dataclasses import dataclass, field, is_dataclass
from pathlib import Path
from typing import List, Optional

import editdistance
import numpy as np
import torch
from omegaconf import MISSING, OmegaConf
from sklearn.model_selection import ParameterGrid
from tqdm.auto import tqdm

import nemo.collections.asr as nemo_asr
from nemo.collections.asr.parts.submodules import rnnt_beam_decoding
from nemo.core.config import hydra_runner
from nemo.utils import logging

# fmt: off


@dataclass
class EvalBeamSearchNGramConfig:
    nemo_model_file: str = MISSING
    input_manifest: str = MISSING
    decoding_mode: str = MISSING  # Add this line
    kenlm_model_file: Optional[str] = None
    preds_output_folder: Optional[str] = None
    probs_cache_file: Optional[str] = None

    acoustic_batch_size: int = 128
    beam_batch_size: int = 128
    device: str = "cuda"
    use_amp: bool = False
    num_workers: int = 1

    decoding_strategy: str = "beam"
    beam_width: List[int] = field(default_factory=lambda: [128, 256])
    beam_alpha: List[float] = field(default_factory=lambda: [0.5, 1.0])
    maes_prefix_alpha: List[int] = field(default_factory=lambda: [2])
    maes_expansion_gamma: List[float] = field(default_factory=lambda: [2.3])
    hat_subtract_ilm: bool = False
    hat_ilm_weight: List[float] = field(default_factory=lambda: [0.0])

    decoding: rnnt_beam_decoding.BeamRNNTInferConfig = field(default_factory=lambda: rnnt_beam_decoding.BeamRNNTInferConfig(beam_size=128))


# fmt: on


def decoding_step(
    model: nemo_asr.models.ASRModel,
    cfg: EvalBeamSearchNGramConfig,
    all_probs: List[torch.Tensor],
    target_transcripts: List[str],
    preds_output_file: str = None,
    beam_batch_size: int = 128,
    progress_bar: bool = True,
):
    level = logging.getEffectiveLevel()
    logging.setLevel(logging.CRITICAL)
    model.change_decoding_strategy(None)

    cfg.decoding.hat_ilm_weight = cfg.decoding.hat_ilm_weight * cfg.hat_subtract_ilm
    cfg.decoding.return_best_hypothesis = False
    cfg.decoding.ngram_lm_model = cfg.kenlm_model_file
    cfg.decoding.hat_subtract_ilm = cfg.hat_subtract_ilm

    model.cfg.decoding.strategy = cfg.decoding_strategy
    model.cfg.decoding.beam = cfg.decoding
    model.change_decoding_strategy(model.cfg.decoding)
    logging.setLevel(level)

    wer_dist_first = cer_dist_first = 0
    wer_dist_best = cer_dist_best = 0
    words_count = 0
    chars_count = 0
    sample_idx = 0
    if preds_output_file:
        out_file = open(preds_output_file, 'w', encoding='utf_8', newline='\n')

    if progress_bar:
        if cfg.decoding_strategy == "greedy_batch":
            description = "Greedy_batch decoding.."
        else:
            description = f"{cfg.decoding_strategy} decoding with bw={cfg.decoding.beam_size}, ba={cfg.decoding.ngram_lm_alpha}, ma={cfg.decoding.maes_prefix_alpha}, mg={cfg.decoding.maes_expansion_gamma}, hat_ilmw={cfg.decoding.hat_ilm_weight}"
        it = tqdm(range(int(np.ceil(len(all_probs) / beam_batch_size))), desc=description, ncols=120)
    else:
        it = range(int(np.ceil(len(all_probs) / beam_batch_size)))
    for batch_idx in it:
        probs_batch = all_probs[batch_idx * beam_batch_size : (batch_idx + 1) * beam_batch_size]
        probs_lens = torch.tensor([prob.shape[-1] for prob in probs_batch])
        with torch.no_grad():
            packed_batch = torch.zeros(len(probs_batch), probs_batch[0].shape[0], max(probs_lens), device='cpu')
            for prob_index in range(len(probs_batch)):
                packed_batch[prob_index, :, : probs_lens[prob_index]] = torch.tensor(
                    probs_batch[prob_index].unsqueeze(0), device=packed_batch.device, dtype=packed_batch.dtype
                )
            best_hyp_batch, beams_batch = model.decoding.rnnt_decoder_predictions_tensor(
                packed_batch, probs_lens, return_hypotheses=True,
            )
        if cfg.decoding_strategy == "greedy_batch":
            beams_batch = [[x] for x in best_hyp_batch]

        for beams_idx, beams in enumerate(beams_batch):
            target = target_transcripts[sample_idx + beams_idx]
            target_split_w = target.split()
            target_split_c = list(target)
            words_count += len(target_split_w)
            chars_count += len(target_split_c)
            wer_dist_min = cer_dist_min = 10000
            for candidate_idx, candidate in enumerate(beams):
                pred_text = candidate.text
                pred_split_w = pred_text.split()
                wer_dist = editdistance.eval(target_split_w, pred_split_w)
                pred_split_c = list(pred_text)
                cer_dist = editdistance.eval(target_split_c, pred_split_c)

                wer_dist_min = min(wer_dist_min, wer_dist)
                cer_dist_min = min(cer_dist_min, cer_dist)

                if candidate_idx == 0:
                    wer_dist_first += wer_dist
                    cer_dist_first += cer_dist

                score = candidate.score
                if preds_output_file:
                    out_file.write('{}\t{}\n'.format(pred_text, score))
            wer_dist_best += wer_dist_min
            cer_dist_best += cer_dist_min
        sample_idx += len(probs_batch)

    if cfg.decoding_strategy == "greedy_batch":
        return wer_dist_first / words_count, cer_dist_first / chars_count

    if preds_output_file:
        out_file.close()
        logging.info(f"Stored the predictions of {cfg.decoding_strategy} decoding at '{preds_output_file}'.")

    if cfg.decoding.ngram_lm_model:
        logging.info(
            f"WER/CER with {cfg.decoding_strategy} decoding and N-gram model = {wer_dist_first / words_count:.2%}/{cer_dist_first / chars_count:.2%}"
        )
    else:
        logging.info(
            f"WER/CER with {cfg.decoding_strategy} decoding = {wer_dist_first / words_count:.2%}/{cer_dist_first / chars_count:.2%}"
        )
    logging.info(
        f"Oracle WER/CER in candidates with perfect LM= {wer_dist_best / words_count:.2%}/{cer_dist_best / chars_count:.2%}"
    )
    logging.info(f"=================================================================================")

    return wer_dist_first / words_count, cer_dist_first / chars_count


@hydra_runner(config_path=None, config_name='EvalBeamSearchNGramConfig', schema=EvalBeamSearchNGramConfig)
def main(cfg: EvalBeamSearchNGramConfig):
    if is_dataclass(cfg):
        cfg = OmegaConf.structured(cfg)  # type: EvalBeamSearchNGramConfig

    valid_decoding_strategis = ["greedy_batch", "beam", "tsd", "alsd", "maes"]
    if cfg.decoding_strategy not in valid_decoding_strategis:
        raise ValueError(
            f"Given decoding_strategy={cfg.decoding_strategy} is invalid. Available options are :\n"
            f"{valid_decoding_strategis}"
        )

    if cfg.nemo_model_file.endswith('.nemo'):
        asr_model = nemo_asr.models.ASRModel.restore_from(cfg.nemo_model_file, map_location=torch.device(cfg.device))
    else:
        logging.warning(
            "nemo_model_file does not end with .nemo, therefore trying to load a pretrained model with this name."
        )
        asr_model = nemo_asr.models.ASRModel.from_pretrained(
            cfg.nemo_model_file, map_location=torch.device(cfg.device)
        )

    if cfg.kenlm_model_file:
        if not os.path.exists(cfg.kenlm_model_file):
            raise FileNotFoundError(f"Could not find the KenLM model file '{cfg.kenlm_model_file}'.")
        if cfg.decoding_strategy != "maes":
            raise ValueError(f"Decoding with kenlm model is supported only for maes decoding algorithm.")
        lm_path = cfg.kenlm_model_file
    else:
        lm_path = None
        cfg.beam_alpha = [0.0]
    if cfg.hat_subtract_ilm:
        assert lm_path, "kenlm must be set for hat internal lm subtraction"

    if cfg.decoding_strategy != "maes":
        cfg.maes_expansion_gamma = [1.0]
        cfg.maes_prefix_alpha = [1.0]

    if cfg.device == "cuda" and not torch.cuda.is_available():
        logging.warning("You have set device=cuda but no CUDA devices found. Setting device=cpu instead.")
        cfg.device = "cpu"
    elif cfg.device == "cpu" and torch.cuda.is_available():
        logging.warning("You have set device=cpu, but there are available CUDA devices. Using CPU for inference.")

    if cfg.probs_cache_file and os.path.exists(cfg.probs_cache_file):
        logging.info(f"Restoring the probs cache from '{cfg.probs_cache_file}'.")
        with open(cfg.probs_cache_file, "rb") as cache_f:
            probs_dict = pickle.load(cache_f)
    else:
        logging.info(f"Computing and caching the probabilities of samples in '{cfg.input_manifest}'.")
        probs_dict = {}
        for test_batch in asr_model.transcribe(
            paths2audio_files=cfg.input_manifest,
            batch_size=cfg.acoustic_batch_size,
            num_workers=cfg.num_workers,
            return_hypotheses=False,
            use_amp=cfg.use_amp,
            channel_selector=None,
        ):
            for test_pred in test_batch:
                probs_dict[test_pred.audio_file] = (test_pred.feature_probs.cpu(), test_pred.tokens.cpu())
        if cfg.probs_cache_file:
            with open(cfg.probs_cache_file, "wb") as cache_f:
                pickle.dump(probs_dict, cache_f)

    for grid_idx, params in enumerate(ParameterGrid(cfg.dict_config)):
        for k, v in params.items():
            OmegaConf.update(cfg, k, v, merge=True)

        preds_output_file = None
        if cfg.preds_output_folder:
            preds_output_file = os.path.join(
                cfg.preds_output_folder,
                f"beam_search_preds_bs={cfg.decoding.beam_size}_ba={cfg.decoding.ngram_lm_alpha}_ma={cfg.decoding.maes_prefix_alpha}_mg={cfg.decoding.maes_expansion_gamma}_ilmw={cfg.decoding.hat_ilm_weight}.txt",
            )
            os.makedirs(cfg.preds_output_folder, exist_ok=True)

        all_probs = []
        target_transcripts = []
        for file, (probs, _) in probs_dict.items():
            all_probs.append(probs)
            target_transcripts.append(file)

        wer, cer = decoding_step(
            asr_model,
            cfg,
            all_probs,
            target_transcripts,
            preds_output_file=preds_output_file,
            beam_batch_size=cfg.beam_batch_size,
        )
        logging.info(f"WER/CER = {wer:.2%}/{cer:.2%}")


if __name__ == "__main__":
    main()


In [None]:
%%writefile /kaggle/working/NeMo/scripts/asr_language_modeling/ngram_lm/eval_beamsearch_ngram_ctc.py
# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

"""
# This script would evaluate an N-gram language model trained with KenLM library (https://github.com/kpu/kenlm) in
# fusion with beam search decoders on top of a trained ASR model with CTC decoder. To evaluate a model with 
# Transducer (RNN-T) decoder use another script 'scripts/asr_language_modeling/ngram_lm/eval_beamsearch_ngram_transducer.py'. 
# NeMo's beam search decoders are capable of using the KenLM's N-gram models
# to find the best candidates. This script supports both character level and BPE level
# encodings and models which is detected automatically from the type of the model.
# You may train the LM model with 'scripts/asr_language_modeling/ngram_lm/train_kenlm.py'.

# Config Help

To discover all arguments of the script, please run :
python eval_beamsearch_ngram_ctc.py --help
python eval_beamsearch_ngram_ctc.py --cfg job

# USAGE

python eval_beamsearch_ngram_ctc.py nemo_model_file=<path to the .nemo file of the model> \
           input_manifest=<path to the evaluation JSON manifest file> \
           kenlm_model_file=<path to the binary KenLM model> \
           beam_width=[<list of the beam widths, separated with commas>] \
           beam_alpha=[<list of the beam alphas, separated with commas>] \
           beam_beta=[<list of the beam betas, separated with commas>] \
           preds_output_folder=<optional folder to store the predictions> \
           probs_cache_file=null \
           decoding_mode=beamsearch_ngram
           ...


# Grid Search for Hyper parameters

For grid search, you can provide a list of arguments as follows -

           beam_width=[4,8,16,....] \
           beam_alpha=[-2.0,-1.0,...,1.0,2.0] \
           beam_beta=[-1.0,-0.5,0.0,...,1.0] \

# You may find more info on how to use this script at:
# https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/asr/asr_language_modeling.html

"""


import contextlib
import json
import os
import pickle
from dataclasses import dataclass, field, is_dataclass
from pathlib import Path
from typing import List, Optional

import editdistance
import numpy as np
import torch
from omegaconf import MISSING, OmegaConf
from sklearn.model_selection import ParameterGrid
from tqdm.auto import tqdm

import nemo.collections.asr as nemo_asr
from nemo.collections.asr.models import EncDecHybridRNNTCTCModel
from nemo.collections.asr.parts.submodules import ctc_beam_decoding
from nemo.collections.asr.parts.utils.transcribe_utils import PunctuationCapitalization, TextProcessingConfig
from nemo.core.config import hydra_runner
from nemo.utils import logging

# fmt: off


@dataclass
class EvalBeamSearchNGramConfig:
    """
    Evaluate an ASR model with beam search decoding and n-gram KenLM language model.
    """
    # # The path of the '.nemo' file of the ASR model or the name of a pretrained model (ngc / huggingface)
    nemo_model_file: str = MISSING

    # File paths
    input_manifest: str = MISSING  # The manifest file of the evaluation set
    kenlm_model_file: Optional[str] = None  # The path of the KenLM binary model file
    preds_output_folder: Optional[str] = None  # The optional folder where the predictions are stored
    probs_cache_file: Optional[str] = None  # The cache file for storing the logprobs of the model

    # Parameters for inference
    acoustic_batch_size: int = 16  # The batch size to calculate log probabilities
    beam_batch_size: int = 128  # The batch size to be used for beam search decoding
    device: str = "cuda"  # The device to load the model onto to calculate log probabilities
    use_amp: bool = False  # Whether to use AMP if available to calculate log probabilities

    # Beam Search hyperparameters

    # The decoding scheme to be used for evaluation.
    # Can be one of ["greedy", "beamsearch", "beamsearch_ngram"]
    decoding_mode: str = "beamsearch_ngram"

    beam_width: List[int] = field(default_factory=lambda: [128])  # The width or list of the widths for the beam search decoding
    beam_alpha: List[float] = field(default_factory=lambda: [1.0])  # The alpha parameter or list of the alphas for the beam search decoding
    beam_beta: List[float] = field(default_factory=lambda: [0.0])  # The beta parameter or list of the betas for the beam search decoding

    decoding_strategy: str = "beam"
    decoding: ctc_beam_decoding.BeamCTCInferConfig = field(default_factory=lambda: ctc_beam_decoding.BeamCTCInferConfig(beam_size=128))
    
    text_processing: Optional[TextProcessingConfig] = field(default_factory=lambda: TextProcessingConfig(
        punctuation_marks = ".,?",
        separate_punctuation = False,
        do_lowercase = False,
        rm_punctuation = False,
    ))
# fmt: on


def beam_search_eval(
    model: nemo_asr.models.ASRModel,
    cfg: EvalBeamSearchNGramConfig,
    all_probs: List[torch.Tensor],
    target_transcripts: List[str],
    preds_output_file: str = None,
    lm_path: str = None,
    beam_alpha: float = 1.0,
    beam_beta: float = 0.0,
    beam_width: int = 128,
    beam_batch_size: int = 128,
    progress_bar: bool = True,
    punctuation_capitalization: PunctuationCapitalization = None,
):
    level = logging.getEffectiveLevel()
    logging.setLevel(logging.CRITICAL)
    # Reset config
    if isinstance(model, EncDecHybridRNNTCTCModel):
        model.change_decoding_strategy(decoding_cfg=None, decoder_type="ctc")
    else:
        model.change_decoding_strategy(None)

    # Override the beam search config with current search candidate configuration
    cfg.decoding.beam_size = beam_width
    cfg.decoding.beam_alpha = beam_alpha
    cfg.decoding.beam_beta = beam_beta
    cfg.decoding.return_best_hypothesis = False
    cfg.decoding.kenlm_path = cfg.kenlm_model_file

    # Update model's decoding strategy config
    model.cfg.decoding.strategy = cfg.decoding_strategy
    model.cfg.decoding.beam = cfg.decoding

    # Update model's decoding strategy
    if isinstance(model, EncDecHybridRNNTCTCModel):
        model.change_decoding_strategy(model.cfg.decoding, decoder_type='ctc')
        decoding = model.ctc_decoding
    else:
        model.change_decoding_strategy(model.cfg.decoding)
        decoding = model.decoding
    logging.setLevel(level)

    wer_dist_first = cer_dist_first = 0
    wer_dist_best = cer_dist_best = 0
    words_count = 0
    chars_count = 0
    sample_idx = 0
    if preds_output_file:
        out_file = open(preds_output_file, 'w', encoding='utf_8', newline='\n')

    if progress_bar:
        it = tqdm(
            range(int(np.ceil(len(all_probs) / beam_batch_size))),
            desc=f"Beam search decoding with width={beam_width}, alpha={beam_alpha}, beta={beam_beta}",
            ncols=120,
        )
    else:
        it = range(int(np.ceil(len(all_probs) / beam_batch_size)))
    for batch_idx in it:
        # disabling type checking
        probs_batch = all_probs[batch_idx * beam_batch_size : (batch_idx + 1) * beam_batch_size]
        probs_lens = torch.tensor([prob.shape[0] for prob in probs_batch])
        with torch.no_grad():
            packed_batch = torch.zeros(len(probs_batch), max(probs_lens), probs_batch[0].shape[-1], device='cpu')

            for prob_index in range(len(probs_batch)):
                packed_batch[prob_index, : probs_lens[prob_index], :] = torch.tensor(
                    probs_batch[prob_index], device=packed_batch.device, dtype=packed_batch.dtype
                )

            _, beams_batch = decoding.ctc_decoder_predictions_tensor(
                packed_batch, decoder_lengths=probs_lens, return_hypotheses=True,
            )

        for beams_idx, beams in enumerate(beams_batch):
            target = target_transcripts[sample_idx + beams_idx]
            target_split_w = target.split()
            target_split_c = list(target)
            words_count += len(target_split_w)
            chars_count += len(target_split_c)
            wer_dist_min = cer_dist_min = 10000
            for candidate_idx, candidate in enumerate(beams):  # type: (int, ctc_beam_decoding.rnnt_utils.Hypothesis)
                pred_text = candidate.text
                if cfg.text_processing.do_lowercase:
                    pred_text = punctuation_capitalization.do_lowercase([pred_text])[0]
                if cfg.text_processing.rm_punctuation:
                    pred_text = punctuation_capitalization.rm_punctuation([pred_text])[0]
                if cfg.text_processing.separate_punctuation:
                    pred_text = punctuation_capitalization.separate_punctuation([pred_text])[0]
                pred_split_w = pred_text.split()
                wer_dist = editdistance.eval(target_split_w, pred_split_w)
                pred_split_c = list(pred_text)
                cer_dist = editdistance.eval(target_split_c, pred_split_c)

                wer_dist_min = min(wer_dist_min, wer_dist)
                cer_dist_min = min(cer_dist_min, cer_dist)

                if candidate_idx == 0:
                    # first candidate
                    wer_dist_first += wer_dist
                    cer_dist_first += cer_dist

                score = candidate.score
                if preds_output_file:
                    out_file.write('{}\t{}\n'.format(pred_text, score))
            wer_dist_best += wer_dist_min
            cer_dist_best += cer_dist_min
        sample_idx += len(probs_batch)

    if preds_output_file:
        out_file.close()
        logging.info(f"Stored the predictions of beam search decoding at '{preds_output_file}'.")

    if lm_path:
        logging.info(
            'WER/CER with beam search decoding and N-gram model = {:.2%}/{:.2%}'.format(
                wer_dist_first / words_count, cer_dist_first / chars_count
            )
        )
    else:
        logging.info(
            'WER/CER with beam search decoding = {:.2%}/{:.2%}'.format(
                wer_dist_first / words_count, cer_dist_first / chars_count
            )
        )
    logging.info(
        'Oracle WER/CER in candidates with perfect LM= {:.2%}/{:.2%}'.format(
            wer_dist_best / words_count, cer_dist_best / chars_count
        )
    )
    logging.info(f"=================================================================================")

    return wer_dist_first / words_count, cer_dist_first / chars_count


@hydra_runner(config_path=None, config_name='EvalBeamSearchNGramConfig', schema=EvalBeamSearchNGramConfig)
def main(cfg: EvalBeamSearchNGramConfig):
    if is_dataclass(cfg):
        cfg = OmegaConf.structured(cfg)  # type: EvalBeamSearchNGramConfig

    valid_decoding_modes = ["greedy", "beamsearch", "beamsearch_ngram"]
    if cfg.decoding_mode not in valid_decoding_modes:
        raise ValueError(
            f"Given decoding_mode={cfg.decoding_mode} is invalid. Available options are :\n" f"{valid_decoding_modes}"
        )

    if cfg.nemo_model_file.endswith('.nemo'):
        asr_model = nemo_asr.models.ASRModel.restore_from(cfg.nemo_model_file, map_location=torch.device(cfg.device))
    else:
        logging.warning(
            "nemo_model_file does not end with .nemo, therefore trying to load a pretrained model with this name."
        )
        asr_model = nemo_asr.models.ASRModel.from_pretrained(
            cfg.nemo_model_file, map_location=torch.device(cfg.device)
        )

    target_transcripts = []
    manifest_dir = Path(cfg.input_manifest).parent
    with open(cfg.input_manifest, 'r', encoding='utf_8') as manifest_file:
        audio_file_paths = []
        for line in tqdm(manifest_file, desc=f"Reading Manifest {cfg.input_manifest} ...", ncols=120):
            data = json.loads(line)
            audio_file = Path(data['audio_filepath'])
            if not audio_file.is_file() and not audio_file.is_absolute():
                audio_file = manifest_dir / audio_file
            target_transcripts.append(data['text'])
            audio_file_paths.append(str(audio_file.absolute()))

    punctuation_capitalization = PunctuationCapitalization(cfg.text_processing.punctuation_marks)
    if cfg.text_processing.do_lowercase:
        target_transcripts = punctuation_capitalization.do_lowercase(target_transcripts)
    if cfg.text_processing.rm_punctuation:
        target_transcripts = punctuation_capitalization.rm_punctuation(target_transcripts)
    if cfg.text_processing.separate_punctuation:
        target_transcripts = punctuation_capitalization.separate_punctuation(target_transcripts)

    if cfg.probs_cache_file and os.path.exists(cfg.probs_cache_file):
        logging.info(f"Found a pickle file of probabilities at '{cfg.probs_cache_file}'.")
        logging.info(f"Loading the cached pickle file of probabilities from '{cfg.probs_cache_file}' ...")
        with open(cfg.probs_cache_file, 'rb') as probs_file:
            all_probs = pickle.load(probs_file)

        if len(all_probs) != len(audio_file_paths):
            raise ValueError(
                f"The number of samples in the probabilities file '{cfg.probs_cache_file}' does not "
                f"match the manifest file. You may need to delete the probabilities cached file."
            )
    else:

        @contextlib.contextmanager
        def default_autocast():
            yield

        if cfg.use_amp:
            if torch.cuda.is_available() and hasattr(torch.cuda, 'amp') and hasattr(torch.cuda.amp, 'autocast'):
                logging.info("AMP is enabled!\n")
                autocast = torch.cuda.amp.autocast

            else:
                autocast = default_autocast
        else:

            autocast = default_autocast

        with autocast():
            with torch.no_grad():
                if isinstance(asr_model, EncDecHybridRNNTCTCModel):
                    asr_model.cur_decoder = 'ctc'
                all_logits = asr_model.transcribe(audio_file_paths, batch_size=cfg.acoustic_batch_size)


        all_probs = all_logits
        if cfg.probs_cache_file:
            os.makedirs(os.path.split(cfg.probs_cache_file)[0], exist_ok=True)
            logging.info(f"Writing pickle files of probabilities at '{cfg.probs_cache_file}'...")
            with open(cfg.probs_cache_file, 'wb') as f_dump:
                pickle.dump(all_probs, f_dump)

    wer_dist_greedy = 0
    cer_dist_greedy = 0
    words_count = 0
    chars_count = 0
    for batch_idx, probs in enumerate(all_probs):
        print(f"Type of probs: {type(probs)}")
        preds = np.argmax(probs, axis=0)
        preds_tensor = torch.tensor(preds, device='cpu').unsqueeze(0)
        if isinstance(asr_model, EncDecHybridRNNTCTCModel):
            pred_text = asr_model.ctc_decoding.ctc_decoder_predictions_tensor(preds_tensor)[0][0]
        else:
            pred_text = asr_model.wer.decoding.ctc_decoder_predictions_tensor(preds_tensor)[0][0]

        if cfg.text_processing.do_lowercase:
            pred_text = punctuation_capitalization.do_lowercase([pred_text])[0]
        if cfg.text_processing.rm_punctuation:
            pred_text = punctuation_capitalization.rm_punctuation([pred_text])[0]
        if cfg.text_processing.separate_punctuation:
            pred_text = punctuation_capitalization.separate_punctuation([pred_text])[0]

        pred_split_w = pred_text.split()
        target_split_w = target_transcripts[batch_idx].split()
        pred_split_c = list(pred_text)
        target_split_c = list(target_transcripts[batch_idx])

        wer_dist = editdistance.eval(target_split_w, pred_split_w)
        cer_dist = editdistance.eval(target_split_c, pred_split_c)

        wer_dist_greedy += wer_dist
        cer_dist_greedy += cer_dist
        words_count += len(target_split_w)
        chars_count += len(target_split_c)
        

    logging.info('Greedy WER/CER = {:.2%}/{:.2%}'.format(wer_dist_greedy / words_count, cer_dist_greedy / chars_count))

    asr_model = asr_model.to('cpu')

    if cfg.decoding_mode == "beamsearch_ngram":
        if not os.path.exists(cfg.kenlm_model_file):
            raise FileNotFoundError(f"Could not find the KenLM model file '{cfg.kenlm_model_file}'.")
        lm_path = cfg.kenlm_model_file
    else:
        lm_path = None

    # 'greedy' decoding_mode would skip the beam search decoding
    if cfg.decoding_mode in ["beamsearch_ngram", "beamsearch"]:
        if cfg.beam_width is None or cfg.beam_alpha is None or cfg.beam_beta is None:
            raise ValueError("beam_width, beam_alpha and beam_beta are needed to perform beam search decoding.")
        params = {'beam_width': cfg.beam_width, 'beam_alpha': cfg.beam_alpha, 'beam_beta': cfg.beam_beta}
        hp_grid = ParameterGrid(params)
        hp_grid = list(hp_grid)

        best_wer_beam_size, best_cer_beam_size = None, None
        best_wer_alpha, best_cer_alpha = None, None
        best_wer_beta, best_cer_beta = None, None
        best_wer, best_cer = 1e6, 1e6

        logging.info(f"==============================Starting the beam search decoding===============================")
        logging.info(f"Grid search size: {len(hp_grid)}")
        logging.info(f"It may take some time...")
        logging.info(f"==============================================================================================")

        if cfg.preds_output_folder and not os.path.exists(cfg.preds_output_folder):
            os.mkdir(cfg.preds_output_folder)
        for hp in hp_grid:
            if cfg.preds_output_folder:
                preds_output_file = os.path.join(
                    cfg.preds_output_folder,
                    f"preds_out_width{hp['beam_width']}_alpha{hp['beam_alpha']}_beta{hp['beam_beta']}.tsv",
                )
            else:
                preds_output_file = None

            candidate_wer, candidate_cer = beam_search_eval(
                asr_model,
                cfg,
                all_probs=all_probs,
                target_transcripts=target_transcripts,
                preds_output_file=preds_output_file,
                lm_path=lm_path,
                beam_width=hp["beam_width"],
                beam_alpha=hp["beam_alpha"],
                beam_beta=hp["beam_beta"],
                beam_batch_size=cfg.beam_batch_size,
                progress_bar=True,
                punctuation_capitalization=punctuation_capitalization,
            )

            if candidate_cer < best_cer:
                best_cer_beam_size = hp["beam_width"]
                best_cer_alpha = hp["beam_alpha"]
                best_cer_beta = hp["beam_beta"]
                best_cer = candidate_cer

            if candidate_wer < best_wer:
                best_wer_beam_size = hp["beam_width"]
                best_wer_alpha = hp["beam_alpha"]
                best_wer_beta = hp["beam_beta"]
                best_wer = candidate_wer

        logging.info(
            f'Best WER Candidate = {best_wer:.2%} :: Beam size = {best_wer_beam_size}, '
            f'Beam alpha = {best_wer_alpha}, Beam beta = {best_wer_beta}'
        )

        logging.info(
            f'Best CER Candidate = {best_cer:.2%} :: Beam size = {best_cer_beam_size}, '
            f'Beam alpha = {best_cer_alpha}, Beam beta = {best_cer_beta}'
        )
        logging.info(f"=================================================================================")


if __name__ == '__main__':
    main()


In [None]:
import os
os.environ['HYDRA_FULL_ERROR'] = '1'

In [None]:
!python /kaggle/working/NeMo/scripts/asr_language_modeling/ngram_lm/eval_beamsearch_ngram_ctc.py nemo_model_file="/kaggle/working/final_asr_model.nemo" \
       input_manifest="/kaggle/input/dataset-ja/banana.json" \
       kenlm_model_file="/kaggle/working/kenlm_model.binary" \
       beam_width="[128, 256]" \
       beam_alpha="[0.5, 1.0]" \
       beam_beta="[0.5, 1.0]" \
       preds_output_folder="/kaggle/working/predictions" \
       probs_cache_file=null \
       decoding_mode=beamsearch_ngram \
       decoding_strategy="beam"


In [None]:
!python /kaggle/working/NeMo/scripts/asr_language_modeling/ngram_lm/eval_beamsearch_ngram_ctc.py  \
       decoding_strategy="flashlight" \
        nemo_model_file="/kaggle/working/final_asr_model.nemo"\
       input_manifest="/kaggle/input/dataset-ja/banana.json" \
       kenlm_model_file="/kaggle/working/kenlm_model.binary" \
       beam_width="[128, 256]" \
       beam_alpha="[0.5, 1.0]" \
       beam_beta="[0.5, 1.0]" \
       preds_output_folder="/kaggle/working/predictions" \
       probs_cache_file=null \
       decoding_mode=beamsearch_ngram \
       +decoding.beam.flashlight_cfg.beam_size_token=32 \
       +decoding.beam.flashlight_cfg.beam_threshold=25.0


In [None]:
pwd

In [None]:
%%writefile /kaggle/working/configs/conformer_ctc_bpe.yaml
name: "Conformer-CTC-BPE"

model:
  sample_rate: 16000
  log_prediction: true # enables logging sample predictions in the output during training
  ctc_reduction: 'mean_batch'
  skip_nan_grad: false

  train_ds:
    manifest_filepath: "/kaggle/input/dataset-ja/final_train.json"
    sample_rate: ${model.sample_rate}
    batch_size: 16 # you may increase batch_size if your memory allows
    shuffle: true
    num_workers: 8
    pin_memory: true
    max_duration: 28 # it is set for LibriSpeech, you may need to update it for your dataset
    min_duration: 0.384
    # tarred datasets
    is_tarred: false
    tarred_audio_filepaths: null
    shuffle_n: 2048
    # bucketing params
    bucketing_strategy: "synced_randomized"
    bucketing_batch_size: null

  validation_ds:
    manifest_filepath: "/kaggle/input/dataset-ja/banana.json"
    sample_rate: ${model.sample_rate}
    batch_size: 16 # you may increase batch_size if your memory allows
    shuffle: false
    use_start_end_token: false
    num_workers: 8
    pin_memory: true

  test_ds:
    manifest_filepath: "/kaggle/input/dataset-ja/final_test.json"
    sample_rate: ${model.sample_rate}
    batch_size: 16 # you may increase batch_size if your memory allows
    shuffle: false
    use_start_end_token: false
    num_workers: 8
    pin_memory: true

  # recommend to SPE Unigram tokenizer with small vocab size of 128 or 256 when using 4x sub-sampling
  # you may find more detail on how to train a tokenizer at: /scripts/tokenizers/process_asr_text_tokenizer.py
  tokenizer:
    dir: "/kaggle/working/SphinxSpeech/tokenizer_spe_unigram_v64"  # path to directory which contains either tokenizer.model (bpe) or vocab.txt (wpe)
    type: bpe  # Can be either bpe (SentencePiece tokenizer) or wpe (WordPiece tokenizer)

  preprocessor:
    _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor
    sample_rate: ${model.sample_rate}
    normalize: "per_feature"
    window_size: 0.025
    window_stride: 0.01
    window: "hann"
    features: 80
    n_fft: 512
    log: true
    frame_splicing: 1
    dither: 0.00001
    pad_to: 0
    pad_value: 0.0

  spec_augment:
    _target_: nemo.collections.asr.modules.SpectrogramAugmentation
    freq_masks: 2 # set to zero to disable it
    # you may use lower time_masks for smaller models to have a faster convergence
    time_masks: 5 # set to zero to disable it
    freq_width: 27
    time_width: 0.05

  encoder:
    _target_: nemo.collections.asr.modules.ConformerEncoder
    feat_in: ${model.preprocessor.features}
    feat_out: -1 # you may set it if you need different output size other than the default d_model
    n_layers: 16
    d_model: 176

    # Sub-sampling params
    subsampling: striding # vggnet, striding, stacking or stacking_norm, dw_striding
    subsampling_factor: 4 # must be power of 2 for striding and vggnet
    subsampling_conv_channels: -1 # -1 sets it to d_model
    causal_downsampling: false

    # Feed forward module's params
    ff_expansion_factor: 4

    # Multi-headed Attention Module's params
    self_attention_model: rel_pos # rel_pos or abs_pos
    n_heads: 4 # may need to be lower for smaller d_models
    # [left, right] specifies the number of steps to be seen from left and right of each step in self-attention
    att_context_size: [-1, -1] # -1 means unlimited context
    att_context_style: regular # regular or chunked_limited
    xscaling: true # scales up the input embeddings by sqrt(d_model)
    untie_biases: true # unties the biases of the TransformerXL layers
    pos_emb_max_len: 5000

    # Convolution module's params
    conv_kernel_size: 31
    conv_norm_type: 'batch_norm' # batch_norm or layer_norm or groupnormN (N specifies the number of groups)
    # conv_context_size can be"causal" or a list of two integers while conv_context_size[0]+conv_context_size[1]+1==conv_kernel_size
    # null means [(kernel_size-1)//2, (kernel_size-1)//2], and 'causal' means [(kernel_size-1), 0]
    conv_context_size: null

    ### regularization
    dropout: 0.1 # The dropout used in most of the Conformer Modules
    dropout_pre_encoder: 0.1 # The dropout used before the encoder
    dropout_emb: 0.0 # The dropout used for embeddings
    dropout_att: 0.1 # The dropout for multi-headed attention modules

    # set to non-zero to enable stochastic depth
    stochastic_depth_drop_prob: 0.0
    stochastic_depth_mode: linear  # linear or uniform
    stochastic_depth_start_layer: 1

  decoder:
    _target_: nemo.collections.asr.modules.ConvASRDecoder
    feat_in: null
    num_classes: -1
    vocabulary: []

  # config for InterCTC loss: https://arxiv.org/abs/2102.03216
  # specify loss weights and which layers to use for InterCTC
  # e.g., to reproduce the paper results, set loss_weights: [0.3]
  # and apply_at_layers: [8] (assuming 18 layers). Note that final
  # layer loss coefficient is automatically adjusted (to 0.7 in above example)
  interctc:
    loss_weights: []
    apply_at_layers: []

  optim:
    name: adamw
    lr: 5.0
    # optimizer arguments
    betas: [0.9, 0.98]
    # less necessity for weight_decay as we already have large augmentations with SpecAug
    # you may need weight_decay for large models, stable AMP training, small datasets, or when lower augmentations are used
    # weight decay of 0.0 with lr of 2.0 also works fine
    weight_decay: 1e-3

    # scheduler setup
    sched:
      name: NoamAnnealing
      d_model: ${model.encoder.d_model}
      # scheduler config override
      warmup_steps: 10000
      warmup_ratio: null
      min_lr: 1e-6

trainer:
  devices: -1 # number of GPUs, -1 would use all available GPUs
  num_nodes: 1
  max_epochs: 3
  max_steps: -1 # computed at runtime if not set
  val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations
  accelerator: auto
  strategy: ddp
  accumulate_grad_batches: 1
  gradient_clip_val: 0.0
  precision: 32  # 16, 32, or bf16
  log_every_n_steps: 10  # Interval of logging.
  enable_progress_bar: True
  num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it
  check_val_every_n_epoch: 1 # number of evaluations on validation every n epochs
  sync_batchnorm: true
  enable_checkpointing: False  # Provided by exp_manager
  logger: false  # Provided by exp_manager
  benchmark: false # needs to be false for models with variable-length speech input as it slows down training

exp_manager:
  exp_dir: "/kaggle/working/results"
  name: ${name}
  create_tensorboard_logger: true
  create_checkpoint_callback: true
  checkpoint_callback_params:
    # in case of multiple validation sets, first one is used
    monitor: "val_wer"
    mode: "min"
    save_top_k: 5
    always_save_nemo: True # saves the checkpoints as nemo files instead of PTL checkpoints

  # you need to set these two to True to continue the training
  resume_if_exists: false
  resume_ignore_no_checkpoint: false

  # You may use this section to create a W&B logger
  create_wandb_logger: false
  wandb_logger_kwargs:
    name: null
    project: null


In [None]:
%%writefile /kaggle/working/ASR-Squad/Tokenizers/tokenizer_spe_unigram_v64/vocab.txt
▁
##ا
##ي
##ل
##ه
##ن
##ت
##م
##و
##ر
##د
##ع
##ك
##ب
##س
##ح
##ق
##ف
##ش
##ج
##ص
##ط
##خ
##أ
##ز
##ض
##ة
##غ
##ذ
##ى
##ث
##إ
##ء
##ظ
##ئ
##ؤ
##>
##l
##f
##i
##g
##h
##u
##e
##p
##r
##v
##o
##ً
##ڨ
##?
##÷
##،
##a
##<
##=
##[
##n
##آ
##ِ
##١
##ٱ
##چ


In [None]:
%%writefile /kaggle/working/ASR-Squad/Tokenizers/tokenizer_spe_unigram_v64/tokenizer.vocab
<unk>	0
▁	-2.2088
ا	-2.40748
ي	-2.65333
ل	-2.82795
ه	-2.85866
ن	-2.87771
ت	-3.146
م	-3.22545
و	-3.30548
ر	-3.33668
د	-3.46445
ع	-3.50065
ك	-3.65437
ب	-3.78068
س	-3.88658
ح	-3.91189
ق	-4.06987
ف	-4.09701
ش	-4.19604
ج	-4.32791
ص	-4.70701
ط	-4.83043
خ	-4.86126
أ	-4.89308
ز	-5.05488
ض	-5.18295
ة	-5.67968
غ	-5.85408
ذ	-5.98021
ى	-6.06764
ث	-6.09118
إ	-6.15143
ء	-6.39132
ظ	-6.44179
ئ	-6.5542
ؤ	-7.47499
>	-8.40933
l	-8.41086
f	-8.54166
i	-8.54166
g	-11.166
h	-11.166
u	-11.166
e	-11.2673
p	-11.2673
r	-11.2673
v	-11.2673
o	-11.2673
ً	-13.9689
ڨ	-13.9689
?	-14.4689
÷	-14.4689
،	-14.4689
a	-15.4687
<	-15.4688
=	-15.4689
[	-15.4689
n	-15.4689
آ	-15.4689
ِ	-15.4689
١	-15.4689
ٱ	-15.4689
چ	-15.4689


In [None]:
%%writefile speech_to_text_ctc_bpe.py


import pytorch_lightning as pl
from omegaconf import OmegaConf

from nemo.collections.asr.models.ctc_bpe_models import EncDecCTCModelBPE
from nemo.core.config import hydra_runner
from nemo.utils import logging
from nemo.utils.exp_manager import exp_manager


@hydra_runner(config_path="/kaggle/working/configs/", config_name="conformer_ctc_bpe")
def main(cfg):
    logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}')

    trainer = pl.Trainer(**cfg.trainer)
    exp_manager(trainer, cfg.get("exp_manager", None))
    asr_model = EncDecCTCModelBPE(cfg=cfg.model, trainer=trainer)

    # Initialize the weights of the model from another model, if provided via config
    asr_model.maybe_init_from_pretrained_checkpoint(cfg)

    trainer.fit(asr_model)

    if hasattr(cfg.model, 'test_ds') and cfg.model.test_ds.manifest_filepath is not None:
        if asr_model.prepare_test(trainer):
            trainer.test(asr_model)  
    # Save the model
    final_model_path = '/kaggle/working/final_asr_model2.nemo'
    asr_model.save_to(final_model_path)
    logging.info(f'Model saved at {final_model_path}')

if __name__ == '__main__':
    main()

In [None]:
!python /kaggle/working/speech_to_text_ctc_bpe.py

In [None]:
!python /kaggle/working/configs/transcribe_speech.py \
  model_path="/kaggle/working/final_asr_model2.nemo" \
  dataset_manifest="/kaggle/input/dataset-ja/final_test.json" \
  output_filename="/kaggle/working/test_with_predictions2.json" 


In [None]:

# Calculate WER
!python /kaggle/working/configs/speech_to_text_eval.py \
  dataset_manifest="/kaggle/working/test_with_predictions2.json" \
  use_cer=False \
  only_score_manifest=True

# Calculate CER
!python /kaggle/working/configs/speech_to_text_eval.py \
  dataset_manifest="/kaggle/working/test_with_predictions2.json" \
  use_cer=True \
  only_score_manifest=True

In [None]:
!pip install botocore

In [None]:
!wget -P configs/ https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/examples/asr/speech_to_text_eval.py

In [None]:
!pip install boto3 --upgrade


In [None]:
!pip install --upgrade nemo_toolkit[all]


In [None]:
!pip install --upgrade botocore

In [None]:
import inspect
from nemo.collections.asr.models import aed_multitask_models

print(inspect.getmembers(aed_multitask_models, inspect.isfunction))
print(inspect.getmembers(aed_multitask_models, inspect.isclass))


In [None]:
import os
import zipfile


working_dir = "/kaggle/working"


zip_path = "/kaggle/working/working_directories.zip"


with zipfile.ZipFile(zip_path, 'w') as zipf:
    for root, dirs, files in os.walk(working_dir):
        for file in files:
            file_path = os.path.join(root, file)
            zipf.write(file_path, os.path.relpath(file_path, working_dir))

print(f"All directories zipped and saved to {zip_path}")

In [None]:
!pip install --upgrade nemo-toolkit[all]

In [None]:
%load /kaggle/working/test_with_predictions.json

In [None]:
!git clone https://github.com/Alkholy53/ASR-Squad.git


In [None]:
cd /kaggle/working/ASR-Squad

In [None]:
!touch test.txt

In [None]:
!git config credential.helper store
!git config --global user.email "a.alkholy53@student.aast.edu"
!git config --global user.name "Alkholy53"
!git add .
!git commit -m "Add file from Kaggle"
!git push origin main


In [None]:
from IPython.display import FileLink
FileLink(r'test.txt')

In [None]:
pwd

In [None]:
!touch test.nemo

In [None]:
import os
os.chdir(r'/kaggle/working')

!tar -czf Landscapess.tar.gz ASR-Squad

from IPython.display import FileLink

FileLink(r'Landscapess.tar.gz')


In [None]:
# Set your own project id here
PROJECT_ID = 'your-google-cloud-project'
from google.cloud import storage
storage_client = storage.Client(project=PROJECT_ID)

In [None]:
!git push https://github.com/Alkholy53/ASR-Squad.git main

In [None]:
import os

# Set Git configurations (optional if not already set globally)
os.system('git config --global user.email "a.alkholy53@student.aast.edu"')
os.system('git config --global user.name "Alkholy53"')
# Add all files to the Git staging area
os.system('git add .')

# Commit the changes with a commit message
commit_message = "Add file from Kaggle"
os.system(f'git commit -m "{commit_message}"')

# Push changes to the main branch of the remote repository (origin)
os.system('git push origin main')


In [None]:
import os

# Add all files to the Git staging area
os.system('git add .')

# Commit the changes with a commit message
commit_message = "Add file from Kaggle"
os.system(f'git commit -m "{commit_message}"')

# Push changes to the main branch of the remote repository (origin)
os.system('git push origin main')


In [None]:
!ssh-keygen -t rsa -b 4096 -C "a.alkholy53@student.aast.edu"


In [None]:
!pip install --upgrade google-api-python-client google-auth-httplib2 google-auth-oauthlib


In [None]:
from google.oauth2.credentials import Credentials
from google_auth_oauthlib.flow import InstalledAppFlow

SCOPES = ['https://www.googleapis.com/auth/drive.file']

flow = InstalledAppFlow.from_client_secrets_file(
    'credentials.json', SCOPES)
credentials = flow.run_local_server(port=0)


In [None]:
pwd

In [None]:
from google.oauth2.credentials import Credentials
from google.auth.transport.requests import Request
from googleapiclient.discovery import build
from googleapiclient.http import MediaFileUpload

SCOPES = ['https://www.googleapis.com/auth/drive.file']

def upload_to_drive(file_path, file_name):
    creds = None
    # Load credentials from token.json or credentials.json
    creds = Credentials.from_authorized_user_file('token.json', SCOPES)
    
    # If credentials are expired, refresh them
    if creds and creds.expired and creds.refresh_token:
        creds.refresh(Request())
    
    # Build Google Drive service
    service = build('drive', 'v3', credentials=creds)
    
    # File to be uploaded
    file_metadata = {'name': file_name}
    media = MediaFileUpload(file_path, resumable=True)
    
    # Upload file to Google Drive
    file = service.files().create(body=file_metadata, media_body=media, fields='id').execute()
    
    print(f'File ID: {file.get("id")}')

# Example usage
if __name__ == '__main__':
    upload_to_drive('path_to_your_output_file.csv', 'output_file.csv')
