In [None]:
# Install dependencies
!pip install wget
!apt-get install -y sox libsndfile1 ffmpeg
!pip install text-unidecode
!pip install matplotlib>=3.3.2
!pip install pytorch_lightning
## Install NeMo
BRANCH = 'r2.0.0rc0'
!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH  #egg=nemo_toolkit[all] 
## Grab the config we'll use in this example

In [None]:
!pip install editdistance
!pip install webdataset
!pip install pyannote.metrics
!pip install einops
!pip install pyannote.core
!pip install inflect
!pip install hydra.core
!pip install lhotse
!pip install numpy soundfile joblib omegaconf lhotse
!pip install jiwer
!pip install  gdown
#pip install -r /kaggle/working/SphinxSpeech/requirements.txt

In [None]:
!gdown --id 13gKcDfU0N1VuXtRM2dCFYBMEgYPXKeLf 

In [None]:
import os

# Define the path to your ZIP file
zip_file_path = '/kaggle/working/tokenizer_spe_bpe_v128.zip'
unzip_dir = '/kaggle/working/tokenizers_v2/'

# Create the directory to unzip into
os.makedirs(unzip_dir, exist_ok=True)

# Unzip the file
!unzip -q {zip_file_path} -d {unzip_dir}

# Check the contents of the directory to ensure the files were unzipped
os.listdir(unzip_dir)


In [None]:
mkdir configs

In [None]:
%%writefile /kaggle/working/configs/conformer_ctc_bpe.yaml
name: "Conformer-CTC-BPE"

model:
  sample_rate: 16000
  log_prediction: true
  ctc_reduction: 'mean_batch'
  skip_nan_grad: false

  train_ds:
    manifest_filepath: "/kaggle/input/dataset-ja/final_train.json"
    sample_rate: ${model.sample_rate}
    batch_size: 32  # Increased batch size
    shuffle: true
    num_workers: 4  # Reduced to suggested maximum workers
    pin_memory: true
    max_duration: 28
    min_duration: 0.384
    is_tarred: false
    tarred_audio_filepaths: null
    shuffle_n: 2048
    bucketing_strategy: "synced_randomized"
    bucketing_batch_size: null

  validation_ds:
    manifest_filepath: "/kaggle/input/dataset-ja/banana.json"
    sample_rate: ${model.sample_rate}
    batch_size: 32  # Increased batch size
    shuffle: false
    use_start_end_token: false
    num_workers: 4  # Reduced to suggested maximum workers
    pin_memory: true

  test_ds:
    manifest_filepath: "/kaggle/input/dataset-ja/final_test.json"
    sample_rate: ${model.sample_rate}
    batch_size: 32  # Increased batch size
    shuffle: false
    use_start_end_token: false
    num_workers: 4  # Reduced to suggested maximum workers
    pin_memory: true

  tokenizer:
    dir: "/kaggle/working/tokenizers_v2/tokenizer_spe_bpe_v128"
    type: bpe

  preprocessor:
    _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor
    sample_rate: ${model.sample_rate}
    normalize: "per_feature"
    window_size: 0.025
    window_stride: 0.01
    window: "hann"
    features: 80
    n_fft: 512
    log: true
    frame_splicing: 1
    dither: 0.00001
    pad_to: 0
    pad_value: 0.0

  spec_augment:
    _target_: nemo.collections.asr.modules.SpectrogramAugmentation
    freq_masks: 1  # Slight reduction
    time_masks: 2  # Slight reduction
    freq_width: 27
    time_width: 0.05

  encoder:
    _target_: nemo.collections.asr.modules.ConformerEncoder
    feat_in: ${model.preprocessor.features}
    feat_out: -1
    n_layers: 16
    d_model: 176
    subsampling: striding
    subsampling_factor: 4
    subsampling_conv_channels: -1
    causal_downsampling: false
    ff_expansion_factor: 4
    self_attention_model: rel_pos
    n_heads: 4
    att_context_size: [-1, -1]
    att_context_style: regular
    xscaling: true
    untie_biases: true
    pos_emb_max_len: 5000
    conv_kernel_size: 31
    conv_norm_type: 'batch_norm'
    conv_context_size: null
    dropout: 0.1
    dropout_pre_encoder: 0.1
    dropout_emb: 0.0
    dropout_att: 0.1
    stochastic_depth_drop_prob: 0.0
    stochastic_depth_mode: linear
    stochastic_depth_start_layer: 1

  decoder:
    _target_: nemo.collections.asr.modules.ConvASRDecoder
    feat_in: null
    num_classes: -1
    vocabulary: []

  interctc:
    loss_weights: []
    apply_at_layers: []

  optim:
    name: adamw
    lr: 5.0
    betas: [0.9, 0.98]
    weight_decay: 1e-3

    sched:
      name: NoamAnnealing
      d_model: ${model.encoder.d_model}
      warmup_steps: 10000
      min_lr: 1e-6

trainer:
  devices: -1
  num_nodes: 1
  max_epochs: 5
  max_steps: -1
  val_check_interval: 1.0
  accelerator: auto
  strategy: ddp
  accumulate_grad_batches: 2  # Gradient accumulation
  gradient_clip_val: 0.0
  precision: 16  # Mixed precision training
  log_every_n_steps: 50  # Reduced logging frequency
  enable_progress_bar: True
  num_sanity_val_steps: 0
  check_val_every_n_epoch: 1
  sync_batchnorm: false  # Disable Sync BatchNorm
  enable_checkpointing: False
  logger: false
  benchmark: false

exp_manager:
  exp_dir: "/kaggle/working/results"
  name: ${name}
  create_tensorboard_logger: true
  create_checkpoint_callback: true
  checkpoint_callback_params:
    monitor: "val_wer"
    mode: "min"
    save_top_k: 1  # Save only the best checkpoint
    always_save_nemo: True

  resume_if_exists: false
  resume_ignore_no_checkpoint: false
  create_wandb_logger: false
  wandb_logger_kwargs:
    name: null
    project: null

In [None]:
!touch speech_to_text_ctc_bpe.py

In [None]:
%%writefile speech_to_text_ctc_bpe.py


import pytorch_lightning as pl
from omegaconf import OmegaConf

from nemo.collections.asr.models.ctc_bpe_models import EncDecCTCModelBPE
from nemo.core.config import hydra_runner
from nemo.utils import logging
from nemo.utils.exp_manager import exp_manager


@hydra_runner(config_path="/kaggle/working/configs/", config_name="conformer_ctc_bpe")
def main(cfg):
    logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}')

    trainer = pl.Trainer(**cfg.trainer)
    exp_manager(trainer, cfg.get("exp_manager", None))
    asr_model = EncDecCTCModelBPE(cfg=cfg.model, trainer=trainer)

    # Initialize the weights of the model from another model, if provided via config
    asr_model.maybe_init_from_pretrained_checkpoint(cfg)

    trainer.fit(asr_model)

    if hasattr(cfg.model, 'test_ds') and cfg.model.test_ds.manifest_filepath is not None:
        if asr_model.prepare_test(trainer):
            trainer.test(asr_model)  
    # Save the model
    final_model_path = '/kaggle/working/final_asr_model.nemo'
    asr_model.save_to(final_model_path)
    logging.info(f'Model saved at {final_model_path}')

if __name__ == '__main__':
    main()

In [None]:
!mkdir results

In [None]:
import os
os.environ['HYDRA_FULL_ERROR'] = '1'

In [None]:
!python /kaggle/working/speech_to_text_ctc_bpe.py

In [None]:
BRANCH = 'r2.0.0rc0'

!wget -P configs/ https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/examples/asr/transcribe_speech.py
!wget -P configs/ https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/examples/asr/speech_to_text_eval.py

In [None]:
!python /kaggle/working/configs/transcribe_speech.py \
  model_path="/kaggle/working/final_asr_model.nemo" \
  dataset_manifest="/kaggle/input/dataset-ja/final_test.json" \
  output_filename="/kaggle/working/test_with_predictions.json" 


In [None]:

# Calculate WER
!python /kaggle/working/configs/speech_to_text_eval.py \
  dataset_manifest="/kaggle/working/test_with_predictions.json" \
  use_cer=False \
  only_score_manifest=True

# Calculate CER
!python /kaggle/working/configs/speech_to_text_eval.py \
  dataset_manifest="/kaggle/working/test_with_predictions.json" \
  use_cer=True \
  only_score_manifest=True