Mount Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


Installing Packages

In [None]:
!pip install nemo_toolkit['all']
!pip install hydra-core==1.1
!pip install evaluate
!pip install import-ipynb
!pip install jiwer

Changing to required directory

In [None]:
%cd "/content/drive/MyDrive/Colab Notebooks/Paper 1 Implementation"

/content/drive/MyDrive/Colab Notebooks/Paper 1 Implementation


Importing Packages

In [None]:
from nemo.collections.asr.models.ctc_bpe_models import EncDecCTCModelBPE
from nemo.core.config import hydra_runner
from nemo.utils import logging
from nemo.collections.asr.losses.ctc import CTCLoss
# from ctc_tried import CTCLoss
import omegaconf
from omegaconf import OmegaConf
from omegaconf import DictConfig
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.plugins import DDPPlugin
import sys
import logging
import import_ipynb
%run modifiedModel.ipynb import ModifiedModel
import os
import pickle

ERROR:root:File `'modifiedModel.ipynb.py'` not found.


In [None]:
sys.argv = ['']
del sys

Setting paths to JSON manifest files

In [None]:
LANGUAGE = "slices"
path = '/content/drive/MyDrive/Colab Notebooks/Paper 1 Implementation'
manifest_dir = os.path.join(path, LANGUAGE)
train_manifest = f"{manifest_dir}/train/train.json"
dev_manifest = f"{manifest_dir}/dev/dev.json"
test_manifest = f"{manifest_dir}/test/test.json"

Creating List of Paths

In [None]:
os.chdir('/content/drive/MyDrive/Colab Notebooks/Paper 1 Implementation/slices/train')
directory = 'recordings2'
manifest_dir_url = '/content/drive/MyDrive/Colab Notebooks/Paper 1 Implementation/slices/train/recordings2'
list_of_train_paths = []

for filename in os.listdir(directory):
    list_of_train_paths.append(f"{manifest_dir_url}/{filename}")

os.chdir('/content/drive/MyDrive/Colab Notebooks/Paper 1 Implementation/slices/dev')
directory = 'recordings2'
manifest_dir_url = '/content/drive/MyDrive/Colab Notebooks/Paper 1 Implementation/slices/dev/recordings2'
list_of_dev_paths = []

for filename in os.listdir(directory):
    list_of_dev_paths.append(f"{manifest_dir_url}/{filename}")

list_of_all_paths = list_of_train_paths + list_of_dev_paths
print(list_of_all_paths)

Training 

In [None]:
@hydra_runner(config_path=r"/content/drive/MyDrive/Colab Notebooks/Paper 1 Implementation/conformer", config_name="conformer_ctc_bpe")
def main(cfg):
    # logging.debug(cfg)
    cfg['model']['train_ds']['manifest_filepath'] = train_manifest
    cfg['model']['validation_ds']['manifest_filepath'] = dev_manifest
    cfg['model']['test_ds']['manifest_filepath'] = test_manifest
    # logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}')
    # logging.info("trainer: {}".format(cfg.trainer))
    checkpoint_callback = ModelCheckpoint(dirpath='/content/drive/MyDrive/Colab Notebooks/Paper 1 Implementation', 
                                        save_last=True, save_top_k=20,
                                        filename='{epoch}-{val_wer:.2f}-{other_metric:.2f}',
                                        monitor="val_wer", every_n_epochs=10)
    checkpoint_path = None
    trainer = pl.Trainer(accelerator='cpu', max_epochs=1, callbacks=[checkpoint_callback], auto_select_gpus=True)

    # Object Creation
    teacher_model = ModifiedModel.from_pretrained("stt_en_conformer_ctc_large")
    # student model might have fewer params than teacher model
    student_model = ModifiedModel.from_pretrained("stt_en_conformer_ctc_large")

    # Getting Teacher Model's Softmax Outputs
    teacher_logits = teacher_model.transcribe(paths2audio_files=list_of_all_paths, batch_size = 4, logprobs = True)
    # Getting Teacher Model's SAB Layer Outputs as Feature Maps
    teacher_feature_map = teacher_model.transcribe(paths2audio_files=list_of_all_paths, batch_size = 4, return_self_attention_outputs = True)
    
    # Writing objects to files to persist them
    # Writing teacher_logits object to a file
    file = "teacher_logits.pkl"
    file_obj = open(file, "wb")
    pickle.dump(teacher_logits, file_obj)
    file_obj.close()
    # Writing teacher_feature_map object to a file
    file = "teacher_feature_map.pkl"
    file_obj = open(file, "wb") # write binary
    pickle.dump(teacher_feature_map, file_obj)
    file_obj.close()

    # Random sample should be printed in the output at each step, along with its predicted transcript.
    student_model._wer.log_prediction = True

    # Setting the trainer 
    student_model.set_trainer(trainer)

    param_config = DictConfig(cfg['model'])
    student_model.setup_training_data(param_config.train_ds)
    student_model.setup_multiple_validation_data(val_data_config=param_config.validation_ds)
    student_model.setup_multiple_test_data(test_data_config=param_config.test_ds)
    student_model.spec_augmentation = student_model.from_config_dict(student_model.cfg.spec_augment)
    student_model.setup_optimization(DictConfig(cfg['model']['optim']))
    student_model.encoder.unfreeze()
    student_model.decoder.unfreeze()

    trainer.fit(student_model, ckpt_path=checkpoint_path)
    checkpoint_callback.best_model_path
    checkpoint_callback.best_model_score
    trainer.save_checkpoint

    student_model.save_to("/content/drive/MyDrive/Colab Notebooks/Paper 1 Implementation/student_model.nemo")
    
    if hasattr(cfg.model, 'test_ds') and cfg.model.test_ds.manifest_filepath is not None and False:
        gpu = 1 if cfg.trainer.gpus != 0 else 0
        test_trainer = pl.Trainer(
            gpus=gpu,
            precision=trainer.precision,
            amp_level=trainer.accelerator_connector.amp_level,
            amp_backend=cfg.trainer.get("amp_backend", "native"),
        )
        if student_model.prepare_test(test_trainer):
            test_trainer.test(student_model)

In [None]:
if __name__ == '__main__':
    main()