In [None]:
!pip install speechbrain



In [None]:
!pip install textgrid transformers librosa



In [None]:
import os
import sys
import torch
import logging
import speechbrain as sb
from hyperpyyaml import load_hyperpyyaml
import librosa
import csv
from google.colab import drive, files


In [None]:
# Mount drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
logger = logging.getLogger(__name__)

In [None]:
folder_path = '/content/drive/MyDrive/CS5647_Project'
os.chdir(folder_path)
current_directory = os.getcwd()
print("Current Working Directory after change:", current_directory)

Current Working Directory after change: /content/drive/MyDrive/CS5647_Project


In [None]:
from mpd_eval_v3 import MpdStats

In [None]:
def make_attn_mask(wavs, wav_lens):
    """
    wav_lens: relative lengths(i.e. 0-1) of a batch. shape: (bs, )
    return a tensor of shape (bs, seq_len), representing mask on allowed positions.
            1 for regular tokens, 0 for padded tokens
    """
    abs_lens = (wav_lens*wavs.shape[1]).long()
    attn_mask = wavs.new(wavs.shape).zero_().long()
    for i in range(len(abs_lens)):
        attn_mask[i, :abs_lens[i]] = 1
    return attn_mask

In [None]:
class ASR(sb.Brain):
    def _compile_jit(self):
        for module in self.modules:
            if hasattr(module, "_compile_jit"):
                module._compile_jit()

    def compute_forward(self, batch, stage):
        "Given an input batch it computes the phoneme probabilities."
        batch = batch.to(self.device)
        wavs, wav_lens = batch.sig
        # phns_bos, _ = batch.phn_encoded_bos

        if stage == sb.Stage.TRAIN:
            if hasattr(self.hparams, "augmentation"):
                wavs = self.hparams.augmentation(wavs, wav_lens)

        # some wav2vec models (e.g. large-lv60) needs attention_mask
        # if self.modules.wav2vec2.feature_extractor.return_attention_mask:
        #     attn_mask = make_attn_mask(wavs, wav_lens)
        #     feats = self.modules.wav2vec2(wavs, attention_mask=attn_mask)
        # else:
        #     attn_mask = None
        #     feats = self.modules.wav2vec2(wavs)
        feats = self.modules.wav2vec2(wavs)
        x = self.modules.enc(feats)

        # output layer for ctc log-probabilities
        logits = self.modules.ctc_lin(x)
        p_ctc = self.hparams.log_softmax(logits)

        return p_ctc, wav_lens

    def compute_objectives(self, predictions, batch, stage):
        "Given the network predictions and targets computed the NLL loss."

        p_ctc, wav_lens = predictions

        ids = batch.id
        targets, target_lens = batch.phn_encoded_target
        if stage != sb.Stage.TRAIN:
            canonicals, canonical_lens = batch.phn_encoded_canonical
            perceiveds, perceived_lens = batch.phn_encoded_perceived

        loss_ctc = self.hparams.ctc_cost(p_ctc, targets, wav_lens, target_lens)
        loss = loss_ctc

        # Record losses for posterity
        if stage != sb.Stage.TRAIN:
            # Note: sb.decoders.ctc_greedy_decode will also remove padded tokens
            # that is, it return a list of list with different lengths
            sequence = sb.decoders.ctc_greedy_decode(
                p_ctc, wav_lens, blank_id=self.hparams.blank_index
            )
            self.ctc_metrics.append(ids, p_ctc, targets, wav_lens, target_lens)

            self.per_metrics.append(
                ids=ids,
                predict=sequence,
                target=targets,
                predict_len=None,
                target_len=target_lens,
                ind2lab=self.label_encoder.decode_ndim,
            )
            self.mpd_metrics.append(
                ids=ids,
                predict=sequence,
                canonical=canonicals,
                perceived=perceiveds,
                predict_len=None,
                canonical_len=canonical_lens,
                perceived_len=perceived_lens,
                ind2lab=self.label_encoder.decode_ndim,
            )

        return loss

    def evaluate_batch(self, batch, stage):
        """Computations needed for validation/test batches"""
        predictions = self.compute_forward(batch, stage=stage)
        loss = self.compute_objectives(predictions, batch, stage=stage)
        return loss.detach()

    def on_stage_start(self, stage, epoch):
        "Gets called when a stage (either training, validation, test) starts."
        self.ctc_metrics = self.hparams.ctc_stats()
        if self.hparams.wav2vec2_specaug:
            self.modules.wav2vec2.model.config.apply_spec_augment = True

        if stage != sb.Stage.TRAIN:
            self.modules.wav2vec2.model.config.apply_spec_augment = False
            self.per_metrics = self.hparams.per_stats()
            self.mpd_metrics = MpdStats()

    def on_stage_end(self, stage, stage_loss, epoch):
        """Gets called at the end of a epoch."""
        stage_stats = {"loss": stage_loss}
        if stage == sb.Stage.TRAIN:
            self.train_loss = stage_loss
        else:
            per = self.per_metrics.summarize("error_rate")
            mpd_f1 = self.mpd_metrics.summarize("mpd_f1")

        if stage == sb.Stage.VALID:
            old_lr_model, new_lr_model = self.hparams.lr_annealing_model(
                stage_stats["loss"]
            )
            old_lr_wav2vec2, new_lr_wav2vec2 = self.hparams.lr_annealing_wav2vec2(
                stage_stats["loss"]
            )
            sb.nnet.schedulers.update_learning_rate(
                self.adam_optimizer, new_lr_model
            )
            sb.nnet.schedulers.update_learning_rate(
                self.wav2vec_optimizer, new_lr_wav2vec2
            )
            self.hparams.train_logger.log_stats(
                stats_meta={
                    "epoch": epoch,
                    "lr_model": old_lr_model,
                    "lr_wav2vec2": old_lr_wav2vec2,
                },
                train_stats={"loss": self.train_loss},
                valid_stats={
                    "loss": stage_loss,
                    "ctc_loss": self.ctc_metrics.summarize("average"),
                    "PER": per,
                    "mpd_f1": mpd_f1
                },
            )


            # self.hparams.train_logger.log_stats(
            #     stats_meta={
            #         "epoch": epoch,
            #         "lr_adam": self.adam_optimizer.param_groups[0]["lr"],
            #         "lr_wav2vec": self.wav2vec_optimizer.param_groups[0]["lr"],
            #     },
            #     train_stats={"loss": self.train_loss},
            #     valid_stats={
            #         "loss": stage_loss,
            #         "ctc_loss": self.ctc_metrics.summarize("average"),
            #         "PER": per,
            #         "mpd_f1": mpd_f1
            #     },
            # )
            self.checkpointer.save_and_keep_only(
                meta={"PER": per, "mpd_f1": mpd_f1}, min_keys=["PER"], max_keys=["mpd_f1"]
            )

        if stage == sb.Stage.TEST:
            self.hparams.train_logger.log_stats(
                stats_meta={"Epoch loaded": self.hparams.epoch_counter.current},
                test_stats={"loss": stage_loss, "PER": per, "mpd_f1": mpd_f1},
            )
            with open(self.hparams.wer_file, "w") as w:
                w.write("CTC loss stats:\n")
                self.ctc_metrics.write_stats(w)
                w.write("\nPER stats:\n")
                self.per_metrics.write_stats(w)
                print(
                    "CTC and PER stats written to file",
                    self.hparams.wer_file,
                )
            with open(self.hparams.mpd_file, "w") as m:
                m.write("MPD results and stats:\n")
                self.mpd_metrics.write_stats(m)
                print(
                    "MPD results and stats written to file",
                    self.hparams.mpd_file,
                )


    def fit_batch(self, batch):
        """Fit one batch, override to do multiple updates.

        The default implementation depends on a few methods being defined
        with a particular behavior:

        * ``compute_forward()``
        * ``compute_objectives()``

        Also depends on having optimizers passed at initialization.

        Arguments
        ---------
        batch : list of torch.Tensors
            Batch of data to use for training. Default implementation assumes
            this batch has two elements: inputs and targets.

        Returns
        -------
        detached loss
        """
        # Managing automatic mixed precision
        if self.auto_mix_prec:

            self.wav2vec_optimizer.zero_grad()
            self.adam_optimizer.zero_grad()

            with torch.cuda.amp.autocast():
                outputs = self.compute_forward(batch, sb.Stage.TRAIN)
                loss = self.compute_objectives(outputs, batch, sb.Stage.TRAIN)

            self.scaler.scale(loss).backward()
            self.scaler.unscale_(self.wav2vec_optimizer)
            self.scaler.unscale_(self.adam_optimizer)

            if self.check_gradients(loss):
                self.scaler.step(self.wav2vec_optimizer)
                self.scaler.step(self.adam_optimizer)

            self.scaler.update()
        else:
            outputs = self.compute_forward(batch, sb.Stage.TRAIN)

            loss = self.compute_objectives(outputs, batch, sb.Stage.TRAIN)
            # normalize the loss by gradient_accumulation step
            (loss / self.hparams.gradient_accumulation).backward()

            if self.step % self.hparams.gradient_accumulation == 0:
                # gradient clipping & early stop if loss is not fini
                if self.check_gradients(loss):
                    self.wav2vec_optimizer.step()
                    self.adam_optimizer.step()

                self.wav2vec_optimizer.zero_grad()
                self.adam_optimizer.zero_grad()

        return loss.detach().cpu()

    def init_optimizers(self):
        "Initializes the wav2vec2 optimizer and model optimizer"
        self.wav2vec_optimizer = self.hparams.wav2vec_opt_class(
            self.modules.wav2vec2.model.parameters()
        )
        self.adam_optimizer = self.hparams.adam_opt_class(
            self.hparams.model.parameters()
        )

        if self.checkpointer is not None:
            self.checkpointer.add_recoverable(
                "wav2vec_opt", self.wav2vec_optimizer
            )
            self.checkpointer.add_recoverable("adam_opt", self.adam_optimizer)

    def on_fit_start(self):
        """Gets called at the beginning of ``fit()``, on multiple processes
        if ``distributed_count > 0`` and backend is ddp.

        Default implementation compiles the jit modules, initializes
        optimizers, and loads the latest checkpoint to resume training.
        """
        # Run this *after* starting all processes since jit modules cannot be
        # pickled.
        self._compile_jit()

        # Wrap modules with parallel backend after jit
        self._wrap_distributed()

        # Initialize optimizers after parameters are configured
        self.init_optimizers()

        # Load latest checkpoint to resume training if interrupted
        ## NOTE: make sure to use the "best" model to continual training
        ## so we set the `min_key` argument
        if self.checkpointer is not None:
            self.checkpointer.recover_if_possible(
                device=torch.device(self.device),
                min_key="PER"
            )



In [None]:
def dataio_prep(hparams):
    """This function prepares the datasets to be used in the brain class.
    It also defines the data processing pipeline through user-defined functions."""
    data_folder = hparams["data_folder_save"]
    # 1. Declarations:
    train_data = sb.dataio.dataset.DynamicItemDataset.from_csv(
        csv_path=hparams["train_annotation"],
        replacements={"data_root": data_folder},
    )
    if hparams["sorting"] == "ascending":
        # we sort training data to speed up training and get better results.
        train_data = train_data.filtered_sorted(sort_key="duration")
        # when sorting do not shuffle in dataloader ! otherwise is pointless
        hparams["train_dataloader_opts"]["shuffle"] = False

    elif hparams["sorting"] == "descending":
        train_data = train_data.filtered_sorted(
            sort_key="duration", reverse=True
        )
        # when sorting do not shuffle in dataloader ! otherwise is pointless
        hparams["train_dataloader_opts"]["shuffle"] = False

    elif hparams["sorting"] == "random":
        pass

    else:
        raise NotImplementedError(
            "sorting must be random, ascending or descending"
        )

    valid_data = sb.dataio.dataset.DynamicItemDataset.from_csv(
        csv_path=hparams["valid_annotation"],
        replacements={"data_root": data_folder},
    )
    valid_data = valid_data.filtered_sorted(sort_key="duration")

    test_data = sb.dataio.dataset.DynamicItemDataset.from_csv(
        csv_path=hparams["test_annotation"],
        replacements={"data_root": data_folder},
    )
    test_data = test_data.filtered_sorted(sort_key="duration")

    datasets = [train_data, valid_data, test_data]
    label_encoder = sb.dataio.encoder.CTCTextEncoder()

    # 2. Define audio pipeline:
    @sb.utils.data_pipeline.takes("wav")
    @sb.utils.data_pipeline.provides("sig")
    def audio_pipeline(wav):
        # sig = sb.dataio.dataio.read_audio(wav)
        # # sample rate change to 16000, e,g, using librosa
        # sig = torch.Tensor(librosa.core.load(wav, hparams["sample_rate"])[0])
        # Use wav2vec processor to do normalization
        sig = hparams["wav2vec2"].feature_extractor(
            librosa.core.load(wav, sr=hparams["sample_rate"])[0],
            sampling_rate=hparams["sample_rate"],
        ).input_values[0]
        sig = torch.Tensor(sig)
        return sig

    sb.dataio.dataset.add_dynamic_item(datasets, audio_pipeline)

    # 3. Define text pipeline:
    @sb.utils.data_pipeline.takes("perceived_train_target")
    @sb.utils.data_pipeline.provides(
        "phn_list_target",
        "phn_encoded_list_target",
        "phn_encoded_target",
    )
    def text_pipeline_train(phn):
        phn_list = phn.strip().split()
        yield phn_list
        phn_encoded_list = label_encoder.encode_sequence(phn_list)
        yield phn_encoded_list
        phn_encoded = torch.LongTensor(phn_encoded_list)
        yield phn_encoded

    @sb.utils.data_pipeline.takes("perceived_train_target", "canonical_aligned", "perceived_aligned")
    @sb.utils.data_pipeline.provides(
        "phn_list_target",
        "phn_encoded_list_target",
        "phn_encoded_target",
        "phn_list_canonical",
        "phn_encoded_list_canonical",
        "phn_encoded_canonical",
        "phn_list_perceived",
        "phn_encoded_list_perceived",
        "phn_encoded_perceived",
    )
    def text_pipeline_test(target, canonical, perceived):
        phn_list_target = target.strip().split()
        yield phn_list_target
        phn_encoded_list_target = label_encoder.encode_sequence(phn_list_target)
        yield phn_encoded_list_target
        phn_encoded_target = torch.LongTensor(phn_encoded_list_target)
        yield phn_encoded_target
        phn_list_canonical = canonical.strip().split()
        yield phn_list_canonical
        phn_encoded_list_canonical = label_encoder.encode_sequence(phn_list_canonical)
        yield phn_encoded_list_canonical
        phn_encoded_canonical = torch.LongTensor(phn_encoded_list_canonical)
        yield phn_encoded_canonical
        phn_list_perceived = perceived.strip().split()
        yield phn_list_perceived
        phn_encoded_list_perceived = label_encoder.encode_sequence(phn_list_perceived)
        yield phn_encoded_list_perceived
        phn_encoded_perceived = torch.LongTensor(phn_encoded_list_perceived)
        yield phn_encoded_perceived

    sb.dataio.dataset.add_dynamic_item([train_data], text_pipeline_train)
    sb.dataio.dataset.add_dynamic_item([valid_data, test_data], text_pipeline_test)

    # 3. Fit encoder:
    # Load or compute the label encoder
    lab_enc_file = os.path.join(hparams["save_folder"], "label_encoder.txt")
    special_labels = {
        "blank_label": hparams["blank_index"],
    }
    label_encoder.load_or_create(
        path=lab_enc_file,
        from_didatasets=[train_data],
        output_key="phn_list_target",
        special_labels=special_labels,
        sequence_input=True,
    )

    # 4. Set output:
    sb.dataio.dataset.set_output_keys(
        [train_data],
        ["id", "sig", "phn_encoded_target"],
    )
    sb.dataio.dataset.set_output_keys(
        [valid_data, test_data],
        ["id", "sig", "phn_encoded_target", "phn_encoded_canonical", "phn_encoded_perceived"],
    )

    return train_data, valid_data, test_data, label_encoder


In [None]:

hparams_file = '/content/drive/MyDrive/CS5647_Project/hparams/hubert_train.yaml'

# Load hyperparameters file with command-line overrides
with open(hparams_file) as fin:
    hparams = load_hyperpyyaml(fin)


# Create experiment directory
sb.create_experiment_directory(
    experiment_directory=hparams["output_folder"],
    hyperparams_to_save=hparams_file,
)

# Dataset IO prep: creating Dataset objects and proper encodings for phones
train_data, valid_data, test_data, label_encoder = dataio_prep(hparams)



Some weights of HubertModel were not initialized from the model checkpoint at facebook/hubert-large-ls960-ft and are newly initialized: ['hubert.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'hubert.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


speechbrain.lobes.models.huggingface_wav2vec - speechbrain.lobes.models.huggingface_wav2vec - wav2vec 2.0 feature extractor is frozen.
speechbrain.core - Beginning experiment!
speechbrain.core - Experiment folder: results/hubert-base_ctc/
speechbrain.dataio.encoder - Load called, but CTCTextEncoder is not empty. Loaded data will overwrite everything. This is normal if there is e.g. an unk label defined at init.


In [None]:
import os

os.environ["NUMEXPR_MAX_THREADS"] = "12"

In [None]:
# Trainer initialization
asr_brain = ASR(
    modules=hparams["modules"],
    hparams=hparams,
    checkpointer=hparams["checkpointer"],
    run_opts = {"device": "cuda"}
)
asr_brain.label_encoder = label_encoder

# Training/validation loop
asr_brain.fit(
    asr_brain.hparams.epoch_counter,
    train_data,
    valid_data,
    train_loader_kwargs=hparams["train_dataloader_opts"],
    valid_loader_kwargs=hparams["valid_dataloader_opts"],

)



speechbrain.core - Info: auto_mix_prec arg from hparam file is used
speechbrain.core - 311.8M trainable parameters in ASR
speechbrain.utils.checkpoints - Loading a checkpoint from results/hubert-base_ctc/save/CKPT+2023-11-22+12-00-55+00




speechbrain.utils.epoch_loop - Going into epoch 36


100%|██████████| 38/38 [00:43<00:00,  1.16s/it, train_loss=0.636]
100%|██████████| 19/19 [00:29<00:00,  1.55s/it]

speechbrain.utils.train_logger - epoch: 36, lr_model: 1.69e-04, lr_wav2vec2: 5.63e-06 - train loss: 6.36e-01 - valid loss: 4.79e-01, valid ctc_loss: 4.79e-01, valid PER: 16.03, valid mpd_f1: 3.86e-01





speechbrain.utils.checkpoints - Saved an end-of-epoch checkpoint in results/hubert-base_ctc/save/CKPT+2023-11-22+13-19-21+00
speechbrain.utils.checkpoints - Deleted checkpoint in results/hubert-base_ctc/save/CKPT+2023-11-22+12-00-55+00
speechbrain.utils.epoch_loop - Going into epoch 37


100%|██████████| 38/38 [00:27<00:00,  1.38it/s, train_loss=0.654]
100%|██████████| 19/19 [00:13<00:00,  1.38it/s]

speechbrain.utils.train_logger - epoch: 37, lr_model: 1.69e-04, lr_wav2vec2: 5.63e-06 - train loss: 6.54e-01 - valid loss: 4.76e-01, valid ctc_loss: 4.75e-01, valid PER: 15.88, valid mpd_f1: 4.04e-01





speechbrain.utils.checkpoints - Saved an end-of-epoch checkpoint in results/hubert-base_ctc/save/CKPT+2023-11-22+13-20-13+00
speechbrain.utils.checkpoints - Deleted checkpoint in results/hubert-base_ctc/save/CKPT+2023-11-22+13-19-21+00
speechbrain.utils.epoch_loop - Going into epoch 38


100%|██████████| 38/38 [00:26<00:00,  1.43it/s, train_loss=0.662]
100%|██████████| 19/19 [00:13<00:00,  1.44it/s]

speechbrain.utils.train_logger - epoch: 38, lr_model: 1.69e-04, lr_wav2vec2: 5.63e-06 - train loss: 6.62e-01 - valid loss: 4.72e-01, valid ctc_loss: 4.71e-01, valid PER: 15.43, valid mpd_f1: 4.08e-01





speechbrain.utils.checkpoints - Saved an end-of-epoch checkpoint in results/hubert-base_ctc/save/CKPT+2023-11-22+13-21-03+00
speechbrain.utils.checkpoints - Deleted checkpoint in results/hubert-base_ctc/save/CKPT+2023-11-22+13-20-13+00
speechbrain.utils.epoch_loop - Going into epoch 39


100%|██████████| 38/38 [00:26<00:00,  1.42it/s, train_loss=0.666]
100%|██████████| 19/19 [00:13<00:00,  1.40it/s]

speechbrain.nnet.schedulers - Changing lr from 0.00017 to 0.00013
speechbrain.nnet.schedulers - Changing lr from 5.6e-06 to 4.2e-06
speechbrain.utils.train_logger - epoch: 39, lr_model: 1.69e-04, lr_wav2vec2: 5.63e-06 - train loss: 6.66e-01 - valid loss: 4.71e-01, valid ctc_loss: 4.70e-01, valid PER: 15.14, valid mpd_f1: 3.94e-01





speechbrain.utils.checkpoints - Saved an end-of-epoch checkpoint in results/hubert-base_ctc/save/CKPT+2023-11-22+13-21-55+00
speechbrain.utils.checkpoints - Deleted checkpoint in results/hubert-base_ctc/save/CKPT+2023-11-22+13-21-03+00
speechbrain.utils.epoch_loop - Going into epoch 40


100%|██████████| 38/38 [00:26<00:00,  1.45it/s, train_loss=0.641]
100%|██████████| 19/19 [00:12<00:00,  1.46it/s]

speechbrain.utils.train_logger - epoch: 40, lr_model: 1.27e-04, lr_wav2vec2: 4.22e-06 - train loss: 6.41e-01 - valid loss: 4.60e-01, valid ctc_loss: 4.59e-01, valid PER: 14.59, valid mpd_f1: 4.16e-01





speechbrain.utils.checkpoints - Saved an end-of-epoch checkpoint in results/hubert-base_ctc/save/CKPT+2023-11-22+13-22-47+00
speechbrain.utils.checkpoints - Deleted checkpoint in results/hubert-base_ctc/save/CKPT+2023-11-22+13-21-55+00
speechbrain.utils.epoch_loop - Going into epoch 41


100%|██████████| 38/38 [00:29<00:00,  1.29it/s, train_loss=0.644]
100%|██████████| 19/19 [00:13<00:00,  1.37it/s]

speechbrain.utils.train_logger - epoch: 41, lr_model: 1.27e-04, lr_wav2vec2: 4.22e-06 - train loss: 6.44e-01 - valid loss: 4.55e-01, valid ctc_loss: 4.54e-01, valid PER: 14.68, valid mpd_f1: 4.22e-01





speechbrain.utils.checkpoints - Saved an end-of-epoch checkpoint in results/hubert-base_ctc/save/CKPT+2023-11-22+13-23-42+00
speechbrain.utils.checkpoints - Deleted checkpoint in results/hubert-base_ctc/save/CKPT+2023-11-22+11-41-56+00
speechbrain.utils.epoch_loop - Going into epoch 42


100%|██████████| 38/38 [00:27<00:00,  1.41it/s, train_loss=0.675]
100%|██████████| 19/19 [00:13<00:00,  1.40it/s]

speechbrain.utils.train_logger - epoch: 42, lr_model: 1.27e-04, lr_wav2vec2: 4.22e-06 - train loss: 6.75e-01 - valid loss: 4.58e-01, valid ctc_loss: 4.57e-01, valid PER: 14.68, valid mpd_f1: 4.12e-01





speechbrain.utils.checkpoints - Saved an end-of-epoch checkpoint in results/hubert-base_ctc/save/CKPT+2023-11-22+13-24-36+00
speechbrain.utils.epoch_loop - Going into epoch 43


100%|██████████| 38/38 [00:27<00:00,  1.41it/s, train_loss=0.65]
100%|██████████| 19/19 [00:13<00:00,  1.41it/s]

speechbrain.utils.train_logger - epoch: 43, lr_model: 1.27e-04, lr_wav2vec2: 4.22e-06 - train loss: 6.50e-01 - valid loss: 4.53e-01, valid ctc_loss: 4.53e-01, valid PER: 14.35, valid mpd_f1: 4.18e-01





speechbrain.utils.checkpoints - Saved an end-of-epoch checkpoint in results/hubert-base_ctc/save/CKPT+2023-11-22+13-25-27+00
speechbrain.utils.checkpoints - Deleted checkpoint in results/hubert-base_ctc/save/CKPT+2023-11-22+13-22-47+00
speechbrain.utils.checkpoints - Deleted checkpoint in results/hubert-base_ctc/save/CKPT+2023-11-22+13-24-36+00
speechbrain.utils.epoch_loop - Going into epoch 44


100%|██████████| 38/38 [00:26<00:00,  1.43it/s, train_loss=0.599]
100%|██████████| 19/19 [00:13<00:00,  1.39it/s]

speechbrain.nnet.schedulers - Changing lr from 0.00013 to 9.5e-05
speechbrain.nnet.schedulers - Changing lr from 4.2e-06 to 3.2e-06
speechbrain.utils.train_logger - epoch: 44, lr_model: 1.27e-04, lr_wav2vec2: 4.22e-06 - train loss: 5.99e-01 - valid loss: 4.54e-01, valid ctc_loss: 4.54e-01, valid PER: 14.18, valid mpd_f1: 4.17e-01





speechbrain.utils.checkpoints - Saved an end-of-epoch checkpoint in results/hubert-base_ctc/save/CKPT+2023-11-22+13-26-21+00
speechbrain.utils.checkpoints - Deleted checkpoint in results/hubert-base_ctc/save/CKPT+2023-11-22+13-25-27+00
speechbrain.utils.epoch_loop - Going into epoch 45


100%|██████████| 38/38 [00:27<00:00,  1.40it/s, train_loss=0.595]
100%|██████████| 19/19 [00:13<00:00,  1.45it/s]

speechbrain.utils.train_logger - epoch: 45, lr_model: 9.49e-05, lr_wav2vec2: 3.16e-06 - train loss: 5.95e-01 - valid loss: 4.52e-01, valid ctc_loss: 4.51e-01, valid PER: 14.22, valid mpd_f1: 4.27e-01





speechbrain.utils.checkpoints - Saved an end-of-epoch checkpoint in results/hubert-base_ctc/save/CKPT+2023-11-22+13-27-13+00
speechbrain.utils.checkpoints - Deleted checkpoint in results/hubert-base_ctc/save/CKPT+2023-11-22+13-23-42+00
speechbrain.utils.epoch_loop - Going into epoch 46


100%|██████████| 38/38 [00:26<00:00,  1.41it/s, train_loss=0.603]
100%|██████████| 19/19 [00:13<00:00,  1.38it/s]

speechbrain.utils.train_logger - epoch: 46, lr_model: 9.49e-05, lr_wav2vec2: 3.16e-06 - train loss: 6.03e-01 - valid loss: 4.52e-01, valid ctc_loss: 4.52e-01, valid PER: 14.35, valid mpd_f1: 4.22e-01





speechbrain.utils.checkpoints - Saved an end-of-epoch checkpoint in results/hubert-base_ctc/save/CKPT+2023-11-22+13-28-07+00
speechbrain.utils.epoch_loop - Going into epoch 47


100%|██████████| 38/38 [00:26<00:00,  1.46it/s, train_loss=0.639]
100%|██████████| 19/19 [00:13<00:00,  1.45it/s]

speechbrain.nnet.schedulers - Changing lr from 9.5e-05 to 7.1e-05
speechbrain.nnet.schedulers - Changing lr from 3.2e-06 to 2.4e-06
speechbrain.utils.train_logger - epoch: 47, lr_model: 9.49e-05, lr_wav2vec2: 3.16e-06 - train loss: 6.39e-01 - valid loss: 4.53e-01, valid ctc_loss: 4.53e-01, valid PER: 14.05, valid mpd_f1: 4.19e-01





speechbrain.utils.checkpoints - Saved an end-of-epoch checkpoint in results/hubert-base_ctc/save/CKPT+2023-11-22+13-28-59+00
speechbrain.utils.checkpoints - Deleted checkpoint in results/hubert-base_ctc/save/CKPT+2023-11-22+13-28-07+00
speechbrain.utils.checkpoints - Deleted checkpoint in results/hubert-base_ctc/save/CKPT+2023-11-22+13-26-21+00
speechbrain.utils.epoch_loop - Going into epoch 48


100%|██████████| 38/38 [00:26<00:00,  1.44it/s, train_loss=0.584]
100%|██████████| 19/19 [00:12<00:00,  1.47it/s]

speechbrain.utils.train_logger - epoch: 48, lr_model: 7.12e-05, lr_wav2vec2: 2.37e-06 - train loss: 5.84e-01 - valid loss: 4.47e-01, valid ctc_loss: 4.47e-01, valid PER: 14.29, valid mpd_f1: 4.32e-01





speechbrain.utils.checkpoints - Saved an end-of-epoch checkpoint in results/hubert-base_ctc/save/CKPT+2023-11-22+13-29-52+00
speechbrain.utils.checkpoints - Deleted checkpoint in results/hubert-base_ctc/save/CKPT+2023-11-22+13-27-13+00
speechbrain.utils.epoch_loop - Going into epoch 49


100%|██████████| 38/38 [00:26<00:00,  1.45it/s, train_loss=0.569]
100%|██████████| 19/19 [00:13<00:00,  1.45it/s]

speechbrain.utils.train_logger - epoch: 49, lr_model: 7.12e-05, lr_wav2vec2: 2.37e-06 - train loss: 5.69e-01 - valid loss: 4.47e-01, valid ctc_loss: 4.46e-01, valid PER: 14.02, valid mpd_f1: 4.28e-01





speechbrain.utils.checkpoints - Saved an end-of-epoch checkpoint in results/hubert-base_ctc/save/CKPT+2023-11-22+13-30-48+00
speechbrain.utils.checkpoints - Deleted checkpoint in results/hubert-base_ctc/save/CKPT+2023-11-22+13-28-59+00
speechbrain.utils.epoch_loop - Going into epoch 50


100%|██████████| 38/38 [00:26<00:00,  1.46it/s, train_loss=0.644]
100%|██████████| 19/19 [00:12<00:00,  1.49it/s]

speechbrain.nnet.schedulers - Changing lr from 7.1e-05 to 5.3e-05
speechbrain.nnet.schedulers - Changing lr from 2.4e-06 to 1.8e-06
speechbrain.utils.train_logger - epoch: 50, lr_model: 7.12e-05, lr_wav2vec2: 2.37e-06 - train loss: 6.44e-01 - valid loss: 4.48e-01, valid ctc_loss: 4.47e-01, valid PER: 13.92, valid mpd_f1: 4.28e-01





speechbrain.utils.checkpoints - Saved an end-of-epoch checkpoint in results/hubert-base_ctc/save/CKPT+2023-11-22+13-31-38+00
speechbrain.utils.checkpoints - Deleted checkpoint in results/hubert-base_ctc/save/CKPT+2023-11-22+13-30-48+00


In [None]:
# Test
asr_brain.evaluate(
    test_data,
    min_key="PER",
    test_loader_kwargs=hparams["test_dataloader_opts"],
)

speechbrain.utils.checkpoints - Loading a checkpoint from results/hubert-base_ctc/save/CKPT+2023-11-22+13-31-38+00


100%|██████████| 300/300 [02:13<00:00,  2.24it/s]

speechbrain.utils.train_logger - Epoch loaded: 50 - test loss: 5.70e-01, test PER: 15.38, test mpd_f1: 3.45e-01





CTC and PER stats written to file results/hubert-base_ctc//wer.txt
MPD results and stats written to file results/hubert-base_ctc//mpd.txt


0.5704640717059375

In [None]:
!zip -r /content/checkpoint_latest.zip /content/drive/MyDrive/CS5647_Project/results/hubert-base_ctc/save

  adding: content/drive/MyDrive/CS5647_Project/results/hubert-base_ctc/save/ (stored 0%)
  adding: content/drive/MyDrive/CS5647_Project/results/hubert-base_ctc/save/hubert_checkpoint/ (stored 0%)
  adding: content/drive/MyDrive/CS5647_Project/results/hubert-base_ctc/save/hubert_checkpoint/models--facebook--hubert-large-ls960-ft/ (stored 0%)
  adding: content/drive/MyDrive/CS5647_Project/results/hubert-base_ctc/save/hubert_checkpoint/models--facebook--hubert-large-ls960-ft/blobs/ (stored 0%)
  adding: content/drive/MyDrive/CS5647_Project/results/hubert-base_ctc/save/hubert_checkpoint/models--facebook--hubert-large-ls960-ft/blobs/36ebe8b7c1cc967b3059f0494ae8a1069dd67655 (deflated 35%)
  adding: content/drive/MyDrive/CS5647_Project/results/hubert-base_ctc/save/hubert_checkpoint/models--facebook--hubert-large-ls960-ft/blobs/2cd99ac14fa1cde977c292af904d66aaa607120f (deflated 61%)
  adding: content/drive/MyDrive/CS5647_Project/results/hubert-base_ctc/save/hubert_checkpoint/models--facebook--