In [None]:
!pip install speechbrain

Collecting speechbrain
  Downloading speechbrain-0.5.15-py3-none-any.whl (553 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m553.8/553.8 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting hyperpyyaml (from speechbrain)
  Downloading HyperPyYAML-1.2.2-py3-none-any.whl (16 kB)
Collecting sentencepiece (from speechbrain)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m18.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub (from speechbrain)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m18.5 MB/s[0m eta [36m0:00:00[0m
Collecting ruamel.yaml>=0.17.28 (from hyperpyyaml->speechbrain)
  Downloading ruamel.yaml-0.17.36-py3-none-any.whl (106 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m106.9

In [None]:
!pip install textgrid transformers librosa

Collecting textgrid
  Downloading TextGrid-1.5-py3-none-any.whl (10.0 kB)
Collecting transformers
  Downloading transformers-4.34.1-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m27.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m64.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m71.3 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.17.3-py3-none-any.whl (295 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import os
import sys
import torch
import logging
import speechbrain as sb
from hyperpyyaml import load_hyperpyyaml
import librosa
from tqdm import tqdm
import json
from google.colab import drive, files
import pandas as pd

In [None]:
# Mount drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
logger = logging.getLogger(__name__)

In [None]:
folder_path = '/content/drive/MyDrive/CS5647_Project'
os.chdir(folder_path)
current_directory = os.getcwd()
print("Current Working Directory after change:", current_directory)

Current Working Directory after change: /content/drive/MyDrive/CS5647_Project


In [None]:
def make_attn_mask(wavs, wav_lens):
    """
    wav_lens: relative lengths(i.e. 0-1) of a batch. shape: (bs, )
    return a tensor of shape (bs, seq_len), representing mask on allowed positions.
            1 for regular tokens, 0 for padded tokens
    """
    abs_lens = (wav_lens*wavs.shape[1]).long()
    attn_mask = wavs.new(wavs.shape).zero_().long()
    for i in range(len(abs_lens)):
        attn_mask[i, :abs_lens[i]] = 1
    return attn_mask

In [None]:
# Define training procedure
class ASR(sb.Brain):
    def compute_forward(self, batch, stage):
        "Given an input batch it computes the phoneme probabilities."
        batch = batch.to(self.device)
        ids = batch.id
        wavs, wav_lens = batch.sig

        if stage == sb.Stage.TRAIN:
            if hasattr(self.hparams, "augmentation"):
                wavs = self.hparams.augmentation(wavs, wav_lens)

        # some wav2vec models (e.g. large-lv60) needs attention_mask
        if self.modules.wav2vec2.feature_extractor.return_attention_mask:
            attn_mask = make_attn_mask(wavs, wav_lens)
            feats = self.modules.wav2vec2(wavs, attention_mask=attn_mask)
        else:
            attn_mask = None
            feats = self.modules.wav2vec2(wavs)

        x = self.modules.enc(feats)

        # output layer for ctc log-probabilities
        logits = self.modules.ctc_lin(x)
        p_ctc = self.hparams.log_softmax(logits)
        # Note: sb.decoders.ctc_greedy_decode will also remove padded tokens
        # that is, it return a list of list with different lengths
        sequence = sb.decoders.ctc_greedy_decode(
            p_ctc, wav_lens, blank_id=self.hparams.blank_index
        )
        transcriptions = [" ".join(self.label_encoder.decode_ndim(s)) for s in sequence]


        return ids, transcriptions

    def transcribe_dataset(
            self,
            dataset, # Must be obtained from the dataio_function
            min_key, # We load the model with the lowest WER
            loader_kwargs # opts for the dataloading
        ):

        # If dataset isn't a Dataloader, we create it.
        if not isinstance(dataset, torch.utils.data.DataLoader):
            loader_kwargs["ckpt_prefix"] = None
            dataset = self.make_dataloader(
                dataset, sb.Stage.TEST, **loader_kwargs
            )


        self.on_evaluate_start(min_key=min_key) # We call the on_evaluate_start that will load the best model
        self.modules.eval() # We set the model to eval mode (remove dropout etc)
        self.modules.wav2vec2.model.config.apply_spec_augment = False  # make sure no spec aug applied on wav2vec2

        # Now we iterate over the dataset and we simply compute_forward and decode
        with torch.no_grad():

            wav_ids = []
            transcripts = []
            for batch in tqdm(dataset, dynamic_ncols=True):

                ids, preds = self.compute_forward(batch, stage=sb.Stage.TEST)

                transcripts.extend(preds)
                wav_ids.extend(ids)

        return wav_ids, transcripts


def dataio_prep(hparams):
    """This function prepares the datasets to be used in the brain class.
    It also defines the data processing pipeline through user-defined functions."""
    data_folder = hparams["data_folder_save"]
    # 1. Declarations:

    inference_data = sb.dataio.dataset.DynamicItemDataset.from_csv(
        csv_path=hparams["inference_annotation"],
        replacements={"data_root": data_folder},
    )
    inference_data = inference_data.filtered_sorted(sort_key="duration")

    datasets = [inference_data]
    label_encoder = sb.dataio.encoder.CTCTextEncoder()

    # 2. Define audio pipeline:
    @sb.utils.data_pipeline.takes("wav")
    @sb.utils.data_pipeline.provides("sig")
    def audio_pipeline(wav):
        # sig = sb.dataio.dataio.read_audio(wav)
        # # sample rate change to 16000, e,g, using librosa
        # sig = torch.Tensor(librosa.core.load(wav, hparams["sample_rate"])[0])
        # Use wav2vec processor to do normalization
        audio_signal, _ = librosa.core.load(wav, sr=hparams["sample_rate"])
        sig = hparams["wav2vec2"].feature_extractor(
            audio_signal,
            sampling_rate=hparams["sample_rate"],
        ).input_values[0]
        sig = torch.Tensor(sig)
        return sig

    sb.dataio.dataset.add_dynamic_item(datasets, audio_pipeline)


    # 3. Fit encoder:
    # Load the label encoder
    lab_enc_file = os.path.join(hparams["save_folder"], "label_encoder.txt")
    label_encoder.load(lab_enc_file)

    # 4. Set output:
    sb.dataio.dataset.set_output_keys(
        datasets,
        ["id", "sig"],
    )

    return inference_data, label_encoder

In [None]:
hparams_file = '/content/drive/MyDrive/CS5647_Project/hparams/transcribe.yaml'

# Load hyperparameters file with command-line overrides
with open(hparams_file) as fin:
    hparams = load_hyperpyyaml(fin)





In [None]:

# Create experiment directory
sb.create_experiment_directory(
    experiment_directory=hparams["output_folder"],
    hyperparams_to_save=hparams_file,
)


# # Dataset IO prep: creating Dataset objects and proper encodings for phones
inference_data, label_encoder = dataio_prep(hparams)

# Trainer initialization
asr_brain = ASR(
    modules=hparams["modules"],
    hparams=hparams,
    checkpointer=hparams["checkpointer"],
)
asr_brain.label_encoder = label_encoder
wav_ids, transcripts = asr_brain.transcribe_dataset(
    dataset=inference_data, # Must be obtained from the dataio_function
    min_key="PER", # We load the model with the lowest PER
    loader_kwargs=hparams["inference_dataloader_opts"], # opts for the dataloading
)



speechbrain.core - Beginning experiment!
speechbrain.core - Experiment folder: results/wav2vec2-base_ctc
speechbrain.core - 90.6M trainable parameters in ASR
speechbrain.utils.checkpoints - Loading a checkpoint from results/wav2vec2-base_ctc/save/CKPT+2023-10-19+00-55-18+00


100%|██████████| 300/300 [04:27<00:00,  1.12it/s]


In [None]:
import csv
import json

# Read data from the CSV file
csv_data = {}
with open(hparams["inference_annotation"], "r") as csv_file:
    csv_reader = csv.DictReader(csv_file)
    for row in csv_reader:
        csv_data[row["ID"]] = row

# Update pred_phns based on wav_ids and transcripts
for wav_id, transcript in zip(wav_ids, transcripts):
    if wav_id in csv_data:
        csv_data[wav_id]["pred_phns"] = transcript

# Convert the updated data to the desired format
updated_data = {wav_id: csv_data[wav_id] for wav_id in csv_data}

# Save the updated data as a new JSON file
with open(hparams["inference_annotation_saved"], "w") as json_f_save:
    json.dump(updated_data, json_f_save, indent=2)