# Fine-tuning ECAPA-TDNN on  [CryCeleb2023](https://huggingface.co/spaces/competitions/CryCeleb2023) using [SpeechBrain](https://speechbrain.readthedocs.io)

This notebook should help you get started training your own models for CryCeleb2023 challenge.

Note that it is provides basic example for simplicity and speed.

Author: David Budaghyan (Ubenwa)


### Imports

In [1]:
# For Colab - uncomment and run the following to set up the repo
!pip install speechbrain
#!git clone https://github.com/Ubenwa/cryceleb2023.git
!git clone https://github.com/DineshKPendyala/cryceleb2023.git
%cd cryceleb2023

Cloning into 'cryceleb2023'...
remote: Enumerating objects: 48, done.[K
remote: Counting objects: 100% (48/48), done.[K
remote: Compressing objects: 100% (34/34), done.[K
remote: Total 48 (delta 21), reused 33 (delta 13), pack-reused 0 (from 0)[K
Receiving objects: 100% (48/48), 69.97 KiB | 577.00 KiB/s, done.
Resolving deltas: 100% (21/21), done.
/content/cryceleb2023


In [2]:
%%capture
%load_ext autoreload
%autoreload 2

import pathlib
import random

import numpy as np
import pandas as pd
import seaborn as sns
import speechbrain as sb
import torch
from huggingface_hub import hf_hub_download
from hyperpyyaml import load_hyperpyyaml
from IPython.display import display
from speechbrain.dataio.dataio import read_audio, write_audio
from speechbrain.dataio.dataset import DynamicItemDataset
from speechbrain.dataio.encoder import CategoricalEncoder

from crybrain import CryBrain, download_data, download_data_drive

#dataset_path = "data"

In [None]:
from google.colab import drive
drive.mount('/content/drive')

### Download data

You need to log in to HuggingFace to be able to download the dataset

In [3]:
#from huggingface_hub import notebook_login

#notebook_login()

In [4]:
from crybrain import CryBrain, download_data , download_data_drive


In [5]:
from crybrain import CryBrain, download_data , download_data_drive # Ensure this line is executed before using download_data_drive
from google.colab import drive
drive.mount('/content/drive')
import os
dataset_path = "/content/drive/MyDrive/huggingface_dataset"
download_data_drive(dataset_path)

Mounted at /content/drive
It appears that data is already downloaded. 
If you think it should be re-downloaded, remove /content/drive/MyDrive/huggingface_dataset/ directory and re-run


In [6]:
# read metadata
metadata = pd.read_csv(
    f"{dataset_path}/metadata.csv", dtype={"baby_id": str, "chronological_index": str}
)
train_metadata = metadata.loc[metadata["split"] == "train"].copy()
display(
    train_metadata.head()
    .style.set_caption("train_metadata")
    .set_table_styles([{"selector": "caption", "props": [("font-size", "20px")]}])
)
display(train_metadata.describe())

Unnamed: 0,baby_id,period,duration,split,chronological_index,file_name,file_id
45,369,B,1.16,train,0,audio/train/0369/B/0369_B_000.wav,0369_B_000
46,369,B,0.83,train,1,audio/train/0369/B/0369_B_001.wav,0369_B_001
47,369,B,1.32,train,2,audio/train/0369/B/0369_B_002.wav,0369_B_002
48,369,B,0.89,train,3,audio/train/0369/B/0369_B_003.wav,0369_B_003
49,369,B,0.92,train,4,audio/train/0369/B/0369_B_004.wav,0369_B_004


Unnamed: 0,duration
count,18190.0
mean,0.884927
std,0.539273
min,0.08
25%,0.58
50%,0.77
75%,1.05
max,10.34


### Concatenate cry sounds

We are given short cry sounds for each baby. Here we simply concatenate them.

In [None]:
# read the segments
# read the segments
train_metadata["cry"] = train_metadata.apply(
    lambda row: read_audio(os.path.join(dataset_path,row["file_name"])).numpy(), axis=1
)
# concatenate all segments for each (baby_id, period) group
manifest_df = pd.DataFrame(
    train_metadata.groupby(["baby_id", "period"])["cry"].agg(lambda x: np.concatenate(x.values)),
    columns=["cry"],
).reset_index()
# all files have 16000 sampling rate
manifest_df["duration"] = manifest_df["cry"].apply(len) / 16000
pathlib.Path(f"{dataset_path}/concatenated_audio_train").mkdir(parents=True,exist_ok=True)
manifest_df["file_path"] = manifest_df.apply(
    lambda row: f"{dataset_path}/concatenated_audio_train/{row['baby_id']}_{row['period']}.wav",
    axis=1,
)
manifest_df.apply(
    lambda row: write_audio(
        filepath=f'{row["file_path"]}', audio=torch.tensor(row["cry"]), samplerate=16000
    ),
    axis=1,
)
manifest_df = manifest_df.drop(columns=["cry"])
display(manifest_df)
ax = sns.histplot(manifest_df, x="duration")
ax.set_title("Histogram of Concatenated Cry Sound Lengths")

During training, we will extract random cuts of 3-5 seconds from concatenated audio

In [8]:
def create_cut_length_interval(row, cut_length_interval):
    """cut_length_interval is a tuple indicating the range of lengths we want our chunks to be.
    this function computes the valid range of chunk lengths for each audio file
    """
    # the lengths are in seconds, convert them to frames
    cut_length_interval = [round(length * 16000) for length in cut_length_interval]
    cry_length = round(row["duration"] * 16000)
    # make the interval valid for the specific sound file
    min_cut_length, max_cut_length = cut_length_interval
    # if min_cut_length is greater than length of cry, don't cut
    if min_cut_length >= cry_length:
        cut_length_interval = (cry_length, cry_length)
    # if max_cut_length is greater than length of cry, take a cut of length between min_cut_length and full length of cry
    elif max_cut_length >= cry_length:
        cut_length_interval = (min_cut_length, cry_length)
    return cut_length_interval


cut_length_interval = (3, 5)
manifest_df["cut_length_interval_in_frames"] = manifest_df.apply(
    lambda row: create_cut_length_interval(row, cut_length_interval=cut_length_interval), axis=1
)

NameError: name 'manifest_df' is not defined

In [None]:
display(manifest_df)

### Split into train and val

For training a classfier, we can split the data into train/val in any way, as long as val does not contain new classes

One way to split is to split by period: train on birth recordings and validate on discharge

In [None]:
# we can train on any subset of babies (e.g. to reduce the number of classes, only keep babies with long enough cries, etc)
def get_babies_with_both_recordings(manifest_df):
    count_of_periods_per_baby = manifest_df.groupby("baby_id")["period"].count()
    baby_ids_with_recording_from_both_periods = count_of_periods_per_baby[
        count_of_periods_per_baby == 2
    ].index
    return baby_ids_with_recording_from_both_periods


# def get_babies_with_a_birth_recording(manifest_df):
#   bool_series = manifest_df.groupby('baby_id')['period'].unique().apply(set(['B']).issubset)
#   baby_ids_with_a_recordings_from_birth = bool_series[bool_series].index
#   return baby_ids_with_a_recordings_from_birth


def split_by_period(row, included_baby_ids):
    if row["baby_id"] in included_baby_ids:
        if row["period"] == "B":
            return "train"
        else:
            return "val"
    else:
        return "not_used"


babies_with_both_recordings = get_babies_with_both_recordings(manifest_df)
manifest_df["split"] = manifest_df.apply(
    lambda row: split_by_period(row, included_baby_ids=babies_with_both_recordings), axis=1
)

# each instance will be identified with a unique id
manifest_df["id"] = manifest_df["baby_id"] + "_" + manifest_df["period"]
display(manifest_df)
display(
    manifest_df["split"]
    .value_counts()
    .rename("use_babies_with_both_recordings_and_split_by_period")
)
manifest_df.set_index("id").to_json("manifest.json", orient="index")

### Create dynamic datasets

See SpeechBrain documentation to understand details

In [None]:
# create a dynamic dataset from the csv, only used to create train and val datasets
dataset = DynamicItemDataset.from_json("manifest.json")
baby_id_encoder = CategoricalEncoder()
datasets = {}
# create a dataset for each split
for split in ["train", "val"]:
    # retrieve the desired slice (train or val) and sort by length to minimize amount of padding
    datasets[split] = dataset.filtered_sorted(
        key_test={"split": lambda value: value == split}, sort_key="duration"
    )  # select_n=100
    # create the baby_id_encoded field
    datasets[split].add_dynamic_item(
        baby_id_encoder.encode_label_torch, takes="baby_id", provides="baby_id_encoded"
    )
    # set visible fields
    datasets[split].set_output_keys(["id", "baby_id", "baby_id_encoded", "sig"])


# create the signal field for the val split (no chunking)
datasets["val"].add_dynamic_item(sb.dataio.dataio.read_audio, takes="file_path", provides="sig")

# the label encoder will map the baby_ids to target classes 0, 1, 2, ...
# only use the classes which appear in `train`,
baby_id_encoder.update_from_didataset(datasets["train"], "baby_id")


# for reading the train split, we add chunking
def audio_pipeline(file_path, cut_length_interval_in_frames):
    """Load the signal, and pass it and its length to the corruption class.
    This is done on the CPU in the `collate_fn`."""
    sig = sb.dataio.dataio.read_audio(file_path)
    if cut_length_interval_in_frames is not None:
        cut_length = random.randint(*cut_length_interval_in_frames)
        # pick the start index of the cut
        left_index = random.randint(0, len(sig) - cut_length)
        # cut the signal
        sig = sig[left_index : left_index + cut_length]
    return sig


# create the signal field (with chunking)
datasets["train"].add_dynamic_item(
    audio_pipeline, takes=["file_path", "cut_length_interval_in_frames"], provides="sig"
)

display(datasets["train"][0])

### Fine-tune the classifier

Here we use a very basic example that just trains for 5 epochs

In [None]:
config_filename = "hparams/ecapa_voxceleb_basic.yaml"
overrides = {
    "seed": 3011,
    "n_classes": len(baby_id_encoder),
    "experiment_name": "ecapa_voxceleb_ft_basic",
    "bs": 32,
    "n_epochs": 100,
}
device = "cuda"
run_opts = {"device": device}
###########################################
# Load hyperparameters file with command-line overrides.
with open(config_filename) as fin:
    hparams = load_hyperpyyaml(fin, overrides)
# Create experiment directory
sb.create_experiment_directory(
    experiment_directory=hparams["experiment_dir"],
    hyperparams_to_save=config_filename,
    overrides=overrides,
)

# Initialize the Brain object to prepare for training.
crybrain = CryBrain(
    modules=hparams["modules"],
    opt_class=hparams["opt_class"],
    hparams=hparams,
    run_opts=run_opts,
    checkpointer=hparams["checkpointer"],
)

# if a pretrained model is specified, load it
if "pretrained_embedding_model" in hparams:
    sb.utils.distributed.run_on_main(hparams["pretrained_embedding_model"].collect_files)
    hparams["pretrained_embedding_model"].load_collected()

crybrain.fit(
    epoch_counter=crybrain.hparams.epoch_counter,
    train_set=datasets["train"],
    valid_set=datasets["val"],
    train_loader_kwargs=hparams["train_dataloader_options"],
    valid_loader_kwargs=hparams["val_dataloader_options"],
)

You can now use embedding_model.ckpt from this recipe and use it in evaluate.ipynb to verify pairs of cries and submit your results!