In [None]:
#File finder attribute to locate file outside current folders too
#using torchaudio.info in order to speed up the data-loading and training procedure
import torch, torchmetrics
from pyannote.database import FileFinder
from pyannote.audio.core.io import get_torchaudio_info
preprocessors = {'audio': FileFinder(), "torchaudio.info": get_torchaudio_info}

In [None]:
#Setting up database for training

from pyannote.database import get_protocol
ami = get_protocol('AMI.SpeakerDiarization.only_words',preprocessors=preprocessors)

In [None]:
#We can use ami.train() to get trainng data protocol and the same for ami.validation() and ami.test()
#The code below will provide torchaudio info about each of the audio in the dataset

for i in ami.train():
    print(get_torchaudio_info(i))

## Training a voice activity detection model from scratch

Voice activity detection (VAD) is the task of detecting speech regions in a given audio stream or recording.

We initialize a VAD *task* that describes how the model will be trained:

* `ami` indicates that we will use files available in `ami.train()`.
* `duration=2.` and `batch_size=128` indicates that the model will ingest batches of 128 two seconds long audio chunks.

In [None]:
#Imp: While training on NSCC you need to reset number of workers to 5 to work according to NSCC memory allocation
#Note that we can add any number of metrics on which we want our model to trained in the format mentioned below

from pyannote.audio.tasks import VoiceActivityDetection
vad_task = VoiceActivityDetection(ami, duration=2.0, batch_size=128,num_workers=5,metric=(torchmetrics.F1Score(threshold=0.5, average='micro'),torchmetrics.AUROC(num_classes=2)))

We initialize one *model* with the `PyanNet` architecture used [in that paper](https://arxiv.org/abs/2104.04045).  
In particular, we increase the default stride of the initial `sincnet` feature extraction layer to `10`.

The model is also provided with the task (`task=vad_task`) for which it is being trained:

In [None]:
#Setting up a vad model according to custom VAD task initialzed above

from pyannote.audio.models.segmentation import PyanNet
vad_model = PyanNet(task=vad_task, sincnet={'stride': 10})

In [None]:
#If you need to change/update learning rate, weight decay or learning rate schedulers, 
#you can do so by the below mentioned function

def configure_optimizers(model):
        optimizer = torch.optim.Adam(model.parameters(), lr=5e-5,eps= 1e-08,maximize= False,weight_decay=0)
        lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer,T_max = 20)
        return {"optimizer": optimizer, "lr_scheduler": lr_scheduler}


In [None]:
#If you have changed the parameters, update the parameters for your model

from types import MethodType
vad_model.configure_optimizers = MethodType(configure_optimizers, vad_model)

Now that everything is ready, let's train with `pytorch-ligthning`!

In [None]:
import pytorch_lightning as pl
trainer = pl.Trainer(gpus=1, max_epochs=1)
trainer.fit(vad_model)

Once trained, the model can be applied to a test file:

In [None]:
# here we use a test file provided by the protocol, but it could be any audio file
# e.g. test_file = "/path/to/test.wav".

test_file = next(ami.test())

Because the model was trained on 2s audio chunks and that test files are likely to be much longer than that, we wrap the `model` with an `Inference` instance: it will take care of sliding a 2s window over the whole file and aggregate the output of the model.

In [None]:
from pyannote.audio import Inference
vad = Inference(vad_model)

vad_probability = vad(test_file)
vad_probability

Perfect voice activity detection output should look like that:

In [None]:
expected_output = test_file["annotation"].get_timeline().support()
expected_output