# Extraction of participant only audios from manual diarizations

In [6]:
"""
This scripts takes care of diarization of the audio files, via the pyannote-audio library.
First, we extract the segmented speakers in the form of a text file. Then, we determine the 
speaker with the highest speaking time in seconds. We then extract the audio of the speaker 
and save it in a separate file."""

import os
import pandas as pd
import tqdm
from pyannote.audio import Pipeline
import torch
import torchaudio
import soundfile as sf
import numpy as np

from diarization.passwords import AUTH_TOKEN
pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization-3.1",
                                    use_auth_token=AUTH_TOKEN)
# check if GPU is available
if torch.cuda.is_available():
    pipeline.to(torch.device("cuda"))
else:
    pipeline.to(torch.device("cpu"))

def diarize(audio_file):
    """
    This function takes an audio file as input and returns the diarization object,
    diarization dataframe and a dictionary containing the number of speakers and 
    their speaking time."""
    waveform, sr = torchaudio.load(audio_file) # Audio must be a torch tensor
    diarization = pipeline({'waveform':waveform,'sample_rate':sr})
    diarization_df = pd.DataFrame()

    # Count number of speakers
    speakers = []
    for turn, _, speaker in diarization.itertracks(yield_label=True):
        if speaker not in speakers:
            speakers.append(speaker)

        dur = turn.end-turn.start
        new_row = {'start': int(turn.start * sr), 
                   'end': int(turn.end * sr), 
                   'start [s]': round(turn.start,3), 
                   'end [s]': round(turn.end,3), 
                   'dur [s]' : round(dur,3), 
                   'speaker': speaker}
        diarization_df = pd.concat([diarization_df, pd.DataFrame(new_row, index=[0])], 
                                   ignore_index=True)

    num_speakers = len(speakers)
    speaker_durs = diarization_df.groupby(['speaker'], as_index=False).sum()

    output_dict = dict()
    output_dict['filename'] = audio_file
    output_dict['num_speakers'] = num_speakers

    for _, row in speaker_durs.iterrows():
        output_dict[str(row['speaker'])] = row['dur [s]']

    return diarization, diarization_df, output_dict

def get_longest_speaker(diarization_df):
    """
    This function takes the diarization dataframe as input and returns the speaker
    with the highest speaking time in seconds."""
    speaker_durs = diarization_df.groupby(['speaker'], as_index=False).sum()
    speaker_durs = speaker_durs.sort_values(by='dur [s]', ascending=False)
    longest_speaker = speaker_durs.iloc[0]['speaker']
    return longest_speaker

def extract_audio(audio_file, diarization_df, longest_speaker):
    """
    This function takes the audio file, diarization dataframe and the speaker with the
    highest speaking time as input and deletes any intervention from other speakers.
    Resulting audio should be of equal lenght to the original audio."""
    waveform, sr = sf.read(audio_file)
    new_waveform = np.zeros_like(waveform)
    for _, row in diarization_df.iterrows():
        if row['speaker'] != longest_speaker:
            new_waveform[row['start']:row['end']] = 0   # Delete audio from other speakers
        else:
            new_waveform[row['start']:row['end']] = waveform[row['start']:row['end']]
    output_file = os.path.join(base_dir, 'diarization', f'{audio_file.split("/")[-1][:-4]}_participant.wav')
    sf.write(output_file, new_waveform, sr)


In [31]:
def extract_audio_manual_diarization(base_dir, audio_path, manual_diarization_path):
    """
    This function takes the audio file and the manual diarization file as input and deletes any intervention from other speakers.
    Resulting audio should be of equal lenght to the original audio."""
    waveform, sr = sf.read(audio_path)
    new_waveform = np.zeros_like(waveform)
    with open(manual_diarization_path, 'r', encoding="utf-8") as f:
        for line in f:
            start, end, speaker = line.split()
            speaker = speaker.lower()
            if speaker == "p":
                #keep audio from participant
                start = int(float(start) * sr)
                end = int(float(end) * sr)
                new_waveform[start:end] = waveform[start:end]
            else:
                start = int(float(start) * sr)
                end = int(float(end) * sr)
                new_waveform[start:end] = 0
    output_file = os.path.join(base_dir, f'{audio_path.split("/")[-1][:-4]}_manual_participant.wav')
    sf.write(output_file, new_waveform, sr)

In [35]:
# read der_report.csv

der_report = pd.read_csv("DER_report.csv")
output_dir = "/home/aleph/diariziation_error_rate/filtered_missed_detection"

# iterate over every filename in the "filename" column
for filename in tqdm.tqdm(der_report['filename']):
    # print(filename)
    FOLDER = "/home/aleph/diariziation_error_rate/combined_database/reference2"
    # search for the file in the folder
    for dirpath, dirnames, filenames in os.walk(FOLDER):
            if filename in filenames:
                manual_diarization_path = os.path.join(dirpath, filename)
                audio_path = os.path.join(dirpath, filename[:-4] + ".wav")
                break
            else:
                manual_diarization_path = None
    # check if the file was found

    if not manual_diarization_path:
        #check in different folder
        FOLDER = "/home/aleph/diariziation_error_rate/combined_database/reference"
        for dirpath, dirnames, filenames in os.walk(FOLDER):
            if filename in filenames:
                manual_diarization_path = os.path.join(dirpath, filename)
                audio_path = os.path.join(dirpath, filename[:-4] + ".wav")
                break
            else:
                manual_diarization_path = None
    # check if the file was found
    if not manual_diarization_path:
        print(f"{filename} not found")
        continue

    # call to extract_audio_manual_diarization
    try:
        extract_audio_manual_diarization(output_dir, audio_path, manual_diarization_path) 
    # except LibsndfileError
    except Exception as e:
        print(f"Error with {filename}: {e}")
        continue



 13%|█▎        | 37/276 [00:00<00:05, 41.11it/s]

Error with Slachevsky_EP_15_Intereses.txt: Error opening '/home/aleph/diariziation_error_rate/combined_database/reference2/FA/Slachevsky_EP_15_Intereses.wav': System error.


 83%|████████▎ | 230/276 [00:04<00:00, 47.11it/s]

Error with FONDECYT_SL0349_Lectura.txt: Error opening '/home/aleph/diariziation_error_rate/combined_database/reference/SG/FONDECYT_SL0349_Lectura.wav': System error.


100%|██████████| 276/276 [00:06<00:00, 45.10it/s]

TOTAL not found





# vad the audio

In [18]:
import pandas as pd
from pyannote.audio import Model
from diarization.passwords import AUTH_TOKEN
from pyannote.audio.pipelines import VoiceActivityDetection
from pyannote.audio import Pipeline

model = Model.from_pretrained(
  "pyannote/segmentation-3.0", 
  use_auth_token=AUTH_TOKEN)



def vad_pyannote(audio_path):
  
  HYPER_PARAMETERS = {
    # remove speech regions shorter than that many seconds.
    "min_duration_on": 0.0,
    # fill non-speech regions shorter than that many seconds.
    "min_duration_off": 0.0
  }
  pipeline.instantiate(HYPER_PARAMETERS)
  pipeline = VoiceActivityDetection(segmentation=model)

  vad = pipeline(audio_path)
  diarization_df = pd.DataFrame()
  speakers = []
  sr = 16000
  for turn, _, speaker in vad.itertracks(yield_label=True):
          if speaker not in speakers:
              speakers.append(speaker)

          dur = turn.end-turn.start
          new_row = {'start': int(turn.start * sr), 
                  'end': int(turn.end * sr), 
                  'start [s]': round(turn.start,3), 
                  'end [s]': round(turn.end,3), 
                  'dur [s]' : round(dur,3), 
                  'speaker': "participant"}
          diarization_df = pd.concat([diarization_df, pd.DataFrame(new_row, index=[0])], 
                                  ignore_index=True)
  # save to file
  diarization_df.to_csv(audio_path[:-4] + "_vad.csv", sep=" ", index=False, header=False)

In [19]:
vad_pyannote("/home/aleph/diariziation_error_rate/filtered_missed_detection/Slachevsky_EP_26_Letra_P_manual_participant.wav")

UnboundLocalError: local variable 'pipeline' referenced before assignment