In [13]:

import glob
import numpy as np
from scipy.signal import butter, filtfilt
from pydub import AudioSegment
import scipy.io.wavfile as wav
from tqdm import tqdm
from pydub import AudioSegment


In [14]:
# Note The threshold depends also on the input volume set on the computer
def _get_voice_onset(signal, threshold = 200, fs=44100, min_time=100):
    '''
    signal : numpy.ndarray
             signal in. Should be the envelope of the raw signal for accurate results
    threshold : int
                Amplitude threshold for voice onset.
                (Threshold = 200 with NYUAD MEG mic at 75% input volume seems to work well)
    fs : int
         Sampling frequency
    min_time : int (ms)
             Time in ms after the threshold is crossed used to calculate
              the median amplitude and decide if it was random burst of noise
              or speech onset.
    '''

    n_above_thresh = int(fs/min_time) # convert time above threshold to number of samples.

    indices_onset = np.where(signal >= threshold)[0] # All indices above threshold
    # Next, find the first index that where the MEDIAN stays above threshold for the next 10ms
    # Not using the MEAN because sensitive to a single extreme value
    # Note 44.1 points per millesconds (for fs=44100)
    # 10ms = 441 points
    for i in indices_onset:
        median_mintime = np.median(np.abs(signal[i:i+n_above_thresh])) # median value in the timewindow of length min_time
        if median_mintime >= threshold:
            idx_onset = i
            onset_time = idx_onset / float(fs) * 1000.0

            return idx_onset, onset_time
    return np.nan, np.nan # if no point exceeds the threshold.
                          # Return "None" instead of None in order to be able to append it to a list later on

In [15]:
#--- Based on Jarne (2017) "Simple empirical algorithm to obtain signal envelope in three steps"
def _get_envelope(signal, fs=44100, N=200, cutoff=2000):
    '''
    signal: input wav (numpy.ndarray)
    fs: sampling frequency
    N: number of samples per chunk (in part (2))
    cutoff: LPF cutoff, the smaller the cuttoff the stronger the filter. (tweek this).
    '''
    # 1) Take the absolute value of the signal
    abs_signal = abs(signal)
    # 2) Seperate into samples of N, and get peak value of each sample.
    chunked_signal = [abs_signal[i:i+N] for i in range(0, len(abs_signal), N)]
    new_signal = []
    for chunk in chunked_signal: #Then for each chunk, replace all values by max value
        max_value = np.max(chunk)
        new_chunk = [max_value for i in range(len(chunk))]
        new_signal.append(new_chunk)
    # new_signal = np.array(new_signal).flatten()
    new_signal = np.array([item for sublist in new_signal for item in sublist]) # flatten list of lists
    # 3) LPF the new_signal (the envelope, not the original signal)
    def FilterSignal(signal_in, fs, cutoff):
        B, A = butter(1, cutoff / (fs / 2.0), btype='low')
        filtered_signal = filtfilt(B, A, signal_in, axis=0)
        return filtered_signal
    filteredSignal = FilterSignal(new_signal, fs, cutoff)

    return filteredSignal

In [18]:
def convert_ogg_to_wav(input_file, output_file):
    # Load .ogg file
    audio = AudioSegment.from_ogg(input_file)
    
    # Export as .wav file
    audio.export(output_file, format="wav")


# Load the .ogg file
input_file = 'C:/Users/DanielZander/Documents/DuckSoup/Development/pilot_ultimatumtest/audio_preproc/trimed/20-p4p6/i-e86c4940d859745f7abfd3f642401ab5-a-20240911-144740.854-s-mkpsyphysical_main1-n-20-p4p6-u-p6-c-1-audio-dry.ogg'
output_file = 'C:/Users/DanielZander/Documents/DuckSoup/Development/pilot_ultimatumtest/audio_preproc/trimed/20-p4p6/i-e86c4940d859745f7abfd3f642401ab5-a-20240911-144740.854-s-mkpsyphysical_main1-n-20-p4p6-u-p6-c-1-audio-dry.wav'
convert_ogg_to_wav(input_file, output_file)

fs, signal = wav.read(output_file)

filtered_signal = _get_envelope(signal, fs=fs)

# Use the _get_voice_onset function to find the speech onset time
idx_onset, onset_time = _get_voice_onset(filtered_signal, threshold=20000000, fs=fs, min_time=100)


# Print the results
if not np.isnan(onset_time):
    print(f"Speech onset detected at index {idx_onset} which corresponds to {onset_time:.2f} ms.")
else:
    print("No speech onset detected.")




Speech onset detected at index 53994 which corresponds to 1124.88 ms.
