In [13]:
import librosa
import torch
from pyannote.audio import Pipeline
import numpy as np
import soundfile as sf

# Load the audio file
audio_path = "data/mlpc24_speech_commands/scenes/2_speech_true_Ofen_aus.wav"
y, sr = librosa.load(audio_path, sr=None)

# Convert the audio signal to the required format (1, time) and then to a torch tensor
waveform = torch.tensor(y).unsqueeze(0)  # Add a channel dimension

# Authenticate with Hugging Face and load the VAD pipeline
token = "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"  # Replace with your Hugging Face access token
pipeline = Pipeline.from_pretrained("pyannote/voice-activity-detection", use_auth_token=token)

# Apply VAD to the audio file
vad = pipeline({'waveform': waveform, 'sample_rate': sr})

# Get speech segments
speech_segments = vad.get_timeline().support()

# Add buffer around speech segments
buffer_duration = 0.5  # buffer duration in seconds
buffer_samples = int(buffer_duration * sr)

segments_with_buffer = []
for segment in speech_segments:
    start = max(0, int(segment.start * sr) - buffer_samples)
    end = min(len(y), int(segment.end * sr) + buffer_samples)
    segments_with_buffer.append((start, end))

# Extract 1.1-second snippets
snippet_duration = 1.1  # snippet duration in seconds
snippet_samples = int(snippet_duration * sr)

snippets = []
for start, end in segments_with_buffer:
    segment_duration = end - start
    if segment_duration >= snippet_samples:
        for snippet_start in range(start, end - snippet_samples + 1, snippet_samples):
            snippet = y[snippet_start:snippet_start + snippet_samples]
            snippets.append(snippet)
    else:
        snippet = y[start:end]
        if len(snippet) < snippet_samples:
            snippet = np.pad(snippet, (0, snippet_samples - len(snippet)), mode='constant')
        snippets.append(snippet)

# Save snippets
for i, snippet in enumerate(snippets):
    sf.write(f'snippet_{i}.wav', snippet, sr)
