In [None]:
import pandas as pd
import os
import webrtcvad
import librosa
import librosa.display
import IPython.display as ipd
from pydub import AudioSegment
import wave
import io
import soundfile as sf
import contextlib

In [None]:
#Konstanten
TARGET_SAMPLE_RATE = 16000
WAVE_TYPE = "PCM_16"

#IO
READ_PATH = "./Samples_Unprocessed/"
WRITE_PATH = "./Samples_Processed/"

## PREPROCESSING

In [None]:
#Lädt alle Sampless
samples = []

for subdir, dirs, files in os.walk(READ_PATH):
    for file in files:
        filepath = os.path.join(subdir, file)
        audio_time_series, sample_rate = librosa.load(filepath,sr=None,mono=True)
        
        #Fügt neue Zeile an
        samples.append({
            "filename"                :file,
            "filepath"                :filepath,
            "audio_time_series"       :audio_time_series,
            "audio_time_series_16kHZ" :librosa.resample(audio_time_series,orig_sr=sample_rate,target_sr=TARGET_SAMPLE_RATE),
            "sample_rate"             :sample_rate
        })

In [None]:
#Exportiert Samples nach Samples_Processed
for entry in samples:
    filepath = os.path.join(WRITE_PATH, entry["filename"])
    sf.write(file=filepath,data=entry["audio_time_series_16kHZ"],samplerate=TARGET_SAMPLE_RATE, format="WAV",subtype='PCM_16')

## Testen

In [None]:
#Konstanten in ms
FRAME_SIZE = 30
HOP_SIZE = 10

In [None]:
#Lädt alle Samples
samples = []

for subdir, dirs, files in os.walk(WRITE_PATH):
    for file in files:
        #Lädt Audio
        filepath = os.path.join(subdir, file)
        song = AudioSegment.from_wav(filepath)

        #Zerteilt Sample in kleine Parts
        number_of_parts = (len(song) - FRAME_SIZE) / HOP_SIZE
        for start in range(0,number_of_parts,HOP_SIZE):
            part = song[start:start+FRAME_SIZE].get_array_of_samples()

            samples.append({
                "filename"                :file,
                "filepath"                :filepath,
                ""
            })

In [None]:
len(song) /30

In [None]:
vad = webrtcvad.Vad()
vad.is_speech(bytearray(song[1230:1260].get_array_of_samples()),TARGET_SAMPLE_RATE)

# Silerio

In [1]:
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model, util = torch.hub.load(
    repo_or_dir='snakers4/silero-vad',
    model='silero_vad',
    force_reload=True,
    trust_repo=False
)

The repository is already trusted.


Downloading: "https://github.com/snakers4/silero-vad/zipball/master" to C:\Users\Jonas/.cache\torch\hub\master.zip


In [3]:
torch.hub.download_url_to_file('https://models.silero.ai/vad_models/en.wav', 'en_example.wav')

100%|██████████| 1.83M/1.83M [00:00<00:00, 12.1MB/s]


In [3]:
#Parameter
sum(p.numel() for p in model.parameters())

180282

In [6]:
from torchinfo import summary
summary(model)

Layer (type:depth-idx)                        Param #
VADRNNJITMerge                                --
├─VADRNNJIT: 1-1                              --
│    └─AdaptiveAudioNormalizationNew: 2-1     --
│    └─STFT: 2-2                              --
│    └─Sequential: 2-3                        --
│    │    └─ConvBlock: 3-1                    9,836
│    │    └─Dropout: 3-2                      --
│    └─Sequential: 2-4                        --
│    │    └─Conv1d: 3-3                       272
│    │    └─BatchNorm1d: 3-4                  32
│    │    └─ReLU: 3-5                         --
│    │    └─Sequential: 3-6                   1,184
│    │    └─Conv1d: 3-7                       1,056
│    │    └─BatchNorm1d: 3-8                  64
│    │    └─ReLU: 3-9                         --
│    │    └─Sequential: 3-10                  1,248
│    │    └─Conv1d: 3-11                      1,056
│    │    └─BatchNorm1d: 3-12                 64
│    │    └─ReLU: 3-13                        --

In [None]:
from pthflops import count_ops

# Pyannote