In [2]:
import librosa
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch

In [3]:
class Config:
 
    DEBUG_MODE = False
    
    OUTPUT_DIR = './working/'
    DATA_ROOT = './Data'
    FS = 32000
    
    # Mel spectrogram parameters
    N_FFT = 1024
    HOP_LENGTH = 512
    N_MELS = 128
    FMIN = 50
    FMAX = 14000
    
    TARGET_DURATION = 5.0
    TARGET_SHAPE = (256, 256)  
    
    N_MAX = 50 if DEBUG_MODE else None  

config = Config()

In [7]:
df = pd.read_csv(config.DATA_ROOT + '/train.csv')
fabio = df[df.author == 'Fabio A. Sarria-S'].copy()

print(f'We have {len(fabio)} Fabio\'s recordings in total')
fabio['collection'].unique()

We have 26 Fabio's recordings in total


array(['CSA'], dtype=object)

In [17]:
df = pd.read_csv(config.DATA_ROOT + '/train.csv')
df = df[df.collection == 'XC'].copy()
df['author'].unique()
#df['collection'].unique() - iNat, XC, CSA

1145

In [7]:
torch.set_num_threads(1)

model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
                              model='silero_vad',
                              force_reload=True)

(get_speech_timestamps,
 _, read_audio,
 *_) = utils

sampling_rate = 16000 # also accepts 8000


df = pd.read_csv(config.DATA_ROOT + '/train.csv')
df = df[df.collection == 'iNat'].copy()

author_map = {
    'Paula Caycedo-Rosales | Juan-Pablo López': 'Paula Caycedo-Rosales',
    'Eliana Barona-Cortés | Daniela García-Cobos': 'Eliana Barona-Cortés',
    'Ana María Ospina-Larrea | Daniela Murillo': 'Ana María Ospina-Larrea',
    'Alexandra Butrago-Cardona': 'Alexandra Buitrago-Cardona',
    'Eliana Barona- Cortés': 'Eliana Barona-Cortés',
    'Diego A Gómez-Morales': 'Diego A. Gomez-Morales',
}
author_map_func = lambda x: author_map[x] if x in author_map.keys() else x

df.author = df.author.map(author_map_func)
authors = sorted(df.author.unique())

# Here, I limit the output to 2 authors. Otherwise, the webpage becomes too heavy to load.
# If your are interested, please check the previous version of the notebook!
for author in authors[:]:
    selection = df[df.author == author].copy()
    print(f'We have {len(selection)} CSA recordings by {author} in total')
    
    N = len(selection)
    chunk_len = 0.2 # Chunk len in seconds
    
    for n in range(N):
        # Load the data
        rec = selection.iloc[n]
        fname = config.DATA_ROOT + f'/train_audio/{rec.filename}'
        wav, sr = librosa.load(fname)
    
        # Calculate the sound power
        power = wav ** 2
        
        # Split the data into chunks and sum the energy in every chunk
        chunk = int(chunk_len * sr)
        
        pad = int(np.ceil(len(power) / chunk) * chunk - len(power))
        power = np.pad(power, (0, pad))
        power = power.reshape((-1, chunk)).sum(axis=1)

        speech_timestamps = get_speech_timestamps(torch.Tensor(wav), model)
        segmentation = np.zeros_like(wav)
        for st in speech_timestamps:
            segmentation[st['start']: st['end']] = 20
    
        fig = plt.figure(figsize=(24, 3))
        fig.suptitle(f'{rec.filename} by {rec.author}')
        
        t = np.arange(len(power)) * chunk_len
        plt.plot(t, 10 * np.log10(power), 'b')
        
        t = np.arange(len(segmentation)) / sr
        plt.plot(t, segmentation, 'r')        
        #plt.show()
        
        #display(ipd.Audio(fname))

Downloading: "https://github.com/snakers4/silero-vad/zipball/master" to /home/cele/.cache/torch/hub/master.zip


We have 1 CSA recordings by Alejandro Mendoza | Mónica Izquierdo in total
We have 7 CSA recordings by Alexandra Buitrago-Cardona in total
We have 6 CSA recordings by Ana María Ospina-Larrea in total
We have 19 CSA recordings by Angela M. Mendoza-Henao in total


  fig = plt.figure(figsize=(24, 3))


We have 37 CSA recordings by Diego A. Gomez-Morales in total
We have 10 CSA recordings by Eliana Barona-Cortés in total
We have 26 CSA recordings by Fabio A. Sarria-S in total


: 

In [4]:
import numpy as np
import pickle

with open("train_voice_data.pkl", "rb") as fr:
    data = pickle.load(fr)
data

data

{'/kaggle/input/birdclef-2025/train_audio/1139490/CSA36385.ogg': [{'start': 145440,
   'end': 232416},
  {'start': 234016, 'end': 263136},
  {'start': 265760, 'end': 291296},
  {'start': 299040, 'end': 343008},
  {'start': 351776, 'end': 398816},
  {'start': 403488, 'end': 453088},
  {'start': 455712, 'end': 546784},
  {'start': 549408, 'end': 703968},
  {'start': 705568, 'end': 738784},
  {'start': 741408, 'end': 798176},
  {'start': 802336, 'end': 822240},
  {'start': 824864, 'end': 868832},
  {'start': 874016, 'end': 936416},
  {'start': 941600, 'end': 1016288},
  {'start': 1020448, 'end': 1041888},
  {'start': 1047072, 'end': 1096672},
  {'start': 1104928, 'end': 1131488},
  {'start': 1137696, 'end': 1196000},
  {'start': 1202720, 'end': 1251808},
  {'start': 1256480, 'end': 1308640},
  {'start': 1310752, 'end': 1353696},
  {'start': 1357344, 'end': 1434080},
  {'start': 1435680, 'end': 1454560},
  {'start': 1461280, 'end': 1474016},
  {'start': 1489952, 'end': 1504736},
  {'start'