In [2]:
from pydub import AudioSegment
import os

orig_folder = "bird_voices"
mod_folder = "bird_voices_mod"
f_format = "mp3"
mp3_files = [f for f in os.listdir(orig_folder) if f.lower().endswith(".mp3")]

for mp3_file in mp3_files:

    f = AudioSegment.from_file(
        os.path.join(orig_folder, mp3_file),
        f_format
    )

    # оставляем часть файла с 3 секунды без голоса диктора
    f = f[3000:]

    f.export(
        os.path.join(mod_folder, mp3_file),
        f_format
    )

print("Успешная обрезка аудио")

Успешная обрезка аудио


In [2]:
import os
import numpy as np
import librosa
from tqdm import tqdm

# === Настройки ===
input_dir = "bird_voices_mod"
output_dir = "mel_data"
segment_length = 4.0        # 4 секунды
overlap = 0.5               # 50%
sample_rate = 16000
n_mels = 64

os.makedirs(output_dir, exist_ok=True)

def process_audio(file_path, label):
    # --- Загружаем аудио ---
    y, sr = librosa.load(file_path, sr=sample_rate, mono=True)

    # --- Нормализация громкости ---
    y = librosa.util.normalize(y)

    # --- Параметры сегментации ---
    step = int(segment_length * sample_rate * (1 - overlap))  # шаг сдвига
    seg_len = int(segment_length * sample_rate)

    total_len = len(y)
    if total_len < seg_len:
        y = np.pad(y, (0, seg_len - total_len))  # добиваем до 4 секунд

    num_segments = max(1, (total_len - seg_len) // step + 1)

    # --- Папка для текущей птицы ---
    label_dir = os.path.join(output_dir, label)
    os.makedirs(label_dir, exist_ok=True)

    # --- Преобразование в мел-спектрограммы ---
    for i in range(num_segments):
        start = i * step
        end = start + seg_len
        segment = y[start:end]

        # Мел-спектрограмма
        mel = librosa.feature.melspectrogram(
            y=segment,
            sr=sample_rate,
            n_mels=n_mels,
            fmax=8000
        )

        mel_db = librosa.power_to_db(mel, ref=np.max)

        # Нормализация
        mel_db = (mel_db - np.mean(mel_db)) / (np.std(mel_db) + 1e-6)

        # --- Сохраняем ---
        base = os.path.splitext(os.path.basename(file_path))[0]
        out_path = os.path.join(label_dir, f"{base}_{i}.npy")
        np.save(out_path, mel_db.astype(np.float32))

# === Основной цикл ===
for fname in tqdm(os.listdir(input_dir)):
    if not fname.endswith(".mp3"):
        continue
    label = fname.split('_')[0]  # "sparrow_001.mp3" => "sparrow"
    fpath = os.path.join(input_dir, fname)
    process_audio(fpath, label)

print("✅ Обработка завершена! Все сегменты сохранены в", output_dir)


100%|██████████| 127/127 [00:58<00:00,  2.18it/s]

✅ Обработка завершена! Все сегменты сохранены в mel_data





In [3]:
import numpy as np
import glob
import os

def load_mel_dataset(root_dir):
    X, y = [], []
    labels = sorted(os.listdir(root_dir))
    label_to_idx = {lbl: i for i, lbl in enumerate(labels)}

    for lbl in labels:
        files = glob.glob(os.path.join(root_dir, lbl, "*.npy"))
        for f in files:
            mel = np.load(f)
            X.append(mel)
            y.append(label_to_idx[lbl])

    X = np.array(X)[..., np.newaxis]  # добавляем канал
    y = np.array(y)
    return X, y

X, y = load_mel_dataset("mel_data")
print(X.shape, y.shape)


(5023, 64, 126, 1) (5023,)


In [None]:
import tensorflow as tf


def pairwise_distance(embeddings, squared=False):
    """Вычислить расстояние между эмбедингами"""
    dot_product = tf.matmul(embeddings, embeddings, transpose_b=True)
    square_norm = tf.linalg.diag_part(dot_product)
    distances = tf.expand_dims(square_norm, 1) - 2.0 * dot_product + tf.expand_dims(square_norm, 0)
    distances = tf.maximum(distances, 0.0)
    if not squared:
        mask = tf.equal(distances, 0.0)
        distances = tf.sqrt(distances + tf.cast(mask, tf.float32) * 1e-16)
    return distances


def triplet_semihard_loss(y_true, y_pred, margin=0.2):
    """Вычислить triplet semi-hard потери"""
    labels = tf.reshape(y_true, [-1])
    pdist_matrix = pairwise_distance(y_pred, squared=True)

    adjacency = tf.equal(tf.expand_dims(labels, 0), tf.expand_dims(labels, 1))

    # Позитивная маска (но исключающая сравнение с самим собой)
    mask_positives = tf.cast(adjacency, tf.float32) - tf.eye(tf.shape(labels)[0])

    # Вычислить положительные расстояния
    pos_dist = tf.multiply(mask_positives, pdist_matrix)
    hardest_pos_dist = tf.reduce_max(pos_dist, axis=1, keepdims=True)

    # Условие: pos < neg < pos + margin
    condition = tf.logical_and(
        pdist_matrix > hardest_pos_dist,
        pdist_matrix < hardest_pos_dist + margin
    )

    mask_condition = tf.cast(condition, tf.float32)

    semi_hard_negatives = pdist_matrix + (1.0 - mask_condition) * 1e12
    semi_hard_neg_dist = tf.reduce_min(semi_hard_negatives, axis=1, keepdims=True)

    # Triplet semi-hard потеря
    loss = tf.maximum(hardest_pos_dist - semi_hard_neg_dist + margin, 0.0)
    return tf.reduce_mean(loss)


In [None]:
import tensorflow as tf

# Проверяем устройство
print(tf.config.list_physical_devices())

# Создание модели (эмбеддера)
def create_embedding_model():
    inp = tf.keras.Input(shape=(64, 126, 1))
    x = tf.keras.layers.Conv2D(32, 3, activation='relu', padding='same')(inp)
    x = tf.keras.layers.MaxPool2D()(x)
    x = tf.keras.layers.Conv2D(64, 3, activation='relu', padding='same')(x)
    x = tf.keras.layers.GlobalAveragePooling2D()(x)
    x = tf.keras.layers.Dense(128)(x)

    # нормализация через Lambda-слой
    out = tf.keras.layers.Lambda(lambda t: tf.math.l2_normalize(t, axis=1))(x)

    return tf.keras.Model(inp, out)

embedding_model = create_embedding_model()

# Triplet semi-hard loss - самый оптимальный вариант потерь
loss_fn = triplet_semihard_loss

embedding_model.compile(
    optimizer=tf.keras.optimizers.Adam(1e-3),
    loss=loss_fn
)

embedding_model.fit(
    X, y,
    batch_size=64,       # CPU => лучше маленький batch
    epochs=10
)


[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU')]

Epoch 1/10
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 105ms/step - loss: 0.1243
Epoch 2/10
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 105ms/step - loss: 0.0936
Epoch 3/10
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 104ms/step - loss: 0.0930
Epoch 4/10
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 100ms/step - loss: 0.0911
Epoch 5/10
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 101ms/step - loss: 0.0917
Epoch 6/10
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 106ms/step - loss: 0.0848
Epoch 7/10
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 96ms/step - loss: 0.0791
Epoch 8/10
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 98ms/step - loss: 0.0795
Epoch 9/10
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 97ms/step - loss: 0.0777
Epoch 10/10
[1m79/79[0m 

<keras.src.callbacks.history.History at 0x2341364d000>