In [None]:
from IPython.display import display, Audio
from keras.models import Sequential, Model
from keras.layers import Dense, LeakyReLU, BatchNormalization, Reshape, Conv1D, ReLU, Dropout, Flatten, AveragePooling1D, Input

import librosa
import numpy as np
import os
import tensorflow as tf

In [None]:
NOISE_DIM = 500
SAMPLE_RATE = 32000
DURATION = 5
AUDIO_DIM = SAMPLE_RATE * DURATION

In [None]:
# Generator Model
def generator(noise_dim, audio_dim):
    model = Sequential()
    model.add(Input(shape=(noise_dim,)))
    model.add(Dense(noise_dim))
    model.add(LeakyReLU(negative_slope=0.01))
    model.add(BatchNormalization(momentum=0.9))
    model.add(Reshape((noise_dim, 1)))

    model.add(Conv1D(16, 20, padding='same'))
    model.add(ReLU())
    model.add(BatchNormalization(momentum=0.9))
    model.add(Dropout(rate=0.1))

    model.add(Conv1D(32, 25, padding='same'))
    model.add(ReLU())
    model.add(BatchNormalization(momentum=0.9))
    model.add(Dropout(rate=0.1))

    model.add(Conv1D(64, 50, padding='same'))
    model.add(ReLU())
    model.add(BatchNormalization(momentum=0.9))
    model.add(Dropout(rate=0.1))
 
    model.add(Conv1D(320, 100, padding='same'))
    model.add(Dropout(rate=0.3))
    model.add(Flatten())
    return model

# Discriminator Model
def discriminator(audio_dim):
    model = Sequential()
    model.add(Input(shape=(audio_dim,)))
    model.add(Reshape((audio_dim, 1)))
    model.add(Conv1D(32, 100, strides=7, padding='valid'))
    model.add(ReLU())
    model.add(AveragePooling1D(4))
    model.add(BatchNormalization(momentum=0.9))
    model.add(Dropout(rate=0.1))

    model.add(Conv1D(16, 50, strides=5, padding='valid'))
    model.add(ReLU())
    model.add(BatchNormalization(momentum=0.9))
    model.add(Dropout(rate=0.1))

    model.add(Conv1D(8, 25, strides=3, padding='valid'))
    model.add(ReLU())
    model.add(BatchNormalization(momentum=0.9))
    model.add(Dropout(rate=0.1))

    model.add(Flatten())
    model.add(Dense(1024))
    model.add(LeakyReLU(negative_slope=0.01))
    model.add(BatchNormalization(momentum=0.9))
    model.add(Dense(1, activation='sigmoid'))
    return model

In [None]:
gpu_devices = tf.config.list_physical_devices('GPU')
print(gpu_devices)
for device in gpu_devices:
    tf.config.experimental.set_memory_growth(device, True)

In [None]:
G = generator(NOISE_DIM, AUDIO_DIM)
G.summary()

In [None]:
D = discriminator(AUDIO_DIM)
D.summary()

In [None]:
def load_data():
    X = []
    NB_BIRDS = 1
    i = 0
    for root, dirs, _ in os.walk('./birdclef-2023/train_audio'):
        for dir in dirs[:NB_BIRDS]:  # A CHANGER
            bird_path = os.path.join(root, dir)
            for _, _, files in os.walk(bird_path):
                for file in files:
                    sound_path = os.path.join(bird_path, file)
                    audio_data, _ = librosa.load(sound_path)
                    duration = librosa.get_duration(y=audio_data, sr=SAMPLE_RATE)
                    if duration > DURATION:
                        audio_data = audio_data[:AUDIO_DIM]
                    else:
                        audio_data = np.pad(audio_data, (0, AUDIO_DIM - int(duration * SAMPLE_RATE)), 'constant')
                    X.append(audio_data)
            print(f'bird dir number {i}')
            i += 1
    return X

In [None]:
D.compile(loss='binary_crossentropy',
          optimizer=tf.keras.optimizers.Adam(learning_rate=0.0002, beta_1=0.5),
          metrics=['accuracy'])

In [None]:
z = Input(shape=(NOISE_DIM,))
audio = G(z)
validity = D(audio)

In [None]:
GAN = Model(z, validity)
GAN.compile(loss='binary_crossentropy',
            optimizer=tf.keras.optimizers.Adam(learning_rate=0.0002, beta_1=0.5))

In [None]:
# Fonction pour entraîner le GAN
def train(iterations, batch_size):
    # Charger les données d'oiseaux réels
    real_data = load_data()

    for iteration in range(iterations):
        # -----------------------
        # Entraînement du discriminateur
        # -----------------------

        # Générer un batch d'échantillons aléatoires de l'espace latent
        noise = np.random.normal(0, 1, (batch_size, NOISE_DIM))
        # Générer un batch d'audio à partir du générateur
        generated_audio = G.predict(noise)

        # Sélectionner un batch d'audio réel au hasard
        idx = np.random.randint(0, len(real_data), batch_size)
        real_audio = np.array([real_data[i] for i in idx])

        # Étiquettes pour l'entraînement du discriminateur
        valid = np.ones((batch_size, 1))
        fake = np.zeros((batch_size, 1))

        # Entraîner le discriminateur sur les vrais et faux échantillons
        d_loss_real = D.train_on_batch(real_audio, valid)
        d_loss_fake = D.train_on_batch(generated_audio, fake)
        d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)

        # -----------------------
        # Entraînement du générateur
        # -----------------------

        # Générer un batch d'échantillons aléatoires de l'espace latent
        noise = np.random.normal(0, 1, (batch_size, NOISE_DIM))
        # Étiquettes pour l'entraînement du générateur (vraies images)
        valid = np.ones((batch_size, 1))

        # Entraîner le générateur (via le GAN) en essayant de tromper le discriminateur
        g_loss = GAN.train_on_batch(noise, valid)

        # Afficher les progrès
        print(f"Iteration: {iteration}, Discriminator Loss: {d_loss[0]}, Generator Loss: {g_loss}")
        

# Paramètres d'entraînement
iterations = 10
batch_size = 32

# Entraîner le GAN
train(iterations, batch_size)

In [None]:
noise = np.random.normal(0, 1, (batch_size, NOISE_DIM))
# Générer un batch d'audio à partir du générateur
generated_audio = G.predict(noise)

display(Audio(generated_audio[0], rate=SAMPLE_RATE, autoplay=False))

In [None]:
real_data = load_data()
display(Audio(real_data[0], rate=SAMPLE_RATE, autoplay=False))