### Music Generator based on Genre using VanillaGAN

In [93]:
# importing necessary libraries

import os
import numpy as np
import librosa
import librosa.display
import tensorflow as tf
from tensorflow.keras import layers
import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings('ignore')

In [94]:
# Path to dataset
DATA_PATH = r'H:\Deep Learning\Music Generator by Genre\GTZAN Data\genres_original'

# Define fixed length for spectrograms (e.g., 128 time frames)
FIXED_LENGTH = 1000

In [95]:
# Function to load and process Mel-Spectrograms for a specific genre
def load_data(genre, sr=22050, n_fft=2048, hop_length=512, n_mels=128):
    genre_path = os.path.join(DATA_PATH, genre)
    mel_spectrograms = []

    for file_name in os.listdir(genre_path):
        file_path = os.path.join(genre_path, file_name)
        if file_name.endswith('.wav'):
            y, _ = librosa.load(file_path, sr=sr)
            mel = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=n_fft, hop_length=hop_length, n_mels=n_mels)
            mel_db = librosa.power_to_db(mel, ref=np.max)

            # Fix the shape of spectrograms (padding or truncating)
            if mel_db.shape[1] < FIXED_LENGTH:
                # Pad with zeros if shorter than FIXED_LENGTH
                pad_width = FIXED_LENGTH - mel_db.shape[1]
                mel_db = np.pad(mel_db, ((0, 0), (0, pad_width)), mode='constant')
            else:
                # Truncate if longer than FIXED_LENGTH
                mel_db = mel_db[:, :FIXED_LENGTH]

            mel_spectrograms.append(mel_db)

    return np.array(mel_spectrograms)

In [96]:
def preprocess_data(mel_spectrograms):
    mel_spectrograms = (mel_spectrograms - mel_spectrograms.min()) / (mel_spectrograms.max() - mel_spectrograms.min())
    mel_spectrograms = mel_spectrograms.astype('float32')
    mel_spectrograms = mel_spectrograms[..., np.newaxis]
    return mel_spectrograms

In [97]:
# Load data for a specific genre
genre = "classical"  # Change genre as needed
mel_spectrograms = load_data(genre)
mel_spectrograms = preprocess_data(mel_spectrograms)

print(f'Classical Genre Mel Spectrograms shape: {mel_spectrograms.shape}')

Classical Genre Mel Spectrograms shape: (100, 128, 1000, 1)


In [98]:
# GAN Parameters
latent_dim = 100
input_shape = mel_spectrograms.shape[1:]

print(input_shape)

(128, 1000, 1)


In [99]:
# Building Generator Model

def build_generator(latent_dim):
    model = tf.keras.Sequential([

        # project noise vector to initial feature map
        layers.Dense(512 * 4 *4, input_dim=latent_dim),
        layers.Reshape((4,4,512)),

        # Upsampling layers with increasing resolution
        layers.Conv2DTranspose(256, kernel_size=5, strides=(2, 2), padding='same', activation='relu'),
        layers.BatchNormalization(),

        layers.Conv2DTranspose(128, kernel_size=5, strides=(2, 5), padding='same', activation='relu'),
        layers.BatchNormalization(),

        layers.Conv2DTranspose(64, kernel_size=5, strides=(2, 5), padding='same', activation='relu'),
        layers.BatchNormalization(),

        layers.Conv2DTranspose(32, kernel_size=3, strides=(2, 5), padding='same', activation='relu'),
        layers.BatchNormalization(),


        # Output layer (mel-spectrogram grayscale image)
        layers.Conv2DTranspose(1, kernel_size=5, strides=(2,1), padding='same', activation='tanh')

    ])

    return model

In [100]:
# Building Discriminator Model

def build_discriminator(input_shape):
    model = tf.keras.Sequential([
        # First convolutional layer
        layers.Conv2D(64, kernel_size=5, strides=2, padding='same', input_shape=input_shape),
        layers.LeakyReLU(alpha=0.2),
        layers.Dropout(0.3),

        # Second convolutional layer
        layers.Conv2D(128, kernel_size=5, strides=2, padding='same'),
        layers.LeakyReLU(alpha=0.2),
        layers.Dropout(0.3),

        # Third convolutional layer
        layers.Conv2D(256, kernel_size=5, strides=2, padding='same'),
        layers.LeakyReLU(alpha=0.2),
        layers.Dropout(0.3),

        # Fourth convolutional layer
        layers.Conv2D(512, kernel_size=5, strides=2, padding='same'),
        layers.LeakyReLU(alpha=0.2),
        layers.Dropout(0.3),

        layers.Flatten(),

        # Dense layers for classification
        layers.Dense(1024),
        layers.LeakyReLU(alpha=0.2),
        layers.Dropout(0.5),

        layers.Dense(1, activation='sigmoid')  # Binary classification (real/fake)
    ])

    return model

In [101]:
# Instantiate Generator and Discriminator
generator = build_generator(latent_dim)
discriminator = build_discriminator(input_shape)

# Compile Discriminator
discriminator.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4), 
    loss=tf.keras.losses.BinaryCrossentropy(from_logits=True), 
    metrics=['accuracy']
)


In [102]:
# Build GAN Model
discriminator.trainable = False
gan_input = layers.Input(shape=(latent_dim,))
fake_image = generator(gan_input)
gan_output = discriminator(fake_image)
gan = tf.keras.Model(gan_input, gan_output)
gan.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4), 
    loss=tf.keras.losses.BinaryCrossentropy(from_logits=True)
)

In [103]:
gan.summary()

In [104]:
# Training Function
def train_gan(generator, discriminator, gan, data, epochs, batch_size):
    half_batch = batch_size // 2
    for epoch in range(epochs):
        # Train Discriminator
        idx = np.random.randint(0, data.shape[0], half_batch)
        real_images = tf.convert_to_tensor(data[idx], dtype=tf.float32)
        real_labels = np.ones((half_batch, 1))

        noise = np.random.normal(0, 1, (half_batch, latent_dim))
        fake_images = generator.predict(noise)
        fake_labels = np.zeros((half_batch, 1))

        d_loss_real = discriminator.train_on_batch(real_images, real_labels)
        d_loss_fake = discriminator.train_on_batch(fake_images, fake_labels)

        # Train Generator
        noise = np.random.normal(0, 1, (batch_size, latent_dim))
        misleading_labels = np.ones((batch_size, 1))
        g_loss = gan.train_on_batch(noise, misleading_labels)

        # Log losses
        if epoch % 10 == 0:
            print(f"Epoch {epoch}/{epochs} | D Loss: {d_loss_real[0]+d_loss_fake[0]:.4f} | G Loss: {g_loss:.4f}")


In [105]:
# Train the GAN
train_gan(generator, discriminator, gan, mel_spectrograms, epochs=50, batch_size=64)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 613ms/step


AttributeError: 'NoneType' object has no attribute 'update_state'

In [None]:
# Generate New Audio
def generate_audio(generator, latent_dim, sr=22050):
    noise = np.random.normal(0, 1, (1, latent_dim))
    generated_mel = generator.predict(noise)
    generated_mel = generated_mel.squeeze()  # Remove extra dimensions
    generated_mel = (generated_mel * 255).astype(np.uint8)  # Rescale

    # Convert Mel-Spectrogram back to audio
    mel_spectrogram = librosa.feature.inverse.mel_to_audio(generated_mel, sr=sr)
    return mel_spectrogram

# Example: Generate audio
generated_audio = generate_audio(generator, latent_dim)
librosa.output.write_wav('generated_audio.wav', generated_audio, sr=22050)