# WAV to Mel-Spectogram Converter

For this code, we are creating multiple mel-spectrograms, saving them to eventually be used for our multiple neural networks/ensembles in order to build a more accurate model

## 1 - Imports

In [1]:
import os
import librosa
import numpy as np
import matplotlib.pyplot as plt
import gc

import sys

from IPython.display import clear_output
clear_output()

print(sys.executable)

/Library/Developer/CommandLineTools/usr/bin/python3


## 2 - Grab the file paths for all the music files

In [2]:
DATASET_PATH = "../Data/genres_original"
genres = ['blues', 'classical', 'country', 'disco', 'hiphop', 'jazz', 'metal', 'pop', 'reggae', 'rock']
file_paths = []
genre_count = []

# Iterate through each genre and load the audio files
for genre in genres:
    genre_path = os.path.join(DATASET_PATH, genre)
    if os.path.isdir(genre_path):
        for file in os.listdir(genre_path):
            if file.endswith(".wav"):
                file_path = os.path.join(genre_path, file)
                try:
                    # Add file path to the list
                    file_paths.append(file_path)
                    # Assign label based on genre index
                    genre_count.append(genres.index(genre))
                except Exception as e:
                    print(f"Error processing {file_path}: {e}")
                    continue

# Print the number of loaded files and labels
print("Files loaded:", len(file_paths))
print("Labels loaded:", len(genre_count))
print("Genres loaded:", len(genres))

genre_count[400], file_paths[400]

Files loaded: 1000
Labels loaded: 1000
Genres loaded: 10


(4, '../Data/genres_original/hiphop/hiphop.00022.wav')

## 3 - Function for creating either 3-second or 30-second .wav Mel-Spectrogram images

In [3]:
def _save_mel_spectrogram_image(melSpectrogram, sr, hop_length, cmap, file_path, genre, duration=30):
    
	print(file_path)

	plt.figure(figsize=(12, 4))
 
	librosa.display.specshow(
		melSpectrogram,
		sr=sr,
		hop_length=hop_length,
		x_axis='time',
		y_axis='mel',
		cmap=cmap
	)

	plt.xlim(0, duration)
	plt.gca().set_axis_off()
	plt.gcf().set_facecolor("none")
 	
	saved_path = os.path.join(
    	"../Images", "Mel Spectrograms", genre, os.path.basename(file_path).replace("wav", "png")
	)

	os.makedirs(os.path.dirname(saved_path), exist_ok=True)

	plt.savefig(saved_path, bbox_inches='tight', pad_inches=0)
	plt.close()
	
	return saved_path

def _save_mel_spectrogram_segmented(melSpectrogram, sr, hop_length, cmap, file_path, genre, num_segments=10):
    
	segments = np.array_split(melSpectrogram, num_segments, axis=1)
	base_filename = os.path.splitext(os.path.basename(file_path))[0]
	saved_paths = []
    
	for segmentIndex, segment in enumerate(segments, start=1):
		plt.figure(figsize=(12/num_segments, 4))
		
		librosa.display.specshow(
			segment,
			sr=sr,
			hop_length=hop_length,
			x_axis='time',
			y_axis='mel',
			cmap=cmap
		)
		
		segment_duration = segment.shape[1] * hop_length / sr
  
		plt.xlim(0, segment_duration)
		plt.gca().set_axis_off()
		plt.gcf().set_facecolor("none")
		saved_path = os.path.join(
			"../Images", "Mel Spectrograms (3 seconds)", genre,
   			f"{base_filename}_segment_{segmentIndex}.png"
		)
  
		os.makedirs(os.path.dirname(saved_path), exist_ok=True)
  
		plt.savefig(saved_path, bbox_inches='tight', pad_inches=0)
		plt.close()
  
		saved_paths.append(saved_path)
  
	return saved_paths  # Return the list of saved paths for segmented mel spectrograms
    

def createMelSpectrogramImages(inputDuration = 30, inputSampleRate = 22050, inputNFFT = 2048, inputHopLength = 512, inputCMAP = 'viridis'):
	try:
		file_paths, genres, genre_count
	except NameError as e:
		print("Error: Ensure that the file paths and genres are defined before calling this function.")
		return

	for i in range(len(file_paths)):
		try:
			y, sr = librosa.load(
				file_paths[i],
    			sr=inputSampleRate,
			)
   
			target_length = inputSampleRate * inputDuration
   
			y = y[:target_length]
   
			if len(y) < target_length:
				y = np.pad(y, (0, target_length - len(y)), mode='constant')
    
			melSpectrogram = librosa.feature.melspectrogram(
				y=y,
				sr=sr,
				n_fft=inputNFFT,
				hop_length=inputHopLength,
				n_mels=128,
				fmax=sr / 2
			)
   
			melSpectrogram_db = librosa.power_to_db(melSpectrogram, ref=np.max)
			melSpectrogram_normalized = librosa.util.normalize(melSpectrogram_db)
   
			if inputDuration == 30:
				savedPath = _save_mel_spectrogram_image(
					melSpectrogram_normalized, sr, inputHopLength, inputCMAP,
					file_paths[i], genres[genre_count[i]]
				)

			elif inputDuration == 3:
				savedPath = _save_mel_spectrogram_segmented(
					melSpectrogram_normalized, sr, inputHopLength, inputCMAP,
					file_paths[i], genres[genre_count[i]]
				)
    
		except Exception as e:
			print(f"Error processing {file_paths[i]}: {e}")
			continue

## 4 - Activate the 3-second or 30-second inputs

In [4]:
createMelSpectrogramImages(
	inputDuration=3,
	inputSampleRate=22050,
	inputNFFT=2048,
	inputHopLength=512,
	inputCMAP='inferno'
)

In [5]:
createMelSpectrogramImages(
	inputDuration=30,
	inputSampleRate=22050,
	inputNFFT=2048,
	inputHopLength=512,
	inputCMAP='inferno'
)

../Data/genres_original/blues/blues.00093.wav
../Data/genres_original/blues/blues.00087.wav
../Data/genres_original/blues/blues.00050.wav
../Data/genres_original/blues/blues.00044.wav
../Data/genres_original/blues/blues.00078.wav
../Data/genres_original/blues/blues.00079.wav
../Data/genres_original/blues/blues.00045.wav
../Data/genres_original/blues/blues.00051.wav
../Data/genres_original/blues/blues.00086.wav
../Data/genres_original/blues/blues.00092.wav
../Data/genres_original/blues/blues.00084.wav
../Data/genres_original/blues/blues.00090.wav
../Data/genres_original/blues/blues.00047.wav
../Data/genres_original/blues/blues.00053.wav
../Data/genres_original/blues/blues.00052.wav
../Data/genres_original/blues/blues.00046.wav
../Data/genres_original/blues/blues.00091.wav
../Data/genres_original/blues/blues.00085.wav
../Data/genres_original/blues/blues.00081.wav
../Data/genres_original/blues/blues.00095.wav
../Data/genres_original/blues/blues.00042.wav
../Data/genres_original/blues/blue

## 5 - Load the images

In [40]:
import os
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras import models
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ReduceLROnPlateau

# Define genres and constants
genres = ['blues', 'classical', 'country', 'disco', 'hiphop',
          'jazz', 'metal', 'pop', 'reggae', 'rock']
GENRE_TO_INDEX = {genre: index for index, genre in enumerate(genres)}
FILEPATH = os.path.join("..", "Images", "Mel Spectrograms")

# Data loading and augmentation
X = []
y = []

def augmentImage(image):
    image = tf.image.random_brightness(image, max_delta=0.2)
    image = tf.image.random_contrast(image, lower=0.8, upper=1.2)
    return image

for genre in genres:
    genre_directory = os.path.join(FILEPATH, genre)
    if os.path.exists(genre_directory):
        for filename in os.listdir(genre_directory):
            file_path = os.path.join(genre_directory, filename)
            image = tf.io.read_file(file_path)
            image = tf.image.decode_png(image, channels=1)
            image = tf.image.convert_image_dtype(image, tf.float32)  # values in [0, 1]
            image = tf.image.resize(image, [256, 256])
            image = augmentImage(image)
            X.append(image.numpy())
            y.append(GENRE_TO_INDEX[genre])

# Convert to NumPy arrays
X = np.array(X, dtype=np.float32)
y = np.array(y, dtype=np.int32)

# Train-test split
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

## 6 - Create the CNN

In [41]:
import os
from sklearn.model_selection import train_test_split
import tensorflow as tf

from tensorflow.keras import models
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout, BatchNormalization

def create_cnn(input_shape, num_classes):
	model = models.Sequential([
		Conv2D(32, (3, 3), activation='relu', input_shape=input_shape),
		MaxPooling2D((2, 2)),
		BatchNormalization(),
  
		Conv2D(64, (3, 3), activation='relu'),
		MaxPooling2D((2, 2)),
		BatchNormalization(),
  
		Conv2D(128, (3, 3), activation='relu'),
		MaxPooling2D((2, 2)),
		BatchNormalization(),
  
		Flatten(),
  
		Dense(256, activation='relu'),
		Dropout(0.5),
  
		Dense(num_classes, activation='softmax')
	])
	
	return model

## 7 - Create and compile the model

In [42]:
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ReduceLROnPlateau

model = create_cnn(input_shape=(256, 256, 1), num_classes=len(genres))
model.compile(
    optimizer=Adam(learning_rate=0.001),
	loss='sparse_categorical_crossentropy',
	metrics=['accuracy']
)

reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=1e-6)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


## 8 - Train the model

In [43]:
history = model.fit(
	X_train, y_train,
	validation_data=(X_val, y_val),
	epochs=20,
	batch_size=32,
	callbacks=[reduce_lr],
)

Epoch 1/20
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 533ms/step - accuracy: 0.2075 - loss: 33.5320 - val_accuracy: 0.1000 - val_loss: 6.7656 - learning_rate: 0.0010
Epoch 2/20
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 532ms/step - accuracy: 0.2998 - loss: 8.7799 - val_accuracy: 0.1000 - val_loss: 14.1939 - learning_rate: 0.0010
Epoch 3/20
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 525ms/step - accuracy: 0.2355 - loss: 2.7071 - val_accuracy: 0.1000 - val_loss: 26.1923 - learning_rate: 0.0010
Epoch 4/20
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 519ms/step - accuracy: 0.2694 - loss: 2.3328 - val_accuracy: 0.1000 - val_loss: 23.3038 - learning_rate: 0.0010
Epoch 5/20
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 532ms/step - accuracy: 0.2664 - loss: 2.1787 - val_accuracy: 0.1000 - val_loss: 22.5396 - learning_rate: 0.0010
Epoch 6/20
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m

KeyboardInterrupt: 

## Load the images

In [None]:
# get the accuracy and loss from the history
accuracy = history.history['accuracy']
loss = history.history['loss']
val_accuracy = history.history['val_accuracy']
val_loss = history.history['val_loss']
print("Training Accuracy:", accuracy[-1])
print("Validation Accuracy:", val_accuracy[-1])
print("Training Loss:", loss[-1])
print("Validation Loss:", val_loss[-1])

Training Accuracy: 0.4387499988079071
Validation Accuracy: 0.15000000596046448
Training Loss: 1.3710315227508545
Validation Loss: 6.674400806427002


In [44]:
# Direct WAV to Mel-Spectrogram CNN Processing
import os
import librosa
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras import models, layers
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping
import matplotlib.pyplot as plt

# 1. Data Loading and Preprocessing Functions
def load_audio_file(file_path, sr=22050, duration=30):
    """Load and preprocess audio file"""
    try:
        # Load audio file
        y, _ = librosa.load(file_path, sr=sr, duration=duration)
        
        # Ensure consistent length
        target_length = sr * duration
        if len(y) < target_length:
            # Pad with zeros if too short
            y = np.pad(y, (0, target_length - len(y)), mode='constant')
        else:
            # Truncate if too long
            y = y[:target_length]
            
        return y
    except Exception as e:
        print(f"Error loading {file_path}: {e}")
        return None

def audio_to_melspectrogram(y, sr=22050, n_fft=2048, hop_length=512, n_mels=128):
    """Convert audio to mel-spectrogram"""
    # Create mel-spectrogram
    mel_spec = librosa.feature.melspectrogram(
        y=y,
        sr=sr,
        n_fft=n_fft,
        hop_length=hop_length,
        n_mels=n_mels,
        fmax=sr/2
    )
    
    # Convert to dB scale
    mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
    
    # Normalize to [0, 1] range
    mel_spec_norm = (mel_spec_db - mel_spec_db.min()) / (mel_spec_db.max() - mel_spec_db.min())
    
    return mel_spec_norm

def create_segments(mel_spec, num_segments=10):
    """Split mel-spectrogram into segments"""
    # Split along time axis (axis=1)
    segments = np.array_split(mel_spec, num_segments, axis=1)
    
    # Pad segments to same size if needed
    max_length = max(seg.shape[1] for seg in segments)
    padded_segments = []
    
    for seg in segments:
        if seg.shape[1] < max_length:
            pad_width = max_length - seg.shape[1]
            seg = np.pad(seg, ((0, 0), (0, pad_width)), mode='constant')
        padded_segments.append(seg)
    
    return np.array(padded_segments)

# 2. Dataset Loading
def load_dataset(dataset_path, genres, sr=22050, duration=30, segment=True, num_segments=10):
    """Load entire dataset and convert to mel-spectrograms"""
    X = []
    y = []
    
    print("Loading dataset...")
    
    for genre_idx, genre in enumerate(genres):
        genre_path = os.path.join(dataset_path, genre)
        print(f"Processing {genre}...")
        
        if not os.path.isdir(genre_path):
            continue
            
        for file in os.listdir(genre_path):
            if not file.endswith(".wav"):
                continue
                
            file_path = os.path.join(genre_path, file)
            
            # Load audio
            audio = load_audio_file(file_path, sr=sr, duration=duration)
            if audio is None:
                continue
            
            # Convert to mel-spectrogram
            mel_spec = audio_to_melspectrogram(audio, sr=sr)
            
            if segment:
                # Create segments
                segments = create_segments(mel_spec, num_segments)
                
                # Add each segment as a separate sample
                for seg in segments:
                    # Add channel dimension for CNN
                    X.append(seg[..., np.newaxis])
                    y.append(genre_idx)
            else:
                # Use full mel-spectrogram
                X.append(mel_spec[..., np.newaxis])
                y.append(genre_idx)
    
    return np.array(X), np.array(y)

# 3. CNN Model Architecture
def create_melspec_cnn(input_shape, num_classes):
    """Create CNN for mel-spectrogram classification"""
    model = models.Sequential([
        # First Conv Block
        layers.Conv2D(32, (3, 3), activation='relu', input_shape=input_shape),
        layers.BatchNormalization(),
        layers.MaxPooling2D((2, 2)),
        layers.Dropout(0.25),
        
        # Second Conv Block
        layers.Conv2D(64, (3, 3), activation='relu'),
        layers.BatchNormalization(),
        layers.MaxPooling2D((2, 2)),
        layers.Dropout(0.25),
        
        # Third Conv Block
        layers.Conv2D(128, (3, 3), activation='relu'),
        layers.BatchNormalization(),
        layers.MaxPooling2D((2, 2)),
        layers.Dropout(0.25),
        
        # Fourth Conv Block
        layers.Conv2D(256, (3, 3), activation='relu'),
        layers.BatchNormalization(),
        layers.MaxPooling2D((2, 2)),
        layers.Dropout(0.25),
        
        # Dense layers
        layers.GlobalAveragePooling2D(),  # Alternative to Flatten
        layers.Dense(512, activation='relu'),
        layers.BatchNormalization(),
        layers.Dropout(0.5),
        
        layers.Dense(256, activation='relu'),
        layers.BatchNormalization(),
        layers.Dropout(0.5),
        
        # Output layer
        layers.Dense(num_classes, activation='softmax')
    ])
    
    return model

# 4. Training Pipeline
def train_model():
    # Dataset configuration
    DATASET_PATH = "../Data/genres_original"
    genres = ['blues', 'classical', 'country', 'disco', 'hiphop', 
              'jazz', 'metal', 'pop', 'reggae', 'rock']
    
    # Audio parameters
    SR = 22050
    DURATION = 30  # seconds
    SEGMENT = True
    NUM_SEGMENTS = 10
    
    # Load dataset
    X, y = load_dataset(
        DATASET_PATH, 
        genres, 
        sr=SR, 
        duration=DURATION, 
        segment=SEGMENT, 
        num_segments=NUM_SEGMENTS
    )
    
    print(f"Dataset loaded: {X.shape}, {y.shape}")
    print(f"Input shape: {X.shape[1:]}")
    
    # Split dataset
    X_train, X_val, y_train, y_val = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=42
    )
    
    # Create model
    model = create_melspec_cnn(
        input_shape=X.shape[1:], 
        num_classes=len(genres)
    )
    
    # Compile model
    model.compile(
        optimizer=Adam(learning_rate=0.001),
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )
    
    # Print model summary
    model.summary()
    
    # Callbacks
    callbacks = [
        ReduceLROnPlateau(
            monitor='val_loss', 
            factor=0.5, 
            patience=5, 
            min_lr=1e-7,
            verbose=1
        ),
        EarlyStopping(
            monitor='val_loss',
            patience=10,
            restore_best_weights=True,
            verbose=1
        )
    ]
    
    # Train model
    history = model.fit(
        X_train, y_train,
        validation_data=(X_val, y_val),
        epochs=50,
        batch_size=32,
        callbacks=callbacks,
        verbose=1
    )
    
    return model, history

# 5. Evaluation and Visualization
def plot_training_history(history):
    """Plot training history"""
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))
    
    # Accuracy
    ax1.plot(history.history['accuracy'], label='Training Accuracy')
    ax1.plot(history.history['val_accuracy'], label='Validation Accuracy')
    ax1.set_title('Model Accuracy')
    ax1.set_xlabel('Epoch')
    ax1.set_ylabel('Accuracy')
    ax1.legend()
    
    # Loss
    ax2.plot(history.history['loss'], label='Training Loss')
    ax2.plot(history.history['val_loss'], label='Validation Loss')
    ax2.set_title('Model Loss')
    ax2.set_xlabel('Epoch')
    ax2.set_ylabel('Loss')
    ax2.legend()
    
    plt.tight_layout()
    plt.show()

# 6. Prediction Function
def predict_genre(model, audio_file_path, genres, sr=22050, duration=30):
    """Predict genre of a single audio file"""
    # Load and preprocess audio
    audio = load_audio_file(audio_file_path, sr=sr, duration=duration)
    if audio is None:
        return None
    
    # Convert to mel-spectrogram
    mel_spec = audio_to_melspectrogram(audio, sr=sr)
    
    # Add batch and channel dimensions
    mel_spec = mel_spec[np.newaxis, ..., np.newaxis]
    
    # Predict
    predictions = model.predict(mel_spec)
    predicted_class = np.argmax(predictions[0])
    confidence = predictions[0][predicted_class]
    
    return genres[predicted_class], confidence

# 7. Main execution
if __name__ == "__main__":
    # Train the model
    model, history = train_model()
    
    # Plot results
    plot_training_history(history)
    
    # Save model
    model.save("melspec_genre_classifier.h5")
    
    # Example prediction
    genres = ['blues', 'classical', 'country', 'disco', 'hiphop', 
              'jazz', 'metal', 'pop', 'reggae', 'rock']
    
    # Uncomment to test prediction on a specific file
    # genre, confidence = predict_genre(model, "path/to/test/file.wav", genres)
    # print(f"Predicted genre: {genre} (confidence: {confidence:.2f})")

Loading dataset...
Processing blues...
Processing classical...
Processing country...
Processing disco...
Processing hiphop...
Processing jazz...
Processing metal...
Processing pop...
Processing reggae...
Processing rock...
Dataset loaded: (10000, 128, 130, 1), (10000,)
Input shape: (128, 130, 1)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/50
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 212ms/step - accuracy: 0.3007 - loss: 2.3469 - val_accuracy: 0.1000 - val_loss: 5.2492 - learning_rate: 0.0010
Epoch 2/50
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 200ms/step - accuracy: 0.4859 - loss: 1.5477 - val_accuracy: 0.1370 - val_loss: 8.1934 - learning_rate: 0.0010
Epoch 3/50
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 195ms/step - accuracy: 0.5859 - loss: 1.2278 - val_accuracy: 0.3255 - val_loss: 2.8286 - learning_rate: 0.0010
Epoch 4/50
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 196ms/step - accuracy: 0.6374 - loss: 1.0692 - val_accuracy: 0.3550 - val_loss: 2.1747 - learning_rate: 0.0010
Epoch 5/50
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 198ms/step - accuracy: 0.6894 - loss: 0.9032 - val_accuracy: 0.4975 - val_loss: 1.7170 - learning_rate: 0.0010
Epoch 6/50
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0

KeyboardInterrupt: 