In [1]:
import os
import numpy as np
import librosa
import librosa.display
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
from sklearn.metrics import confusion_matrix, classification_report
import itertools

In [2]:
DATASET_PATH = r"C:\Users\USER\Desktop\Song_dataset"
SR = 22050 # sample rate
DURATION = 30.0 # seconds per clip (you chose 30s)
N_MELS = 128
N_FFT = 2048
HOP_LENGTH = 512
MAX_PAD_LEN = int(np.ceil((DURATION * SR) / HOP_LENGTH))
FEATURES_FILE = 'X.npy'
LABELS_FILE = 'y.npy'

In [3]:
def extract_mel_spectrogram(file_path, sr=SR, duration=DURATION, n_mels=N_MELS, n_fft=N_FFT, hop_length=HOP_LENGTH, max_pad_len=MAX_PAD_LEN):
    try:
        audio, sr = librosa.load(file_path, sr=sr, duration=duration, res_type='kaiser_fast')
        mel_spec = librosa.feature.melspectrogram(y=audio, sr=sr, n_fft=n_fft, hop_length=hop_length, n_mels=n_mels)
        mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)


# pad or truncate
        if mel_spec_db.shape[1] < max_pad_len:
           pad_width = max_pad_len - mel_spec_db.shape[1]
           mel_spec_db = np.pad(mel_spec_db, pad_width=((0,0),(0,pad_width)), mode='constant')
        else:
           mel_spec_db = mel_spec_db[:, :max_pad_len]


        return mel_spec_db
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None

In [4]:
if os.path.exists(FEATURES_FILE) and os.path.exists(LABELS_FILE):
    print("Loading saved features...")
    X = np.load(FEATURES_FILE)
    y = np.load(LABELS_FILE)
else:
    print("Extracting features from audio files. This may take a while...")
    X = []
    y = []


    genres = [d for d in os.listdir(DATASET_PATH) if os.path.isdir(os.path.join(DATASET_PATH, d))]
    print(f"Found genres: {genres}")


    for genre in genres:
        genre_path = os.path.join(DATASET_PATH, genre)
        files = [f for f in os.listdir(genre_path) if f.lower().endswith(('.mp3', '.wav', '.flac', '.ogg'))]
        print(f"Processing {len(files)} files in genre '{genre}'")


        for file in files:
            file_path = os.path.join(genre_path, file)
            mel = extract_mel_spectrogram(file_path)
            if mel is not None:
                X.append(mel)
                y.append(genre)


    X = np.array(X)
    y = np.array(y)


    # Save for faster reuse
    np.save(FEATURES_FILE, X)
    np.save(LABELS_FILE, y)
    print(f"Saved features to {FEATURES_FILE} and labels to {LABELS_FILE}")


print("Feature array shape:", X.shape)
print("Labels shape:", y.shape)

Extracting features from audio files. This may take a while...
Found genres: ['melody', 'rap']
Processing 20 files in genre 'melody'
Processing 21 files in genre 'rap'
Saved features to X.npy and labels to y.npy
Feature array shape: (41, 128, 1292)
Labels shape: (41,)


In [5]:
le = LabelEncoder()
y_encoded = le.fit_transform(y)
y_onehot = to_categorical(y_encoded)
# Add channel dimension for CNN: (samples, n_mels, time_steps, 1)
X = X[..., np.newaxis]


# Normalize X to range 0-1
X_min = X.min()
X_max = X.max()
X = (X - X_min) / (X_max - X_min + 1e-9)


X_train, X_test, y_train, y_test = train_test_split(X, y_onehot, test_size=0.2, random_state=42, stratify=y_onehot)


print("Train shape:", X_train.shape, "Test shape:", X_test.shape)

Train shape: (32, 128, 1292, 1) Test shape: (9, 128, 1292, 1)


In [6]:
input_shape = X_train.shape[1:]
num_classes = y_onehot.shape[1]


def build_model(input_shape, num_classes):
    model = Sequential()


    model.add(Conv2D(32, (3,3), activation='relu', padding='same', input_shape=input_shape))
    model.add(BatchNormalization())
    model.add(MaxPooling2D((2,2)))
    model.add(Dropout(0.2))


    model.add(Conv2D(64, (3,3), activation='relu', padding='same'))
    model.add(BatchNormalization())
    model.add(MaxPooling2D((2,2)))
    model.add(Dropout(0.2))


    model.add(Conv2D(128, (3,3), activation='relu', padding='same'))
    model.add(BatchNormalization())
    model.add(MaxPooling2D((2,2)))
    model.add(Dropout(0.3))


    model.add(Flatten())
    model.add(Dense(256, activation='relu'))
    model.add(Dropout(0.4))
    model.add(Dense(num_classes, activation='softmax'))


    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model


model = build_model(input_shape, num_classes)
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 128, 1292, 32)     320       
                                                                 
 batch_normalization (BatchN  (None, 128, 1292, 32)    128       
 ormalization)                                                   
                                                                 
 max_pooling2d (MaxPooling2D  (None, 64, 646, 32)      0         
 )                                                               
                                                                 
 dropout (Dropout)           (None, 64, 646, 32)       0         
                                                                 
 conv2d_1 (Conv2D)           (None, 64, 646, 64)       18496     
                                                                 
 batch_normalization_1 (Batc  (None, 64, 646, 64)      2

In [7]:
checkpoint = ModelCheckpoint('best_model.h5', monitor='val_accuracy', save_best_only=True, mode='max')
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=1e-6, verbose=1)


EPOCHS = 30
BATCH_SIZE = 16


history = model.fit(
X_train, y_train,
validation_split=0.2,
epochs=EPOCHS,
batch_size=BATCH_SIZE,
callbacks=[checkpoint, reduce_lr]
)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 4: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 7: ReduceLROnPlateau reducing learning rate to 0.0002500000118743628.
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 10: ReduceLROnPlateau reducing learning rate to 0.0001250000059371814.
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 13: ReduceLROnPlateau reducing learning rate to 6.25000029685907e-05.
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 16: ReduceLROnPlateau reducing learning rate to 3.125000148429535e-05.
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 19: ReduceLROnPlateau reducing learning rate to 1.5625000742147677e-05.
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 22: ReduceLROnPlateau reducing learning rate to 7.812500371073838e-06.
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 25: ReduceLROnPlateau reducing learning rate to 3.906250185536919e-06.
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 28: ReduceLROnPlateau reducing learning rate to 1.953