In [1]:
import kagglehub

# Download the latest version of the dataset
path = kagglehub.dataset_download("andradaolteanu/gtzan-dataset-music-genre-classification")

# Print the path where the dataset is saved
print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/gtzan-dataset-music-genre-classification


In [2]:
import numpy as np
from scipy.io import wavfile
import librosa

def safe_load_wav(audio_path, duration=30, sr=22050):
    try:
        # Read using scipy
        orig_sr, audio = wavfile.read(audio_path)
        audio = audio.astype(np.float32)

        # Resample if needed
        if orig_sr != sr:
            audio = librosa.resample(audio, orig_sr, sr)

        # Trim or pad to fixed length
        max_len = duration * sr
        if len(audio) > max_len:
            audio = audio[:max_len]
        elif len(audio) < max_len:
            audio = np.pad(audio, (0, max_len - len(audio)))

        return audio, sr
    except Exception as e:
        print(f"Failed to load {audio_path}: {e}")
        return None, None


In [4]:
import os
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tqdm import tqdm

image_dir = "/kaggle/input/gtzan-dataset-music-genre-classification/Data/images_original"
genres = os.listdir(image_dir)

img_height, img_width = 128, 128

X, y = [], []

for genre in genres:
    genre_path = os.path.join(image_dir, genre)
    for img_file in tqdm(os.listdir(genre_path), desc=f"Loading {genre}"):
        try:
            img_path = os.path.join(genre_path, img_file)
            img = load_img(img_path, target_size=(img_height, img_width))
            img_array = img_to_array(img) / 255.0
            X.append(img_array)
            y.append(genre)
        except Exception as e:
            print(f"Skipped {img_path}: {e}")

X = np.array(X)
y = np.array(y)

le = LabelEncoder()
y_encoded = le.fit_transform(y)
y_categorical = to_categorical(y_encoded)

X_train, X_test, y_train, y_test = train_test_split(X, y_categorical, test_size=0.2, random_state=42)

model = Sequential([
    Input(shape=(img_height, img_width, 3)),
    Conv2D(32, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.3),
    Dense(10, activation='softmax')
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

history = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_test, y_test))

test_loss, test_acc = model.evaluate(X_test, y_test)
print(f"✅ Test accuracy: {test_acc:.4f}")


Loading disco: 100%|██████████| 100/100 [00:01<00:00, 97.86it/s]
Loading metal: 100%|██████████| 100/100 [00:00<00:00, 103.96it/s]
Loading reggae: 100%|██████████| 100/100 [00:00<00:00, 108.40it/s]
Loading blues: 100%|██████████| 100/100 [00:00<00:00, 114.65it/s]
Loading rock: 100%|██████████| 100/100 [00:00<00:00, 102.34it/s]
Loading classical: 100%|██████████| 100/100 [00:00<00:00, 114.41it/s]
Loading jazz: 100%|██████████| 99/99 [00:00<00:00, 111.58it/s]
Loading hiphop: 100%|██████████| 100/100 [00:00<00:00, 112.99it/s]
Loading country: 100%|██████████| 100/100 [00:00<00:00, 107.24it/s]
Loading pop: 100%|██████████| 100/100 [00:00<00:00, 113.49it/s]


Epoch 1/50
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 837ms/step - accuracy: 0.1168 - loss: 3.1837 - val_accuracy: 0.1450 - val_loss: 2.2178
Epoch 2/50
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 828ms/step - accuracy: 0.2062 - loss: 2.1386 - val_accuracy: 0.2650 - val_loss: 1.9388
Epoch 3/50
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 819ms/step - accuracy: 0.3666 - loss: 1.8357 - val_accuracy: 0.4200 - val_loss: 1.6244
Epoch 4/50
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 759ms/step - accuracy: 0.5161 - loss: 1.4317 - val_accuracy: 0.5100 - val_loss: 1.4688
Epoch 5/50
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 846ms/step - accuracy: 0.6103 - loss: 1.1595 - val_accuracy: 0.5550 - val_loss: 1.3476
Epoch 6/50
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 762ms/step - accuracy: 0.7015 - loss: 0.9124 - val_accuracy: 0.5500 - val_loss: 1.3055
Epoch 7/50
[1m25/25[