## **Importing Libraries & Parameter Setup**

In [1]:
!pip install pydub
!apt-get install ffmpeg

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 29 not upgraded.


In [2]:
import os
import h5py
import librosa
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
import zipfile
import shutil
import soundfile as sf
import pandas as pd
import random
import h5py
from google.colab import files
from pydub import AudioSegment
from tqdm import tqdm
from tensorflow.image import resize
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Conv2D, MaxPool2D, Flatten, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import layers, models, backend as K
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.utils.class_weight import compute_class_weight

In [3]:
from google.colab import drive
drive.mount('/content/drive')

dataset_path = "/content/drive/MyDrive/FMA_DATASET"
audio_path = os.path.join(dataset_path, "fma_medium_wav/fma_medium")
metadata_path = os.path.join(dataset_path, "fma_metadata")

chunk_duration = 4  # saniye
overlap_duration = 2  # saniye
target_sr = 22050
target_shape = (128, 128)
n_mels = 128
fmax = 8000
batch_size = 100

SEED = 42
np.random.seed(SEED)
random.seed(SEED)
tf.random.set_seed(SEED)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## **Dataset Inspect**

In [None]:
tracks_csv_path = os.path.join(metadata_path, "tracks.csv")
tracks_df = pd.read_csv(tracks_csv_path, index_col=0, header=[0, 1])
genre_data = tracks_df[[("track", "genre_top")]].dropna()
genre_data.columns = ["genre_top"]
genre_classes = sorted(genre_data["genre_top"].unique().tolist())
id_to_genre = genre_data.to_dict()["genre_top"]

print(f"🎶 Genre Classes: {genre_classes}")

🎶 Genre Classes: ['Blues', 'Classical', 'Country', 'Easy Listening', 'Electronic', 'Experimental', 'Folk', 'Hip-Hop', 'Instrumental', 'International', 'Jazz', 'Old-Time / Historic', 'Pop', 'Rock', 'Soul-RnB', 'Spoken']


In [4]:
genre_classes = [
    'Blues', 'Classical', 'Country', 'Easy Listening', 'Electronic',
    'Experimental', 'Folk', 'Hip-Hop', 'Instrumental', 'International',
    'Jazz', 'Old-Time / Historic', 'Pop', 'Rock', 'Soul-RnB', 'Spoken'
]

## **Melspectrogram & Augment the Data**

In [None]:
def extract_mel_spectrogram(y, sr, n_mels=128, fmax=8000, shape=(128, 128)):
    mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels, fmax=fmax)
    mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
    min_val = mel_spec_db.min()
    max_val = mel_spec_db.max()
    if max_val - min_val == 0:
        mel_spec_norm = np.zeros_like(mel_spec_db)
    else:
        mel_spec_norm = (mel_spec_db - min_val) / (max_val - min_val)
    mel_spec_resized = resize(np.expand_dims(mel_spec_norm, axis=-1), shape)
    return mel_spec_resized

In [None]:
def augment_audio(y, sr):
    if random.random() < 0.3:
        rate = random.uniform(0.8, 1.2)
        y = librosa.effects.time_stretch(y, rate=rate)
    if random.random() < 0.3:
        n_steps = random.randint(-2, 2)
        y = librosa.effects.pitch_shift(y, sr=sr, n_steps=n_steps)
    if random.random() < 0.3:
        noise_amp = 0.005 * np.random.uniform() * np.amax(y)
        y = y + noise_amp * np.random.normal(size=y.shape[0])
    return y

## **Process Audio**

In [None]:
def process_audio_chunks(y, sr, data, labels, genre, shape=(128, 128)):
    chunk_samples = chunk_duration * sr
    overlap_samples = overlap_duration * sr
    num_chunks = int(np.ceil((len(y) - chunk_samples) / (chunk_samples - overlap_samples))) + 1

    for i in range(num_chunks):
        start = i * (chunk_samples - overlap_samples)
        end = start + chunk_samples
        chunk = y[start:end]

        if len(chunk) < chunk_samples:
            chunk = np.pad(chunk, (0, chunk_samples - len(chunk)), mode='constant')

        spec = extract_mel_spectrogram(chunk, sr, shape=shape)
        data.append(spec)
        labels.append(genre_classes.index(genre))

## **Preprocess Data**

In [None]:
def load_process_and_save_dataset(audio_path, output_path, shape=(128, 128), batch_size=100):
    folders = sorted(os.listdir(audio_path))
    batch_number = 0

    with h5py.File(output_path, 'w') as f:
        for folder in tqdm(folders, desc="Processing Folders"):
            folder_path = os.path.join(audio_path, folder)
            if not os.path.isdir(folder_path):
                continue

            files = [f for f in os.listdir(folder_path) if f.endswith('.wav')]
            for i in range(0, len(files), batch_size):
                batch_files = files[i:i + batch_size]
                data, labels = [], []

                for file in batch_files:
                    file_id = int(file.split('.')[0])
                    if file_id in id_to_genre:
                        try:
                            y, sr = librosa.load(os.path.join(folder_path, file), sr=target_sr)
                            process_audio_chunks(y, sr, data, labels, id_to_genre[file_id], shape=shape)
                        except Exception as e:
                            print(f"⚠️ Error in {file}: {e}")

                # Batch'i diske kaydet
                if data and labels:
                    data_array = np.array(data, dtype=np.float32)
                    labels_array = np.array(labels, dtype=np.int32)
                    labels_encoded_array = to_categorical(labels_array, num_classes=len(genre_classes))

                    f.create_dataset(f'data_{batch_number}', data=data_array)
                    f.create_dataset(f'labels_{batch_number}', data=labels_array)
                    f.create_dataset(f'labels_encoded_{batch_number}', data=labels_encoded_array)
                    batch_number += 1

save_path = "/content/drive/MyDrive/MUSIC GENRE CLASSIFICATION/FMA_DATA/CNN + BILSTM"
output_path = os.path.join(save_path, "processed_data.h5")

load_process_and_save_dataset(audio_path, output_path, shape=target_shape, batch_size=batch_size)

print(f"✅ Veriler başarıyla işlendi ve '{output_path}' dosyasına kaydedildi.")

Processing Folders: 100%|██████████| 156/156 [3:12:22<00:00, 73.99s/it]


✅ Veriler başarıyla işlendi ve '/content/drive/MyDrive/MUSIC GENRE CLASSIFICATION/FMA_DATA/CNN + BILSTM/processed_data.h5' dosyasına kaydedildi.


In [None]:
output_path = "/content/drive/MyDrive/MUSIC GENRE CLASSIFICATION/FMA_DATA/CNN + BILSTM/processed_data.h5"

if os.path.exists(output_path):
    print(f"✅ Dosya mevcut: {output_path}")
else:
    print(f"❌ Dosya bulunamadı: {output_path}")

with h5py.File(output_path, 'r') as f:
    print("📂 Dosya içeriği:")
    for key in f.keys():
        print(f" - {key}: {f[key].shape}, dtype: {f[key].dtype}")

✅ Dosya mevcut: /content/drive/MyDrive/MUSIC GENRE CLASSIFICATION/FMA_DATA/CNN + BILSTM/processed_data.h5
📂 Dosya içeriği:
 - data_0: (1434, 128, 128, 1), dtype: float32
 - data_1: (1422, 128, 128, 1), dtype: float32
 - data_10: (580, 128, 128, 1), dtype: float32
 - data_100: (1436, 128, 128, 1), dtype: float32
 - data_101: (1257, 128, 128, 1), dtype: float32
 - data_102: (1368, 128, 128, 1), dtype: float32
 - data_103: (1439, 128, 128, 1), dtype: float32
 - data_104: (593, 128, 128, 1), dtype: float32
 - data_105: (1444, 128, 128, 1), dtype: float32
 - data_106: (1448, 128, 128, 1), dtype: float32
 - data_107: (28, 128, 128, 1), dtype: float32
 - data_108: (1439, 128, 128, 1), dtype: float32
 - data_109: (1123, 128, 128, 1), dtype: float32
 - data_11: (1442, 128, 128, 1), dtype: float32
 - data_110: (1441, 128, 128, 1), dtype: float32
 - data_111: (213, 128, 128, 1), dtype: float32
 - data_112: (1442, 128, 128, 1), dtype: float32
 - data_113: (1441, 128, 128, 1), dtype: float32
 - dat

## **Model Definition**

In [5]:
h5_file = h5py.File('/content/drive/MyDrive/MUSIC GENRE CLASSIFICATION/FMA_DATA/CNN + BILSTM/processed_data.h5', 'r')

dataset_names = list(h5_file.keys())
batch_numbers = sorted(set(name.split('_')[1] for name in dataset_names))

split_index = int(0.8 * len(batch_numbers))
train_batches = batch_numbers[:split_index]
val_batches = batch_numbers[split_index:]

In [6]:
def h5_data_generator(h5_file, batch_list):
    while True:
        for batch_num in batch_list:
            data = h5_file[f'data_{batch_num}'][:]
            labels = h5_file[f'labels_encoded_{batch_num}'][:]
            yield data, labels

In [7]:
def se_block(input_tensor, ratio=8):
    filters = input_tensor.shape[-1]
    se = layers.GlobalAveragePooling2D()(input_tensor)
    se = layers.Dense(filters // ratio, activation='relu')(se)
    se = layers.Dense(filters, activation='sigmoid')(se)
    se = layers.Reshape((1, 1, filters))(se)
    return layers.Multiply()([input_tensor, se])

In [8]:
class AttentionLayer(layers.Layer):
    def __init__(self, **kwargs):
        super(AttentionLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        self.hidden_size = input_shape[-1]
        self.dense = layers.Dense(self.hidden_size, activation='tanh')
        self.softmax = layers.Softmax(axis=1)
        super(AttentionLayer, self).build(input_shape)

    def call(self, inputs):
        scores = self.dense(inputs)
        weights = self.softmax(scores)
        context = layers.Multiply()([inputs, weights])
        return K.sum(context, axis=1)

In [9]:
def build_advanced_deep_model(input_shape=(128, 128, 1), num_classes=16):
    inputs = layers.Input(shape=input_shape)

    # --- CNN Block 1 ---
    x = layers.Conv2D(32, (3, 3), activation='relu', padding='same')(inputs)
    x = layers.BatchNormalization()(x)
    x = layers.Conv2D(32, (3, 3), activation='relu', padding='same')(x)
    x = layers.BatchNormalization()(x)
    x = se_block(x)  # SE Block
    x = layers.MaxPooling2D((2, 2))(x)
    x = layers.Dropout(0.3)(x)

    # --- CNN Block 2 ---
    x = layers.Conv2D(64, (3, 3), activation='relu', padding='same')(x)
    x = layers.BatchNormalization()(x)
    x = layers.Conv2D(64, (3, 3), activation='relu', padding='same')(x)
    x = layers.BatchNormalization()(x)
    x = se_block(x)  # SE Block
    x = layers.MaxPooling2D((2, 2))(x)
    x = layers.Dropout(0.3)(x)

    # --- CNN Block 3 ---
    x = layers.Conv2D(128, (3, 3), activation='relu', padding='same')(x)
    x = layers.BatchNormalization()(x)
    x = layers.Conv2D(128, (3, 3), activation='relu', padding='same')(x)
    x = layers.BatchNormalization()(x)
    x = se_block(x)  # SE Block
    x = layers.MaxPooling2D((2, 2))(x)
    x = layers.Dropout(0.4)(x)

    # --- CNN Block 4 ---
    x = layers.Conv2D(256, (3, 3), activation='relu', padding='same')(x)
    x = layers.BatchNormalization()(x)
    x = layers.Conv2D(256, (3, 3), activation='relu', padding='same')(x)
    x = layers.BatchNormalization()(x)
    x = se_block(x)  # SE Block
    x = layers.MaxPooling2D((2, 2))(x)
    x = layers.Dropout(0.4)(x)

    # --- CNN Block 5 ---
    x = layers.Conv2D(512, (3, 3), activation='relu', padding='same')(x)
    x = layers.BatchNormalization()(x)
    x = layers.Conv2D(512, (3, 3), activation='relu', padding='same')(x)
    x = layers.BatchNormalization()(x)
    x = se_block(x)  # SE Block
    x = layers.MaxPooling2D((2, 2))(x)
    x = layers.Dropout(0.5)(x)

    # --- Reshape ---
    x = layers.Reshape((-1, x.shape[-1]))(x)  # Dinamik (Batch, Time, Feature)

    # --- BiLSTM ---
    x = layers.Bidirectional(layers.LSTM(256, return_sequences=True))(x)
    x = layers.Dropout(0.5)(x)
    x = layers.Bidirectional(layers.LSTM(128, return_sequences=True))(x)
    x = layers.Dropout(0.5)(x)

    # --- Attention ---
    attention_output = AttentionLayer()(x)

    # --- GAP (Global Average Pooling) ---
    gap_output = layers.GlobalAveragePooling1D()(x)

    # --- Combine Attention + GAP ---
    combined = layers.Concatenate()([attention_output, gap_output])

    # --- Head Ensemble (Multi Dense Head) ---
    head_1 = layers.Dense(256, activation='relu')(combined)
    head_2 = layers.Dense(256, activation='relu')(combined)
    merged_heads = layers.Concatenate()([head_1, head_2])  # Ensemble birleşimi
    merged_heads = layers.Dropout(0.5)(merged_heads)

    # --- Output ---
    outputs = layers.Dense(num_classes, activation='softmax')(merged_heads)

    model = models.Model(inputs=inputs, outputs=outputs)
    return model

## **Model Compile**

In [10]:
model = build_advanced_deep_model(input_shape=(128, 128, 1), num_classes=16)

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001),
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [11]:
model.summary()

## **Train Model**

In [12]:
output_path = "/content/drive/MyDrive/MUSIC GENRE CLASSIFICATION/FMA_DATA/CNN + BILSTM/processed_data.h5"

total_samples = 0
label_keys = []

with h5py.File(output_path, 'r') as f:
    for key in f.keys():
        if key.startswith('labels_') and not key.startswith('labels_encoded_'):
            label_keys.append(key)
            total_samples += f[key].shape[0]  # Her label'ın sayısı kadar örnek var

print(f"Toplam veri sayısı: {total_samples}")
print(f"Toplam batch sayısı: {len(label_keys)}")

Toplam veri sayısı: 360191
Toplam batch sayısı: 330


In [13]:
split_ratio = 0.8

num_train_samples = int(total_samples * split_ratio)
num_val_samples = total_samples - num_train_samples

print(f"Eğitim örnekleri: {num_train_samples}")
print(f"Doğrulama örnekleri: {num_val_samples}")

Eğitim örnekleri: 288152
Doğrulama örnekleri: 72039


In [14]:
def h5_data_generator(h5_file, batch_list):
    while True:
        for batch_num in batch_list:
            data = h5_file[f'data_{batch_num}'][:]
            labels = h5_file[f'labels_encoded_{batch_num}'][:]
            yield data, labels

In [15]:
output_path = "/content/drive/MyDrive/MUSIC GENRE CLASSIFICATION/FMA_DATA/CNN + BILSTM/processed_data.h5"
h5_file = h5py.File(output_path, 'r')

all_batches = sorted([int(key.split('_')[1]) for key in h5_file.keys() if key.startswith('data_')])

split_index = int(0.8 * len(all_batches))
train_batches = all_batches[:split_index]
val_batches = all_batches[split_index:]

train_generator = h5_data_generator(h5_file, train_batches)
val_generator = h5_data_generator(h5_file, val_batches)

In [None]:
batch_sizes = [8, 16, 32, 64, 128]

for b_size in batch_sizes:
    try:
        print(f"Testing batch size: {b_size}")
        # Modeli compile et
        model.compile(optimizer='adam', loss='categorical_crossentropy')
        # 1 epoch kısa deneme
        model.fit(train_generator, epochs=1, batch_size=b_size, steps_per_epoch=1)
        print(f"Batch size {b_size} çalıştı.")
    except Exception as e:
        print(f"Batch size {b_size} başarısız: {e}")


Testing batch size: 8
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m61s[0m 61s/step - accuracy: 0.0391 - loss: 2.8641
Batch size 8 çalıştı.
Testing batch size: 16
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 57s/step - accuracy: 0.3255 - loss: 2.3509
Batch size 16 çalıştı.
Testing batch size: 32
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 18s/step - accuracy: 0.3287 - loss: 2.3526
Batch size 32 çalıştı.
Testing batch size: 64
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 57s/step - accuracy: 0.0785 - loss: 2.4762
Batch size 64 çalıştı.
Testing batch size: 128




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 56s/step - accuracy: 0.6226 - loss: 1.7642
Batch size 128 çalıştı.


In [16]:
batch_size = 32  # Kendi modeline uygun batch_size kullan!

STEPS_PER_EPOCH = num_train_samples // batch_size
VALIDATION_STEPS = num_val_samples // batch_size

print(f"STEPS_PER_EPOCH: {STEPS_PER_EPOCH}")
print(f"VALIDATION_STEPS: {VALIDATION_STEPS}")

STEPS_PER_EPOCH: 9004
VALIDATION_STEPS: 2251


In [None]:
h5_file = h5py.File('/content/drive/MyDrive/MUSIC GENRE CLASSIFICATION/FMA_DATA/CNN + BILSTM/processed_data.h5', 'r')

num_batches = len([key for key in h5_file.keys() if key.startswith('labels_encoded_')])

print("Toplam batch sayısı:", num_batches)

all_labels = []
for i in range(num_batches):
    labels = h5_file[f'labels_encoded_{i}'][:]
    all_labels.extend(np.argmax(labels, axis=1))

h5_file.close()

# --- Class weights hesapla ---
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(all_labels),
    y=all_labels
)
class_weights = dict(enumerate(class_weights))

print("Class weights:", class_weights)

Toplam batch sayısı: 330
Class weights: {0: 21.039193925233644, 1: 2.5288628959784316, 2: 8.745896464646465, 3: 74.54283940397352, 4: 0.24712593995279653, 5: 0.6938278216112926, 6: 1.0276138905372711, 7: 0.7119975172370169, 8: 1.159930827493817, 9: 1.5325711416706378, 10: 4.062793268363111, 11: 3.0632654102598993, 12: 1.3147192372831864, 13: 0.2201893357720635, 14: 10.09503923766816, 15: 13.257913722025913}


In [17]:
# --- Cosine Annealing Learning Rate ---
initial_lr = 0.001
cosine_decay = tf.keras.optimizers.schedules.CosineDecay(
    initial_learning_rate=initial_lr,
    decay_steps=STEPS_PER_EPOCH * 15,
    alpha=0.01
)

# --- Optimizer ---
optimizer = tf.keras.optimizers.Adam(learning_rate=cosine_decay)

class_weights_tensor = tf.constant([21.039193925233644, 2.5288628959784316, 8.745896464646465, 74.54283940397352,
                                    0.24712593995279653, 0.6938278216112926, 1.0276138905372711, 0.7119975172370169,
                                    1.159930827493817, 1.5325711416706378, 4.062793268363111, 3.0632654102598993,
                                    1.3147192372831864, 0.2201893357720635, 10.09503923766816, 13.257913722025913])

# Custom loss function
def weighted_categorical_crossentropy(y_true, y_pred):
    weights = tf.reduce_sum(class_weights_tensor * y_true, axis=-1)
    unweighted_losses = tf.keras.losses.categorical_crossentropy(y_true, y_pred)
    weighted_losses = unweighted_losses * weights
    return tf.reduce_mean(weighted_losses)

# Modeli compile ederken
model.compile(
    optimizer=optimizer,
    loss=weighted_categorical_crossentropy,
    metrics=['accuracy']
)

# --- Early Stop & Model Checkpoint ---
early_stop = EarlyStopping(
    monitor='val_loss',
    patience=5,
    restore_best_weights=True,
    verbose=1
)
checkpoint = ModelCheckpoint(
    'best_model.h5',
    monitor='val_loss',
    save_best_only=True,
    verbose=1
)

# --- Eğitim ---
history = model.fit(
    train_generator,
    validation_data=val_generator,
    epochs=15,
    callbacks=[early_stop, checkpoint],
    steps_per_epoch=STEPS_PER_EPOCH,
    validation_steps=VALIDATION_STEPS
)

Epoch 1/15
[1m 214/9004[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m30:35:05[0m 13s/step - accuracy: 0.0915 - loss: 3.5238

KeyboardInterrupt: 

In [None]:
history_df = pd.DataFrame(history.history)

csv_save_path = '/content/drive/MyDrive/MUSIC GENRE CLASSIFICATION/FMA_DATA/CNN + BILSTM/training_history.csv'

history_df.to_csv(csv_save_path, index=False)
print(f"Training history saved to {csv_save_path}")

## **Model Save**

In [None]:
model_save_path = '/content/drive/MyDrive/MUSIC GENRE CLASSIFICATION/FMA_DATA/CNN + BILSTM/trained_model.keras'
model.save(model_save_path)
print(f"Model saved to {model_save_path}")

In [None]:
h5_file.close()