In [1]:
# Import libraries
import os
os.environ["TF_GPU_ALLOCATOR"] = "cuda_malloc_async"

import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout, Input, BatchNormalization, Add
from tqdm import tqdm
import gc

from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import precision_recall_fscore_support

print("Num GPUs Available:", len(tf.config.list_physical_devices('GPU')))
print("TensorFlow is using:", tf.test.gpu_device_name())
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        print("GPU memory growth enabled.")
    except RuntimeError as e:
        print(e)

2025-02-24 16:07:20.641364: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1740442040.715503     826 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1740442040.735217     826 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-02-24 16:07:20.905412: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Num GPUs Available: 1
TensorFlow is using: /device:GPU:0
GPU memory growth enabled.


I0000 00:00:1740442045.101832     826 gpu_process_state.cc:201] Using CUDA malloc Async allocator for GPU: 0
I0000 00:00:1740442045.104478     826 gpu_device.cc:2022] Created device /device:GPU:0 with 5564 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3070, pci bus id: 0000:06:00.0, compute capability: 8.6


In [None]:
# Data Preparation (Only Load Rare-Class Data)
def load_data(spectrogram_dir, label_dir):
    rare_classes = {6}  # Define rare classes
    spectrograms = []
    labels = []

    # Load spectrograms
    spectrogram_files = sorted(os.listdir(spectrogram_dir))
    label_files = sorted(os.listdir(label_dir))

    total_files = len(spectrogram_files)  

    for spec_file, label_file in tqdm(zip(spectrogram_files, label_files), 
                                      total=total_files, 
                                      desc="Loading Rare-Class Data", 
                                      unit="file"):

        spectrogram = np.load(os.path.join(spectrogram_dir, spec_file))  
        label = np.load(os.path.join(label_dir, label_file)).T 

        # Pad or trim labels
        T_spec = spectrogram.shape[0]  
        T_label = label.shape[0]      
        if T_label < T_spec:
            diff = T_spec - T_label
            label = np.pad(label, ((0, diff), (0, 0)), mode='constant')
        elif T_label > T_spec:
            label = label[:T_spec]

        # Filter to only include samples with rare classes
        if np.any(label[:, list(rare_classes)] > 0):  
            spectrograms.append(spectrogram)
            labels.append(label)

    # Convert to numpy arrays 
    if spectrograms:
        X = np.vstack(spectrograms)  
        Y = np.vstack(labels)  
        return X, Y
    else:
        return np.array([]), np.array([])


In [4]:
# Load the base dataset
X_train, Y_train = load_data("spectrograms_reduced", "labels_reduced")
X_train = np.expand_dims(X_train, axis=-1)  
print("Base X_train:", X_train.shape, "Base Y_train:", Y_train.shape)

Loading Rare-Class Data: 100%|██████████| 270/270 [00:02<00:00, 118.50file/s]


Base X_train: (16185, 128, 87, 1) Base Y_train: (16185, 8)


In [None]:
# Load extra rare-class data
def load_rare_data(spectrogram_dir, label_dir, max_samples=65000):
    rare_classes = {6}  # Define rare classes
    rare_spectrograms = []
    rare_labels = []

    # Get file lists
    spectrogram_files = sorted(os.listdir(spectrogram_dir))
    label_files = sorted(os.listdir(label_dir))

    total_files = len(spectrogram_files)
    total_samples = 0  

    for spec_file, label_file in tqdm(zip(spectrogram_files, label_files), 
                                      total=total_files, 
                                      desc="Loading Rare Data", 
                                      unit="file"):

        # Stop if we reach max samples
        if total_samples >= max_samples:
            break

        spectrogram = np.load(os.path.join(spectrogram_dir, spec_file))  
        label = np.load(os.path.join(label_dir, label_file)).T  

        # Pad or trim labels to match spectrogram length
        T_spec = spectrogram.shape[0]  
        T_label = label.shape[0]      
        if T_label < T_spec:
            diff = T_spec - T_label
            label = np.pad(label, ((0, diff), (0, 0)), mode='constant')
        elif T_label > T_spec:
            label = label[:T_spec]

        # Filter to only include samples with rare classes
        if np.any(label[:, list(rare_classes)] > 0): 
            rare_spectrograms.append(spectrogram)
            rare_labels.append(label)

            total_samples += T_spec

    # Convert to numpy arrays
    if rare_spectrograms:
        X_rare = np.vstack(rare_spectrograms)
        Y_rare = np.vstack(rare_labels)
        return X_rare[:max_samples], Y_rare[:max_samples] 
    else:
        return np.array([]), np.array([])


In [6]:
X_rare, Y_rare = load_rare_data("spectrograms_reduced_train", "labels_reduced_train")

if X_rare.size > 0:
    X_rare = np.expand_dims(X_rare, axis=-1)
    X_train = np.concatenate([X_train, X_rare], axis=0)
    Y_train = np.concatenate([Y_train, Y_rare], axis=0)
    
# Free up memory
del X_rare, Y_rare
gc.collect()  # Force garbage collection

print("Final X_train:", X_train.shape, "Final Y_train:", Y_train.shape)

Loading Rare Data:  75%|███████▌  | 973/1289 [00:09<00:03, 104.72file/s]


Final X_train: (81185, 128, 87, 1) Final Y_train: (81185, 8)


In [7]:
# Occurrences
class_counts = np.sum(Y_train, axis=0)
total_samples = len(Y_train)

for i, count in enumerate(class_counts):
    print(f"Class {i}: {count} occurrences ({count / total_samples:.2%} of the data)")

Class 0: 60084 occurrences (74.01% of the data)
Class 1: 64248 occurrences (79.14% of the data)
Class 2: 64670 occurrences (79.66% of the data)
Class 3: 47470 occurrences (58.47% of the data)
Class 4: 6410 occurrences (7.90% of the data)
Class 5: 9584 occurrences (11.81% of the data)
Class 6: 38562 occurrences (47.50% of the data)
Class 7: 68203 occurrences (84.01% of the data)


In [8]:
# Define CNN model
model = Sequential([
    Input(shape=(128, 87, 1)),  # Input spectrogram shape

    Conv2D(16, (3, 3), activation='relu', padding='same'),  
    MaxPooling2D((2, 2)),
    Conv2D(32, (3, 3), activation='relu', padding='same'),
    MaxPooling2D((2, 2)),
    Conv2D(64, (3, 3), activation='relu', padding='same'),
    MaxPooling2D((2, 2)),   
  
    Flatten(),
    Dense(128, activation='relu'),  
    Dropout(0.3),  
    Dense(64, activation='relu'),
    Dropout(0.2),
    
    Dense(8, activation='sigmoid')  
])

import tensorflow.keras.backend as K

# Per-class alpha values (increase for rare classes)
class_weights = np.array([0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.8, 0.25])  # Higher for classes 4 & 6

def focal_loss(alpha=class_weights, gamma=1.5):  # More aggressive gamma
    def loss(y_true, y_pred):
        y_pred = K.clip(y_pred, 1e-7, 1 - 1e-7)  # Prevent log(0)

        # Apply class-specific alpha weights
        alpha_tensor = K.constant(alpha, dtype=K.floatx())
        alpha_factor = y_true * alpha_tensor + (1 - y_true) * (1 - alpha_tensor)

        loss = -alpha_factor * (1 - y_pred) ** gamma * y_true * K.log(y_pred) \
               - (1 - alpha_factor) * y_pred ** gamma * (1 - y_true) * K.log(1 - y_pred)

        return K.mean(loss)
    return loss

# Compile the model with aggressive focal loss
model.compile(optimizer='adam', loss=focal_loss(), metrics=['binary_accuracy'])
model.summary()

I0000 00:00:1740304365.157668     606 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 5564 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3070, pci bus id: 0000:06:00.0, compute capability: 8.6


In [9]:
# Train model
history = model.fit(X_train, Y_train, batch_size=128, epochs=10, verbose=1)

# Final loss & accuracy
final_loss = history.history['loss'][-1]
final_accuracy = history.history['binary_accuracy'][-1]
print(f"Final Loss: {final_loss:.4f}, Final Accuracy: {final_accuracy:.4f}")

2025-02-23 01:52:54.059336: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 3616304640 exceeds 10% of free system memory.
2025-02-23 01:53:38.929800: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 3616304640 exceeds 10% of free system memory.


Epoch 1/10


I0000 00:00:1740304444.042829    2216 service.cc:148] XLA service 0x7f13d4006050 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1740304444.043430    2216 service.cc:156]   StreamExecutor device (0): NVIDIA GeForce RTX 3070, Compute Capability 8.6
2025-02-23 01:54:04.092176: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
I0000 00:00:1740304444.290018    2216 cuda_dnn.cc:529] Loaded cuDNN version 90300


[1m  9/635[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m8s[0m 14ms/step - binary_accuracy: 0.5598 - loss: 0.6200 

I0000 00:00:1740304447.924676    2216 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m635/635[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 17ms/step - binary_accuracy: 0.7863 - loss: 0.0870
Epoch 2/10
[1m635/635[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 13ms/step - binary_accuracy: 0.8463 - loss: 0.0466
Epoch 3/10
[1m635/635[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 13ms/step - binary_accuracy: 0.8569 - loss: 0.0435
Epoch 4/10
[1m635/635[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 14ms/step - binary_accuracy: 0.8653 - loss: 0.0407
Epoch 5/10
[1m635/635[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 13ms/step - binary_accuracy: 0.8724 - loss: 0.0389
Epoch 6/10
[1m635/635[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 13ms/step - binary_accuracy: 0.8772 - loss: 0.0373
Epoch 7/10
[1m635/635[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 13ms/step - binary_accuracy: 0.8825 - loss: 0.0355
Epoch 8/10
[1m635/635[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 13ms/step - binary_accuracy: 0.8869 

In [10]:
# Get predictions
Y_pred = model.predict(X_train)

# Convert to binary (threshold the probabilities)
threshold = 0.5
Y_pred_binary = (Y_pred > threshold).astype(int)

# Calculate metrics
precision = precision_score(Y_train, Y_pred_binary, average='samples', zero_division=0)
recall = recall_score(Y_train, Y_pred_binary, average='samples', zero_division=0)
f1 = f1_score(Y_train, Y_pred_binary, average='samples', zero_division=0)

# Print the metrics
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")

2025-02-22 20:28:28.730586: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 3393584640 exceeds 10% of free system memory.
2025-02-22 20:28:30.340236: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 3393584640 exceeds 10% of free system memory.


[1m2381/2381[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step
Precision: 0.8449
Recall: 0.9046
F1-Score: 0.8647


In [10]:
X_test, Y_test = load_data("spectrograms_reduced_test", "labels_reduced_test")  
X_test = np.expand_dims(X_test, axis=-1)  

Y_pred_test = model.predict(X_test)
Y_pred_test_binary = (Y_pred_test > 0.5).astype(int)

precision_test = precision_score(Y_test, Y_pred_test_binary, average='samples', zero_division=0)
recall_test = recall_score(Y_test, Y_pred_test_binary, average='samples', zero_division=0)
f1_test = f1_score(Y_test, Y_pred_test_binary, average='samples', zero_division=0)

print(f"Test Precision: {precision_test:.4f}")
print(f"Test Recall: {recall_test:.4f}")
print(f"Test F1-Score: {f1_test:.4f}")

Loading Rare-Class Data: 100%|██████████| 151/151 [00:01<00:00, 99.26file/s] 


[1m274/274[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step
Test Precision: 0.8424
Test Recall: 0.8648
Test F1-Score: 0.8414


In [11]:
# Compute per-class precision, recall, and F1-score
precision_per_class, recall_per_class, f1_per_class, _ = precision_recall_fscore_support(
    Y_test, Y_pred_test_binary, average=None, zero_division=0
)

# Occurrences
class_counts = np.sum(Y_train, axis=0)
total_samples = len(Y_train)

num_classes = Y_test.shape[1]  # Number of instrument classes
for i in range(num_classes):
    print(f"Class {i}: Precision={precision_per_class[i]:.4f}, Recall={recall_per_class[i]:.4f}, "
          f"F1={f1_per_class[i]:.4f} | Occurrences={int(class_counts[i])} ({class_counts[i] / total_samples:.2%})")

Class 0: Precision=0.8388, Recall=0.9521, F1=0.8919 | Occurrences=60084 (74.01%)
Class 1: Precision=0.8280, Recall=0.9806, F1=0.8979 | Occurrences=64248 (79.14%)
Class 2: Precision=0.9617, Recall=0.9513, F1=0.9565 | Occurrences=64670 (79.66%)
Class 3: Precision=0.8583, Recall=0.8793, F1=0.8687 | Occurrences=47470 (58.47%)
Class 4: Precision=0.6978, Recall=0.1074, F1=0.1862 | Occurrences=6410 (7.90%)
Class 5: Precision=0.1818, Recall=0.0433, F1=0.0700 | Occurrences=9584 (11.81%)
Class 6: Precision=0.7641, Recall=0.7213, F1=0.7421 | Occurrences=38562 (47.50%)
Class 7: Precision=0.9820, Recall=0.9864, F1=0.9842 | Occurrences=68203 (84.01%)
