In [1]:
import pandas as pd
import numpy as np
import os
import tensorflow as tf
from tensorflow.keras import layers, models
import librosa
from sklearn.metrics import classification_report


2025-11-22 12:04:38.504453: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Lista todos os dispositivos visíveis
print("Dispositivos disponíveis:")
for device in tf.config.list_physical_devices():
    print(device)

# Mostra as GPUs detectadas
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    print(f"\nGPUs detectadas ({len(gpus)}):")
    for gpu in gpus:
        print(gpu)

    # Mostra detalhes da GPU em uso
    from tensorflow.python.client import device_lib
    devices = device_lib.list_local_devices()
    print("\nDetalhes:")
    for d in devices:
        if d.device_type == 'GPU':
            print(d.physical_device_desc)
else:
    print("\nNenhuma GPU detectada pelo TensorFlow. Ele está usando a CPU.")

Dispositivos disponíveis:
PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU')
PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')

GPUs detectadas (1):
PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')

Detalhes:
device: 0, name: NVIDIA GeForce RTX 3050 Laptop GPU, pci bus id: 0000:01:00.0, compute capability: 8.6


I0000 00:00:1763813081.902914    4329 gpu_device.cc:2020] Created device /device:GPU:0 with 1732 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3050 Laptop GPU, pci bus id: 0000:01:00.0, compute capability: 8.6


In [3]:
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)

# CNN

In [4]:
class CNN1D(tf.keras.Model):
    def __init__(self, input_shape=(16000, 1), num_classes=10, hidden_layers=5, kernel_size=9):
        super(CNN1D, self).__init__()
        self.conv_layers = []
        self.bn_layers = []
        self.pool_layers = []

        # Bloco 1
        self.conv_layers.append(layers.Conv1D(16, kernel_size=kernel_size, activation='relu', padding='same', input_shape=input_shape))
        self.bn_layers.append(layers.BatchNormalization())
        self.pool_layers.append(layers.MaxPooling1D(pool_size=4))

        # Bloco intermediários
        for i in range(1, hidden_layers - 1):
            print(f"layer {i}: {16*2**i}")
            self.conv_layers.append(layers.Conv1D(16*2**i, kernel_size=kernel_size, activation='relu', padding='same'))
            self.bn_layers.append(layers.BatchNormalization())
            self.pool_layers.append(layers.MaxPooling1D(pool_size=4))

        # Bloco final
        print(f"layer {hidden_layers - 1}: {16*2**(hidden_layers - 1)}")
        self.conv_last = layers.Conv1D(16*2**(hidden_layers - 1), kernel_size=kernel_size, activation='relu', padding='same')
        self.bn_last = layers.BatchNormalization()
        self.global_pool = layers.GlobalAveragePooling1D()

        # Camadas densas
        print(f"layer {hidden_layers}: {16*2**(hidden_layers)}")
        self.fc1 = layers.Dense(16*2**(hidden_layers), activation='relu')
        self.drop = layers.Dropout(0.4)
        self.out = layers.Dense(num_classes, activation='softmax')

    def call(self, inputs, training=False):
        x = self.pool_layers[0](self.bn_layers[0](self.conv_layers[0](inputs), training=training))
        for i in range(1, len(self.conv_layers)):
            x = self.pool_layers[i](self.bn_layers[i](self.conv_layers[i](x), training=training))
        x = self.bn_last(self.conv_last(x), training=training)
        x = self.global_pool(x)
        x = self.fc1(x)
        x = self.drop(x, training=training)
        return self.out(x)


In [None]:
DATASET_PATH = "UrbanSound8K"
CSV_PATH = os.path.join(DATASET_PATH, "metadata/UrbanSound8K.csv")
AUDIO_PATH = os.path.join(DATASET_PATH, "audio")

# Carregar metadados
metadata = pd.read_csv(CSV_PATH)
print(metadata.head())

FileNotFoundError: [Errno 2] No such file or directory: 'dataset/metadata/UrbanSound8K.csv'

In [None]:
def load_audio(file_path, target_sr=16000, max_len=16000):
    y, sr = librosa.load(file_path, sr=target_sr)
    if len(y) > max_len:
        y = y[:max_len]
    else:
        y = np.pad(y, (0, max_len - len(y)))
    return np.expand_dims(y, axis=-1)  # (samples, 1)


In [None]:
# Escolher qual fold será usaclassIDdo como validação
val_fold = 10
test_fold = 9

# Separar metadados
train_meta = metadata[(metadata["fold"] != val_fold) & (metadata["fold"] != test_fold)]
test_meta = metadata[metadata["fold"] == test_fold]
val_meta = metadata[metadata["fold"] == val_fold]

def build_dataset(meta):
    X, y = [], []
    for _, row in meta.iterrows():
        file_path = os.path.join(AUDIO_PATH, f"fold{row['fold']}", row["slice_file_name"])
        audio = load_audio(file_path)
        X.append(audio)
        y.append(row["classID"])
    return np.array(X, dtype=np.float32), np.array(y, dtype=np.int64)

print("Carregando dados de treino...")
X_train, y_train = build_dataset(train_meta)

print("Carregando dados de teste...")
X_test, y_test = build_dataset(test_meta)

print("Carregando dados de validação...")
X_val, y_val = build_dataset(val_meta)

print(f"X_train: {X_train.shape}, y_train: {y_train.shape}")
print(f"X_test: {X_test.shape}, y_test: {y_test.shape}")
print(f"X_val: {X_val.shape}, y_val: {y_val.shape}")

Carregando dados de treino...
Carregando dados de teste...
Carregando dados de validação...
X_train: (7079, 16000, 1), y_train: (7079,)
X_test: (816, 16000, 1), y_test: (816,)
X_val: (837, 16000, 1), y_val: (837,)


In [None]:
# Instanciar modelo
model = CNN1D(input_shape=(16000, 1), num_classes=10)

# Compilar
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

# Treinar
model.fit(X_train, y_train,
          validation_data=(X_val, y_val),
          epochs=50,
          batch_size=32,
          callbacks=[
              tf.keras.callbacks.EarlyStopping(patience=8, restore_best_weights=True),
              tf.keras.callbacks.ReduceLROnPlateau(patience=4, factor=0.5)
          ])


layer 1: 32
layer 2: 64
layer 3: 128
layer 4: 256
layer 5: 512


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/50
[1m222/222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 52ms/step - accuracy: 0.4299 - loss: 1.6819 - val_accuracy: 0.1338 - val_loss: 2.8259 - learning_rate: 0.0010
Epoch 2/50
[1m222/222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 32ms/step - accuracy: 0.5617 - loss: 1.2915 - val_accuracy: 0.2198 - val_loss: 3.9144 - learning_rate: 0.0010
Epoch 3/50
[1m222/222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 32ms/step - accuracy: 0.6288 - loss: 1.1097 - val_accuracy: 0.3990 - val_loss: 1.9367 - learning_rate: 0.0010
Epoch 4/50
[1m222/222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 32ms/step - accuracy: 0.6792 - loss: 0.9690 - val_accuracy: 0.5603 - val_loss: 1.4018 - learning_rate: 0.0010
Epoch 5/50
[1m222/222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 32ms/step - accuracy: 0.7101 - loss: 0.8827 - val_accuracy: 0.4444 - val_loss: 2.7270 - learning_rate: 0.0010
Epoch 6/50
[1m222/222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0

<keras.src.callbacks.history.History at 0x7efce1eaff20>

In [None]:
y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)

# Relatório detalhado
print(classification_report(y_test, y_pred_classes, digits=4))

[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 32ms/step
              precision    recall  f1-score   support

           0     0.3947    0.1500    0.2174       100
           1     1.0000    0.3438    0.5116        32
           2     0.3713    0.6200    0.4644       100
           3     0.5041    0.6100    0.5520       100
           4     0.5050    0.5100    0.5075       100
           5     0.7000    0.8652    0.7739        89
           6     0.4328    0.9355    0.5918        31
           7     0.6087    0.6829    0.6437        82
           8     0.7955    0.4268    0.5556        82
           9     0.6615    0.4300    0.5212       100

    accuracy                         0.5392       816
   macro avg     0.5974    0.5574    0.5339       816
weighted avg     0.5717    0.5392    0.5247       816



In [None]:
# ...existing code...
class CNN1D_fixed(tf.keras.Model):
    def __init__(self, input_shape=(16000,1), num_classes=10, hidden_layers=5, kernel_size=9):
        super().__init__()
        self.blocks = []
        # bloco 1
        filters = 16
        self.blocks.append(tf.keras.Sequential([
            layers.Conv1D(filters, kernel_size, padding='same', activation='relu', input_shape=input_shape),
            layers.BatchNormalization(),
            layers.MaxPooling1D(pool_size=4)
        ]))
        # blocos intermediários
        for i in range(1, hidden_layers-1):
            filters = 16 * (2**i)
            self.blocks.append(tf.keras.Sequential([
                layers.Conv1D(filters, kernel_size, padding='same', activation='relu'),
                layers.BatchNormalization(),
                layers.MaxPooling1D(pool_size=4)
            ]))
        # conv final (mantém dimensionalidade temporal)
        final_filters = 16 * (2**(hidden_layers-1))
        self.conv_last = layers.Conv1D(final_filters, kernel_size, padding='same', activation='relu')
        self.bn_last = layers.BatchNormalization()
        self.global_pool = layers.GlobalAveragePooling1D()
        # densas
        fc_units = 16 * (2**hidden_layers)
        self.fc1 = layers.Dense(fc_units, activation='relu')
        self.drop = layers.Dropout(0.4)
        self.out = layers.Dense(num_classes, activation='softmax')

    def call(self, inputs, training=False):
        x = inputs
        for blk in self.blocks:
            x = blk(x, training=training)
        x = self.conv_last(x)
        x = self.bn_last(x, training=training)
        x = self.global_pool(x)
        x = self.fc1(x)
        x = self.drop(x, training=training)
        return self.out(x)

class CNN2D_matched(tf.keras.Model):
    def __init__(self, input_shape=(64, 157, 1), num_classes=10, base_filters=16):
        super().__init__()
        # arquitetura 2D com profundidade similar em parâmetros
        self.conv1 = tf.keras.Sequential([
            layers.Conv2D(base_filters, (3,3), padding='same', activation='relu', input_shape=input_shape),
            layers.BatchNormalization(),
            layers.MaxPooling2D((2,2))
        ])
        self.conv2 = tf.keras.Sequential([
            layers.Conv2D(base_filters*2, (3,3), padding='same', activation='relu'),
            layers.BatchNormalization(),
            layers.MaxPooling2D((2,2))
        ])
        self.conv3 = tf.keras.Sequential([
            layers.Conv2D(base_filters*4, (3,3), padding='same', activation='relu'),
            layers.BatchNormalization(),
            layers.MaxPooling2D((2,2))
        ])
        # último bloco (aumenta filtros para aproximar parâmetro total)
        self.conv_last = tf.keras.Sequential([
            layers.Conv2D(base_filters*8, (3,3), padding='same', activation='relu'),
            layers.BatchNormalization(),
            layers.GlobalAveragePooling2D()
        ])
        # densas alinhadas ao modelo 1D
        fc_units = base_filters*32  # 16 * 2**5 = 512 quando base_filters=16
        self.fc1 = layers.Dense(fc_units, activation='relu')
        self.drop = layers.Dropout(0.4)
        self.out = layers.Dense(num_classes, activation='softmax')

    def call(self, inputs, training=False):
        x = self.conv1(inputs, training=training)
        x = self.conv2(x, training=training)
        x = self.conv3(x, training=training)
        x = self.conv_last(x, training=training)
        x = self.fc1(x)
        x = self.drop(x, training=training)
        return self.out(x)

# utilitário rápido para comparar contagem de parâmetros
def params_count(model, example_input_shape):
    m = model
    m.build((None,)+example_input_shape)
    return m.count_params()

# exemplo de uso (instancie e veja parâmetros)
cnn1 = CNN1D_fixed(input_shape=(16000,1))
cnn2 = CNN2D_matched(input_shape=(64,157,1))  # 64 mel bins × ~157 frames para 1s@16000 com hop≈100
print("CNN1D params:", params_count(cnn1, (16000,1)))
print("CNN2D params:", params_count(cnn2, (64,157,1)))
# ...existing code...

In [None]:
model = CNN1D_fixed(input_shape=(16000, 1), num_classes=10)

# Compilar
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

# Treinar
model.fit(X_train, y_train,
          validation_data=(X_val, y_val),
          epochs=50,
          batch_size=32,
          callbacks=[
              tf.keras.callbacks.EarlyStopping(patience=8, restore_best_weights=True),
              tf.keras.callbacks.ReduceLROnPlateau(patience=4, factor=0.5)
          ])

In [None]:
y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)

# Relatório detalhado
print(classification_report(y_test, y_pred_classes, digits=4))