# Aprendizaje Profundo
Daniel López Gala - UO281798

Se dispone del conjunto de datos NIPS4BPLUS, el cual contiene 674 ficheros de audio con una duración total de menos de una hora. En estos audios podemos encontrar grabaciones de aproximadamente 5 segundos con cantos de pájaros realizadas en 39 localizaciones diferentes repartidas por 7 regiones de Francia y España.

In [1]:
base_path = "/content/drive/MyDrive/DeepLearning/"
#base_path = ""
DEBUG = False

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import cv2

import torchaudio
import torchaudio.transforms as T

import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch.nn.functional as F
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau
import torchvision.models as models

from sklearn.metrics import f1_score

!pip install scikit-multilearn
from skmultilearn.model_selection import iterative_train_test_split

Collecting scikit-multilearn
  Downloading scikit_multilearn-0.2.0-py3-none-any.whl (89 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/89.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━[0m [32m81.9/89.4 kB[0m [31m3.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.4/89.4 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scikit-multilearn
Successfully installed scikit-multilearn-0.2.0


## Preprocesamiento y visualización

- Se define una función `visualize_intermediates` para crear imágenes de los pasos intermedios usados en el preprocesamiento de los audios.

- La clase `AudioPreprocessing` define los pasos para procesar la imagen. Se incluyen:
  - Resample (De 44100Hz a 22050Hz)
  - STFT (Convertir a espectrograma)
  - Normalización
  - Median clipping
  - Conectar puntos cercanos mediante filtros
  - Closing
  - Dilation
  - Median blur
  - Eliminar residuos

In [4]:
def visualize_intermediates(intermediates, sample_rate=22050, hop_length=int(512 * 0.75)):

    # Set default background color for figures to white
    plt.rcParams['figure.facecolor'] = 'white'

    for key, value in intermediates.items():
        if len(value.shape) == 2 and value.shape[1] > 2:  # This indicates a waveform
            plt.figure(figsize=(12, 4))

            # Calculate time axis in seconds for waveform
            time_axis_waveform = np.linspace(0, value.shape[1] / sample_rate, value.shape[1])

            plt.plot(time_axis_waveform, value[0].cpu().numpy())
            plt.xlabel("Time (seconds)")
            plt.title(f"{key}")
            plt.show()
            continue

        print(f"Processing {key} with shape {value.shape}")

        if value.dim() == 4 and value.shape[-1] == 2:
            complex_representation = value[0, ..., 0] + 1j * value[0, ..., 1]  # Convert to complex
            magnitude = torch.abs(complex_representation).cpu().numpy()
            phase = torch.angle(complex_representation).cpu().numpy()
        elif value.is_complex():
            magnitude = torch.abs(value).squeeze().cpu().numpy()
            phase = torch.angle(value).squeeze().cpu().numpy()
        else:
            magnitude = value.squeeze().cpu().numpy()
            phase = None

        # Calculate time axis in seconds for magnitude
        time_axis_magnitude = np.linspace(0, magnitude.shape[1] * hop_length / sample_rate, magnitude.shape[1])

        # Plot magnitude with inverted grayscale colormap
        plt.figure(figsize=(12, 4))
        plt.imshow(magnitude, cmap='gray_r', aspect='auto', origin='lower', extent=[time_axis_magnitude[0], time_axis_magnitude[-1], 0, magnitude.shape[0]])
        plt.xlabel("Time (seconds)")
        plt.title(f"{key} Magnitude")
        plt.colorbar()
        plt.show()

        # Plot phase
        if phase is not None:
            plt.figure(figsize=(12, 4))
            plt.imshow(((phase + np.pi) % (2 * np.pi) - np.pi), cmap='hsv', aspect='auto', origin='lower', vmin=-np.pi, vmax=np.pi, extent=[time_axis_magnitude[0], time_axis_magnitude[-1], 0, phase.shape[0]])
            plt.xlabel("Time (seconds)")
            plt.title(f"{key} Phase")
            plt.colorbar()
            plt.show()


In [5]:
class AudioPreprocessing(nn.Module):
    def __init__(self, debug=DEBUG, sample_rate=22050, n_fft=512, win_length=512, hop_length=int(512 * 0.75)):
        super(AudioPreprocessing, self).__init__()
        self.debug = debug
        self.sample_rate = sample_rate
        self.resampler = T.Resample(orig_freq=44100, new_freq=sample_rate)
        self.spectrogram = T.Spectrogram(n_fft=n_fft, win_length=win_length, hop_length=hop_length, power=None, window_fn=torch.hann_window, center=False, return_complex=True)
        self.time_stretch = T.TimeStretch(n_freq=n_fft // 2 + 1, fixed_rate=1.1)


    def normalize(self, spectrogram, method='max'):
        if method == 'max':
            return spectrogram / (spectrogram.max() + 1e-5)
        elif method == 'min-max':
            return (spectrogram - spectrogram.min()) / (spectrogram.max() - spectrogram.min() + 1e-5)
        elif method == 'mean-std':
            return (spectrogram - spectrogram.mean()) / (spectrogram.std() + 1e-5)
        else:
            raise ValueError(f"Unknown normalization method: {method}")

    def median_clipping(self, spectrogram, threshold=3):
        freq_median = torch.median(spectrogram, dim=2, keepdim=True)[0]
        time_median = torch.median(spectrogram, dim=1, keepdim=True)[0]
        mask = (spectrogram > (threshold * freq_median)) & (spectrogram > (threshold * time_median))
        return torch.where(mask, torch.tensor(1.0).to(spectrogram.device), spectrogram)

    def image_processing(self, spectrogram):
        img = spectrogram.squeeze(0).cpu().numpy()

        # Morphological closing to emphasize birdsong patterns
        # kernel = np.ones((3,3), np.uint8)
        # img = cv2.morphologyEx(img, cv2.MORPH_CLOSE, kernel)

        # Median blurring for noise reduction
        # img = cv2.medianBlur(img.astype(np.float32), 3)

        return torch.tensor(img).float().unsqueeze(0)

    def augment(self, waveform):
        # TODO
        return waveform

    def forward(self, waveform):
        intermediates = {}

        # Resampling
        # waveform = self.resampler(waveform)
        # if self.debug:
        #     intermediates['resampled'] = waveform

        # Data Augmentation on the waveform
        # waveform = self.augment(waveform)

        # Apply STFT and obtain complex spectrogram
        complex_spectrogram = self.spectrogram(waveform)

        # Time stretch on complex spectrogram
        stretched_complex_spectrogram = self.time_stretch(complex_spectrogram)

        # Extract magnitude from the complex spectrogram for subsequent steps
        spectrogram = torch.abs(stretched_complex_spectrogram)
        if self.debug:
            intermediates['stft'] = spectrogram

        # Normalize
        spectrogram = self.normalize(spectrogram, method='max')
        if self.debug:
            intermediates['normalized'] = spectrogram

        # Median Clipping
        # spectrogram = self.median_clipping(spectrogram, threshold=3)
        # if self.debug:
        #     intermediates['median_clipped'] = spectrogram

        # Image Processing
        # spectrogram = self.image_processing(spectrogram)
        # if self.debug:
        #     intermediates['image_processed'] = spectrogram

        return spectrogram, intermediates if self.debug else spectrogram

## Carga de datos

Se leen los audios de forma individual. Cada audio es un objeto. `BirdSongDataset` define el método `__getitem__` para obtener cada instancia del dataset.

No se tiene en cuenta en qué momento del audio suena cada pájaro, tan sólo qué pájaros suenan en cada audio. El problema se plantea como **clasificación multietiqueta**.

El método `get_class_proportions` se utiliza para comprobar que los datasets *train* y *validation* contienen la misma proporción de clases, es decir, están estratíficados.

In [6]:
class BirdSongDataset(Dataset):
    def __init__(self, df, audio_dir, class_info, transform=None):
        self.df = df
        self.audio_dir = audio_dir
        self.class_info = class_info
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        filename = self.df.iloc[idx, 0]
        audio_path = os.path.join(self.audio_dir, filename)
        waveform, sample_rate = torchaudio.load(audio_path) # Get the waveform and sample rate for the current audio

        labels = self.df[self.df['filename'] == filename] # Get all the rows for the current audio
        target = torch.zeros(len(self.class_info)) # Create a torch tensor
        for _, label in labels.iterrows(): # Iterate each bird sound label in the audio
            class_name = label['class'] # Get the class name from the CSV (Ej.: Petpet_song)
            target[self.class_info.index(class_name)] = 1.0 # Set to 1 in the position of that bird from the class_info file.

        if self.transform:
            waveform = self.transform(waveform) # Transform the waveform, where transform is AudioPreprocessing()

        return waveform, target

train_csv = pd.read_csv(f'{base_path}data/train.csv') # CSV with train audio filenames, and bird class names labels.
class_info_csv = pd.read_csv(f'{base_path}data/class_info.csv')
class_names = class_info_csv['class name'].tolist()

# Convert the labels to a binary matrix form
y = np.zeros((len(train_csv), len(class_names)))
for i, (_, row) in enumerate(train_csv.iterrows()):
    labels = row['class'].split(",")  # Classes are comma-separated
    for label in labels:
        y[i, class_names.index(label)] = 1

X_train, y_train, X_val, y_val = iterative_train_test_split(np.array(train_csv), y, test_size=0.1)

train_df = pd.DataFrame(X_train, columns=train_csv.columns)
valid_df = pd.DataFrame(X_val, columns=train_csv.columns)

transform = nn.Sequential(
    AudioPreprocessing()
)

train_dataset = BirdSongDataset(train_df, f'{base_path}data/train/', class_names, transform=transform)
valid_dataset = BirdSongDataset(valid_df, f'{base_path}data/train/', class_names, transform=transform)



In [7]:
def get_class_proportions(y, class_names):
    """
    Calculate the proportion of each class in the given binary matrix y.
    """
    proportions = {}
    total_samples = y.shape[0]

    for idx, class_name in enumerate(class_names):
        proportions[class_name] = np.sum(y[:, idx]) / total_samples

    return proportions


train_proportions = get_class_proportions(y_train, class_names)
valid_proportions = get_class_proportions(y_val, class_names)

if DEBUG:
    print("Class Proportions in Training Dataset:")
    for class_name, proportion in train_proportions.items():
        print(f"{class_name}: {proportion * 100:.2f}%")

    print("\nClass Proportions in Validation Dataset:")
    for class_name, proportion in valid_proportions.items():
        print(f"{class_name}: {proportion * 100:.2f}%")

# Comparing the differences in proportions
print("\nDifferences in Proportions (Training - Validation):")
for class_name in class_names:
    difference = train_proportions[class_name] - valid_proportions[class_name]
    print(f"{class_name}: {difference * 100:.2f}%")



Differences in Proportions (Training - Validation):
Aegcau_call: -0.04%
Alaarv_song: 0.04%
Anttri_song: 0.08%
Butbut_call: -0.07%
Carcan_call: -0.04%
Carcan_song: -0.01%
Carcar_call: 0.06%
Carcar_song: 0.09%
Cerbra_call: -0.12%
Cerbra_song: -0.09%
Cetcet_song: -0.11%
Chlchl_call: -0.07%
Cicatr_song: -0.07%
Cicorn_song: -0.02%
Cisjun_song: 0.05%
Colpal_song: -0.04%
Corcor_call: -0.07%
Denmaj_call: 0.05%
Denmaj_drum: -0.05%
Embcir_call: 0.08%
Embcir_song: 0.05%
Erirub_call: -0.09%
Erirub_song: 0.03%
Fricoe_call: 0.00%
Fricoe_song: 0.08%
Galcri_call: -0.07%
Galcri_song: 0.00%
Galthe_call: 0.05%
Galthe_song: 0.13%
Gargla_call: 0.05%
Hirrus_call: -0.09%
Jyntor_song: -0.02%
Lopcri_call: 0.05%
Loxcur_call: -0.09%
Lularb_song: 0.14%
Lusmeg_call: 0.10%
Lusmeg_song: -0.04%
Lyrple_song: 0.07%
Motcin_call: -0.07%
Musstr_call: -0.07%
Noise: 0.11%
Oriori_call: 0.07%
Oriori_song: 0.00%
Parate_call: 0.00%
Parate_song: -0.06%
Parcae_call: -0.11%
Parcae_song: -0.11%
Parmaj_call: -0.12%
Parmaj_song: 0.0

In [8]:
if DEBUG:
    sample, target = train_dataset[75]
    processed_sample, intermediates = sample

    print(processed_sample.shape)
    num_positive_labels = target.sum().item()
    print(f"Number of positive labels: {num_positive_labels}")
    visualize_intermediates(intermediates)

**Calcular la longitud máxima de las formas de onda**

Se determina la longitud máxima entre todas las formas de onda para poder rellenar (padding) o truncar los audios posteriormente, garantizando que todos tengan la misma longitud.

La función `collate_fn` se utiliza para procesar y combinar un lote (batch) de muestras en el dataloader. Asegura que todas las formas de onda tengan la misma longitud (rellenando con ceros si es necesario) y devuelve las formas de onda junto con sus objetivos (etiquetas). Para esto, necesita la longitud máxima calculada anteriormente.

In [9]:
# Calculate the global max length of waveforms in the dataset
global_max_len = max(
    max(dataset[i][0][0].shape[2] for i in range(len(dataset)))
    for dataset in [train_dataset, valid_dataset]
)

In [10]:
def collate_fn(batch):
    # Test set scenario (Does not have targets, the filename is return to have the same output shape)
    if isinstance(batch[0][1], str):
        waveforms, filenames = zip(*batch)
        # Directly pad and return, no need to stack targets
        waveforms = [torch.cat([wf[0], torch.zeros(wf[0].shape[0], wf[0].shape[1], global_max_len - wf[0].shape[2])], dim=2) for wf in waveforms]
        waveforms = torch.stack(waveforms)
        return waveforms, filenames

    # Training or validation batch
    waveforms, targets = zip(*batch)
    waveforms = [torch.cat([wf[0], torch.zeros((1, wf[0].shape[1], global_max_len - wf[0].shape[2]))], dim=2) for wf in waveforms]
    waveforms = torch.stack(waveforms)
    targets = torch.stack(targets)
    return waveforms, targets

BATCH_SIZE=32
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
valid_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

## Definición del modelo

- Se define una arquitectura basada en el modelo ResNet50 preentrenado.
- Se adapta la primera capa convolucional para aceptar imágenes de un solo canal (grises).
- Se elimina la última capa completamente conectada del ResNet y se agrega una clasificación personalizada para adaptar la arquitectura al problema multietiqueta.

Se utiliza una mezcla de *transfer-learning* y *fine-tuning*.

**Transferencia de aprendizaje**:

El modelo se carga y se adaptan algunas capas. Se congelan los pesos de las capas del modelo preentrenado para que no se actualicen durante el entrenamiento inicial, por lo que sólo las capas personalizadas, como la capa de clasificación, se entrenarán. Es decir, se adapta a una tarea diferente el modelo, manteniendo los pesos originales.

**Fine-tuning**:

Después de algunas épocas de entrenamiento determinadas en el código se desbloquean las capas del modelo preentrenado para que sus pesos también puedan actualizarse durante el entrenamiento

```python
if epoch == X:
    for param in model.features.parameters():
        param.requires_grad = True
```

Este fine-tuning ajusta el modelo a los datos específicos para mejorar el rendimiento, aunque causa cierto *overfitting* al sobreescribir los pesos originales con los datos de entrenamiento.




In [11]:
class ResNetMultilabel(nn.Module):
    def __init__(self, num_classes):
        super(ResNetMultilabel, self).__init__()
        # Load pre-trained resnet model
        self.resnet = models.resnet50(pretrained=True)

        # Modify the first convolutional layer to accept single-channel (grayscale) images
        self.resnet.conv1 = nn.Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)

        # Remove the last fully connected layer to adapt for our task
        layers = list(self.resnet.children())[:-1]
        self.features = nn.Sequential(*layers)

        # Custom classifier for our multilabel task
        self.classifier = nn.Sequential(
            nn.Dropout(0.5),
            nn.Linear(self.resnet.fc.in_features, 512),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(512, num_classes),
            nn.Sigmoid()
        )

    def forward(self, x):
        x = self.features(x)
        x = x.view(x.size(0), -1)
        x = self.classifier(x)
        return x

In [12]:
# Set up the device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using: {device}")

# Initialize the model
model = ResNetMultilabel(num_classes=len(class_names)).to(device)

# The pre-trained layers are in the 'features' submodule
for param in model.features.parameters():
    param.requires_grad = False


Using: cuda


Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /root/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth
100%|██████████| 97.8M/97.8M [00:00<00:00, 190MB/s]


## Entrenamiento

- Se utiliza BCE (Binary Cross Entropy), adecuada para problemas de clasificación multietiqueta junto a un optimizador Adam con las tasas de aprendizaje diferentes para cada fase del entrenamiento.
- Se utiliza un programador de learning rate (ReduceLROnPlateau) que disminuye la tasa de aprendizaje si la función de pérdida no mejora.

El proceso de entrenamiento se ejecuta a través de 20 épocas, y durante cada época se calcula la pérdida en entrenamiento y se ajustan los pesos del modelo, se calcula el F1 en entrenamiento, y se pasa el modelo a modo de evaluación para evaluar en el conjunto de validación, calculando tanto la pérdida como el F1 score.

Si el modelo mejora (en F1) se guarda un checkpoint de los pesos. Está implementada, aunque no se usa actualmente, una lógica de early-stop para evitar el sobreajuste.

Después de cada época se ajusta el learning rate según la evolución de la pérdida en validación.

**Búsqueda de umbral**:
- Se inicializa una lista de posibles `thresholds` de 0.1 a 0.5 en incrementos de 0.05. Estos son los umbrales para decidir si una predicción (probabilidad) del modelo es positiva o negativa.
- Para cada umbral se calcula el F1 score en entrenamiento y validación y se elige el umbral que produce el mejor F1 score en el conjunto de validación.

Esto es importante porque las salidas del modelo son valores continuos entre 0 y 1, que representan la confianza del modelo en que esa etiqueta es positiva, y es necesario decidir un umbral (`threshold`) para convertir estas salidas continuas en etiquetas binarias definitivas.

In [13]:
# Use discriminative learning rates
transfer_learning_lr = 0.001
fine_tuning_lr = 0.0005

# Definimos el número de épocas para cada fase
transfer_learning_epochs = 10
fine_tuning_epochs = 15
total_epochs = transfer_learning_epochs + fine_tuning_epochs

optimizer = optim.Adam([
    {'params': model.features.parameters(), 'lr': transfer_learning_lr / 10}, # Discriminative learning rate for pre-trained layers
    {'params': model.classifier.parameters(), 'lr': transfer_learning_lr} # Learning rate for the classifier
], weight_decay=1e-5)  # L2 regularization


criterion = nn.BCELoss()
scheduler = ReduceLROnPlateau(optimizer, 'min', factor=0.5, patience=5, verbose=True)

best_val_loss = float('inf')
best_f1 = float('-inf')
epochs_no_improve = 0
n_epochs_stop = 3
early_stop = False
thresholds = np.arange(0.1, 0.3, 0.05)

for epoch in range(total_epochs):
    # Cambiar a fine-tuning
    if epoch == transfer_learning_epochs:
        for param in model.features.parameters():
            param.requires_grad = True

        # Ajustar learning rates para fine-tuning
        for param_group in optimizer.param_groups:
            if param_group['params'] == model.classifier.parameters():
                param_group['lr'] = fine_tuning_lr
            else:
                param_group['lr'] = fine_tuning_lr / 10

    # Training
    model.train()
    running_train_loss = 0.0
    all_train_preds = []
    all_train_labels = []
    for i, (inputs, labels) in enumerate(train_loader):
        inputs, labels = inputs.to(device), labels.to(device)

        optimizer.zero_grad()

        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_train_loss += loss.item()

        # Store training predictions and true labels
        all_train_preds.extend(outputs.detach().cpu().numpy().tolist())
        all_train_labels.extend(labels.cpu().numpy().tolist())

    train_loss = running_train_loss / len(train_loader)

    # Calculate training F1 score and also find the best threshold on training data
    train_f1_scores = []
    for threshold in thresholds:
        train_f1_scores.append(f1_score(all_train_labels, np.array(all_train_preds) > threshold, average='samples'))

    # Get the best F1 score and corresponding threshold from the training data
    best_threshold_index_train = np.argmax(train_f1_scores)
    best_threshold_train = thresholds[best_threshold_index_train]
    train_best_f1 = train_f1_scores[best_threshold_index_train]

    # Validation using the threshold obtained from training data
    model.eval()
    running_val_loss = 0.0
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for inputs, labels in valid_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            running_val_loss += loss.item()
            # Store predictions and true labels
            all_preds.extend(outputs.cpu().numpy().tolist())
            all_labels.extend(labels.cpu().numpy().tolist())

    val_loss = running_val_loss / len(valid_loader)

    # Calculate validation F1 score using threshold from training data
    validation_f1 = f1_score(all_labels, np.array(all_preds) > best_threshold_train, average='samples')

    print(f"Epoch {epoch+1}, Train Loss: {train_loss:.4f}, Training F1: {train_best_f1:.4f}, Validation Loss: {val_loss:.4f}, Validation F1: {validation_f1:.4f} using threshold {best_threshold_train:.2f}")

    # Checkpointing
    if validation_f1 > best_f1:
        best_f1 = validation_f1
        epochs_no_improve = 0
        torch.save(model.state_dict(), 'best_model.pth')
    else:
        epochs_no_improve += 1

    # Early stopping
    if epochs_no_improve == n_epochs_stop:
        print('Early stopping!')
        early_stop = True
        break

    # Adjusting learning rate
    scheduler.step(-val_loss)  # Pass negative F1 score since ReduceLROnPlateau expects to minimize the metric

if early_stop:
    print("Stopped training. Loading best model weights!")
    model.load_state_dict(torch.load('best_model.pth'))

print('Finished Training')

Epoch 1, Train Loss: 0.1321, Training F1: 0.1130, Validation Loss: 0.0834, Validation F1: 0.1479 using threshold 0.10
Epoch 2, Train Loss: 0.0994, Training F1: 0.2037, Validation Loss: 0.0750, Validation F1: 0.1999 using threshold 0.10
Epoch 3, Train Loss: 0.0886, Training F1: 0.2697, Validation Loss: 0.0696, Validation F1: 0.2590 using threshold 0.10
Epoch 4, Train Loss: 0.0812, Training F1: 0.3179, Validation Loss: 0.0649, Validation F1: 0.2870 using threshold 0.15
Epoch 5, Train Loss: 0.0750, Training F1: 0.3573, Validation Loss: 0.0607, Validation F1: 0.3435 using threshold 0.15
Epoch 6, Train Loss: 0.0712, Training F1: 0.3910, Validation Loss: 0.0593, Validation F1: 0.3398 using threshold 0.15
Epoch 7, Train Loss: 0.0685, Training F1: 0.4062, Validation Loss: 0.0554, Validation F1: 0.3912 using threshold 0.15
Epoch 00007: reducing learning rate of group 0 to 5.0000e-05.
Epoch 00007: reducing learning rate of group 1 to 5.0000e-04.
Epoch 8, Train Loss: 0.0637, Training F1: 0.4487, 

## Evaluación y Predicción en el conjunto de Test

1. **Evaluación de las predicciones**:
Se pone el modelo en modo `eval()` y se itera sobre el conjunto de validación para obtener las predicciones y se calcula el F1 usando el mejor umbral.

2. **Preparación del conjunto de Test**:
Se crea la clase `BirdSongTestDataset` que lee de `test.csv`, y se crea un DataLoader para el conjunto de Test.

3. **Predicciones en el conjunto de Test**:
Se itera sobre el conjunto de test y se obtienen las predicciones del modelo para cada archivo de audio. Se binarizan usando el mejor umbral y se almacenan en un diccionario con el nombre del archivo como clave.
Las predicciones se convierten en un DataFrame de Pandas y se preparan los datos en el formato esperado, y por último se guarda el DataFrame en un archivo CSV.

In [15]:
print(f"Best threshold: {best_threshold_train}")

Best threshold: 0.25000000000000006


In [16]:
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for inputs, labels in valid_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        preds = (outputs > best_threshold_train).float()

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

f1_macro = f1_score(all_labels, all_preds, average='samples')
print(f"F1 Score (Samples): {f1_macro}")

F1 Score (Samples): 0.7801682771314008


In [17]:
class BirdSongTestDataset(Dataset):
    def __init__(self, df, audio_dir, transform=None):
        self.df = df
        self.audio_dir = audio_dir
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        filename = self.df.iloc[idx, 0]
        #print(f"File: {filename}")
        audio_path = os.path.join(self.audio_dir, filename)
        waveform, sample_rate = torchaudio.load(audio_path)

        if self.transform:
            waveform = self.transform(waveform)

        return waveform, filename  # Return both waveform and filename to match the expected shape

test_csv = pd.read_csv(f'{base_path}data/test.csv')

test_dataset = BirdSongTestDataset(test_csv, f'{base_path}data/test/', transform=transform)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)

In [18]:
if DEBUG:
  sample, _ = test_dataset[99]
  processed_sample, intermediates = sample

  print(processed_sample.shape)
  visualize_intermediates(intermediates)

In [19]:
# Make predictions on test set
model.eval()
predictions = {}
with torch.no_grad():
    for inputs, filenames in test_loader:
        inputs = inputs.to(device)
        outputs = model(inputs)
        preds = (outputs > best_threshold_train).float().cpu().numpy().astype(int)
        for fname, pred in zip(filenames, preds):
            predictions[fname] = pred

# Convert predictions to submission format
submission_df = pd.DataFrame.from_dict(predictions, orient='index', columns=class_names)
submission_df.reset_index(inplace=True)
submission_df.rename(columns={'index': 'filename'}, inplace=True)
submission_df.to_csv('submission.csv', index=False)