In [9]:
import pandas as pd
import numpy as np
import librosa
import librosa.display
import matplotlib.pyplot as plt

train_df = pd.read_csv('data/train.csv')
train_df['duration'] = train_df['end'] - train_df['start']

In [10]:
def extract_slices(audio_file_path, labeled_segments):
    """
    Extract spectrogram slices from an audio file based on labeled segments.
    
    Parameters:
    - audio_file_path: Path to the audio file.
    - labeled_segments: DataFrame containing start, end times and labels for segments.
    
    Returns:
    - slices_list: List of extracted spectrogram slices.
    - labels_list: List of labels corresponding to the slices.
    """
    y, sr = librosa.load(audio_file_path, sr=None)
    D = librosa.amplitude_to_db(np.abs(librosa.stft(y)), ref=np.max)
    
    slices_list = []
    labels_list = []
    
    for _, row in labeled_segments.iterrows():
        start_time = row['start']
        end_time = row['end']
        start_sample = librosa.time_to_frames(start_time, sr=sr)
        end_sample = librosa.time_to_frames(end_time, sr=sr)
        slice_ = D[:, start_sample:end_sample]
        slices_list.append(slice_)
        labels_list.append(row['class'])
    
    return slices_list, labels_list

# Test the function with one of the training audio files
sample_slices, sample_labels = extract_slices('data/train/nips4b_train001.wav', train_df[train_df['filename'] == 'nips4b_train001.wav'])
len(sample_slices), len(sample_labels)

(21, 21)

In [11]:
import os

# Path to the training audio folder
train_audio_folder = 'data/train/'

all_slices = []
all_labels = []

# Loop through all training audio files in the folder and extract slices
for audio_file in os.listdir(train_audio_folder):
    full_path = os.path.join(train_audio_folder, audio_file)
    filename = audio_file
    labeled_segments = train_df[train_df['filename'] == filename]
    slices, labels = extract_slices(full_path, labeled_segments)
    all_slices.extend(slices)
    all_labels.extend(labels)

len(all_slices), len(all_labels)


(4588, 4588)

In [12]:
# Resize and normalize

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Determine the shape of the largest slice for resizing
max_length = max(slice_.shape[1] for slice_ in all_slices)
uniform_shape = (all_slices[0].shape[0], max_length)

# Resize and normalize slices
prepared_slices = [np.pad(slice_, ((0, 0), (0, max_length - slice_.shape[1])), mode='constant') for slice_ in all_slices]
prepared_slices = [(slice_ - np.min(slice_)) / (np.max(slice_) - np.min(slice_)) for slice_ in prepared_slices]

# Label encoding
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(all_labels)

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(prepared_slices, encoded_labels, test_size=0.2, random_state=42)

len(X_train), len(X_val)

  prepared_slices = [(slice_ - np.min(slice_)) / (np.max(slice_) - np.min(slice_)) for slice_ in prepared_slices]


(3670, 918)

In [13]:
# Get the shape of one of the prepared slices
slice_shape = prepared_slices[5].shape
slice_shape

(1025, 431)

In [14]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class BirdSongNet(nn.Module):
    def __init__(self, num_classes=89, input_shape=(1025, 231)):
        super(BirdSongNet, self).__init__()
        
        # Compute the size after pooling operations
        self.feature_size = input_shape[0] // 4 * input_shape[1] // 4 * 32  # Divided by 4 due to two pooling layers
        
        # Convolutional Layer 1
        self.conv1 = nn.Conv2d(1, 16, kernel_size=3, stride=1, padding=1)
        
        # Convolutional Layer 2
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1)
        
        # Fully Connected Layer 1
        self.fc1 = nn.Linear(self.feature_size, 128)
        
        # Fully Connected Layer 2 (Output Layer)
        self.fc2 = nn.Linear(128, num_classes)
    
    def forward(self, x):
        # Convolutional Layer 1 + ReLU + Max Pooling
        x = F.relu(self.conv1(x))
        x = F.max_pool2d(x, 2)
        
        # Convolutional Layer 2 + ReLU + Max Pooling
        x = F.relu(self.conv2(x))
        x = F.max_pool2d(x, 2)
        
        # Flatten
        x = x.view(x.size(0), -1)
        
        # Fully Connected Layer 1 + ReLU
        x = F.relu(self.fc1(x))
        
        # Fully Connected Layer 2 + Sigmoid
        x = torch.sigmoid(self.fc2(x))
        
        return x


# Instantiate the model
model = BirdSongNet(num_classes=89, input_shape=(1025, 231))

# Loss and optimizer
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

print(model)

BirdSongNet(
  (conv1): Conv2d(1, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv2): Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (fc1): Linear(in_features=473088, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=89, bias=True)
)


In [15]:
# Ensure you have your data in the appropriate format
# Convert the prepared slices and labels into PyTorch tensors
X_train_tensor = torch.FloatTensor(X_train).unsqueeze(1)  # Add channel dimension
y_train_tensor = torch.FloatTensor(y_train)
X_val_tensor = torch.FloatTensor(X_val).unsqueeze(1)
y_val_tensor = torch.FloatTensor(y_val)

# Training parameters
num_epochs = 20
batch_size = 16
train_loader = torch.utils.data.DataLoader(dataset=list(zip(X_train_tensor, y_train_tensor)), batch_size=batch_size, shuffle=True)

# Training loop
for epoch in range(num_epochs):
    for i, (audio, labels) in enumerate(train_loader):
        # Zero the parameter gradients
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(audio)
        
        # Compute loss
        loss = criterion(outputs, labels)
        
        # Backward pass and optimize
        loss.backward()
        optimizer.step()
        
        # Print loss every 10 batches
        if (i+1) % 10 == 0:
            print(f"Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{len(train_loader)}], Loss: {loss.item():.4f}")

# Evaluate the model
model.eval()  # Set the model to evaluation mode
with torch.no_grad():
    val_outputs = model(X_val_tensor)
    val_predictions = (val_outputs > 0.5).float()
    accuracy = (val_predictions == y_val_tensor).float().mean()
    print(f"Validation Accuracy: {accuracy:.2f}")


  X_train_tensor = torch.FloatTensor(X_train).unsqueeze(1)  # Add channel dimension


RuntimeError: mat1 and mat2 shapes cannot be multiplied (16x876544 and 473088x128)