# CNN Model Architecture for Music Tagging

This notebook implements Convolutional Neural Network architectures for music classification and tagging.

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchaudio

## Simple CNN for Genre Classification

In [None]:
class SimpleCNN(nn.Module):
    """Simple CNN for music genre classification."""
    
    def __init__(self, n_classes=10, sample_rate=22050, n_mels=128):
        super(SimpleCNN, self).__init__()
        
        self.sample_rate = sample_rate
        self.n_mels = n_mels
        
        # Mel-spectrogram transform
        self.mel_spec = torchaudio.transforms.MelSpectrogram(
            sample_rate=sample_rate,
            n_fft=2048,
            hop_length=512,
            n_mels=n_mels
        )
        self.amplitude_to_db = torchaudio.transforms.AmplitudeToDB()
        
        # Convolutional layers
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, padding=1)
        self.bn1 = nn.BatchNorm2d(32)
        self.pool1 = nn.MaxPool2d(2, 2)
        
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        self.bn2 = nn.BatchNorm2d(64)
        self.pool2 = nn.MaxPool2d(2, 2)
        
        self.conv3 = nn.Conv2d(64, 128, kernel_size=3, padding=1)
        self.bn3 = nn.BatchNorm2d(128)
        self.pool3 = nn.MaxPool2d(2, 2)
        
        self.conv4 = nn.Conv2d(128, 256, kernel_size=3, padding=1)
        self.bn4 = nn.BatchNorm2d(256)
        self.pool4 = nn.MaxPool2d(2, 2)
        
        # Global average pooling
        self.global_pool = nn.AdaptiveAvgPool2d((1, 1))
        
        # Fully connected layers
        self.fc1 = nn.Linear(256, 128)
        self.dropout = nn.Dropout(0.5)
        self.fc2 = nn.Linear(128, n_classes)
    
    def forward(self, x):
        # x: (batch, 1, time)
        
        # Convert to mel-spectrogram
        x = self.mel_spec(x)
        x = self.amplitude_to_db(x)
        # x: (batch, 1, n_mels, time)
        
        # Convolutional blocks
        x = self.pool1(F.relu(self.bn1(self.conv1(x))))
        x = self.pool2(F.relu(self.bn2(self.conv2(x))))
        x = self.pool3(F.relu(self.bn3(self.conv3(x))))
        x = self.pool4(F.relu(self.bn4(self.conv4(x))))
        
        # Global pooling and flatten
        x = self.global_pool(x)
        x = x.view(x.size(0), -1)
        
        # Fully connected layers
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        
        return x

## Deep CNN for Music Tagging (Multi-label)

In [None]:
class DeepCNN(nn.Module):
    """Deeper CNN for music tagging (multi-label classification)."""
    
    def __init__(self, n_classes=50, sample_rate=22050, n_mels=128):
        super(DeepCNN, self).__init__()
        
        self.sample_rate = sample_rate
        self.n_mels = n_mels
        
        # Mel-spectrogram transform
        self.mel_spec = torchaudio.transforms.MelSpectrogram(
            sample_rate=sample_rate,
            n_fft=2048,
            hop_length=512,
            n_mels=n_mels
        )
        self.amplitude_to_db = torchaudio.transforms.AmplitudeToDB()
        
        # Convolutional blocks
        self.conv_blocks = nn.ModuleList([
            self._make_conv_block(1, 64),
            self._make_conv_block(64, 128),
            self._make_conv_block(128, 256),
            self._make_conv_block(256, 512),
        ])
        
        # Global pooling
        self.global_pool = nn.AdaptiveAvgPool2d((1, 1))
        
        # Fully connected layers
        self.fc1 = nn.Linear(512, 256)
        self.dropout1 = nn.Dropout(0.5)
        self.fc2 = nn.Linear(256, 128)
        self.dropout2 = nn.Dropout(0.5)
        self.fc3 = nn.Linear(128, n_classes)
    
    def _make_conv_block(self, in_channels, out_channels):
        """Create a convolutional block."""
        return nn.Sequential(
            nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1),
            nn.BatchNorm2d(out_channels),
            nn.ReLU(inplace=True),
            nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1),
            nn.BatchNorm2d(out_channels),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2, 2)
        )
    
    def forward(self, x):
        # x: (batch, 1, time)
        
        # Convert to mel-spectrogram
        x = self.mel_spec(x)
        x = self.amplitude_to_db(x)
        # x: (batch, 1, n_mels, time)
        
        # Apply convolutional blocks
        for conv_block in self.conv_blocks:
            x = conv_block(x)
        
        # Global pooling and flatten
        x = self.global_pool(x)
        x = x.view(x.size(0), -1)
        
        # Fully connected layers
        x = F.relu(self.fc1(x))
        x = self.dropout1(x)
        x = F.relu(self.fc2(x))
        x = self.dropout2(x)
        x = self.fc3(x)
        
        return x

## Test Model Creation

In [None]:
# Test SimpleCNN
model_cnn = SimpleCNN(n_classes=10)
print("SimpleCNN:")
print(model_cnn)
print(f"\nNumber of parameters: {sum(p.numel() for p in model_cnn.parameters())}")

# Test with random input
batch_size = 4
sample_rate = 22050
duration = 30
x = torch.randn(batch_size, 1, sample_rate * duration)
output = model_cnn(x)
print(f"\nInput shape: {x.shape}")
print(f"Output shape: {output.shape}")

In [None]:
# Test DeepCNN
model_deep = DeepCNN(n_classes=50)
print("DeepCNN:")
print(model_deep)
print(f"\nNumber of parameters: {sum(p.numel() for p in model_deep.parameters())}")

# Test with random input
output = model_deep(x)
print(f"\nInput shape: {x.shape}")
print(f"Output shape: {output.shape}")

## Save Model

In [None]:
def save_model(model, path):
    """Save model state dict."""
    torch.save(model.state_dict(), path)
    print(f"Model saved to {path}")

def load_model(model, path):
    """Load model state dict."""
    model.load_state_dict(torch.load(path))
    print(f"Model loaded from {path}")
    return model

# Example usage:
# save_model(model_cnn, '../models/simple_cnn.pth')
# model_cnn = load_model(SimpleCNN(n_classes=10), '../models/simple_cnn.pth')