In [1]:
import librosa
import os
import torch
torch.mps.empty_cache()

In [2]:
# set path
path = '/Users/cam/Code/repos/music_tools/data'
os.chdir(path)
print('current directory: ', os.getcwd())

current directory:  /Users/cam/Code/repos/music_tools/data


In [None]:
# librosa to tensor

# create spectrogram
y, sr = librosa.load("King Gizzard & The Lizard Wizard - Live at Field of Vision '25 - 01 Gamma Knife (Live at Field of Vision '25).mp3")
spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128)

# convert to Tensor
# Use .float() because neural nets usually expect float32
melspec_tensor = torch.from_numpy(spec).float()

# 3. Add Batch and Channel dimensions (required for most Conv2D layers)
# Resulting shape: [1, 1, 128, time_steps]
spec_tensor = melspec_tensor.unsqueeze(0).unsqueeze(0)

In [None]:
import torch.nn as nn
import torch.nn.functional as F

class AudioClassifier(nn.Module):
    def __init__(self, num_classes):
        super(AudioClassifier, self).__init__()
        # Conv layer: looks for local patterns in the spectrogram
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, stride=1, padding=1)
        self.pool = nn.MaxPool2d(2, 2)
        # Flattening and Linear layers to output the final prediction
        self.fc1 = nn.Linear(32 * 64 * (spec_tensor.shape[-1] // 2), num_classes)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = torch.flatten(x, 1) 
        x = self.fc1(x)
        return x

In [None]:
# Assume you have 500 possible artists to identify
model = AudioClassifier(num_classes=1)

# Pass your melspec_tensor through the model
output = model(melspec_tensor)

print(f"Output shape: {output.shape}") # Should be [1, 500]