In [1]:
%load_ext autoreload
%autoreload 2

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import pickle
import pretty_midi
import librosa
import librosa.display
import gc
from sklearn.preprocessing import StandardScaler
import warnings


import torch
from torch.utils.data import DataLoader, Dataset


from sklearn.preprocessing import StandardScaler

from Preprocessing import *
from ExtractGenre import *

import DatasetLoader as DL

In [2]:
InputPath = os.path.realpath('YAMF/genres_original')

GenreMapping = {'metal': 0, 'disco': 1, 'classical': 2, 'hiphop': 3, 'jazz': 4,
          'country': 5, 'pop': 6, 'blues': 7, 'reggae': 8, 'rock': 9}

In [2]:
def NormalizeSpectrogram(X):
   X_min = X.min()
   X_max = X.max()
   return (X - X_min) / (X_max - X_min)


def DataCNN(InputPath = os.path.realpath('YAMF/genres_original'), length = 128):

   numErr = 0

   TrainDataList, ValDataList, DataList = [], [], []
   for dir in tqdm(os.listdir(InputPath)):
      
      DirPath = os.path.join(InputPath, dir)

      if not os.path.isdir(DirPath):
         continue

      genre = GenreMapping[dir]

      trainSong = 0
      for song in os.listdir(DirPath):
         warnings.filterwarnings('ignore')

         trainSong += 1
         SongPath = os.path.join(DirPath, song)

         #Train data
         if trainSong <= 80:
            try:
               y, sr = librosa.load(SongPath, sr=22050)
               mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128)
               S_db = librosa.amplitude_to_db(mel_spec, ref=np.max)
            except:
               numErr += 1
               continue 

            for _ in range(50):

               rIDX = np.random.randint(0, np.shape(S_db)[1] - length)
               indexs = np.arange(rIDX, rIDX + length)

               X = S_db[:, indexs]

               NormX = NormalizeSpectrogram(X)
               TrainDataList.append((NormX, genre))

         #Validation data
         elif trainSong > 80:
            try:
               y, sr = librosa.load(SongPath, sr=22050)
               mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128)
               S_db = librosa.amplitude_to_db(mel_spec, ref=np.max)
            except:
               numErr += 1
               continue 

            SeparateSong = np.shape(S_db)[1] // length
            for i in range(SeparateSong):

               indexs = np.arange(i * length, i * length + length)
               X = S_db[:, indexs]
               NormX = NormalizeSpectrogram(X)
               ValDataList.append((NormX, genre))

   DataList.extend((TrainDataList, ValDataList))
   return DataList


In [None]:
#DataList = DataCNN()

100%|██████████| 11/11 [00:21<00:00,  1.96s/it]


In [None]:
# with open('YAMF/test.pkl', 'wb') as f:
#    pickle.dump(DataList, f)

In [3]:
from torch.utils.data import Dataset
import torch

class GenreDataset(Dataset):
    def __init__(self, path='YAMF/test.pkl', Train = True, transform=None):

        with open(path, 'rb') as f:
            TD = pickle.load(f)

        if Train:
            self.X = np.array([TD[0][i][0] for i in range(len(TD[0]))])
            self.Y = np.array([TD[0][i][1] for i in range(len(TD[0]))])

        else:
            self.X = np.array([TD[1][i][0] for i in range(len(TD[1]))])
            self.Y = np.array([TD[1][i][1] for i in range(len(TD[1]))])

        del TD
        gc.collect()

    def __len__(self):
        return len(self.Y)

    def __getitem__(self, idx):

        xTensor = self.X[idx]
        yTensor = self.Y[idx]

        xTensor = torch.tensor(xTensor, dtype=torch.float32).unsqueeze(0)
        return xTensor, torch.tensor(yTensor)

In [4]:
trainData = GenreDataset(Train = True)
valData = GenreDataset(Train = False)
trainLoader = DataLoader(trainData, batch_size = 32, shuffle=True, num_workers=0)
valLoader = DataLoader(valData, batch_size = 32, shuffle=True, num_workers=0)

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class GenreCNN(nn.Module):
    def __init__(self, n_classes=10):
        super().__init__()
        self.features = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=3, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(2),

            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(2),

            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.MaxPool2d(2),
        )

        self.classifier = nn.Sequential(
            nn.AdaptiveAvgPool2d((4, 4)),  # Force 4×4 output
            nn.Flatten(),
            nn.Linear(4*4*128, 256),      # 4*4*128 = 2048
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, n_classes)
        )

    def forward(self, x):
        x = self.features(x)
        x = self.classifier(x)  # AdaptiveAvgPool2d and Flatten are inside classifier
        return x

In [None]:
from torch.optim import Adam
from torch.nn import CrossEntropyLoss


from torch.optim import Adam
from torch.nn import CrossEntropyLoss

device = "cuda" if torch.cuda.is_available() else "cpu"
model = GenreCNN()
opt = Adam(model.parameters(), lr=1e-3, weight_decay=1e-4)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(opt, patience=3, factor=0.5)
loss_fn = CrossEntropyLoss()
model.to(device)

print(device)

epochs = 30

train_losses = []
val_losses = []
train_accuracies = []
val_accuracies = []

for epoch in range(epochs):
    print(f"Epoch: {epoch+1}")

    # Training phase
    model.train()
    train_loss = 0
    train_correct = 0
    train_total = 0

    for batch_x, batch_y in tqdm(trainLoader):
        batch_x = batch_x.to(device)
        batch_y = batch_y.to(device)

        y_pred = model(batch_x)
        loss = loss_fn(y_pred, batch_y)

        opt.zero_grad()
        loss.backward()
        opt.step()

        train_loss += loss.item()

        _, predicted = torch.max(y_pred.data, 1)
        train_total += batch_y.size(0)
        train_correct += (predicted == batch_y).sum().item()

    avg_train_loss = train_loss / len(trainLoader)
    train_acc = train_correct / train_total

    model.eval()
    with torch.no_grad():
        predictions = []
        true = []
        for batch_x, batch_y in tqdm(valLoader):
            batch_x = batch_x.to(device)
            batch_y = batch_y.to(device)
            y_pred = model(batch_x)
            predictions.append(y_pred)
            true.append(batch_y)

        predictions = torch.cat(predictions, axis=0)
        true = torch.cat(true, axis=0)
        val_loss = loss_fn(predictions, true)
        predicted_classes = torch.argmax(predictions, dim=1)
        val_acc = (predicted_classes == true).float().mean()

    # Store metrics
    train_losses.append(avg_train_loss)
    val_losses.append(val_loss.item())
    train_accuracies.append(train_acc)
    val_accuracies.append(val_acc.item())

    print(f"Train Loss: {avg_train_loss:.4f}, Train Acc: {train_acc:.4f}")
    print(f"Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")

Epoch: 1


100%|██████████| 1249/1249 [01:31<00:00, 13.61it/s]
100%|██████████| 63/63 [00:01<00:00, 54.52it/s]


loss: 1.4736919403076172, accuracy: 0.5879999995231628
Epoch: 2


 10%|▉         | 124/1249 [00:08<01:20, 14.05it/s]


KeyboardInterrupt: 