In [1]:
%load_ext autoreload
%autoreload 2

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import pickle
import pretty_midi
import librosa
import librosa.display
import gc
from sklearn.preprocessing import StandardScaler
import warnings



import torch
from torch.utils.data import DataLoader, Dataset


from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import umap

from Preprocessing import *
from ExtractGenre import *

import DatasetLoader as DL

In [53]:
def CleaningData(InputPath = os.path.realpath('clean_midi'), LogFolder = os.path.realpath('LogFolder'), FolderName = 'LogFolder'):

   os.makedirs(FolderName, exist_ok=True)

   for dirpath, dirnames, filenames in tqdm(os.walk(InputPath)):

      midFile = [file for file in filenames if file.endswith('.mid')]
      lenmidiFile = len(midFile)


      if midFile:

         for song in range(lenmidiFile):
            songPath = os.path.join(dirpath, midFile[song])

            mid = Func_CorruptedFile(songPath, songPath, midFile[song], LogFolder)

            if mid is None:
               continue

In [None]:
InputPath = os.path.realpath('CompleteMIDI')

#CleaningData(InputPath, LogFolder='CompleteLogFolder', FolderName='CompleteLogFolder')

46332it [1:11:42, 10.77it/s]


In [57]:
def SongID(InputPath):

   #create a dictionary {SongID: Path_to_Song}
   SongID_Dict = {}
   for dirpath, dirnames, filenames in tqdm(os.walk(InputPath)):

      midFile = [file for file in filenames if file.endswith('.mid')]
      lenmidiFile = len(midFile)
      
      if midFile:
         songID = os.path.basename(dirpath)
         songPath = [os.path.join(dirpath, f) for f in midFile]

         SongID_Dict[songID] = songPath[np.random.randint(0, lenmidiFile)]


   #Transform the .cls file into a dictionary
   clsPath = "SongID-Genre.cls"
   genre_Dict = {}

   with open(clsPath, "r") as f:
    for line in f:
        parts = line.strip().split()
        if len(parts) == 2:
            song_id, genre = parts
            genre_Dict[song_id] = genre

   SongGenre_Dict = {}

   for songID, path in SongID_Dict.items():
      Genre = genre_Dict.get(songID)

      if Genre:
         SongGenre_Dict[(songID, Genre)] = path


   return SongGenre_Dict

In [58]:
InputPath = os.path.realpath('CompleteMIDI')

SongGenre_Dict = SongID(InputPath)

46332it [00:03, 11821.37it/s]


In [70]:
def ToFeedCNN(SongGenre_Dict, numSample = 1000):

   it = 0
   numErr = 0
   Dataset = []
   
   keys = np.array(list(SongGenre_Dict))
   rIDX = np.random.choice(np.arange(len(keys)), numSample)
   for key in tqdm(keys[rIDX]):
      
      path = SongGenre_Dict[tuple(key)]

      try:
         with warnings.catch_warnings():
            warnings.filterwarnings("error", category=RuntimeWarning)
            midi = pretty_midi.PrettyMIDI(path)
            audio = midi.fluidsynth(fs=16000, sf2_path="FluidR3_GM/FluidR3_GM.sf2")

            if audio is None or len(audio) == 0:
               raise ValueError("Invalid audio synthesized from MIDI.")

            S = librosa.feature.melspectrogram(y=audio, sr=16000, n_mels=128)
            S_dB = librosa.power_to_db(S, ref=np.max)

      except RuntimeWarning as w:
         numErr += 1
         continue

      except Exception as e:
         numErr += 1
         continue  


      #Data augmenting
      if key[1] not in ['Electronic', 'Pop_Rock']:
         for _ in range(15):
            #Choose a random 256 array in the S_dB (not considered the whole song, only a fraction)
            rNumber = np.random.randint(0, S_dB.shape[1] - 256)
            idx = np.arange(rNumber, rNumber + 256)
            X = S_dB[:, idx]
            Y = key[1]
            Dataset.append((X, Y))

      else:
         #Choose a random 256 array in the S_dB (not considered the whole song, only a fraction)
         rNumber = np.random.randint(0, S_dB.shape[1] - 256)
         idx = np.arange(rNumber, rNumber + 256)
         X = S_dB[:, idx]
         Y = key[1]
         Dataset.append((X, Y))

      it += 1

      if it > numSample:
         break

   print(numErr)

   return Dataset


In [None]:
Datas = ToFeedCNN(SongGenre_Dict, numSample = 3000)

In [72]:
with open('GenreTrainDataset/test.pkl', 'wb') as f:
   pickle.dump(Datas, f)

In [73]:
def CNNPreprocessing(GenreTrainingPath):

    # Load the file in the folder
    GenreTrainingDS = []
    for file in os.listdir(GenreTrainingPath):
        filePath = os.path.join(GenreTrainingPath, file)
        if file[:4] == 'test':
            with open(filePath, 'rb') as f:
                TD = pickle.load(f)
            GenreTrainingDS.extend(TD)

    # Maps each genre into a number
    GenreMapping = {
        'Vocal': 0, 'Pop_Rock': 1, 'Latin': 2, 'Electronic': 3, 'Country': 4, 'Reggae': 5, 'Rap': 6,
        'RnB': 7, 'Jazz': 8, 'Folk': 9, 'Religious': 10, 'Classical': 11, 'Easy_Listening': 12,
        'International': 13,
    }

    # First pass: collect all features for fitting scaler
    all_features = []
    valid_songs = []  # store songs that don't raise errors
    for song in GenreTrainingDS:
        try:
            X = song[0]  # shape (128, 256)
            if X.shape[1] != 256:  # skip corrupt entries
                continue
            all_features.append(X.reshape(-1, X.shape[1]))  # shape (128, 256)
            valid_songs.append(song)
        except:
            continue

    all_features = np.vstack(all_features)  # shape (N * 128, 256)
    scaler = StandardScaler().fit(all_features)

    # Second pass: scale using the fitted scaler
    TrainDataset = []
    for song in valid_songs:
        try:
            X = scaler.transform(song[0])  # still shape (128, 256)
            Y = GenreMapping[song[1]]
            TrainDataset.append((X, Y))
        except:
            continue

    del GenreTrainingDS
    gc.collect()

    return TrainDataset

In [74]:
GenreTrainingPath = os.path.realpath('GenreTrainDataset')
TrainDataset = CNNPreprocessing(GenreTrainingPath)

In [None]:
# with open('GenreTrainDataset/CNNDataset.pkl', 'wb') as f:
#    pickle.dump(TrainDataset, f)

In [2]:
from torch.utils.data import Dataset
import torch

class GenreDataset(Dataset):
    def __init__(self, path='GenreTrainDataset/CNNDataset.pkl', Train = True, transform=None):
        # self.file = h5py.File(path, 'r')
        # self.X = self.file['x']
        # self.Y = self.file['y']

        with open(path, 'rb') as f:
            TD = pickle.load(f)


        TrainIDX = np.arange(10_000)
        ValIDX = np.arange(len(TrainIDX), len(TrainIDX) + 3_000)

        if Train:
            self.X = np.array([TD[i][0] for i in TrainIDX])
            self.Y = np.array([TD[i][1] for i in TrainIDX])

        else:
            self.X = np.array([TD[i][0] for i in ValIDX])
            self.Y = np.array([TD[i][1] for i in ValIDX])

        del TD
        gc.collect()

    def __len__(self):
        return len(self.Y)

    def __getitem__(self, idx):

        xTensor = self.X[idx]
        yTensor = self.Y[idx]

        xTensor = torch.tensor(xTensor, dtype=torch.float32).unsqueeze(0)
        return xTensor, torch.tensor(yTensor)

In [3]:
trainData = GenreDataset(path='GenreTrainDataset/CNNDataset.pkl', Train = True)
valData = GenreDataset(path='GenreTrainDataset/CNNDataset.pkl', Train = False)
trainLoader = DataLoader(trainData, batch_size = 32, shuffle=True, num_workers=0)
valLoader = DataLoader(valData, batch_size = 32, shuffle=True, num_workers=0)

In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class GenreCNN(nn.Module):
    def __init__(self, n_classes=12, input_shape=(1, 128, 256)):
        super().__init__()
        self.conv1 = nn.Conv2d(1, 16, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, padding=1)
        self.conv3 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        self.conv4 = nn.Conv2d(64, 128, kernel_size=3, padding=1)
        self.pool = nn.MaxPool2d(2, 2)
        self.dropout = nn.Dropout(0.3)

        # Infer the flattened size after convolutions
        self._to_linear = self._get_conv_output_size(input_shape)
        self.fc = nn.Linear(self._to_linear, n_classes)

    def _get_conv_output_size(self, shape):
        with torch.no_grad():
            dummy_input = torch.zeros(1, *shape)
            x = self.pool(F.relu(self.conv1(dummy_input)))
            x = self.pool(F.relu(self.conv2(x)))
            x = self.pool(F.relu(self.conv3(x)))
            x = self.pool(F.relu(self.conv4(x)))
            return x.view(1, -1).size(1)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = self.pool(F.relu(self.conv3(x)))
        x = self.pool(F.relu(self.conv4(x)))
        x = self.dropout(x)
        x = x.view(x.size(0), -1)
        x = self.fc(x)
        return x

In [1]:
class GenreCNNSimple(nn.Module):
    def __init__(self, n_classes=10):
        super().__init__()
        self.features = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=3, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(2),

            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(2),

            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.MaxPool2d(2),
        )

        self.classifier = nn.Sequential(
            nn.AdaptiveAvgPool2d((4, 4)),  # Force 4×4 output
            nn.Flatten(),
            nn.Linear(4*4*128, 256),      # 4*4*128 = 2048
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, n_classes)
        )

    def forward(self, x):
        x = self.features(x)
        x = self.classifier(x)  # AdaptiveAvgPool2d and Flatten are inside classifier
        return x

NameError: name 'nn' is not defined

In [None]:
from torch.optim import Adam
from torch.nn import CrossEntropyLoss

device = "cuda" if torch.cuda.is_available() else "cpu"
model = GenreCNNSimple()
opt = Adam(model.parameters(), lr=1e-3, weight_decay=1e-4)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(opt, patience=3, factor=0.5)
loss_fn = CrossEntropyLoss()
model.to(device)

print(device)

epochs = 30

# Initialize lists to store metrics
train_losses = []
val_losses = []
train_accuracies = []
val_accuracies = []

for epoch in range(epochs):
    print(f"Epoch: {epoch+1}")

    # Training phase
    model.train()
    train_loss = 0
    train_correct = 0
    train_total = 0

    for batch_x, batch_y in tqdm(trainLoader):
        batch_x = batch_x.to(device)
        batch_y = batch_y.to(device)

        y_pred = model(batch_x)
        loss = loss_fn(y_pred, batch_y)

        opt.zero_grad()
        loss.backward()
        opt.step()

        train_loss += loss.item()

        # Calculate training accuracy
        _, predicted = torch.max(y_pred.data, 1)
        train_total += batch_y.size(0)
        train_correct += (predicted == batch_y).sum().item()

    # Calculate epoch averages
    avg_train_loss = train_loss / len(trainLoader)
    train_acc = train_correct / train_total

    # Validation phase (your existing code)
    model.eval()
    with torch.no_grad():
        predictions = []
        true = []
        for batch_x, batch_y in tqdm(valLoader):
            batch_x = batch_x.to(device)
            batch_y = batch_y.to(device)
            y_pred = model(batch_x)
            predictions.append(y_pred)
            true.append(batch_y)

        predictions = torch.cat(predictions, axis=0)
        true = torch.cat(true, axis=0)
        val_loss = loss_fn(predictions, true)
        predicted_classes = torch.argmax(predictions, dim=1)
        val_acc = (predicted_classes == true).float().mean()

    # Store metrics
    train_losses.append(avg_train_loss)
    val_losses.append(val_loss.item())
    train_accuracies.append(train_acc)
    val_accuracies.append(val_acc.item())

    print(f"Train Loss: {avg_train_loss:.4f}, Train Acc: {train_acc:.4f}")
    print(f"Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")