In [1]:
import torch
import torch, librosa
import librosa.display
import os
import random
import opendatasets as od
from torchvision import datasets, transforms
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image

In [2]:
def resize_img(path):
    img = Image.open(path)
    img_cropped = img.crop(box = (54, 35, 390, 253))
    img_resized = img_cropped.resize((round(img_cropped.size[0]*0.5), round(img_cropped.size[1]*0.5)))
    img_resized.save(path)
    img.close()
    img_cropped.close()
    img_resized.close()
    return

In [3]:
def wav_to_jpg(path, name, music_type):
    # Convert one file .wav to .jpg
    
    if not os.path.exists(path):
        print('Music file not found')
        return -1
    if os.path.exists(f"images_sound/{music_type}/{name[:-4]}.jpg"):
        return 0
    y, sr = librosa.load(path)
    S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128, fmax=8000, power=3.9)
    fig, ax = plt.subplots()
    S_db = librosa.power_to_db(S, ref=np.max)
    img = librosa.display.specshow(S_db, sr=sr, fmax=8000, ax=ax)
    if not os.path.exists(f"images_sound/{music_type}"):
        os.mkdir(f"images_sound/{music_type}")
    plt.savefig(f"images_sound/{music_type}/{name[:-4]}.jpg")
    plt.close('all')
    resize_img(f"images_sound/{music_type}/{name[:-4]}.jpg")
    return 0

In [4]:
def convert_all_music(path_data):
    # Convert all file .wav to .jpg
    # Risk of crashing
    
    label_dict = {}
    index_label = 0

    if not os.path.exists(path_data):
        print('Data folder not found')
        return -1
    if not os.path.exists("images_sound"):
        os.mkdir("images_sound")

    for music_type in os.listdir(path_data):
        d = os.path.join(path_data, music_type)
        if os.path.isdir(d):
            label_dict[music_type] = index_label
            index_label += 1
            for music in os.listdir(d):
                entire_path = os.path.join(d, music)
                wav_to_jpg(entire_path, music, music_type)

    return label_dict

In [5]:
def img_to_tensor(entire_path):
    img = Image.open(entire_path)
    resize = transforms.functional.resize(img, size=[200, 200])
    transform = transforms.Compose([transforms.Normalize([0.5], [0.5])])
    to_tensor = transforms.ToTensor()
    new_img = transform(to_tensor(resize))
    return new_img

In [6]:
def create_all_tensor(path_data, batch_size, label_dict):
    my_tensor = []
    all_label = []
    i = 0

    for k in range(0, 1000):
        my_tensor.append(torch.empty(3, 200, 200))
    for music_type in os.listdir(path_data):
        d = os.path.join(path_data, music_type)
        if os.path.isdir(d):
            for music in os.listdir(d):
                entire_path = os.path.join(d, music)
                my_tensor[i] = img_to_tensor(entire_path)
                all_label.append(label_dict[music_type])
                i += 1
    return my_tensor, all_label

In [7]:
def shuffle_2_list(first_list, second_list):
    # Shuffle with the same way the list of tensor
    zip_list = list(zip(first_list, second_list))
    random.shuffle(zip_list)
    first_list, second_list = zip(*zip_list)
    return first_list, second_list

In [8]:
# username: gabriel404guietdupre
# key: 97d328e7f389ab6055fc7b9dd41327ea

dataset = "https://www.kaggle.com/datasets/andradaolteanu/gtzan-dataset-music-genre-classification"
od.download(dataset)
if os.path.exists("gtzan-dataset-music-genre-classification/Data/genres_original/jazz/jazz.00054.wav"):
    os.remove('gtzan-dataset-music-genre-classification/Data/genres_original/jazz/jazz.00054.wav')


Skipping, found downloaded files in "./gtzan-dataset-music-genre-classification" (use force=True to force download)


In [9]:
batch_size = 32
num_epochs = 20
lr = 0.01

# Convert all music to image and create a dict of all song's type
label_dict = convert_all_music('gtzan-dataset-music-genre-classification/Data/genres_original')

# Create 2 tensors, my_tensor with all images in tensor with a batch size, and all_label of each image in order
my_tensor, all_label = create_all_tensor('images_sound', batch_size, label_dict)
my_tensor, all_label = shuffle_2_list(my_tensor, all_label)

train_loader = []
train_label = []
test_loader = []
test_label = []
j = 0

In [10]:
j = 0
i = 0
list_batch_tensor = []
list_batch_label = []
for k in range (0, int(len(my_tensor) / batch_size)):
    list_batch_tensor.append(torch.empty(batch_size, 3, 200, 200))
    list_batch_label.append(torch.empty(batch_size, dtype=torch.long))
for l, tensor in enumerate(my_tensor):
    if (i >= int(len(my_tensor) / batch_size)):
        break
    list_batch_tensor[i][j] = my_tensor[l]
    list_batch_label[i][j] = all_label[l]
    if (j == batch_size - 1):
        j = 0
        i += 1
    else:
        j += 1

In [11]:
j = 0
for i in range (len(list_batch_tensor)):
    if i >= (80 * len(list_batch_tensor)) / 100:
        test_loader.append(torch.empty(batch_size, 3, 200, 200))
        test_loader[j] = list_batch_tensor[i]
        test_label.append(torch.empty(batch_size, dtype=torch.long))
        test_label[j] = list_batch_label[i]
        j += 1
    else:
        train_loader.append(torch.empty(batch_size, 3, 200, 200))
        train_loader[i] = list_batch_tensor[i]
        train_label.append(torch.empty(batch_size, dtype=torch.long))
        train_label[i] = list_batch_label[i]

In [18]:
class Net(nn.Module):
  def __init__(self):
    """Intitalize neural net layers"""
    super(Net, self).__init__()
    self.conv1 = nn.Conv2d(in_channels=3, out_channels=8, kernel_size=3, stride=1, padding=0)
    self.conv2 = nn.Conv2d(in_channels=8, out_channels=16, kernel_size=3, stride=1, padding=0)
    self.conv3 = nn.Conv2d(in_channels=16, out_channels=32, kernel_size=3, stride=1, padding=0)
    self.conv4 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, stride=1, padding=0)
    self.conv5 = nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=0)
    self.conv6 = nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=1, padding=0)
    self.fc1 = nn.Linear(in_features=6400, out_features=2000)
    self.fc2 = nn.Linear(in_features=2000, out_features=500)
    self.fc3 = nn.Linear(in_features=500, out_features=100)
    self.fc4 = nn.Linear(in_features=100, out_features=10)

    self.batchnorm1 = nn.BatchNorm2d(num_features=8)
    self.batchnorm2 = nn.BatchNorm2d(num_features=16)
    self.batchnorm3 = nn.BatchNorm2d(num_features=32)
    self.batchnorm4 = nn.BatchNorm2d(num_features=64)
    self.batchnorm5 = nn.BatchNorm2d(num_features=128)
    self.batchnorm6 = nn.BatchNorm2d(num_features=256)

    self.dropout = nn.Dropout(p=0.3, inplace=False)

  def forward(self, x):
    # Conv layer 1.
    x = self.conv1(x)
    x = F.relu(x)
    x = self.batchnorm1(x)
    x = F.max_pool2d(x, kernel_size=2)

    # Conv layer 2.
    x = self.conv2(x)
    x = F.relu(x)
    x = self.batchnorm2(x)
    x = F.max_pool2d(x, kernel_size=2)

    # Conv layer 3.
    x = self.conv3(x)
    x = self.batchnorm3(x)
    x = F.relu(x)
    x = F.max_pool2d(x, kernel_size=2)

    # Conv layer 4.
    x = self.conv4(x)
    x = F.relu(x)
    x = self.batchnorm4(x)
    x = F.max_pool2d(x, kernel_size=2)

    # Fully connected layer 1.
    x = torch.flatten(x, 1)
    x = self.dropout(x)
    x = F.relu(self.fc1(x))
    x = F.relu(self.fc2(x))
    x = F.relu(self.fc3(x))
    x = self.fc4(x)
    #x = F.softmax(x)

    return x

In [19]:
model = Net()

# loss function
criterion = nn.CrossEntropyLoss(reduction='mean')
optimizer = torch.optim.SGD(model.parameters(), lr=lr)

#training loop
for epoch in range(num_epochs):
    for i, batch in enumerate(train_loader):
        #forward
        outputs = model(batch)
        optimizer.zero_grad()
        loss = criterion(outputs, train_label[i])
        #backwards
        loss.backward()
        optimizer.step()
    print(f'epoch {epoch + 1} / {num_epochs}, loss = {loss.item():.4f}')
print("End of training")

epoch 1 / 20, loss = 2.2785
epoch 2 / 20, loss = 2.2128
epoch 3 / 20, loss = 2.1528
epoch 4 / 20, loss = 2.0729
epoch 5 / 20, loss = 1.9231
epoch 6 / 20, loss = 1.8157
epoch 7 / 20, loss = 1.7522
epoch 8 / 20, loss = 1.5939
epoch 9 / 20, loss = 1.4755
epoch 10 / 20, loss = 1.3298
epoch 11 / 20, loss = 1.1547
epoch 12 / 20, loss = 1.0483
epoch 13 / 20, loss = 0.8310
epoch 14 / 20, loss = 0.7403
epoch 15 / 20, loss = 0.6751
epoch 16 / 20, loss = 0.6383
epoch 17 / 20, loss = 0.4145
epoch 18 / 20, loss = 0.3040
epoch 19 / 20, loss = 0.2555
epoch 20 / 20, loss = 0.1920
End of training


In [20]:
with torch.no_grad():
    n_correct = 0
    n_samples = 0
    n_class_correct = [0 for i in range(10)]
    n_class_samples = [0 for i in range(10)]
    for i, images in enumerate(test_loader):
        outputs = model(images)
        
        _, predicted = torch.max(outputs.data, 1)
        n_samples += test_label[i].size(0)
        n_correct += (predicted == test_label[i]).sum().item()

        for j in range(batch_size):
            label = test_label[i][j]
            pred = predicted[j]
            if (label == pred):
                n_class_correct[label] += 1
            n_class_samples[label] += 1
            
    acc = 100.0 * n_correct / n_samples
    print(f'Accuracy of the network on the {n_samples} test images: {acc} %')


Accuracy of the network on the 192 test images: 57.291666666666664 %
