In [1]:
import cv2
import torch
from torch import nn
from d2l import torch as d2l
from torch.utils.data import DataLoader
import os
import numpy as np

In [2]:
class_dict = {'blues' : 0,
              'classical': 1,
              'country': 2,
              'disco' : 3,
              'hiphop' : 4,
              'jazz' : 5,
              'metal' : 6,
              'pop' : 7,
              'reggae': 8,
              'rock' : 9    
}

In [3]:
def collect_data():
    data = []
    # Collect all images from the 10 genres
    for genre in os.listdir("../Data/images_original"):
        for image in os.listdir("../Data/images_original/" + genre):
            image_path = "../Data/images_original/" + genre + "/" + image
            grayscale_img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
            grayscale_img = grayscale_img[35:-36:, 54:-43]
            grayscale_img = torch.tensor(grayscale_img).to(dtype = torch.float32).clone().detach().requires_grad_(True)  #.requires_grad(True)
            #grayscale_img = torch.tensor(grayscale_img, requires_grad=True)
            # Splitting each image vertically into 5 different parts
            for i in range(5):
                # The data loader adds a dimension so I've removed a dimension from the image and class
                data.append((grayscale_img[ : ,  (67*i):67*(i+1)].reshape(1,217,67), torch.tensor(class_dict.get(genre)))) #.reshape(1)))
            
    return data         


In [4]:
data = collect_data()

In [5]:
len(data)

4995

In [6]:
import random

In [7]:
data = random.sample(data, 4000)

In [8]:
len(data)

4000

In [9]:
train_size = int(0.8 * len(data))
test_size = len(data) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(data, [train_size, test_size])

train_dataset[2][0].shape

torch.Size([1, 217, 67])

In [10]:
len(train_dataset)

3200

In [11]:
# Get data in dataloader format
train_loader = DataLoader(train_dataset, batch_size=25, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=25, shuffle=False)

# Metadata

In [12]:
csv_path = "C:/Users/bbste/Documents/LSE/ST311/ST311-Group-Project/Data/features_30_sec.csv"

In [13]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

In [14]:
raw_df = pd.read_csv(csv_path)

In [15]:
del raw_df['filename']
del raw_df['length']

In [16]:
raw_df['label'] = raw_df['label'].map(class_dict)

In [17]:
df = StandardScaler().fit_transform(raw_df.iloc[:, 0:-1])

In [18]:
df = pd.DataFrame(df, columns = raw_df.columns.tolist()[:-1])

In [19]:
df['label'] = raw_df['label']

In [20]:
metadata = [(torch.tensor(list(df.loc[i][:-1])).clone().detach().requires_grad_(True).reshape(1,1,-1), torch.tensor(df['label'][i], dtype=torch.long)) for i in range(len(df))]

In [21]:
train_size = int(0.8 * len(metadata))
test_size = len(metadata) - train_size
metadata_train_dataset, metadata_test_dataset = torch.utils.data.random_split(metadata, [train_size, test_size])

In [22]:
# Get data in dataloader format
metadata_train_loader = DataLoader(metadata_train_dataset, batch_size=25, shuffle=True)
metadata_test_loader = DataLoader(metadata_test_dataset, batch_size=25, shuffle=False)

# Prototype

In [29]:
import torch
import torch.nn as nn
import torch.optim as optim

class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        self.conv1 = nn.Conv2d(1, 16, kernel_size=3, stride=1, padding=1)
        self.relu1 = nn.ReLU()
        self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1)
        self.relu2 = nn.ReLU()
        self.pool2 = nn.MaxPool2d(kernel_size=2, stride=2)
        self.fc1 = nn.Linear(27705, 25)             #(32 * 54 * 17 + 57, 128)
        self.relu3 = nn.ReLU()
        #self.fc2 = nn.Linear(512, 64)
        #self.sig = nn.Sigmoid()
        #self.fc3 = nn.Linear(64, 25)
        #self.sig3 = nn.Sigmoid()
        self.fc4 = nn.Linear(25, 10)
        

    def forward(self, x_image, x_metadata):
        x_image = self.pool1(self.relu1(self.conv1(x_image)))
        x_image = self.pool2(self.relu2(self.conv2(x_image)))
        x_image = x_image.view(x_image.size(0), -1)
        x_metadata = x_metadata.view(x_metadata.size(0), -1)
        #print(x_metadata.shape)
        x = torch.cat((x_image, x_metadata), dim=1)
        #print(x.shape)
        x = self.relu3(self.fc1(x))
        #x = self.fc2(self.sig(x))
        #x = self.fc3(self.sig(x))
        x = self.fc4(x)
        
        return x



# Initialize the model, loss function, and optimizer
model = CNN()
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

# Set up the dataloaders
trainloader_image = train_loader
testloader_image = test_loader
trainloader_metadata = metadata_train_loader
testloader_metadata = metadata_test_loader

# Train the model
num_epochs = 10
for epoch in range(num_epochs):
    print(epoch)
    running_loss = 0.0
    for i, ((images, labels), (metadata, _)) in enumerate(zip(trainloader_image, trainloader_metadata)):
        optimizer.zero_grad()
        outputs = model(images, metadata)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        if (i+1) % 100 == 0:
            print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'
                  .format(epoch+1, num_epochs, i+1, len(trainloader_image), running_loss/100))
            running_loss = 0.0

# Evaluate the model
model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for (images, labels), (metadata, _) in zip(testloader_image, testloader_metadata):
        #print(images.shape)
        outputs = model(images, metadata)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
    print('Accuracy of the network on the test images: %d %%' % (100 * correct / total))


0
1
2
3
4
5
6
7
8
9
Accuracy of the network on the test images: 45 %
