In [21]:
!pip install torchvision
!pip install d2l

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting d2l
  Downloading d2l-0.17.6-py3-none-any.whl (112 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m112.6/112.6 KB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pandas==1.2.4
  Downloading pandas-1.2.4-cp39-cp39-manylinux1_x86_64.whl (9.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.7/9.7 MB[0m [31m80.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting matplotlib==3.5.1
  Downloading matplotlib-3.5.1-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.whl (11.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.2/11.2 MB[0m [31m85.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting requests==2.25.1
  Downloading requests-2.25.1-py2.py3-none-any.whl (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [22]:
import csv
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchtext
import numpy as np
import matplotlib.pyplot as plt
import cv2
import os
from torch.utils.data import DataLoader
from torch.autograd import Variable
from d2l import torch as d2l

In [3]:
!git clone "https://github.com/BSteiner1/ST311-Group-Project/"

Cloning into 'ST311-Group-Project'...
remote: Enumerating objects: 2131, done.[K
remote: Counting objects: 100% (133/133), done.[K
remote: Compressing objects: 100% (98/98), done.[K
remote: Total 2131 (delta 53), reused 106 (delta 29), pack-reused 1998[K
Receiving objects: 100% (2131/2131), 1.19 GiB | 17.96 MiB/s, done.
Resolving deltas: 100% (53/53), done.
Updating files: 100% (2011/2011), done.


### Data Loading

In [3]:
class_dict = {'blues' : 0,
              'classical': 1,
              'country': 2,
              'disco' : 3,
              'hiphop' : 4,
              'jazz' : 5,
              'metal' : 6,
              'pop' : 7,
              'reggae': 8,
              'rock' : 9    
}



In [113]:
def collect_data():
    data = []
    # Collect all images from the 10 genres
    for genre in os.listdir("ST311-Group-Project/Data/images_original"):
        for image in os.listdir("ST311-Group-Project/Data/images_original/" + genre):
            image_path = "ST311-Group-Project/Data/images_original/" + genre + "/" + image
            grayscale_img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
            grayscale_img = grayscale_img[35:-36:, 54:-43]
            grayscale_img = torch.tensor(grayscale_img).to(dtype = torch.float32).clone().detach().requires_grad_(True)  #.requires_grad(True)
            #grayscale_img = torch.tensor(grayscale_img, requires_grad=True)
            # Splitting each image vertically into 5 different parts
            for i in range(5):
                # The data loader adds a dimension so I've removed a dimension from the image and class
                data.append((grayscale_img[ : ,  (67*i):67*(i+1)].reshape(1,217,67), torch.tensor(class_dict.get(genre)))) #.reshape(1)))
            
            #data.append((grayscale_img.reshape(1, 217, 335), torch.tensor(class_dict.get(genre))))
    return data     

data = collect_data()

In [114]:
train_size = int(0.8 * len(data))
test_size = len(data) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(data, [train_size, test_size])

In [115]:
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

### RNN (Test Acc. 34.33)

In [140]:
'''
Parameters
'''
batch_size = 20
n_iters = 3000
num_epochs = n_iters / (len(train_dataset) / batch_size)
num_epochs = int(num_epochs)

'''
Net
'''

class RNNModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, layer_dim, output_dim):
        super(RNNModel, self).__init__()

        #Hidden dimensions
        self.hidden_dim = hidden_dim

        #Number of hidden layers
        self.layer_dim = layer_dim

        #RNN Layer 
        self.rnn = nn.GRU(input_dim, hidden_dim, layer_dim, batch_first=True, bidirectional = True)

        #Linear Layer
        self.fc = nn.Linear(hidden_dim*2, output_dim)

    def forward(self, x):

        #Initialize hidden state with zeros
        h0 = torch.zeros(self.layer_dim*2, x.size(0), self.hidden_dim).requires_grad_()

        #RNN Foward Step
        out, hn = self.rnn(x, h0.detach())

        #Clips off final hidden state 
        out = self.fc(out[:, -1, :]) 

        return out

In [141]:
input_dim = 67
hidden_dim = 16
layer_dim = 2
output_dim = 10

In [142]:
rnn_model = RNNModel(input_dim, hidden_dim, layer_dim, output_dim)
rnn_criterion = nn.CrossEntropyLoss()

rnn_learning_rate = 0.01

optimizer = torch.optim.Adam(rnn_model.parameters(), lr=rnn_learning_rate)  

#### Trainer

In [143]:
# Number of steps to unroll
seq_dim = 217  

iter = 0
for epoch in range(num_epochs):
    for i, (images, labels) in enumerate(train_loader):
        rnn_model.train()

        #Load spectrograms as tensors with gradient accumulation abilities
        images = images.view(-1, seq_dim, input_dim).requires_grad_()

        #Clear gradients
        optimizer.zero_grad()

        #Forward pass 
        outputs = rnn_model(images)

        #Calculate Loss
        loss = rnn_criterion(outputs, labels)

        #Backward Pass
        loss.backward()

        #Update
        optimizer.step()

        iter += 1

        if iter % 500 == 0:
            rnn_model.eval()

            #Calculate Accuracy

            correct = 0
            total = 0

            # Iterate through test dataset
            for images, labels in test_loader:

                #Load spectrograms as tensors
                images = images.view(-1, seq_dim, input_dim)

                # Forward pass 
                outputs = rnn_model(images)

                #Get predictions
                _, predicted = torch.max(outputs.data, 1)

                #Total number of genres
                total += labels.size(0)

                #Total correct predictions
                correct += (predicted == labels).sum()

            accuracy = 100 * correct / total

            # Print Loss
            print('Iteration: {}. Loss: {}. Accuracy: {}'.format(iter, loss.item(), accuracy))


Iteration: 500. Loss: 1.8392529487609863. Accuracy: 31.231231689453125
Iteration: 1000. Loss: 1.7490439414978027. Accuracy: 33.03303146362305
Iteration: 1500. Loss: 1.4500503540039062. Accuracy: 33.733734130859375
Iteration: 2000. Loss: 1.5536307096481323. Accuracy: 34.834835052490234
Iteration: 2500. Loss: 1.6321288347244263. Accuracy: 34.23423385620117
Iteration: 3000. Loss: 1.5003257989883423. Accuracy: 34.33433532714844
Iteration: 3500. Loss: 2.2110650539398193. Accuracy: 24.12412452697754


### Next Steps 
- Mess around with hyperparameters
- Implement Pooling
- embedding?