# Music Genre Recognition on GTZAN

In [20]:
# from google.colab import drive
# drive.mount('/content/drive')
# !git clone https://github.com/ConcoLab/genrecog.git
# %cd genrecog
# print("The repository is cloned from github")
# !gdown --id 1ZkJwOQPGR_okWNAPbJ8_6YtDOCog5fg3
# !gdown --id 1gPI8Jd94jCniZLHC2-KLVHPw0HlfNvFx
# !ls
# !wget -O train.npz -c "https://users.encs.concordia.ca/~a_hraf/index.php" -P "/dataset/npz_files/"
# !wget -O train.npz -c "https://users.encs.concordia.ca/~a_hraf/train.npz" -P /dataset/npz_files/
# !wget -O test.npz -c "https://users.encs.concordia.ca/~a_hraf/test.npz" -P ./dataset/npz_files/

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%%capture
%cd drive/MyDrive/genrecog/
%pip install speechbrain

In [None]:
# Essntial Libraries
from genrecog.preprocess.preprocessor import Preprocessor
from genrecog.nnet.CNN import Conv1d, VanillaConv1d, VanillaConv2d
from genrecog.nnet.RNN import VanillaRNN, LSTM, GRU
from genrecog.tools.trainer import CNNFbankTrainer, RNNFbankTrainer, SklearnTrainer, KmeansTrainer
import torch
from torch.utils.data import TensorDataset, DataLoader 
import datetime
from genrecog.preprocess.feature import Feature
from IPython.display import Audio
import matplotlib.pyplot as plt



torch.manual_seed(0)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
train_preprcessor = Preprocessor('dataset/npz_files/train.npz')
test_preprcessor = Preprocessor('dataset/npz_files/test.npz')


# Processed Data
The following code shows that we actually using a set of musics which are converted to waves and then we use Fast Fourier Transform (FFT) with using a library available in speechbrain library (FBank) to extract the features.

In [None]:
# Load a sample dataset for demonstration purposes
X_test, y_test = test_preprcessor.as_shuffled_numpy()

## Listen a Part of a Music
Each sample in X_test is a music sample that you can listen to and they have a length of 7 seconds.

In [None]:
Audio(X_test[0], rate=22050)

Also you can see the wave resulting from each sample that is gotten from the music.

In [None]:
plt.plot(X_test[0])

Now, it is the time to extract the features off the sample. So, you can see that each sample is turned to a two-dimensional array containig FFT samples.

In [None]:
feature_maker = Feature()
print("Shape of the music waves:", X_test.shape)
X_test_features = feature_maker.numpy_fbank_features(X_test)
print("Resulted sample features shape:", X_test_features.shape)


And finally we can plot the features:

In [None]:
plt.imshow(X_test_features[0].T, 
           cmap='viridis', 
           interpolation='nearest', 
           aspect='auto', origin='left, bottom')
plt.title('Transformed music sample using FBank')
plt.ylabel('channel')
plt.xlabel('time')

# 1. Neural Network Data Preparation

To train neural network models, we need to define the DataLoaders. So, we have prepared three different DataLoaders for training, validation and test. This allowed us to reuse the same method for both RNN and CNN at the further steps. 

**It is important to note that we set the size of batches to 400 and you might need to change it due to RAM constraints.**

In [None]:
batch_size = 400 # If your RAM does not support you can replace it with 100

X, y = train_preprcessor.as_shuffled_torch()
X_test, y_test = test_preprcessor.as_shuffled_torch()

dataset = TensorDataset(X.to(device), y.to(device))
validation_dataset, train_dataset = torch.utils.data.random_split(dataset, (400, 3200))
test_dataset = TensorDataset(X_test.to(device), y_test.to(device))

train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size)
validation_dataloader = DataLoader(validation_dataset, shuffle=True, batch_size=batch_size)
test_dataloader = DataLoader(test_dataset, shuffle=True, batch_size=batch_size)

Also, we took 100 as the numnber of epochs in all of the trained models before deciding for overfitting. Here you are able to set it to 100 if you'd like to compare our stated results with this notebook. You would be able to see all the epochs' information after each training process.

In [None]:
num_epochs=10 # You can change it to 100 for closer results

# 2. Recurrent Neural Network

We have decided to train different models on RNN to see which of them works better than the others. So, we fixed the following hyperparameters and change the RNN core functions. The layer that are used include VanillaRNN, LSTM, and GRU. Also, for each of them we used two different variations. One was taking the last hidden layer as the output of the forward function and the other was taking the average of all hidden layers as the output function.

In [None]:
hidden_size = 128
num_layers = 5
input_size = 40
output_dim = 10
time_sequence = 702
lr = 0.001

## 2.1. VanillaRNN with Using Last Layer of the Hidden Layers

In [None]:
%reset_selective -f model
%reset_selective -f trainer
%reset_selective -f optimizer
%reset_selective -f loss
loss = torch.nn.CrossEntropyLoss()
model = VanillaRNN(
    input_size=input_size, 
    time_sequence=time_sequence,
    hidden_size=hidden_size, 
    num_layers=num_layers, 
    output_dim=output_dim,
    use_mean=False
    ).to(device)
loss = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
print(model)
trainer = RNNFbankTrainer(
    model=model, 
    optimizer=optimizer, 
    loss=loss, 
    train_dataloader=train_dataloader, 
    validation_dataloader=validation_dataloader, 
    num_epochs=num_epochs)
trainer.train()
trainer.plot_loss("training and validation loss")
trainer.plot_accuracies("training and validation accuracy")
trainer.plot_confusion_matrix(test_dataloader, 'confusion matrix')
y_pred, y_eval, loss, accuracy = trainer.eval(test_dataloader)
print("accuracy: ", accuracy * 100)

## 2.2. VanillaRNN with Using Mean Value of All Hidden Layers

In [None]:
%reset_selective -f model
%reset_selective -f trainer
%reset_selective -f optimizer
%reset_selective -f loss
loss = torch.nn.CrossEntropyLoss()
model = VanillaRNN(
    input_size=input_size, 
    time_sequence=time_sequence,
    hidden_size=hidden_size, 
    num_layers=num_layers, 
    output_dim=output_dim,
    use_mean=True
    ).to(device)
loss = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
print(model)
trainer = RNNFbankTrainer(
    model=model, 
    optimizer=optimizer, 
    loss=loss, 
    train_dataloader=train_dataloader, 
    validation_dataloader=validation_dataloader, 
    num_epochs=num_epochs)
trainer.train()
trainer.plot_loss("training and validation loss")
trainer.plot_accuracies("training and validation accuracy")
trainer.plot_confusion_matrix(test_dataloader, 'confusion matrix')
y_pred, y_eval, loss, accuracy = trainer.eval(test_dataloader)
print("accuracy: ", accuracy * 100)

## 2.3. LSTM with Using Last Layer of Hidden Layers

In [None]:
%reset_selective -f model
%reset_selective -f trainer
%reset_selective -f optimizer
%reset_selective -f loss
loss = torch.nn.CrossEntropyLoss()
model = LSTM(
    input_size=input_size, 
    time_sequence=time_sequence,
    hidden_size=hidden_size, 
    num_layers=num_layers, 
    output_dim=output_dim,
    use_mean=False
    ).to(device)
loss = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
print(model)
trainer = RNNFbankTrainer(
    model=model, 
    optimizer=optimizer, 
    loss=loss, 
    train_dataloader=train_dataloader, 
    validation_dataloader=validation_dataloader, 
    num_epochs=num_epochs)
trainer.train()
trainer.plot_loss("training and validation loss")
trainer.plot_accuracies("training and validation accuracy")
trainer.plot_confusion_matrix(test_dataloader, 'confusion matrix')
y_pred, y_eval, loss, accuracy = trainer.eval(test_dataloader)
print("accuracy: ", accuracy * 100)

## 2.4. LSTM with Using Mean Value of All Hidden Layers

In [None]:
%reset_selective -f model
%reset_selective -f trainer
%reset_selective -f optimizer
%reset_selective -f loss
loss = torch.nn.CrossEntropyLoss()
model = LSTM(
    input_size=input_size, 
    time_sequence=time_sequence,
    hidden_size=hidden_size, 
    num_layers=num_layers, 
    output_dim=output_dim,
    use_mean=True
    ).to(device)
loss = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
print(model)
trainer = RNNFbankTrainer(
    model=model, 
    optimizer=optimizer, 
    loss=loss, 
    train_dataloader=train_dataloader, 
    validation_dataloader=validation_dataloader, 
    num_epochs=num_epochs)
trainer.train()
trainer.plot_loss("training and validation loss")
trainer.plot_accuracies("training and validation accuracy")
trainer.plot_confusion_matrix(test_dataloader, 'confusion matrix')
y_pred, y_eval, loss, accuracy = trainer.eval(test_dataloader)
print("accuracy: ", accuracy * 100)

## 2.5. GRU with Using Last Layer of the Hidden Layers

In [None]:
%reset_selective -f model
%reset_selective -f trainer
%reset_selective -f optimizer
%reset_selective -f loss
loss = torch.nn.CrossEntropyLoss()
model = GRU(
    input_size=input_size, 
    time_sequence=time_sequence,
    hidden_size=hidden_size, 
    num_layers=num_layers, 
    output_dim=output_dim,
    use_mean=False
    ).to(device)
loss = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
print(model)
trainer = RNNFbankTrainer(
    model=model, 
    optimizer=optimizer, 
    loss=loss, 
    train_dataloader=train_dataloader, 
    validation_dataloader=validation_dataloader, 
    num_epochs=num_epochs)
trainer.train()
trainer.plot_loss("training and validation loss")
trainer.plot_accuracies("training and validation accuracy")
trainer.plot_confusion_matrix(test_dataloader, 'confusion matrix')
y_pred, y_eval, loss, accuracy = trainer.eval(test_dataloader)
print("accuracy: ", accuracy * 100)

## 2.6. GRU with Using Mean Value of All Hidden Layers

In [None]:
%reset_selective -f model
%reset_selective -f trainer
%reset_selective -f optimizer
%reset_selective -f loss
loss = torch.nn.CrossEntropyLoss()
model = GRU(
    input_size=input_size, 
    time_sequence=time_sequence,
    hidden_size=hidden_size, 
    num_layers=num_layers, 
    output_dim=output_dim,
    use_mean=True
    ).to(device)
loss = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
print(model)
trainer = RNNFbankTrainer(
    model=model, 
    optimizer=optimizer, 
    loss=loss, 
    train_dataloader=train_dataloader, 
    validation_dataloader=validation_dataloader, 
    num_epochs=num_epochs)
trainer.train()
trainer.plot_loss("training and validation loss")
trainer.plot_accuracies("training and validation accuracy")
trainer.plot_confusion_matrix(test_dataloader, 'confusion matrix')
y_pred, y_eval, loss, accuracy = trainer.eval(test_dataloader)
print("accuracy: ", accuracy * 100)

# 3. Convolutional Neural Network
After examining the RNN to train a classifier for our project, we have decided to use CNN to compare our result with. We have trained three different models for our CNN including VanillaConv1d, VanillaConv1d, Conv2d, which each of them becomes more complex in terms of architecture. Also, we tried to keep the architecture close to RNN which makes the results compareable. As of this reason, we used learning rate equal to 0.001 for all CNN models.


In [None]:
lr = 0.001

## 3.1. VanillaConv1d 

In [None]:
%reset_selective -f model
%reset_selective -f trainer
%reset_selective -f optimizer
%reset_selective -f loss
model = VanillaConv1d().to(device)
loss = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
print(model)
trainer = CNNFbankTrainer(
    model=model, 
    optimizer=optimizer, 
    loss=loss, 
    train_dataloader=train_dataloader, 
    validation_dataloader=validation_dataloader, 
    num_epochs=num_epochs)
trainer.train()
trainer.plot_loss("training and validation loss")
trainer.plot_accuracies("training and validation accuracy")
trainer.plot_confusion_matrix(test_dataloader, 'confusion matrix')
y_pred, y_eval, loss, accuracy = trainer.eval(test_dataloader)
print("accuracy: ", accuracy * 100)

## 3.2. Conv1d

In [None]:
%reset_selective -f model
%reset_selective -f trainer
%reset_selective -f optimizer
%reset_selective -f loss
loss = torch.nn.CrossEntropyLoss()
model = Conv1d().to(device)
print(model)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
trainer = CNNFbankTrainer(
    model=model, 
    optimizer=optimizer, 
    loss=loss, 
    train_dataloader=train_dataloader, 
    validation_dataloader=validation_dataloader, 
    num_epochs=num_epochs)
trainer.train()
trainer.plot_loss("training and validation loss")
trainer.plot_accuracies("training and validation accuracy")
trainer.plot_confusion_matrix(test_dataloader, 'confusion matrix')
y_pred, y_eval, loss, accuracy = trainer.eval(test_dataloader)
print("accuracy: ", accuracy * 100)

## 3.3. VanillaConv2d

In [None]:
%reset_selective -f model
%reset_selective -f trainer
%reset_selective -f optimizer
%reset_selective -f loss
loss = torch.nn.CrossEntropyLoss()
model = VanillaConv2d().to(device)
print(model)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
trainer = CNNFbankTrainer(
    model=model, 
    optimizer=optimizer, 
    loss=loss, 
    train_dataloader=train_dataloader, 
    validation_dataloader=validation_dataloader, 
    num_epochs=num_epochs)
trainer.train()
trainer.plot_loss("training and validation loss")
trainer.plot_accuracies("training and validation accuracy")
trainer.plot_confusion_matrix(test_dataloader, 'confusion matrix')
y_pred, y_eval, loss, accuracy = trainer.eval(test_dataloader)
print("accuracy: ", accuracy * 100)

In [None]:
# Free Memory :)
%reset_selective -f model
%reset_selective -f trainer
%reset_selective -f optimizer
%reset_selective -f loss

# 4. Multiple Model Training
In this part, we have written a class which is responsible to train some of the `sklearn` models. So, we can pass a dictionary of various models with different parameters to a the class and it trains them all one by one. This was helpful for hyperparameter search.




## 4.1. Dataset as Numpy
Since `Sklearn` only works with Numpy, we need to redefine the features as Numpy arrays instead of torch Tensors. Also, in the previous parts (NN), the feature extraction was done during loading the dataset in the DataLoader to save more memory. Here we need to convert the Numpy arrays to FBank Featurs and then use them to train models.

In [None]:
X,y = train_preprcessor.as_shuffled_numpy()
X_test, y_test = test_preprcessor.as_shuffled_numpy()

feature_maker = Feature()
X_features = feature_maker.numpy_fbank_features(X).reshape(-1,702*40)
X_test_features = feature_maker.numpy_fbank_features(X_test).reshape(-1,702*40)

## 4.2. Defining Models and Parameters

The following dictionary shows how easy we can create various models based on their name and their paramters. It is possible to have i.e. multiple SVMs with different hyperparameters at the same time. 

In [None]:
models = {
    "mlp": {
        "name": "mlp",
        "parameters": {
            "hidden_layer_sizes": (128,128,128,128,128),
            "solver": "adam",
            "max_iter": 100,
            "early_stopping": True,
         }
    },
    "svm_ovo": {
          "name": "svm",
          "parameters": {
            "decision_function_shape":"ovo"
          }
    },
    "svm_ovr": {
          "name": "svm",
          "parameters": {
            "decision_function_shape":"ovr"
          }
    },
    "decision_tree": {
          "name": "decision_tree",
          "parameters": {
          }
    },
    "random_forest": {
          "name": "random_forest",
          "parameters": {
          }
    },

}

## Training the Defined Models

We can pass the models as a parameter to the defined class and then we can set three more variables. We can ask to use PCA with a specific number of components or we can have MinMaxScaler normalization if it is needed. So, it helps us to investigate the effect of both in addition to hyperparamter search.

In [None]:
trainer = SklearnTrainer(models=models, use_norm=True, use_pca=True, pca_size=200)
trainer.train(X_features, y)

## Evaluating the Models

In the last step, we can evaluate the model and retrieve the information for each model such as accuracy data.

In [None]:
evaluations = trainer.eval(X_test_features, y_test)

Also, we can generate the classification report for all models at the same time and compare them. Don't forget to evaluate first and then pull these information.

In [None]:
trainer.classification_report();

Moreover, we will have access to confusion matrix for all models upon request.

In [None]:
trainer.plot_confusion_matrix()

# 5. KMeans Clustering
We have used KMeans as part of our project to see if we can define clusters properly with in the samples and we check to see if we can assign each cluster to a set of genres. More details are available on the report.


In [None]:
trained_genres = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
model = KmeansTrainer(trained_genres)
model.train(X_features, y)
y_pred_features = model.eval(X_test_features, y_test)
model.accuracy_score(X_test_features, y_test)
model.plot_adjusted_matrix(X_test_features, y_test)