In [7]:
# Mount Google Drive to the Colab VM.
from google.colab import drive
drive.mount('/content/drive')

FOLDERNAME = "COMPSCI 682/compsci-682-project"
assert FOLDERNAME is not None, "[!] Enter the foldername."

# Now that we've mounted your Drive, this ensures that
# the Python interpreter of the Colab VM can load
# python files from within it.
import sys
sys.path.append('/content/drive/My Drive/{}'.format(FOLDERNAME))
# %cd /content/drive/My\ Drive/$FOLDERNAME/datasets
# !wget https://raw.githubusercontent.com/coreyker/dnn-mgr/master/gtzan/train_filtered.txt
# !wget https://raw.githubusercontent.com/coreyker/dnn-mgr/master/gtzan/valid_filtered.txt
# !wget https://raw.githubusercontent.com/coreyker/dnn-mgr/master/gtzan/test_filtered.txt
%cd /content/drive/My\ Drive/$FOLDERNAME

KeyboardInterrupt: ignored

In [None]:
# Setting up locale
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [None]:
from src.configuration import *
from src.dataset import *
from src.networks import *
from src.utils import *
import src.preprocess as pp

In [None]:
# Setting up environment for using GPU or CPU as per availability
dtype = torch.float32
device = torch.device('cuda') if USE_GPU and torch.cuda.is_available() else torch.device('cpu')
print('Using device:', device)

In [None]:
# Preprocess the dataset to extract and save features
if pp.are_features_extracted("datasets/features"):
    print("Features already extracted. Proceed further...")
else:
    print("Features absent. Processing...")
    pp.generate_spectrograms(
        data_path="datasets/genres",
        save_path="datasets/features"
    )
    print("Features extracted. Proceed further...")

# Note that one song in GTZAN dataset contains data in unknown/corrupt format: we will delete if this exists.
if os.path.exists("datasets/genres/jazz/jazz.00054.wav"):
    os.remove("datasets/genres/jazz/jazz.00054.wav")

In [None]:

# Log details about the datasets
print("Number of audio instances: ", len(audio_dataset)) # Expect 999
print("Number of melspectrogram instances: ", len(melspectrogram_dataset)) # Expect 999
print("Number of class labels: ", len(CLASS_LABELS)) # Expect 10
print("Class labels: ", CLASS_LABELS)

# What does this mean? $661794 \approx 30 \cdot 22050$ (seconds * sample rate (bitrate))

# What does each number in the tensor mean?
The numbers in a PyTorch waveform represent the amplitude of the waveform at each point in time. The waveform is a sequence of numbers, where each number represents the amplitude of the waveform at a specific point in time. The waveform is typically used to represent audio signals, but it can also be used to represent other types of signals, such as video signals or sensor data.

# What is the minimum-maximum range for each value in the tensor?


In [None]:
fig, axs = plt.subplots(nrows=2, ncols=2, figsize=(16,8))
print_stats(audio_dataset[0])
plot_waveform(audio_dataset[0], title="Blues-00000 Waveform", ax=axs[0,0])
plot_spectrogram(audio_dataset[0], title="Blues-00000 Spectrogram", ax=axs[0,1])
plot_spectrogram(audio_dataset[0],  type="melspectrogram", title="Blues-00000 Melspectrogram",ax=axs[1,0])
plot_spectrogram(audio_dataset[0],  type="mfcc", title="Blues-00000 MFCC",ax=axs[1,1])
fig.tight_layout()

In [None]:
# Melspectrogram
random_feature_display("melspectrogram", "Mel Spectrogram")

In [None]:
# Waveplot
random_feature_display("waveplot", "Waveform Plot")

In [None]:
# Spectrogram
random_feature_display("spectrogram", "Spectrogram")

In [None]:
# MFCC
random_feature_display("mfcc", "Mel Frequency Cepstral Coefficients (MFCC)")

In [None]:
train_loader = get_dataloader(data_path="datasets/", split='train')
iter_train_loader = iter(train_loader)
train_wav, train_genre = next(iter_train_loader)

valid_loader = get_dataloader(data_path="datasets/", split='valid')
test_loader = get_dataloader(data_path="datasets/", split='test')
iter_test_loader = iter(test_loader)
test_wav, test_genre = next(iter_test_loader)
print('training data shape: %s' % str(train_wav.shape))
print('validation/test data shape: %s' % str(test_wav.shape))
print(train_genre)

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix


device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
cnn = ConvolutionalNeuralNetwork().to(device)
loss_function = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(cnn.parameters(), lr=0.001)
valid_losses = []
num_epochs = 30

for epoch in range(num_epochs):
    losses = []

    # Train
    cnn.train()
    for (wav, genre_index) in train_loader:
        wav = wav.to(device)
        genre_index = genre_index.to(device)

        # Forward
        out = cnn(wav)
        loss = loss_function(out, genre_index)

        # Backward
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        losses.append(loss.item())
    print('Epoch: [%d/%d], Train loss: %.4f' % (epoch+1, num_epochs, np.mean(losses)))

    # Validation
    cnn.eval()
    y_true = []
    y_pred = []
    losses = []
    for wav, genre_index in valid_loader:
        wav = wav.to(device)
        genre_index = genre_index.to(device)

        # reshape and aggregate chunk-level predictions
        b, c, t = wav.size()
        logits = cnn(wav.view(-1, t))
        logits = logits.view(b, c, -1).mean(dim=1)
        loss = loss_function(logits, genre_index)
        losses.append(loss.item())
        _, pred = torch.max(logits.data, 1)

        # append labels and predictions
        y_true.extend(genre_index.tolist())
        y_pred.extend(pred.tolist())
    accuracy = accuracy_score(y_true, y_pred)
    valid_loss = np.mean(losses)
    print('Epoch: [%d/%d], Valid loss: %.4f, Valid accuracy: %.4f' % (epoch+1, num_epochs, valid_loss, accuracy))

    # Save model
    valid_losses.append(valid_loss.item())
    if np.argmin(valid_losses) == epoch:
        print('Saving the best model at %d epochs!' % epoch)
        torch.save(cnn.state_dict(), 'best_model.ckpt')

In [None]:
# Load the best model
S = torch.load('best_model.ckpt')
cnn.load_state_dict(S)
print('loaded!')

# Run evaluation
cnn.eval()
y_true, y_pred = [], []

with torch.no_grad():
    for wav, genre_index in test_loader:
        wav = wav.to(device)
        genre_index = genre_index.to(device)

        # reshape and aggregate chunk-level predictions
        b, c, t = wav.size()
        logits = cnn(wav.view(-1, t))
        logits = logits.view(b, c, -1).mean(dim=1)
        _, pred = torch.max(logits.data, 1)

        # append labels and predictions
        y_true.extend(genre_index.tolist())
        y_pred.extend(pred.tolist())

In [None]:
import seaborn as sns
from sklearn.metrics import confusion_matrix

accuracy = accuracy_score(y_true, y_pred)
cm = confusion_matrix(y_true, y_pred)
sns.heatmap(cm, annot=True, xticklabels=class_labels, yticklabels=class_labels, cmap="YlGn")
print('Accuracy: %.4f' % accuracy)

In [None]:
print(cnn)
wav, _ = next(iter(train_loader))
wav = wav.to(device)
yhat = cnn(wav)
make_dot(yhat, params=dict(list(cnn.named_parameters()))).render("CNN_Model_GTZAN", format="png")