In [None]:
# Extract zip files
import zipfile
with zipfile.ZipFile('./acousticbrainz.zip', 'r') as zip:
    zip.extractall('extracted')

In [6]:
# Extract subfiles and move everything around to be more convenient
import tarfile
from os import listdir, mkdir, replace
import bz2

mkdir('./tsvs')
for filename in listdir('./extracted'):
    if '.tar.bz2' in filename:
      tarfile.open('./extracted/' + filename).extractall('./train' if 'train' in filename else './validation')
    elif 'tsv.bz2' in filename:
      data = bz2.BZ2File('./extracted/' + filename).read()
      open('./tsvs/' + filename[:-4], 'wb').write(data)
    
for folder in listdir('./train/acousticbrainz-mediaeval-train'):
    for filename in listdir('./train/acousticbrainz-mediaeval-train/' + folder):
        replace('./train/acousticbrainz-mediaeval-train/' + folder + '/' + filename, './train/' + filename)

for folder in listdir('./validation/acousticbrainz-mediaeval-validation'):
    for filename in listdir('./validation/acousticbrainz-mediaeval-validation/' + folder):
        replace('./validation/acousticbrainz-mediaeval-validation/' + folder + '/' + filename, './validation/' + filename)

In [1]:
# Go through the data and preprocess it in into a usable form
import json
from os import listdir
import numpy as np

X_train = []
y_train = []
X_validation = []
y_validation = []

all_genres = set()

def process_tsv(file_path, validation=False):
    with open(file_path) as tsv:
        tsv.readline()
        while True:
            line = tsv.readline()
            if not line:
                break
            entries = line.split('\t')
            
            genres = [genre for genre in entries[2:] if genre != '' and genre != '\n']
            for genre in genres:
                all_genres.add(genre)
            if not validation:
                y_train.append(genres)
            else:
                y_validation.append(genres)

            try:
                with open(('./train/' if not validation else './validation/') + entries[0] + '.json') as file:
                    song = json.load(file)
                    if not validation:
                        length = float(song['metadata']['audio_properties']['length'])
                        X_train.append(length)
                    else:
                        length = float(song['metadata']['audio_properties']['length'])
                        X_validation.append(length)
            except:
                print('failed to load' + entries[0])

process_tsv('./tsvs/acousticbrainz-mediaeval-discogs-train.tsv')
process_tsv('./tsvs/acousticbrainz-mediaeval-lastfm-train.tsv')
process_tsv('./tsvs/acousticbrainz-mediaeval-tagtraum-train.tsv')
process_tsv('./tsvs/acousticbrainz-mediaeval-discogs-validation.tsv', validation=True)
process_tsv('./tsvs/acousticbrainz-mediaeval-lastfm-validation.tsv', validation=True)
process_tsv('./tsvs/acousticbrainz-mediaeval-tagtraum-validation.tsv', validation=True)

all_genres = list(all_genres) # contains all the genres and their indices
for i, datapoint in enumerate(y_train):
    processed_datapoint = []
    for genre in all_genres:
        processed_datapoint.append(int(genre in datapoint))
    y_train[i] = processed_datapoint

for i, datapoint in enumerate(y_validation):
    processed_datapoint = []
    for genre in all_genres:
        processed_datapoint.append(int(genre in datapoint))
    y_validation[i] = processed_datapoint

X_train = np.asarray(X_train).astype(np.float64)
X_validation = np.asarray(X_validation).astype(np.float64)

failed to load60444e32-3a4c-4323-a3af-4057f36e2c85


In [2]:
# Define the model and give a summary
from keras import layers
from keras.models import Model

length = layers.Input(shape=(1,))
model = Model(length, layers.Dense(1, activation='relu')(length))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()

2022-04-04 01:31:27.150267: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-04-04 01:31:27.188710: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-04-04 01:31:27.188876: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-04-04 01:31:27.190421: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 1)]               0         
                                                                 
 dense (Dense)               (None, 1)                 2         
                                                                 
Total params: 2
Trainable params: 2
Non-trainable params: 0
_________________________________________________________________


2022-04-04 01:31:27.190915: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-04-04 01:31:27.191053: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-04-04 01:31:27.191169: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-04-04 01:31:27.668311: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-04-04 01:31:27.668508: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from S

In [None]:
# Train and save the model 
model.fit(X_train, y_train, batch_size=32, epochs=1, verbose=1, validation_data=(X_validation, y_validation))
score = model.evaluate(X_validation, y_validation, verbose=1)
print('Validation loss and accuracy:', score)
model.save('trained_model')