In [1]:
import os
import functools
import pickle
import sys
from typing import Iterator, Generator

# import music21
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from tqdm import tqdm
%matplotlib inline

In [2]:
# Load data
score_names = np.load('./data/score_names.npy')

In [3]:
scores = [
    (np.load('./data/{}.npy'.format(i)))
    for i in tqdm(range(len(score_names)))
]

100%|██████████| 5930/5930 [00:02<00:00, 2845.69it/s]


In [4]:
meta = [
    (np.load('./data/{}_meta.npy'.format(i)))
    for i in tqdm(range(len(score_names)))
]

100%|██████████| 5930/5930 [00:04<00:00, 1444.04it/s]


In [5]:
max_pitch = np.max([np.max(t) for t in scores])
min_pitch = np.min([np.min(t[t > 0]) for t in scores])
n_notes = int(max_pitch - min_pitch) + 1

In [6]:
def make_targets(score, voice):
    """
    Make our target variables. It is a stream of notes and rests for one voice.
    """
    n_output_features = n_notes + 1  # all possible notes plus rest
    y = np.zeros((score.shape[1], n_output_features))  # shape: n timesteps X m features
    for i, note in enumerate(score[voice]):
        if note > 0:
            note_idx = int(note - min_pitch) + 1
            y[i, note_idx] = 1
        else:
            y[i, 0] = 1
    return y

In [7]:
# hyperparameters
SEQUENCE_STEPS = 3
window_size = 32
n_features = 58
BATCH_LENGTH = 128

In [8]:
idx_slur = 0
idx_beat = 2

def make_targets_slur(meta, voice):
    """
    Make our target variables. It is the a stream of notes and one of metadata
    for a specified voice in the score.
    """
    return meta[voice, :, idx_slur].reshape(-1, 1)

In [9]:
def make_padded(score, window_size, max_voices=6):
    # pad the beginning of the sequence so that our first window ends on the first timestep
    # also padd the voices
    padding_size = window_size - 1
    if max_voices is not None:

        voices_padding_size = max_voices - score.shape[0]
        voices_padding = np.zeros((voices_padding_size, score.shape[1]))
        score = np.vstack((score, voices_padding))
        score_padding = np.zeros((max_voices, padding_size))
    else:
        score_padding = np.zeros((score.shape[0], padding_size))
    return np.hstack((score_padding, score))


In [10]:
def make_input_sequence(score, voice, sequence_steps=16, conv_window_size=32):
    """
    Make an input sequence for a particular voice
    """
    window_size = sequence_steps * conv_window_size
    padded_score = make_padded(score, window_size)
    padding_size = window_size - 1
    indexer = np.arange(window_size)[None, :] + np.arange(padded_score.shape[1] - padding_size)[:, None]
    score_sequence = padded_score.T[indexer, :, None]

    # Now, mask out the target values
    score_sequence[:, -1, voice, :] = 0
    
    return score_sequence.reshape((score.shape[1], -1, conv_window_size, padded_score.shape[0], 1)) / max_pitch

In [11]:
def make_input_sequence_slur(meta, voice, sequence_steps=16, conv_window_size=32):
    window_size = sequence_steps * conv_window_size
    padded_meta = make_padded(meta[:, :, idx_slur], window_size)
    
    padding_size = window_size - 1
    indexer = np.arange(window_size)[None, :] + np.arange(padded_meta.shape[1] - padding_size)[:, None]
    slur_sequence = padded_meta.T[indexer, :, None]
    
    # Now, mask out the target values
    slur_sequence[:, -1, voice, :] = 0
    
    return slur_sequence.reshape((meta.shape[1], -1, conv_window_size, padded_meta.shape[0], 1))

In [12]:
def make_input_sequence_beat(meta, voice=0, window_size=SEQUENCE_STEPS):
    padding = np.zeros((meta.shape[0], window_size - 1, meta.shape[2]))
    padded = np.hstack([padding, meta])
    indexer = np.arange(window_size)[None, :] + np.arange(meta.shape[1])[:, None]
    return np.squeeze(padded[voice, :, idx_beat:][indexer, :, None])

In [13]:
np.random.seed(25)

In [14]:
scores_train, scores_valid, meta_train, meta_valid = train_test_split(scores, meta, test_size=0.05)

In [15]:
np.squeeze(make_input_sequence_beat(meta_valid[2], 0)).shape

(336, 3, 16)

In [16]:
np.concatenate([
    make_input_sequence(scores_valid[2], 0),
    make_input_sequence_slur(meta_valid[2], 0)
], axis=4).shape

(336, 16, 32, 6, 2)

In [17]:
def cycle(iterable):
    while True:
        for i in iterable:
            yield i

In [18]:
def data_gen(scores, meta):
    for score, meta in zip(scores_train, meta_train):
        for voice in range(score.shape[0]):
            # inputs
            notes = make_input_sequence(score, voice, sequence_steps=SEQUENCE_STEPS)
            slurs = make_input_sequence_slur(meta, voice, sequence_steps=SEQUENCE_STEPS)
            notes_slurs = np.concatenate((notes, slurs), axis=4)
            beats = make_input_sequence_beat(meta, voice)
            # targets
            note_targets = make_targets(score, voice)
            slur_targets = make_targets_slur(meta, voice)
            for i in range(0, score.shape[1], BATCH_LENGTH):
                yield (
                    [
                        notes_slurs[i:i+BATCH_LENGTH],
                        beats[i:i+BATCH_LENGTH],
                    ],
                    [
                        note_targets[i:i+BATCH_LENGTH],
                        slur_targets[i:i+BATCH_LENGTH],
                    ]
                )
            

In [19]:
train_gen = cycle(data_gen(scores_train, meta_train))
valid_gen = cycle(data_gen(scores_valid, meta_valid))

In [20]:
# train_gen = cycle(
#     ([
#         make_input_sequence(score, voice, sequence_steps=SEQUENCE_STEPS),
#         make_input_sequence_slur(meta[:, i:i+BATCH_LENGTH, :], voice, sequence_steps=SEQUENCE_STEPS),
#         make_input_sequence_beat(meta[:, i:i+BATCH_LENGTH, :], voice),
#     ], [
#         make_targets(score[:, i:i+BATCH_LENGTH], voice),
#         make_targets_slur(meta[:, i:i+BATCH_LENGTH, :], voice)
#     ])
#     for score, meta in zip(scores_train, meta_train)
#     for voice in range(score.shape[0])
#     for i in range(0, score.shape[1], BATCH_LENGTH)
# )

In [21]:
# valid_gen = cycle(
#     ([
#         make_input_sequence(score[:, i:i+BATCH_LENGTH], voice, sequence_steps=SEQUENCE_STEPS),
#         make_input_sequence_slur(meta[:, i:i+BATCH_LENGTH, :], voice, sequence_steps=SEQUENCE_STEPS),
#         make_input_sequence_beat(meta[:, i:i+BATCH_LENGTH, :], voice),
#     ], [
#         make_targets(score[:, i:i+BATCH_LENGTH], voice),
#         make_targets_slur(meta[:, i:i+BATCH_LENGTH, :], voice)
#     ])
#     for score, meta in zip(scores_valid, meta_valid)
#     for voice in range(score.shape[0])
#     for i in range(0, score.shape[1], BATCH_LENGTH)
# )

In [22]:
make_input_sequence_slur(meta_valid[0], 0).shape

(880, 16, 32, 6, 1)

In [23]:
from keras.models import Sequential, Model
from keras.layers import BatchNormalization, Conv2D, TimeDistributed, Input, Activation, Flatten, LSTM, CuDNNLSTM, ConvLSTM2D, Dense, Dropout, MaxPooling2D, GlobalAveragePooling2D, GlobalAveragePooling3D
from keras.activations import relu
import keras.callbacks
from keras.layers import add, concatenate

Using TensorFlow backend.
  return f(*args, **kwds)


In [24]:
def residual_block(input_layer, nb_filters):
    skip = Conv2D(nb_filters, (1, 1))(input_layer)
    
    layer_1 = Conv2D(nb_filters, (3, 3), padding="same")(input_layer)
    layer_1_bn = BatchNormalization()(layer_1)
    layer_1_a = Activation("relu")(layer_1_bn)
    
    layer_2 = Conv2D(nb_filters, (3, 3), padding="same")(layer_1_a)
    layer_2_bn = BatchNormalization()(layer_2)
    
    combined = add([skip, layer_2_bn])
    return Activation("relu")(combined)

In [25]:
def make_convnet(input_layer):
    a1 = residual_block(input_layer, 32)
    a2 = MaxPooling2D(2, (2, 2), padding="same")(a1)
    
    b1 = residual_block(a2, 64)
    return GlobalAveragePooling2D()(b1)

In [26]:
# Make the input networks
input_notes = Input(shape=(window_size, 6, 2))
notes_convnet = make_convnet(input_notes)

notes_model = Model(inputs=[input_notes], outputs=[notes_convnet])

input_beats = (Input(shape=(16,)))
beats_net = Dense(32, activation='relu')(input_beats)
beats_model = Model(inputs=[input_beats], outputs=[beats_net])


# Make the sequence/timedistributed versions of them
input_notes_sequence = Input(shape=(SEQUENCE_STEPS, window_size, 6, 2))
notes = TimeDistributed(notes_model)(input_notes_sequence)

input_beats_sequence = Input(shape=(SEQUENCE_STEPS, 16))
beats = TimeDistributed(beats_model)(input_beats_sequence)

lstm_input = concatenate([notes, beats])

lstm_1 = LSTM(64, dropout=0.1, return_sequences=True)(lstm_input)
lstm_2 = LSTM(64, dropout=0.1)(lstm_1)

# Add information about beat


# make outputs
output_notes = Dense(n_features + 1, activation='softmax')(lstm_2)
output_slurs = Dense(1, activation='sigmoid')(lstm_2)

model = Model(
    inputs=[input_notes_sequence, input_beats_sequence],
    outputs=[output_notes, output_slurs]
)

model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            (None, 3, 32, 6, 2)  0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            (None, 3, 16)        0                                            
__________________________________________________________________________________________________
time_distributed_1 (TimeDistrib (None, 3, 64)        68256       input_3[0][0]                    
__________________________________________________________________________________________________
time_distributed_2 (TimeDistrib (None, 3, 32)        544         input_4[0][0]                    
__________________________________________________________________________________________________
concatenat

In [27]:
top3_acc = functools.partial(keras.metrics.top_k_categorical_accuracy, k=3)

top3_acc.__name__ = 'top3_acc'



In [28]:
model.compile('adam', loss=['categorical_crossentropy', 'binary_crossentropy'], metrics=['accuracy', top3_acc])

In [29]:
checkpointer = keras.callbacks.ModelCheckpoint(filepath='./models/model-multi.hdf5', verbose=1, save_best_only=True)

In [30]:
steps_per_epoch = np.sum(score.shape[0] * score.shape[1] // BATCH_LENGTH for score in scores_train)
steps_per_epoch

152825

In [31]:
validation_steps = np.sum(score.shape[0] * score.shape[1] // BATCH_LENGTH for score in scores_valid)
validation_steps

7685

In [None]:
try:
    model.load_weights('./models/model-multi.hdf5')
except OSError:
    print('no weights found')

In [None]:
history = model.fit_generator(
    train_gen,
    steps_per_epoch=steps_per_epoch,
    epochs=5,
    validation_data=valid_gen,
    validation_steps=500,
    callbacks=[checkpointer]
)

Epoch 1/5
   785/152825 [..............................] - ETA: 2:46:34 - loss: 1.0250 - dense_2_loss: 0.8894 - dense_3_loss: 0.1357 - dense_2_acc: 0.7056 - dense_2_top3_acc: 0.9055 - dense_3_acc: 0.9436 - dense_3_top3_acc: 1.0000

In [None]:
model.evaluate_generator(valid_gen, steps=validation_steps)

In [None]:
meta_valid[0][0, :, 0]