In [1]:
import os
import functools
import pickle
import sys
from typing import Iterator, Generator

# import music21
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from tqdm import tqdm
%matplotlib inline

In [2]:
# Load data
score_names = np.load('./data/score_names.npy')

In [3]:
scores = [
    (np.load('./data/{}.npy'.format(i)))
    for i in tqdm(range(len(score_names)))
]

100%|██████████| 5930/5930 [00:14<00:00, 416.85it/s]


In [6]:
meta = [
    (np.load('./data/{}_meta.npy'.format(i)))
    for i in tqdm(range(len(score_names)))
]

100%|██████████| 5930/5930 [00:17<00:00, 331.62it/s]


In [138]:
max_pitch = np.max([np.max(t) for t in scores])
min_pitch = np.min([np.min(t[t > 0]) for t in scores])
n_notes = int(max_pitch - min_pitch) + 1
idx_slur = 0
idx_beat = 2

In [151]:
# hyperparameters
SEQUENCE_STEPS = 4  # The number of windows to look at; 4 * 32 means we look at an 8-bar window
window_size = 16  # each measure is 16, so 32 is a two-measure window
n_features = n_notes + 1

In [99]:
def make_targets(score, voice):
    """
    Make our target variables. It is the a stream of notes and one of metadata
    for a specified voice in the score.
    """
    n_output_features = n_notes + 1
    y = np.zeros((score.shape[1], n_output_features))  # shape: n timesteps X m features
    for i, note in enumerate(score[voice]):
        if note > 0:
            note_idx = int(note - min_pitch)
            y[i, note_idx + 1] = 1
        else:
            y[i, 0] = 1 # it's a rest
    return y

In [160]:
def make_targets_slur(meta, voice):
    """
    Make our target variables. It is the a stream of notes and one of metadata
    for a specified voice in the score.
    """
    return meta[voice, :, idx_slur]

In [101]:
def make_padded(score, window_size, max_voices=None):
    # pad the beginning of the sequence so that our first window ends on the first timestep
    # also padd the voices
    padding_size = window_size - 1

    score_padding = np.zeros((score.shape[0], padding_size))
    return np.hstack((score_padding, score))


In [163]:
def make_input_beat(meta, voice):
    return meta[voice, :, idx_beat:]

In [164]:
def make_input_sequence(score, meta, voice, sequence_steps=16, conv_window_size=32):
    """
    Make an input sequence for a particular voice
    """
    window_size = sequence_steps * conv_window_size
    # First, do the notes channel
    padded_score = make_padded(score, window_size)
    padding_size = window_size - 1
    indexer = np.arange(window_size)[None, :] + np.arange(padded_score.shape[1] - padding_size)[:, None]
    score_sequence = padded_score.T[indexer, :, None] / max_pitch
    
    # Now, the slurs channel
    padded_meta = make_padded(meta[:, :, 0], window_size)
    meta_sequence = padded_meta.T[indexer, :, None]
    
    # Stack them together
    sequence = np.concatenate((score_sequence, meta_sequence), axis=3)

    # Now, mask out the target values
    sequence[:, -1, voice, :] = 0
    
    return sequence.reshape((score.shape[1], -1, conv_window_size, padded_score.shape[0], 2))

In [103]:
np.random.seed(25)

In [104]:
scores_train, scores_valid, meta_train, meta_valid = train_test_split(scores, meta, test_size=0.1)

In [105]:
scores_valid[0].shape

(5, 880)

In [106]:
def cycle(iterable):
    while True:
        for i in iterable:
            yield i

In [166]:
train_gen = cycle(
    (
        [
            make_input_sequence(score, meta, voice, sequence_steps=SEQUENCE_STEPS, conv_window_size=window_size),
            make_input_beat(meta, voice)
        ],
        [
            make_targets(score, voice),
            make_targets_slur(meta, voice)
        ]
    )
    for score, meta in zip(scores_train, meta_train)
    for voice in range(score.shape[0])
)

In [167]:
make_input_beat(meta[0], 0)

array([[ 1.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  1.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  1., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  1.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  1.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  1.]])

In [159]:
meta[0].shape

(5, 720, 18)

In [156]:
valid_gen = cycle(
    (
        [
            make_input_sequence(score, meta, voice, sequence_steps=SEQUENCE_STEPS),
            make_input_beat(meta, voice)
        ],
        [
            make_targets(score, voice),
            make_targets_slur(meta, voice)
        ]
    )
    for score, meta in zip(scores_valid, meta_valid)
    for voice in range(score.shape[0])
)

In [109]:
from keras.models import Sequential, Model
from keras import layers
from keras.activations import relu
import keras.callbacks

Using TensorFlow backend.


In [143]:
notes_model = Sequential()
notes_model.add(layers.SpatialDropout3D(0.1, input_shape=(None, window_size, None, 2)))
notes_model.add(layers.ConvLSTM2D(32, 3, return_sequences=True, padding='same'))
notes_model.add(layers.MaxPool3D(2, 2))
notes_model.add(layers.ConvLSTM2D(64, 3, padding='same'))
notes_model.add(layers.GlobalAveragePooling2D())
notes_model.add(layers.Dropout(0.1))

beats_input = layers.Input(shape=(16,))

features = layers.concatenate([notes_model.output, beats_input])

fc_1 = layers.Dense(100, activation='relu')(features)

output_notes = layers.Dense(n_notes + 1, activation='softmax')(fc_1)
output_slur = layers.Dense(1, activation='sigmoid')(fc_1)

model = Model(inputs=[notes_model.input, beats_input], outputs=[output_notes, output_slur])
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
spatial_dropout3d_13_input (Inpu (None, None, 16, None 0                                            
____________________________________________________________________________________________________
spatial_dropout3d_13 (SpatialDro (None, None, 16, None 0           spatial_dropout3d_13_input[0][0] 
____________________________________________________________________________________________________
conv_lst_m2d_25 (ConvLSTM2D)     (None, None, 16, None 39296       spatial_dropout3d_13[0][0]       
____________________________________________________________________________________________________
max_pooling3d_13 (MaxPooling3D)  (None, None, 8, None, 0           conv_lst_m2d_25[0][0]            
___________________________________________________________________________________________

In [144]:
top3_acc = functools.partial(keras.metrics.top_k_categorical_accuracy, k=3)

top3_acc.__name__ = 'top3_acc'


In [145]:
model.compile('adam', loss=['categorical_crossentropy', 'binary_crossentropy'], metrics=['accuracy', top3_acc])

In [146]:
checkpointer = keras.callbacks.ModelCheckpoint(filepath='./models/model6.hdf5', verbose=1, save_best_only=True)

In [147]:
steps_per_epoch = np.sum(score.shape[0] for score in scores_train)
steps_per_epoch

25420

In [148]:
validation_steps = np.sum(score.shape[0] for score in scores_valid)
validation_steps

2835

In [149]:
try:
    model.load_weights('./models/model6.hdf5')
except OSError:
    print('no weights found')

no weights found


In [168]:
history = model.fit_generator(
    train_gen,
    steps_per_epoch=steps_per_epoch,
    epochs=5,
    validation_data=valid_gen,
    validation_steps=validation_steps,
    callbacks=[checkpointer]
)

Epoch 1/5
    3/25420 [..............................] - ETA: 705033s - loss: 4.7630 - dense_12_loss: 4.0615 - dense_13_loss: 0.7015 - dense_12_acc: 0.0397 - dense_12_top3_acc: 0.0750 - dense_13_acc: 0.4738 - dense_13_top3_acc: 1.0000

KeyboardInterrupt: 

In [None]:
model.evaluate_generator(valid_gen, steps=validation_steps)