In [8]:
import os
import functools
import pickle
import sys
from typing import Iterator, Generator

# import music21
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from tqdm import tqdm
%matplotlib inline

In [9]:
# Load data
score_names = np.load('./data/score_names.npy')

In [30]:
scores = [
    (np.load('./data/{}.npy'.format(i)))
    for i in tqdm(range(len(score_names)))
]

100%|██████████| 5930/5930 [00:09<00:00, 631.28it/s]


In [4]:
meta = [
    (np.load('./data/{}_meta.npy'.format(i)))
    for i in tqdm(range(len(score_names)))
]

100%|██████████| 5930/5930 [08:36<00:00, 11.48it/s]


In [31]:
max_pitch = np.max([np.max(t) for t in scores])
min_pitch = np.min([np.min(t[t > 0]) for t in scores])
n_notes = int(max_pitch - min_pitch) + 1
idx_slur = 0
idx_beat = 2

In [32]:
# hyperparameters
SEQUENCE_STEPS = 4  # The number of windows to look at; 4 * 32 means we look at an 8-bar window
window_size = 32  # each measure is 16, so 32 is a two-measure window
n_features = n_notes + 1

In [33]:
def make_targets(score, voice):
    """
    Make our target variables. It is the a stream of notes and one of metadata
    for a specified voice in the score.
    """
    n_output_features = n_notes + 1
    y = np.zeros((score.shape[1], n_output_features))  # shape: n timesteps X m features
    for i, note in enumerate(score[voice]):
        if note > 0:
            note_idx = int(note - min_pitch)
            y[i, note_idx + 1] = 1
        else:
            y[i, 0] = 1 # it's a rest
    return y

In [34]:
def make_targets_slur(meta, voice):
    """
    Make our target variables. It is the a stream of notes and one of metadata
    for a specified voice in the score.
    """
    return meta[voice, :, idx_slur]

In [35]:
def make_padded(score, window_size, max_voices=None):
    # pad the beginning of the sequence so that our first window ends on the first timestep
    # also padd the voices
    padding_size = window_size - 1

    score_padding = np.zeros((score.shape[0], padding_size))
    return np.hstack((score_padding, score))


In [36]:
def make_input_beat(meta, voice):
    return meta[voice, :, idx_beat:]

In [326]:
def make_input_sequence(score, meta, voice, sequence_steps=16, conv_window_size=32):
    """
    Make an input sequence for a particular voice
    """
    window_size = sequence_steps * conv_window_size
    # First, do the notes channel
    padded_score = make_padded(score, window_size) / max_pitch
    padding_size = window_size - 1
    
    # Now, the slurs channel
    padded_meta = make_padded(meta[:, :, 0], window_size)
    
    # Stack them together
    indexer = np.arange(window_size)[None, :] + np.arange(padded_score.shape[1] - padding_size)[:, None]
    stacked = np.stack((padded_score, padded_meta), axis=-1)
    
    # Make the sliding windows
    sequence = stacked.swapaxes(0, 1)[indexer, :, :]
    
    # Now, mask out the target values
    sequence[:, -1, voice, :] = 0
    
    return sequence.reshape((score.shape[1], -1, conv_window_size, padded_score.shape[0], 2))

In [None]:
np.random.seed(25)

In [313]:
scores_train, scores_valid, meta_train, meta_valid = train_test_split(scores, meta, test_size=0.1)

In [314]:
make_input_sequence_2(scores_valid[0], meta_valid[0], 0).shape

(880, 16, 32, 5, 2)

In [315]:
make_input_sequence(scores_valid[0], meta_valid[0], 0).shape == make_input_sequence_2(scores_valid[0], meta_valid[0], 0).shape

True

In [41]:
def cycle(iterable):
    while True:
        for i in iterable:
            yield i

In [42]:
train_gen = cycle(
    (
        [
            make_input_sequence(score, meta, voice, sequence_steps=SEQUENCE_STEPS, conv_window_size=window_size),
            make_input_beat(meta, voice)
        ],
        [
            make_targets(score, voice),
            make_targets_slur(meta, voice)
        ]
    )
    for score, meta in zip(scores_train, meta_train)
    for voice in range(score.shape[0])
)

In [77]:
meta[0].shape

(5, 720, 18)

In [107]:
voice_sample_valid = [
    np.random.randint(score.shape[0]) 
    for score in scores_valid
]

In [4]:
from keras.utils import Sequence

In [330]:
class BatchSequence(Sequence):
    def __init__(self, scores, meta, subsample_voices=False):
        self.scores = scores
        self.meta = meta
        if subsample_voices:
            # Take one randomly sampled voice for each score
            voice_sample = [
                np.random.randint(score.shape[0]) 
                for score in scores
            ]
            self.indices = [
                (score_idx, voice_sample[score_idx])
                for score_idx, score in enumerate(scores)
            ]
        else:
            self.indices = [
                (score_idx, voice_idx)
                for score_idx, score in enumerate(scores)
                for voice_idx in range(score.shape[0])
            ]
                    
    def __len__(self):
        return len(self.indices)
    
    def __getitem__(self, idx):
        score_idx, voice = self.indices[idx]
        score = self.scores[score_idx]
        meta = self.meta[score_idx]
        return (
            [
                make_input_sequence(score, meta, voice, sequence_steps=SEQUENCE_STEPS, conv_window_size=window_size),
                make_input_beat(meta, voice)
            ],
            [
                make_targets(score, voice),
                make_targets_slur(meta, voice)
            ]
        )

In [334]:
valid_sequence = BatchSequence(scores_valid, meta_valid, subsample_voices=True)

In [335]:
train_sequence = BatchSequence(scores_train, meta_train)

In [336]:
from keras.models import Sequential, Model
from keras import layers
from keras.activations import relu
import keras.callbacks

In [337]:
notes_model = Sequential()
notes_model.add(layers.ConvLSTM2D(32, 3, return_sequences=True, padding='same', input_shape=(None, window_size, None, 2)))
notes_model.add(layers.MaxPool3D(2, 2))
notes_model.add(layers.ConvLSTM2D(64, 3, padding='same'))
notes_model.add(layers.GlobalAveragePooling2D())

beats_input = layers.Input(shape=(16,))

features = layers.concatenate([notes_model.output, beats_input])

fc_1 = layers.Dense(100, activation='relu')(features)
dropout = layers.Dropout(0.1)(fc_1)

output_notes = layers.Dense(n_notes + 1, activation='softmax')(dropout)
output_slur = layers.Dense(1, activation='sigmoid')(fc_1)

model = Model(inputs=[notes_model.input, beats_input], outputs=[output_notes, output_slur])
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
conv_lst_m2d_3_input (InputLayer (None, None, 32, None 0                                            
____________________________________________________________________________________________________
conv_lst_m2d_3 (ConvLSTM2D)      (None, None, 32, None 39296       conv_lst_m2d_3_input[0][0]       
____________________________________________________________________________________________________
max_pooling3d_2 (MaxPooling3D)   (None, None, 16, None 0           conv_lst_m2d_3[0][0]             
____________________________________________________________________________________________________
conv_lst_m2d_4 (ConvLSTM2D)      (None, 16, None, 64)  221440      max_pooling3d_2[0][0]            
___________________________________________________________________________________________

In [338]:
top3_acc = functools.partial(keras.metrics.top_k_categorical_accuracy, k=3)

top3_acc.__name__ = 'top3_acc'


In [339]:
model.compile('adam', loss=['categorical_crossentropy', 'binary_crossentropy'], metrics=['accuracy', top3_acc])

In [340]:
checkpointer = keras.callbacks.ModelCheckpoint(filepath='./models/model6.hdf5', verbose=1, save_best_only=True)

In [341]:
steps_per_epoch = np.sum(score.shape[0] for score in scores_train)
steps_per_epoch

25420

In [342]:
validation_steps = np.sum(score.shape[0] for score in scores_valid)
validation_steps

2835

In [343]:
try:
    model.load_weights('./models/model6.hdf5')
except OSError:
    print('no weights found')

In [None]:
history = model.fit_generator(
    train_sequence,
    steps_per_epoch=len(train_sequence),
    epochs=5,
    validation_data=valid_sequence,
    validation_steps=len(valid_sequence),
    callbacks=[checkpointer],
    use_multiprocessing=True,
    max_queue_size=100,
    workers=4
)

Epoch 1/5


In [70]:
%time pred = model.predict_generator(valid_sequence, steps=1, use_multiprocessing=True, workers=4)

CPU times: user 16.8 s, sys: 10.2 s, total: 27 s
Wall time: 1min 6s


In [60]:
model.save_weights('./models/model6.hdf5')

In [93]:
np.array(valid).shape

(593, 2, 2)

In [82]:
valid_sequence.indices[0]

(0, 2)

In [97]:
notes_pred = np.argmax(pred[0][0], axis=1)

In [99]:
notes = np.argmax(valid_sequence[0][1][0], axis=1)

In [103]:
notes + (notes > 1) * (min_pitch - 1)

array([ 64.,  64.,  64.,  64.,  64.,  64.,  64.,  64.,  64.,  64.,  64.,
        64.,  64.,  64.,  64.,  64.,  66.,  66.,  66.,  66.,  66.,  66.,
        66.,  66.,  67.,  67.,  67.,  67.,  71.,  71.,  71.,  71.,  71.,
        71.,  71.,  71.,  69.,  69.,  69.,  69.,  67.,  67.,  67.,  67.,
        66.,  66.,  66.,  66.,  64.,  64.,  64.,  64.,  67.,  67.,  67.,
        67.,  67.,  67.,  67.,  67.,  66.,  66.,  66.,  66.,  64.,  64.,
        64.,  64.,  62.,  62.,  62.,  62.,  64.,  64.,  64.,  64.,  64.,
        64.,  66.,  66.,  67.,  67.,  67.,  67.,  67.,  67.,  67.,  67.,
        60.,  60.,  60.,  60.,  60.,  60.,  60.,  60.,  62.,  62.,  62.,
        62.,  62.,  62.,  62.,  62.,  64.,  64.,  64.,  64.,  64.,  64.,
        62.,  62.,  60.,  60.,  60.,  60.,  62.,  62.,  62.,  62.,  64.,
        64.,  64.,  64.,  64.,  64.,  64.,  64.,  64.,  64.,  64.,  64.,
        57.,  57.,  57.,  57.,  59.,  59.,  59.,  59.,  59.,  59.,  59.,
        59.,  62.,  62.,  62.,  62.,  64.,  64.,  6

In [108]:
scores_valid[0][2] - (notes + (notes > 1) * (min_pitch - 1))

array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0