In [2]:
import os
import pickle
import sys
from typing import Iterator, Generator

import music21
import numpy as np
import matplotlib.pyplot as plt
from joblib import Parallel, delayed
from sklearn.model_selection import train_test_split
from tqdm import tqdm
%matplotlib inline

In [3]:
# Load data
score_names = np.load('./data/score_names.npy')

In [4]:
scores = [
    (np.load('./data/{}.npy'.format(i)))
    for i in tqdm(range(len(score_names)))
]

100%|██████████| 5930/5930 [00:10<00:00, 587.01it/s]


In [5]:
max_pitch = np.max([np.max(t) for t in scores])
min_pitch = np.min([np.min(t[t > 0]) for t in scores])
n_notes = int(max_pitch - min_pitch) + 1

In [6]:
def make_targets(score, voice):
    """
    Make our target variables. It is the a stream of notes and one of metadata
    for a specified voice in the score.
    """
    n_output_features = n_notes
    y = np.zeros((score.shape[1], n_output_features))  # shape: n timesteps X m features
    for i, note in enumerate(score[voice]):
        if note > 0:
            note_idx = int(note - min_pitch)
            y[i, note_idx] = 1
    return y

In [7]:
def make_targets_meta(meta, voice):
    """
    Make our target variables. It is the a stream of notes and one of metadata
    for a specified voice in the score.
    """
    y_meta = np.zeros((meta.shape[1], 2)) #  add 2 meta features: slur, rest
    for i in range(meta.shape[1]):
        y_meta[i, idx_rest] = meta[voice, i, idx_rest]
        y_meta[i, idx_slur] = meta[voice, i, idx_slur]
    return y_meta

In [8]:
def make_padded(score, window_size):
    # pad the beginning of the sequence so that our first window ends on the first timestep
    padding_size = window_size - 1
    score_padding = np.zeros((score.shape[0], padding_size))
    return np.hstack((score_padding, score))


In [9]:
def make_padded_meta(meta, window_size):
    padding_size = window_size - 1
    meta_padding = np.zeros((meta.shape[0], padding_size, meta.shape[2]))
    return np.hstack((meta_padding, meta))

In [36]:
def make_input_sequence(score, voice, window_size=32):
    """
    Make an input sequence for a particular voice
    """
    padded_score = make_padded(score, window_size)
    padding_size = window_size - 1
    indexer = np.arange(window_size)[None, :] + np.arange(padded_score.shape[1] - padding_size)[:, None]
    score_sequence = padded_score.T[indexer, :, None]

    # Now, mask out the target values
    score_sequence[:, -1, voice, :] = 0
    
    return score_sequence.reshape((score.shape[1], 1, window_size, score.shape[0], 1))

In [37]:
def make_input_sequence_meta(meta, voice, window_size=32):
    padded_meta = make_padded_meta(meta, window_size)
    
    padding_size = window_size - 1
    indexer = np.arange(window_size)[None, :] + np.arange(padded_score.shape[1] - padding_size)[:, None]
    meta_sequence = np.swapaxes(padded_meta, 0, 1)[indexer, :, :]
    
    # Now, mask out the target values
    meta_sequence[:, -1, voice, :] = 0
    
    return meta_sequence

In [38]:
np.random.seed(1)

In [39]:
scores_train, scores_valid = train_test_split(scores, test_size=0.1)

In [40]:
scores_valid[0].shape

(4, 208)

In [41]:
make_input_sequence(scores_valid[0], 0).shape

(208, 1, 32, 4, 1)

In [44]:
train_gen = (
    (make_input_sequence(score, voice), make_targets(score, voice))
    for score in scores_train
    for voice in range(score.shape[0])
)

In [45]:
valid_gen = (
    (make_input_sequence(score, voice), make_targets(score, voice))
    for score in scores_valid
    for voice in range(score.shape[0])
)

In [116]:
y_valid[0].shape

(1, 208, 58)

In [52]:
from keras.models import Sequential, Model
from keras.layers import Conv2D, TimeDistributed, Input, Flatten, LSTM, ConvLSTM2D, Dense, Dropout, MaxPool3D, GlobalAveragePooling2D, GlobalAveragePooling1D
import keras.callbacks

In [53]:
window_size = 32
n_features = 58

In [58]:
model=Sequential()
model.add(ConvLSTM2D(32, 3, padding='same', activation='relu', return_sequences=True, input_shape=(1, window_size, None, 1)))
model.add(ConvLSTM2D(64, 3, strides=2, padding='same', activation='relu'))
# model.add(Dropout(0.2))
model.add(GlobalAveragePooling2D())
model.add(Dense(100, activation='relu'))
model.add(Dense(n_features, activation='softmax'))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv_lst_m2d_7 (ConvLSTM2D)  (None, 1, 32, None, 32)   38144     
_________________________________________________________________
conv_lst_m2d_8 (ConvLSTM2D)  (None, 16, None, 64)      221440    
_________________________________________________________________
global_average_pooling2d_3 ( (None, 64)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 100)               6500      
_________________________________________________________________
dense_3 (Dense)              (None, 58)                5858      
Total params: 271,942
Trainable params: 271,942
Non-trainable params: 0
_________________________________________________________________


In [59]:
model.compile('adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
model.fit_generator(
    train_gen,
    steps_per_epoch=len(scores_train),
    epochs=1,
    validation_data=valid_gen,
    validation_steps=len(scores_valid),
#     callbacks=[keras.callbacks.ProgbarLogger()]
)

Epoch 1/1
  74/5337 [..............................] - ETA: 107128s - loss: 2.9616 - acc: 0.0623