In [None]:
#https://stackoverflow.com/questions/50915634/multilayer-seq2seq-model-with-lstm-in-keras

In [1]:
import os
import pretty_midi
import numpy as np
import math
import random
import matplotlib.pyplot as plt

In [15]:
from tensorflow import keras

In [14]:
from __future__ import print_function

from keras.models import Model
from keras.models import load_model

from keras.layers import Input, Bidirectional, LSTM, Concatenate, Dense

In [3]:
import pretty_midi
import argparse
import librosa
import os
import sys

In [4]:
workdir = "C:\\Users\\toend\\Documents\\ITU\\Thesis"
pathToSave = os.path.join(workdir, "MIDIdata\\RESULTS")
songLengthInSeconds = 30
frequency = 100
START_OF_SEQUENCE_VALUE = 101
END_OF_SEQUENCE_VALUE = 102

In [5]:
def reshape(piano_roll):
    h, w = piano_roll.shape
    slices = []
    for i in range(w):
        columnSlice = piano_roll[:,i]
        columnSlice = np.asarray(columnSlice)
        slices.append(columnSlice)
    return np.asarray(slices)

In [6]:
def createPianoRoll(piano):
    endTime = math.ceil(piano.get_end_time())
    #print(endTime)
    notes = piano.notes
    #adding a pad to avoid index out of bounds
    width = (endTime*frequency)
    piano_roll = np.zeros((128, width))
    for note in notes:
        pitch = note.pitch
        #NORMALIZING
        velocity = note.velocity/100
        start = int(round(note.start * frequency))
        end = int(round(note.end * frequency))
        for i in range(start-1, end):
            piano_roll[pitch][i] = velocity
    return piano_roll

In [7]:
def loadData(path):
    song = []
    with os.scandir(path) as entries:
        for entry in entries:
            if entry.is_file() and not ".ini" in entry.name:
                filename = entry.name
                #print(filename)
                midi_file = entry.path
                pm = pretty_midi.PrettyMIDI(midi_file)
                piano = pm.instruments[0]
                piano_roll = createPianoRoll(piano)
                piano_roll = reshape(piano_roll)
                song.append(piano_roll)
    return np.array(song)

In [8]:
def padPianoRollToCorrectShape(pianoRoll):
    result = np.zeros((3000,128))
    result[:pianoRoll.shape[0],:pianoRoll.shape[1]] = pianoRoll
    return result

In [9]:
def padPianoRollWithStartAndEndSymbols(pianoRoll):
    start = np.array([START_OF_SEQUENCE_VALUE for i in range(128)])
    X = np.insert(pianoRoll, 0, start, axis=0)
    end = np.array([[END_OF_SEQUENCE_VALUE for i in range(128)]])
    X = np.append(X, end, axis=0)
    return X

In [10]:
#LOADING THE DATA FROM ALL 4 CATEGORIES


path = os.path.join(workdir, "MIDIdata\\TRAINING\\ArousedNegative")
X = loadData(path)
X1 = []
for pianoRoll in X:
    pianoRoll = padPianoRollToCorrectShape(pianoRoll)
    #pianoRoll = padPianoRollWithStartAndEndSymbols(pianoRoll)
    X1.append(pianoRoll)


path = os.path.join(workdir, "MIDIdata\\TRAINING\\ArousedPositive")
X = loadData(path)
X2 = []
for pianoRoll in X:
    pianoRoll = padPianoRollToCorrectShape(pianoRoll)
    #pianoRoll = padPianoRollWithStartAndEndSymbols(pianoRoll)
    X2.append(pianoRoll)


path = os.path.join(workdir, "MIDIdata\\TRAINING\\CalmNegative")
X = loadData(path)
X3 = []
for pianoRoll in X:
    pianoRoll = padPianoRollToCorrectShape(pianoRoll)
    #pianoRoll = padPianoRollWithStartAndEndSymbols(pianoRoll)
    X3.append(pianoRoll)

path = os.path.join(workdir, "MIDIdata\\TRAINING\\CalmPositive")
X = loadData(path)
X4 = []
for pianoRoll in X:
    pianoRoll = padPianoRollToCorrectShape(pianoRoll)
    #pianoRoll = padPianoRollWithStartAndEndSymbols(pianoRoll)
    X4.append(pianoRoll)


print(len(X1))
#print(np.array(X1)[0][0])
#print(np.array(X1)[0][3001])
print(len(X2))
print(len(X3))
print(len(X4))

150
179
112
69


In [11]:
encoderInputRolls = np.array(X1[:150])
decoderInputRolls = np.array(X2[:150])
decoderTargetRolls = np.array(X2[:150])
#print(decoderTargetRolls[0][0])

In [12]:
#SHIFTING TARGET DATA SO THAT IT STAYS ONE TIMESTEP AHEAD

#SHOULD START AND END TOKEN BE DELETED FOR ENCODER? !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

#DELETING END TOKEN FOR DECODER INPUT
#decoderInputRolls = np.delete(decoderInputRolls, 3001 , 1)

#DELETING START TOKEN FOR TARGET
#decoderTargetRolls = np.delete(decoderTargetRolls, 0, 1)


print(encoderInputRolls.shape)
print(decoderInputRolls.shape)
print(decoderTargetRolls.shape)
#print(decoderTargetRolls[0][0])

(150, 3000, 128)
(150, 3000, 128)
(150, 3000, 128)


In [17]:
batch_size = 30  # Batch size for training. Usually 64
epochs = 150  # Number of epochs to train for.
latent_dim = 100  # Latent dimensionality of the encoding space.
pianoNodes = 128  # For input shape
learning_rate = 0.03  #No clue, default is 0.01
decay = learning_rate/epochs  #https://www.pyimagesearch.com/2019/07/22/keras-learning-rate-schedules-and-decay/

In [18]:
#Learning Rate
#https://machinelearningmastery.com/understand-the-dynamics-of-learning-rate-on-deep-learning-neural-networks/
lr_schedule = keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate=1e-2,
    decay_steps=10000,
    decay_rate=0.9)

optimizer = keras.optimizers.SGD(learning_rate=lr_schedule)

In [20]:
#Defining an input sequence and process it.
encoder_inputs = Input(shape=(None, pianoNodes))
#Defining the two layers and the output
encoder_outputs, forward_h1, forward_c1, backward_h1, backward_c1  = Bidirectional(LSTM(latent_dim, return_state=True, return_sequences=True))(encoder_inputs)
#Defining the output of the two layers
_, forward_h2, forward_c2, backward_h2, backward_c2 = Bidirectional(LSTM(latent_dim, return_state=True))(encoder_outputs)

# Discarding `encoder_outputs` and only keep the states.
# Is concatenating the right choice?
state_h1 = Concatenate()([forward_h1, backward_h1])
state_c1 = Concatenate()([forward_c1, backward_c1])
state_h2 = Concatenate()([forward_h2, backward_h2])
state_c2 = Concatenate()([forward_c2, backward_c2])
encoder_states = [state_h1, state_c1, state_h2, state_c2]

In [22]:
# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None, pianoNodes))
#Defining the two layers
decoder_lstm1 = LSTM(latent_dim*2, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm1(decoder_inputs, initial_state= [state_h1, state_c1])

decoder_lstm2 = LSTM(latent_dim*2, return_sequences=True, return_state=True)
final, dh2, dc2 = decoder_lstm2(decoder_outputs, initial_state= [state_h2, state_c2])

#CONSIDER ACTIVATION FUNCTION --> SIGMOID MIGHT BE BETTER FOR NOW
decoder_dense = Dense(pianoNodes, activation='sigmoid')
decoder_outputs = decoder_dense(final)

In [23]:
# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Run training
model.compile(optimizer='adam', loss='binary_crossentropy',
              metrics=['accuracy'])
model.fit([encoderInputRolls, decoderInputRolls], decoderTargetRolls,
          batch_size=batch_size,
          epochs=epochs,
          validation_split=0.1)
# Save model
model.save('2LayeredBLSTMs2s.h5')

Train on 135 samples, validate on 15 samples
Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150


Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150
Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 66/150
Epoch 67/150
Epoch 68/150
Epoch 69/150
Epoch 70/150
Epoch 71/150
Epoch 72/150
Epoch 73/150
Epoch 74/150
Epoch 75/150
Epoch 76/150
Epoch 77/150
Epoch 78/150
Epoch 79/150
Epoch 80/150
Epoch 81/150
Epoch 82/150
Epoch 83/150
Epoch 84/150
Epoch 85/150
Epoch 86/150
Epoch 87/150
Epoch 88/150
Epoch 89/150
Epoch 90/150
Epoch 91/150
Epoch 92/150
Epoch 93/150
Epoch 94/150
Epoch 95/150
Epoch 96/150
Epoch 97/150
Epoch 98/150
Epoch 99/150
Epoch 100/150
Epoch 101/150
Epoch 102/150
Epoch 103/150
Epoch 104/150
Epoch 105/150
Epoch 106/150
Epoch 107/150
Epoch 108/150
Epoch 109/150
Epoch 110/150
Epoch 111/150
Epoch 112/150


Epoch 113/150
Epoch 114/150
Epoch 115/150
Epoch 116/150
Epoch 117/150
Epoch 118/150
Epoch 119/150
Epoch 120/150
Epoch 121/150
Epoch 122/150
Epoch 123/150
Epoch 124/150
Epoch 125/150
Epoch 126/150
Epoch 127/150
Epoch 128/150
Epoch 129/150
Epoch 130/150
Epoch 131/150
Epoch 132/150
Epoch 133/150
Epoch 134/150
Epoch 135/150
Epoch 136/150
Epoch 137/150
Epoch 138/150
Epoch 139/150
Epoch 140/150
Epoch 141/150
Epoch 142/150
Epoch 143/150
Epoch 144/150
Epoch 145/150
Epoch 146/150
Epoch 147/150
Epoch 148/150
Epoch 149/150
Epoch 150/150


In [24]:
# load model
model = load_model('2LayeredBLSTMs2s.h5')
# summarize model.
model.summary()


Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, None, 128)    0                                            
__________________________________________________________________________________________________
bidirectional_3 (Bidirectional) [(None, None, 200),  183200      input_2[0][0]                    
__________________________________________________________________________________________________
input_4 (InputLayer)            (None, None, 128)    0                                            
__________________________________________________________________________________________________
concatenate_5 (Concatenate)     (None, 200)          0           bidirectional_3[0][1]            
                                                                 bidirectional_3[0][3]      

In [25]:
# Next: inference mode (sampling).
encoder_model = Model(encoder_inputs, encoder_states)

decoder_state_input_h1 = Input(shape=(latent_dim*2,))
decoder_state_input_c1 = Input(shape=(latent_dim*2,))
decoder_state_input_h2 = Input(shape=(latent_dim*2,))
decoder_state_input_c2 = Input(shape=(latent_dim*2,))

decoder_states_inputs = [decoder_state_input_h1, decoder_state_input_c1,
                         decoder_state_input_h2, decoder_state_input_c2]

decoder_outputs, state_h1, state_c1 = decoder_lstm1(decoder_inputs, initial_state=decoder_states_inputs[:2])
final, state_h2, state_c2 = decoder_lstm2(decoder_outputs, initial_state=decoder_states_inputs[-2:])


decoder_states = [state_h1, state_c2, state_h2, state_c2]

decoder_outputs = decoder_dense(final)

decoder_model = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states)

In [26]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)
    
    # Generate empty target sequence of length 1.
    # Populate the first character of target sequence with the start character.
    #start = np.array([START_OF_SEQUENCE_VALUE for i in range(128)])
    
    start = np.zeros(128)
    target_seq = np.reshape(start, (1, 1, 128))
    
    #Normalizing the START_OF_SEQUENCE_VALUE from 101 to 1.01
    #target_seq = target_seq/100

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_pianoRoll = np.empty((1,1,128))
    while not stop_condition:
        output_tokens, h1, c1, h1, c1 = decoder_model.predict(
            [target_seq] + states_value)

        # Sample a single node
        #sampled_node = np.argmax(output_tokens[0, -1, :])
        
        #Add all nodes
        addedPianoRoll = np.reshape(output_tokens[0, -1, :], (1,1,128))
        decoded_pianoRoll = np.append(decoded_pianoRoll, addedPianoRoll , axis=0)

        # Exit condition: either hit max length
        # or find stop character ---> sampled_node == 102
        if (len(decoded_pianoRoll) > 3000):
            stop_condition = True

        # Update the target sequence (of length 1).
        #target_seq = np.zeros((1, 1, 128))
        #target_seq[0, 0, sampled_token_index] = 1.
        
        target_seq = np.reshape(output_tokens[0, -1, :], (1,1,128))
        
        #Readjusting values to avoid vanishing values
        target_seq = convertProbabilitiesToActualPianoRoll(target_seq)
        
        # Update states
        states_value = [h1, c1, h1, c1]

    return decoded_pianoRoll

In [27]:
def piano_roll_to_pretty_midi(piano_roll, fs=100, program=0):
    '''Convert a Piano Roll array into a PrettyMidi object
     with a single instrument.
    Parameters
    ----------
    piano_roll : np.ndarray, shape=(128,frames), dtype=int
        Piano roll of one instrument
    fs : int
        Sampling frequency of the columns, i.e. each column is spaced apart
        by ``1./fs`` seconds.
    program : int
        The program number of the instrument.
    Returns
    -------
    midi_object : pretty_midi.PrettyMIDI
        A pretty_midi.PrettyMIDI class instance describing
        the piano roll.
    '''
    notes, frames = piano_roll.shape
    pm = pretty_midi.PrettyMIDI()
    instrument = pretty_midi.Instrument(program=program)

    # pad 1 column of zeros so we can acknowledge inital and ending events
    piano_roll = np.pad(piano_roll, [(0, 0), (1, 1)], 'constant')

    # use changes in velocities to find note on / note off events
    velocity_changes = np.nonzero(np.diff(piano_roll).T)

    # keep track on velocities and note on times
    prev_velocities = np.zeros(notes, dtype=int)
    note_on_time = np.zeros(notes)

    for time, note in zip(*velocity_changes):
        # use time + 1 because of padding above
        velocity = piano_roll[note, time + 1]
        time = time / fs
        if velocity > 0:
            if prev_velocities[note] == 0:
                note_on_time[note] = time
                prev_velocities[note] = velocity
        else:
            pm_note = pretty_midi.Note(
                velocity=prev_velocities[note],
                pitch=note,
                start=note_on_time[note],
                end=time)
            instrument.notes.append(pm_note)
            prev_velocities[note] = 0
    pm.instruments.append(instrument)
    return pm

In [28]:
#Decide cutoff value for node-on/off --- 0.09?
def convertProbabilitiesToActualPianoRoll(pianoRoll):
    pianoRoll = pianoRoll *1.5
    #print(decodedPianoRoll[0])
    pianoRoll = np.where(pianoRoll<0.99, pianoRoll, 0.90)
    #Everything above 17%?
    pianoRoll = np.where(pianoRoll>0.80, pianoRoll, 0)
    #print(pianoRoll[0])
    
    #pianoRoll = np.where(pianoRoll==0, pianoRoll, 80)
    #print(pianoRoll[0])
    #print(pianoRoll.shape)
    return pianoRoll

In [29]:
pianoRolls = np.empty((1, 3000, 128))
for seq_index in range(3):
    # Take one sequence (part of the training set) for trying out decoding.
    print('----------------')
    input_seq = encoderInputRolls[seq_index: seq_index + 1]
    print(input_seq.shape) # (1, 3002, 128)
    decoded_pianoRoll = decode_sequence(input_seq)
    print(decoded_pianoRoll.shape) # (3001, 1 , 128) --> can be changed 
    
    #Reshaping to get rid of 3rd dimension
    decoded_pianoRoll = np.reshape(decoded_pianoRoll, (decoded_pianoRoll.shape[0], 128))
    print(decoded_pianoRoll.shape)
    
    #deleting start of sequence
    decoded_pianoRoll = np.delete(decoded_pianoRoll, 0, 0)
    #print(decoded_pianoRoll.shape)
    
    #converting to actual pianoRoll with only chosen node-on/off
    pianoRoll = convertProbabilitiesToActualPianoRoll(decoded_pianoRoll)
    
    #converting probabilities 0-1 to velocity values 0-100
    pianoRoll = pianoRoll*100
    
    #Converting velocity values to values you can actually hear (from 1.5 to something like 60)
    #decoded_pianoRoll = decoded_pianoRoll*60
    
    
    #Inspecting values generated by the model
    #for i in range(200):
    #    print(pianoRoll[i])
    
    #Reshaping to become an actual pianoRoll
    pianoRoll = reshape(pianoRoll)
    print(pianoRoll.shape)
    
    #Flipping nodes around
    #pianoRoll = np.flip(pianoRoll, 0)
    
    #saving to midi
    pm = piano_roll_to_pretty_midi(pianoRoll, frequency, 0)
    pm.write(pathToSave + "\\BLSTM" + str(seq_index) + ".mid")
    
    #Adding pianoroll to list of all pianorolls
    #decodedPianoRolls = np.append(decodedPianoRolls, decoded_pianoRoll, axis=0)
    #pianoRolls = np.append(pianoRolls, pianoRoll, axis=0)

----------------
(1, 3000, 128)
(3001, 1, 128)
(3001, 128)
(128, 3000)
----------------
(1, 3000, 128)
(3001, 1, 128)
(3001, 128)
(128, 3000)
----------------
(1, 3000, 128)
(3001, 1, 128)
(3001, 128)
(128, 3000)


In [30]:
for i in range(200):
    print(pianoRoll[:,i])

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.

In [31]:
for i in range(200):
    print(decoded_pianoRoll[i])

[9.16719437e-05 2.39908695e-04 1.73956156e-04 1.41113997e-04
 1.76250935e-04 2.85446644e-04 5.68628311e-05 2.03073025e-04
 5.08964062e-04 2.63720751e-04 1.61916018e-04 1.77949667e-04
 1.25676394e-04 5.98728657e-05 2.49773264e-04 6.62326813e-04
 1.27226114e-04 8.18967819e-05 1.27971172e-04 1.66505575e-04
 2.20507383e-04 1.27017498e-04 1.70201063e-04 2.87950039e-04
 1.26263499e-03 3.93837690e-04 1.40750408e-03 3.66240740e-04
 2.09426880e-03 3.82933021e-03 3.80367041e-03 6.23056293e-03
 3.26403975e-03 7.05081224e-03 4.20263410e-03 6.28107786e-03
 7.70279765e-03 3.32441926e-03 8.06763768e-03 5.45269251e-03
 9.15920734e-03 9.41213965e-03 1.45350397e-02 2.15345621e-02
 1.07532442e-02 1.75439417e-02 1.44267976e-02 1.35226846e-02
 2.05058753e-02 1.68275237e-02 2.50170231e-02 2.36437023e-02
 2.22393572e-02 2.37343609e-02 1.57251358e-02 2.79541612e-02
 2.68454850e-02 2.83117890e-02 2.35763192e-02 3.14463675e-02
 3.72833014e-02 3.25360000e-02 3.48744690e-02 2.90248990e-02
 4.35786545e-02 4.518598