# Building a Generative Music Model

In [1]:
import numpy as np
import tensorflow as tf

from tqdm import tqdm

import utils

In [None]:
with open('processed_data/vectorized_classical_songs2.npy', 'rb') as data:
    songs = np.load(data, allow_pickle=True)

In [None]:
print(f'# of songs in dataset: {len(songs):,}')
print(f'avg # of notes per song: {np.average(list(map(len, songs[:,0]))):.0f}')

# of songs in dataset: 3,013
avg # of notes per song: 2664


In [None]:
songs.shape

(3013, 2)

## Create Training Sequences

In [None]:
Xs_p = []
Xs_d = []

Ys_p = []
Ys_d = []

for song in tqdm(songs):
    song_pitches = song[0]
    song_durs = song[1]
    
    X_p, Y_p = utils.get_sequences(song_pitches, input_length=32, output_length=1, offset=1, shift=1)
    X_d, Y_d = utils.get_sequences(song_durs, input_length=32, output_length=1, offset=1, shift=1)

    if X_p.ndim != 1:
        Xs_p.append(X_p)
        Xs_d.append(X_d)
        
        Ys_p.append(Y_p)
        Ys_d.append(Y_d)
        

X_p = np.concatenate(Xs_p)
X_d = np.concatenate(Xs_d)

Y_p = np.concatenate(Ys_p)
Y_d = np.concatenate(Ys_d)

100%|██████████| 3013/3013 [15:34:21<00:00, 18.61s/it]


In [None]:
print(f'X_p shape: {X_p.shape}')
print(f'X_d shape: {X_d.shape}')
print()
print(f'Y_p shape: {Y_p.shape}')
print(f'Y_d shape: {Y_d.shape}')

X_p shape: (7928979, 32)
X_d shape: (7928979, 32)

Y_p shape: (7928979, 1)
Y_d shape: (7928979, 1)


In [None]:
np.savez_compressed('processed_data/classical_songs_sequences.npz', X=(X_p, X_d), Y=(Y_p, Y_d))

## Modeling

### Load data

In [2]:
data = np.load('processed_data/classical_songs_sequences.npz', mmap_mode='r')
X = data['X'] # (X_p, X_d)
Y = data['Y'] # (Y_p, Y_d)

X_p, X_d = X[0], X[1]
Y_p, Y_d = Y[0], Y[1]

In [3]:
print(f'there are {X_p.shape[0]:,} sequences of {X_p.shape[1]} notes for training')

there are 7,928,979 sequences of 32 notes for training


In [4]:
# convert targets to one hot encoding 
Y_p_ohe, Y_d_ohe = tf.keras.utils.to_categorical(np.squeeze(Y_p)), tf.keras.utils.to_categorical(np.squeeze(Y_d))

### Model1

In [5]:
from tensorflow.keras.layers import Input, Embedding, Concatenate, LSTM, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

In [6]:
# constants
n_pitches = 12
n_durs = 5
seq_len = X_p.shape[1] 

# hyperparameters
embed_dim = 12 
lstm1_n_units = 512
dense1_n_units = 256

In [7]:
# Define input layers
pitch_input = Input(shape=(None, ), name='pitch_input')
dur_input = Input(shape=(None, ), name='dur_input')

# Define embedding layers
pitch_embedding = Embedding(n_pitches, embed_dim, name='pitch_emb')(pitch_input)
dur_embedding = Embedding(n_durs, embed_dim, name='dur_emb')(dur_input) 

# Merge embedding layers using a concatenation layer
pitch_dur = Concatenate(axis=1, name='merge_pitch_dur')([pitch_embedding, dur_embedding])

# Define LSTM layer
lstm1 = LSTM(lstm1_n_units, name='lstm1')(pitch_dur)

# Define dense layer
dense1 = Dense(dense1_n_units, name='dense1')(lstm1)

# Define output layers
pitch_output = Dense(n_pitches, activation='softmax', name='pitch_output')(dense1)
dur_output = Dense(n_durs, activation='softmax', name='dur_output')(dense1)

# Define model
model1 = tf.keras.Model(inputs=[pitch_input, dur_input], outputs=[pitch_output, dur_output], name='model1')

In [8]:
model1.summary()

Model: "model1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
pitch_input (InputLayer)        [(None, None)]       0                                            
__________________________________________________________________________________________________
dur_input (InputLayer)          [(None, None)]       0                                            
__________________________________________________________________________________________________
pitch_emb (Embedding)           (None, None, 12)     144         pitch_input[0][0]                
__________________________________________________________________________________________________
dur_emb (Embedding)             (None, None, 12)     60          dur_input[0][0]                  
_____________________________________________________________________________________________

In [12]:
opt = Adam()
model1.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])

In [13]:
# dataset very large so step_per_epoch is reduced
# TODO: currently the train-val split results in data leakage, that's okay for now, but a split on the song-level would solve that
history = model1.fit(x=[X_p, X_d], y=[Y_p_ohe, Y_d_ohe], validation_split=0.01, batch_size=64, steps_per_epoch=8000, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [14]:
model1.save('model1_1.h5')

### Model2: Add second LSTM layer


In [16]:
# constants
n_pitches = 12
n_durs = 5
seq_len = X_p.shape[1] 

# hyperparameters
embed_dim = 12 
lstm1_n_units = 128
lstm2_n_units = 64
dense1_n_units = 32

In [17]:
# Define input layers
pitch_input = Input(shape=(None, ), name='pitch_input')
dur_input = Input(shape=(None, ), name='dur_input')

# Define embedding layers
pitch_embedding = Embedding(n_pitches, embed_dim, name='pitch_emb')(pitch_input)
dur_embedding = Embedding(n_durs, embed_dim, name='dur_emb')(dur_input)

# Merge embedding layers using a concatenation layer
pitch_dur = Concatenate(axis=1, name='merge_pitch_dur')([pitch_embedding, dur_embedding])

# Define LSTM layers
lstm1 = LSTM(lstm1_n_units, name='lstm1', return_sequences=True)(pitch_dur)
lstm2 = LSTM(lstm2_n_units, name='lstm2')(lstm1)

# Define dense layer
dense1 = Dense(dense1_n_units, name='dense1')(lstm2)

# Define output layers
pitch_output = Dense(n_pitches, activation='softmax', name='pitch_output')(dense1)
dur_output = Dense(n_durs, activation='softmax', name='dur_output')(dense1)

# Define model
model2 = tf.keras.Model(inputs=[pitch_input, dur_input], outputs=[pitch_output, dur_output], name='model2')

In [18]:
model2.summary()

Model: "model2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
pitch_input (InputLayer)        [(None, None)]       0                                            
__________________________________________________________________________________________________
dur_input (InputLayer)          [(None, None)]       0                                            
__________________________________________________________________________________________________
pitch_emb (Embedding)           (None, None, 12)     144         pitch_input[0][0]                
__________________________________________________________________________________________________
dur_emb (Embedding)             (None, None, 12)     60          dur_input[0][0]                  
_____________________________________________________________________________________________

In [19]:
opt = Adam()
model2.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])

In [20]:
# dataset very large so step_per_epoch is reduced
# TODO: currently the train-val split results in data leakage, that's okay for now, but a split on the song-level would solve that
history = model2.fit(x=[X_p, X_d], y=[Y_p_ohe, Y_d_ohe], validation_split=0.01, batch_size=64, steps_per_epoch=8000, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [21]:
model2.save('model2.h5')

# Generate Some Music

to generate music (note sequences), give a seed for pitches and durations, and specify the number of notes to generate, the maximum length of seed to use for generating each note, and the temperature for sampling notes from the model's predicted probability distribution

In [None]:
model1 = tf.keras.models.load_model('models/model1_1.h5')

In [None]:
gen_seq = utils.generate_sequence(model1, [0], [0], n_notes=64, seed_len=16, temp=1.25);

In [None]:
gen_p, gen_d = gen_seq # unpack pitches and durations

In [None]:
stream = utils.generate_stream(gen_p, gen_d) # generate a stream from the sequence of notes

In [None]:
# play generated musice
utils.music21.midi.realtime.StreamPlayer(stream).play()

In [None]:
utils.stream_to_midi(stream, 'generated_music/model1/0') # save stream to midi file