In [1]:
import pretty_midi
import os
import numpy as np
import heapq
import pickle

# See https://jazz-soft.net/demo/GeneralMidi.html for which instrument each number represents
#instruments = [0, 6, 40, 41, 42, 43, 45, 60, 68, 70, 71, 73]

num_notes = 128 # Number of pitches in MIDI

# We allow the network to step itself forward in time in increments of 10 ms.
# Thus the network can shift between 10 ms and 1 s, inclusive
num_time_shifts = 100 
min_time_shift = 0.01

# A message can be NOTE_ON, NOTE_OFF, or TIME_SHIFT
message_dim = 2*num_notes + num_time_shifts

# quantize_time_shift: takes a time shift and puts it into the correct bin
# in our reduced representation
# ARGUMENTS
# time_shift: the number of seconds to shift
# RETURN: the quantized time shift
def quantize_time_shift(time_shift):
    return int(np.round(time_shift/min_time_shift)) - 1

# note_on_event: generates the index for a NOTE_ON event
# ARGUMENTS
# note: the MIDI number for the note to be played
# RETURN: the index for a NOTE_ON message
def note_on_event(note):
    return note

# note_off_event: generates the index for a NOTE_OFF event
# ARGUMENTS
# note: the MIDI number for the note to be turned off
# RETURN: the index for the NOTE_OFF message
def note_off_event(note):
    return num_notes + note

# time_shift_event: generates the index for a TIME_SHIFT event
# ARGUMENTS
# time_shift: the quantized time shift
# RETURN: a one-hot encoding of the TIME_SHIFT message
def time_shift_event(time_shift):
    assert(0 <= time_shift and time_shift < num_time_shifts)
    return 2*num_notes + time_shift

# append_time_shift: appends a time shift event to the data array. If the time
# shift is too large, split it into multiple small time shifts. Also appends
# time events
# ARGUMENTS
# data: time shift events will be appended to this list
# times: times of events will be appended to this list
# time: time of first event
# time_shift: amount of time to shift
def append_time_shift(data, times, time, time_shift):
    time_shift = quantize_time_shift(time_shift)
    
    # Split large time shifts into multiple small time shifts
    while (time_shift >= num_time_shifts):
        data.append(time_shift_event(num_time_shifts - 1))
        times.append(time)
        time += min_time_shift*num_time_shifts
        time_shift -= (num_time_shifts - 1)

    if (time_shift >= 0):
        data.append(time_shift_event(time_shift))
        times.append(time)
    
base_path = 'musicnet_midis/'

fnum = 0 # Which file are we writing currently?

data_fnames = [] # Save file name corresponding to each numpy array
for composer in os.listdir(base_path):
    print('Starting ' + composer)
    for fname in os.listdir(base_path + composer):
        try:
            mid = pretty_midi.PrettyMIDI(base_path + composer + '/' + fname)
        except:
            # There are 7 files that cause an IO error, both with mido and pretty_midi. Haven't looked into why
            continue
        
        # Store data in a 2d numpy array. Dimension 0 indexes the instruments. 
        # Dimension 1 contains a sequence of messages for each instrument. Each sequence is padded
        # with -1.
        data = [[] for instrument in mid.instruments]
        
        # Store message times for each instrument. Specifically, we have a 2d numpy array, as above,
        # but containing message times instead of messages. Again, padded with -1
        times = [[] for instrument in mid.instruments]
        
        for i, instrument in enumerate(mid.instruments):            
            # Priority queue of notes to turn off and the times to turn them off.
            # Specifically, this is a list of tuples of the form (off_time, pitch),
            # where the first element of the list is always the next note to turn off
            off_queue = []
            
            time = 0
                        
            for n, note in enumerate(instrument.notes):
                # Fixes bug in 'Haydn/2104_op64n5_1.mid' where notes 674 and 675 are the same note
                if n > 0 and note.pitch == instrument.notes[n - 1].pitch and note.start == instrument.notes[n - 1].start:
                    continue
            
                # We need to turn off a note
                while off_queue and note.start > off_queue[0][0]:
                    append_time_shift(data[i], times[i], time, off_queue[0][0] - time)
                    data[i].append(note_off_event(off_queue[0][1]))
                    time = off_queue[0][0]
                    times[i].append(time)
                    heapq.heappop(off_queue)
                    
                # Time shift until we reach the start of this note
                append_time_shift(data[i], times[i], time, note.start - time)
                time = note.start
                
                data[i].append(note_on_event(note.pitch))
                times[i].append(time)
                
                # Add this note to the queue of notes needing to be turned off
                heapq.heappush(off_queue, (note.end, note.pitch))
                
                if n == len(instrument.notes) - 1:
                    # No more notes left. Flush the off queue
                    while off_queue:
                        append_time_shift(data[i], times[i], time, off_queue[0][0] - time)
                        data[i].append(note_off_event(off_queue[0][1]))
                        time = off_queue[0][0]
                        times[i].append(time)
                        heapq.heappop(off_queue)
                    
            data[i] = np.array(data[i])
            times[i] = np.array(times[i])
            
            assert(len(times[i]) == len(data[i]))
        
        max_len = max([len(seq) for seq in data])
        
        data_np = -np.ones((len(data), max_len), dtype='long')
        for i, seq in enumerate(data):
            data_np[i, :len(seq)] = np.array(seq)
            
        times_np = -np.ones((len(data), max_len), dtype='float')
        for i, seq in enumerate(times):
            times_np[i, :len(seq)] = np.array(seq)
        
        if fnum%5 == 4:
            prefix = 'test'
        else:
            prefix = 'train'
            
        np.save(prefix + '/recording' + str(fnum) + '.npy', data_np)
        np.save(prefix + '/times' + str(fnum) + '.npy', times_np)
        
        # Also save a numpy array containing the MIDI number for each instrument
        instruments = np.array([instrument.program for instrument in mid.instruments])
        np.save(prefix + '/instruments' + str(fnum) + '.npy', instruments)
        
        data_fnames.append(composer + '/' + fname)
        fnum += 1
        
pickle.dump(data_fnames, open( "preprocessed_data_fnames.p", "wb" ) )

Starting Cambini
Starting Schubert
Starting Bach




Starting Faure
Starting Dvorak
Starting Beethoven
Starting Mozart
Starting Ravel
Starting Brahms
Starting Haydn
