In [1]:
import os
import pretty_midi
import joblib
import glob
import numpy as np
from nltk import word_tokenize, sent_tokenize

In [2]:
current_data = os.path.join(os.getcwd(), "Data", "Classical_Piano")
max_seq_length = 100 #Update this --> re-run it all

Generating the data

In [3]:
def slice_piano_roll(piano_roll):
    h, w = piano_roll.shape
    #Need to slice the piano_roll such that we have an array for each timestep
    split = []
    for i in range(w):
        column = [row[i] for row in piano_roll]
        #Need to convert all the timestamps (f.ex. 47) to just values of 1 to not confuse the network
        column = np.asarray(column)
        column = np.where(column==0, column, 1)
        #Adding to the new array
        split = np.concatenate((split, np.asarray(column)))
    return np.array_split(split, w)

In [4]:
def reshape(piano_roll):
    return np.reshape(piano_roll, (len(piano_roll), 128, 1))

In [5]:
def compute_pianoRoll(midi_file):
    try:
        midi = pretty_midi.PrettyMIDI(midi_file)
        piano_midi = midi.instruments[0]
        #Splitting each track up into 0.2sec "windows"
        piano_roll = piano_midi.get_piano_roll(fs=5)
        return slice_piano_roll(piano_roll)
        #return reshape(slice_piano_roll(piano_roll))
    except Exception as e:
        print(e)

In [6]:
def generate_sliced_pianoRolls():
    piano_rolls = joblib.Parallel(n_jobs=10, verbose=1)(joblib.delayed(compute_pianoRoll)(midi_file)
                                                   for midi_file in glob.glob(os.path.join(current_data, '*', '*.MID')))
    piano_rolls = [s for s in piano_rolls if s is not None]
    return piano_rolls

In [7]:
def get_chunks_and_slices(piano, n, dataX, dataY):
    #Yield successive n-sized chunks of timesteps from piano_roll for input.
    #(Where n is max_sequence_length)
    #and single timesteps for output targets
    for track in piano:
        #Trimming the empty arrays in the beginning of the sequence
        startOfSequence = False
        for i in range(0, len(track) - n):
            if np.any(track[i]):
                startOfSequence = True
            if startOfSequence:
                chunk = track[i:i + n]
                #just making completly sure that it has the right length
                x = chunk + [0]*(n - len(chunk))
                y = track[i + n]
                dataX.append(x)
                dataY.append(y)
                

In [8]:
def generate_data(dataX, dataY):
    data = generate_sliced_pianoRolls()
    get_chunks_and_slices(data, max_seq_length, dataX, dataY)
    #dataX = fit_data_to_model(dataX, 128, max_seq_length)   

In [11]:
def get_seq_length():
    return max_seq_length