In [1]:
import numpy as np
from matplotlib import pyplot as plt
import tensorflow as tf
import keras
import pickle
from keras.utils import np_utils
from tensorflow.keras.layers import InputLayer, Dense, LSTM, Dropout, Activation
from tensorflow.keras.models import Sequential

from music21 import converter, instrument, note, chord, stream
import glob

from keras.utils import np_utils

FONTSIZE=18
plt.rcParams['figure.figsize']=(10,6)
plt.rcParams['font.size']=FONTSIZE

Using TensorFlow backend.


# Load in MIDI files and convert them to notes

In [4]:
filenames=glob.glob('../MIDI_files/Classical_Archives_The_Greats/Chopin/*.mid')

filename='../MIDI_files/Classical_Archives_The_Greats/Chopin/Prelude n01 op28 \'\'Reunion\'\'.mid'

notes=[]

#this has length num_songs and gives the number of notes in each, this will be useful for partitioning training data
notes_per_song=[]

for i in range(len(filenames)):
    
    file=filenames[i]
    
    print(file)
    
    midi=converter.parse(file)
    
    parts = instrument.partitionByInstrument(midi)

    if parts: # file has instrument parts
        notes_to_parse = parts.parts[0].recurse()
        
    else: # file has notes in a flat structure
        notes_to_parse = midi.flat.notes 
    
    notes_temp=0
    for element in notes_to_parse:
        
        if isinstance(element, note.Note):
            notes.append(str(element.pitch))
            notes_temp+=1
            
        elif isinstance(element, chord.Chord):
            notes.append('.'.join(str(n) for n in element.normalOrder))
            notes_temp+=1
            
    notes_per_song.append(notes_temp)



../MIDI_files/Classical_Archives_The_Greats/Chopin/Etude op10 n04 Drchew.mid
../MIDI_files/Classical_Archives_The_Greats/Chopin/Piano Concerto n2 2mov.mid
../MIDI_files/Classical_Archives_The_Greats/Chopin/Waltz op64 n1.mid
../MIDI_files/Classical_Archives_The_Greats/Chopin/Piano Concerto n1 op11 1mov.mid
../MIDI_files/Classical_Archives_The_Greats/Chopin/Nocturne op48 n2.mid
../MIDI_files/Classical_Archives_The_Greats/Chopin/Prelude n17 op28 ''Scene On the Place of Notre Dame.mid
../MIDI_files/Classical_Archives_The_Greats/Chopin/Nocturne op37 n2.mid
../MIDI_files/Classical_Archives_The_Greats/Chopin/Etude op10 n08.mid
../MIDI_files/Classical_Archives_The_Greats/Chopin/Prelude n09 op28 ''Vision''.mid
../MIDI_files/Classical_Archives_The_Greats/Chopin/Prelude n10 op28 ''The Night Moth''.mid
../MIDI_files/Classical_Archives_The_Greats/Chopin/Sonata op35 n1 .mid
../MIDI_files/Classical_Archives_The_Greats/Chopin/Etude op10 n01.mid
../MIDI_files/Classical_Archives_The_Greats/Chopin/Sonata

In [5]:
print(len(notes))
print(sum(notes_per_song))
print(notes_per_song) #what we can actually do here is partition by song
#training data will have roughly 75% of songs and validation data will have remaining songs

num_songs=len(notes_per_song)

134557
134557
[1619, 2109, 1092, 7005, 942, 1079, 890, 1817, 282, 252, 3354, 1276, 1546, 197, 2785, 1227, 914, 1021, 2324, 848, 889, 3319, 691, 3475, 2456, 2750, 806, 3580, 1058, 1636, 789, 1919, 256, 471, 2677, 866, 1550, 153, 458, 2389, 271, 372, 1007, 1676, 1410, 2628, 1264, 1510, 1203, 968, 505, 434, 499, 789, 306, 4601, 583, 438, 949, 702, 86, 493, 1353, 803, 695, 4285, 1074, 727, 2661, 4788, 758, 471, 3000, 120, 1756, 628, 1691, 1263, 2561, 954, 1477, 681, 1532, 3649, 1323, 345, 1327, 588, 334, 1263, 740, 1247, 1555, 574, 873]


# Remove songs that have less than sequence_length notes

In [6]:
sequence_length=100


#remove songs that have less than sequence_length notes
remove_song_idx=[]
for i in range(num_songs):
    if notes_per_song[i]<sequence_length:
        remove_song_idx.append(i)

print(remove_song_idx)




[60]


In [7]:
remove_note_idx=[]
num_note=0

for i in range(num_songs):
    
    num_notes_in_song=notes_per_song[i]

    
    for j in range(num_notes_in_song):
        
        if i in remove_song_idx:
            remove_note_idx.append(num_note)
            num_note+=1
        else:
            num_note+=1
            
print(len(remove_note_idx))
        

86


In [8]:
notes_per_song=np.delete(notes_per_song,remove_song_idx)
notes=np.delete(notes,remove_note_idx)
num_songs=len(notes_per_song)

In [9]:
print(notes_per_song)
print(len(notes))

[1619 2109 1092 7005  942 1079  890 1817  282  252 3354 1276 1546  197
 2785 1227  914 1021 2324  848  889 3319  691 3475 2456 2750  806 3580
 1058 1636  789 1919  256  471 2677  866 1550  153  458 2389  271  372
 1007 1676 1410 2628 1264 1510 1203  968  505  434  499  789  306 4601
  583  438  949  702  493 1353  803  695 4285 1074  727 2661 4788  758
  471 3000  120 1756  628 1691 1263 2561  954 1477  681 1532 3649 1323
  345 1327  588  334 1263  740 1247 1555  574  873]
134471


# Create ins and outs

In [10]:
# Now what we have to do is create a bunch of sequences of constant length
# we also have to map the notes to integers and then map those integers to categorical variables 
sequence_length=100

#create a list of all pitchnames
pitchnames=set(item for item in notes)

n_notes=len(pitchnames)

#create a dict that maps pitchnames to integers
note_to_int={} #keys are pitchnames, values are integers
count=-1
for i in pitchnames:
    count+=1
    note_to_int[i]=count
    
#calculate total number of sequences
num_sequences=0
for i in range(num_songs):
    num_notes_in_song=notes_per_song[i]
    for j in range(num_notes_in_song-sequence_length):
        num_sequences+=1
    
ins=np.zeros((num_sequences,sequence_length))
outs=[]

#the input is a sequence of sequence_length notes
#the output is the next note after that sequence

note_count=0
sequences_per_song=[]

for i in range(num_songs):
    
    num_notes_in_song=notes_per_song[i]
    sequences_per_song.append(num_notes_in_song-sequence_length)

    
    #here we're grouping it by song, there will not be a single sequence that has part of one song and part of another
    for j in range(num_notes_in_song-sequence_length):
        
        sequence_in=notes[note_count:note_count+sequence_length]
        sequence_out=notes[note_count+sequence_length]
        
        ins_temp=[note_to_int[i] for i in sequence_in]
        
        ins[note_count,:]=ins_temp
        outs.append(note_to_int[sequence_out])
        
        note_count+=1
        
#ins=np.reshape(ins,(num_sequences, sequence_length, 1)) #reshape to keras ready shape
#ins = ins / float(n_notes) #scale to 0-1
#outs=np_utils.to_categorical(outs) #transform outs to a categorical
    

print(sequences_per_song)
print(ins.shape)
print(len(outs))


[1519, 2009, 992, 6905, 842, 979, 790, 1717, 182, 152, 3254, 1176, 1446, 97, 2685, 1127, 814, 921, 2224, 748, 789, 3219, 591, 3375, 2356, 2650, 706, 3480, 958, 1536, 689, 1819, 156, 371, 2577, 766, 1450, 53, 358, 2289, 171, 272, 907, 1576, 1310, 2528, 1164, 1410, 1103, 868, 405, 334, 399, 689, 206, 4501, 483, 338, 849, 602, 393, 1253, 703, 595, 4185, 974, 627, 2561, 4688, 658, 371, 2900, 20, 1656, 528, 1591, 1163, 2461, 854, 1377, 581, 1432, 3549, 1223, 245, 1227, 488, 234, 1163, 640, 1147, 1455, 474, 773]
(125071, 100)
125071


# Partition Training and Validation Data

In [11]:
# Ok now we need to define our training and validation sets
# The real challenge for our model is going to be the fact that the validation sequences are from songs that are not included in the training set
# If we can get the model to not overfit we'll have accomplished something pretty impressive

songs_all=np.arange(len(notes_per_song))
idx_train=np.random.choice(songs_all,size=int(len(notes_per_song)*0.75),replace=False) #index of songs used for training
idx_validation=[] #index of songs used for validation
for i in songs_all:
    if i not in idx_train:
        idx_validation.append(i)
        
        
#calculate number of sequences for training and validation
num_sequences_train=0
for i in idx_train:
    num_sequences_train+=sequences_per_song[i]
num_sequences_validation=sum(sequences_per_song)-num_sequences_train

print("Number of Training Sequences is "+str(num_sequences_train))
print("Number of Validation Sequences is "+str(num_sequences_validation))
        
ins_train=np.zeros((num_sequences_train,sequence_length))
outs_train=[]
ins_validation=np.zeros((num_sequences_validation,sequence_length))
outs_validation=[]
        
idx_sequence=-1
idx_train_sequence=-1
idx_validation_sequence=-1
for i in range(num_songs):
    
    sequences_in_song=sequences_per_song[i]
    
    for j in range(sequences_in_song):
        idx_sequence+=1
        
        if i in idx_train:
            idx_train_sequence+=1
            ins_train[idx_train_sequence,:]=ins[idx_sequence,:]
            outs_train.append(outs[idx_sequence])
            
            
        elif i in idx_validation:
            idx_validation_sequence+=1
            ins_validation[idx_validation_sequence,:]=ins[idx_sequence,:]
            outs_validation.append(outs[idx_sequence])
    

ins_train=np.reshape(ins_train,(num_sequences_train,sequence_length,1)) #reshape to keras ready shape
ins_validation=np.reshape(ins_validation,(num_sequences_validation,sequence_length,1))

ins_train = ins_train / float(n_notes) #scale to 0-1
ins_validation=ins_validation/float(n_notes)

outs_train=np_utils.to_categorical(outs_train) #transform outs to a categorical
outs_validation=np_utils.to_categorical(outs_validation)

print("ins_train shape: "+str(ins_train.shape))
print("outs_train shape: "+str(outs_train.shape))
print("ins_validation shape: "+str(ins_validation.shape))
print("outs_validation shape: "+str(outs_validation.shape))

Number of Training Sequences is 100049
Number of Validation Sequences is 25022
ins_train shape: (100049, 100, 1)
outs_train shape: (100049, 467)
ins_validation shape: (25022, 100, 1)
outs_validation shape: (25022, 467)


In [12]:
training_data={'ins_train':ins_train,'outs_train':outs_train,'ins_validation':ins_validation,'outs_validation':outs_validation}

fp=open("training_data.pkl",'wb')
pickle.dump(training_data,fp)
fp.close()