In [1]:
import numpy as np 
import os 
import pandas as pd 
import music21

## Data preprocessing

In [48]:
composer_list = ['Bach', 'Beethoven', 'Chopin', 'Mozart']
data = []

In [49]:
def extract_note_features(midi_file):
    midi = music21.converter.parse(midi_file)
    
    note_tuples = []
    for thisNote in midi.flatten().getElementsByClass(['Note', 'Rest', 'Chord']):
        if thisNote.isNote:
            pitch = thisNote.pitch.midi
        elif thisNote.isRest:
            # Use 0 for rest
            pitch = -1
        elif thisNote.isChord:
            combined_pitch = 0
            for pitch in thisNote.pitches:
                combined_pitch += pitch.midi
            pitch = combined_pitch
        # Multiplying by 4 to scale
        note_tuples.append((pitch, round(float(thisNote.beat) * 4, 2), round(float(thisNote.duration.quarterLength) * 4, 2)))
    
    return note_tuples

In [50]:
for composer in composer_list:
    for dirname, _, filenames in os.walk(f'./data/{composer}/'):
        for filename in filenames:
            if filename.endswith('.mid'):
                midi_path = os.path.join(dirname, filename)
                note_tuples = extract_note_features(midi_path)
                data.append({
                    'midi_file': filename,
                    'composer': composer,
                    'path': dirname,
                    'note_tuple': note_tuples,
                })



In [51]:
df = pd.DataFrame(data)

In [52]:
df.head()

Unnamed: 0,midi_file,composer,path,note_tuple
0,Bwv0997 Partita for Lute 1mov.mid,Bach,./data/Bach/,"[(-1, 4.0, 2.0), (48, 4.0, 4.0), (72, 6.0, 1.0..."
1,Bwv0535 Prelude and Fugue.mid,Bach,./data/Bach/,"[(-1, 4.0, 16.0), (55, 4.0, 1.0), (-1, 4.0, 16..."
2,Bwv0806 English Suite n1 05mov.mid,Bach,./data/Bach/,"[(69, 4.0, 2.0), (-1, 4.0, 2.0), (69, 5.0, 9.3..."
3,Bwv0998 Prelude Fugue Allegro for Lute 3mov.mid,Bach,./data/Bach/,"[(-1, 4.0, 1.0), (39, 4.0, 4.0), (-1, 4.0, 6.0..."
4,Jesu Joy of Man Desiring.mid,Bach,./data/Bach/,"[(-1, 4.0, 1.33), (-1, 4.0, 1.33), (-1, 4.0, 4..."


In [53]:
print(df['note_tuple'][0])

[(-1, 4.0, 2.0), (48, 4.0, 4.0), (72, 6.0, 1.0), (74, 7.0, 1.0), (75, 8.0, 2.0), (-1, 8.0, 8.0), (79, 10.0, 2.0), (83, 12.0, 2.0), (84, 14.0, 2.0), (-1, 16.0, 2.0), (46, 16.0, 4.0), (84, 18.0, 2.0), (-1, 4.0, 2.0), (44, 4.0, 4.0), (72, 6.0, 1.0), (74, 7.0, 1.0), (75, 8.0, 2.0), (-1, 8.0, 8.0), (79, 10.0, 2.0), (83, 12.0, 2.0), (84, 14.0, 2.0), (-1, 16.0, 2.0), (43, 16.0, 4.0), (84, 18.0, 2.0), (-1, 4.0, 2.0), (41, 4.0, 4.0), (72, 6.0, 1.0), (74, 7.0, 1.0), (75, 8.0, 2.0), (-1, 8.0, 8.0), (79, 10.0, 2.0), (83, 12.0, 2.0), (84, 14.0, 2.0), (-1, 16.0, 2.0), (39, 16.0, 4.0), (84, 18.0, 2.0), (-1, 4.0, 1.0), (41, 4.0, 4.0), (82, 5.0, 1.0), (80, 6.0, 1.0), (79, 7.0, 1.0), (77, 8.0, 1.0), (-1, 8.0, 4.0), (75, 9.0, 1.0), (74, 10.0, 1.0), (72, 11.0, 1.0), (71, 12.0, 1.0), (43, 12.0, 4.0), (72, 13.0, 1.0), (74, 14.0, 1.0), (68, 15.0, 1.0), (67, 16.0, 2.0), (-1, 16.0, 4.0), (77, 18.0, 2.0), (75, 4.0, 1.0), (36, 4.0, 4.0), (74, 5.0, 1.0), (72, 6.0, 2.0), (-1, 8.0, 1.0), (51, 8.0, 4.0), (71, 9.0, 1

In [54]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1530 entries, 0 to 1529
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   midi_file   1530 non-null   object
 1   composer    1530 non-null   object
 2   path        1530 non-null   object
 3   note_tuple  1530 non-null   object
dtypes: object(4)
memory usage: 47.9+ KB


In [55]:
df.to_csv('./data/preprocessed_tuple_with_midi.csv', index=False)