In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import sys
sys.path.insert(0, '../../')
from src.encode_data import *
from src.midi_data import *
from src.data_sources import process_all, arr2csv
from src.midi_transform import *
from src.fastai_data import *

In [3]:
import traceback
import time

## Standardize and reformat raw midi files before encoding to text
- Transform key to C major
- Remove unused instruments
- Combine multiple tracks with the same instrument into a single part
- Melody, Piano, String

### Load midi data

In [4]:
version = 'v15'
data_path = Path('data/midi')
version_path = data_path/version

In [5]:
import pandas as pd

In [6]:
# out_dir = 'midi_encode'
# duet_only = False
out_dir = 'piano_duet'
duet_only = True

In [7]:
source_dir = 'midi_sources'
source_csv = version_path/'metadata'/f'{source_dir}.csv'
out_csv = version_path/out_dir/f'{out_dir}.csv'
out_csv.parent.mkdir(parents=True, exist_ok=True)
source_csv, out_csv

(PosixPath('data/midi/v15/metadata/midi_sources.csv'),
 PosixPath('data/midi/v15/piano_duet/piano_duet.csv'))

In [8]:
# num_comps = 2 # note, duration
cutoff = 5 # max instruments
min_variation = 3 # minimum number of different midi notes played
# max_dur = 128

### Encoding midi to numpy

In [9]:
df = pd.read_csv(source_csv); df.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,ht_time_signature,ht_offset,midi,section,parts,ht_bpm,title,midi_title,artist,song_url,genres,source,ht_key,md5,mxl,ht_mode
0,4.0,0.0,midi_sources/hooktheory/pianoroll/w/wayne-shar...,chorus,"intro,chorus",128.0,yu-gi-oh-theme-song,yu-gi-oh3,wayne-sharpe,https://www.hooktheory.com/theorytab/view/wayn...,,hooktheory,C,bf1f29e5ff84e3e93e37fb873bfb590e,,1.0
1,3.0,0.0,midi_sources/hooktheory/pianoroll/w/wayne-shar...,intro,"intro,chorus",85.0,yu-gi-oh-theme-song,yu-gi-oh,wayne-sharpe,https://www.hooktheory.com/theorytab/view/wayn...,,hooktheory,C,055f80ad67f64edb14a85ca8fbfe8c29,,1.0
2,4.0,-5.0,midi_sources/hooktheory/pianoroll/w/what-a-day...,chorus,chorus,96.0,kiefer,kiefer,what-a-day,https://www.hooktheory.com/theorytab/view/what...,Jazz,hooktheory,D,197f96f5d181f6ce1e2c5ab04ac1ff87,,6.0
3,4.0,-5.0,midi_sources/hooktheory/pianoroll/w/whiteflame...,pre-chorus,"verse,pre-chorus,chorus",152.0,senbonzakura,senbonzakura - pre-Pre-Chorus,whiteflame,https://www.hooktheory.com/theorytab/view/whit...,"J-Pop,Pop",hooktheory,D,9e7ce13a35f1314423a9a6d5a5287a4a,,6.0
4,4.0,-5.0,midi_sources/hooktheory/pianoroll/w/whiteflame...,verse,"verse,pre-chorus,chorus",152.0,senbonzakura,Senbonzakura,whiteflame,https://www.hooktheory.com/theorytab/view/whit...,"J-Pop,Pop",hooktheory,D,d5aaf79d0989222f1362f9f46c540a27,,6.0


In [10]:
all_records = df.to_dict(orient='records'); len(all_records)

197182

In [11]:
def process_metadata(metadata):
    result = metadata.copy()
    
    # Part 1. Compress tracks/instruments
    if not isinstance(metadata.get('midi'), str): return None
    
    input_path = version_path/metadata['midi']
    extension = input_path.suffix.lower()
    if not input_path.exists(): 
        print('Input path does not exist:', input_path, metadata)
        return result
    
    # Get outfile and check if it exists
    out_file = Path(str(input_path).replace(f'/{source_dir}/', f'/{out_dir}/'))
    out_file = out_file.with_suffix('.npy')
    out_file.parent.mkdir(parents=True, exist_ok=True)
    if out_file.exists(): 
        result['numpy'] = str(out_file.relative_to(version_path))
        return result
    
    npenc = transform_midi(input_path)
    if npenc is None: return result
    np.save(out_file, npenc)
    result['numpy'] = str(out_file.relative_to(version_path))
    return result

In [12]:
def transform_midi(midi_file):
    input_path = midi_file
    
    if duet_only:
        try: 
            if num_piano_tracks(input_path) not in [1, 2] : return None
        except Exception: return None
    
    try: input_file = compress_midi_file(input_path, min_variation=min_variation, cutoff=cutoff) # remove non note tracks and standardize instruments
    except Exception as e:
        print('Error parsing midi', input_path, e)
        return None
    if not input_file: return None
        
    # Part 2. Compress rests and long notes
    stream = file2stream(input_file) # 1.
    try:
        chordarr = stream2chordarr(stream, max_dur=DUR_RANGE-2, flat=False) # 2. max_dur = quarter_len * sample_freq (4). 128 = 8 bars
    except Exception as e:
        print('Could not encode to chordarr:', input_path, e)
#         print(traceback.format_exc())
        return None
    
    chord_trim = trim_chordarr_rests(chordarr)
    chord_short = shorten_chordarr_rests(chord_trim)
    delta_trim = chord_trim.shape[0] - chord_short.shape[0]
#     if delta_trim > 500: 
#         print(f'Removed {delta_trim} rests from {input_path}. Skipping song')
#         return None
    chordarr = chord_short
    
    # Part 3. Chord array to numpy
    seq = chordarr2seq(chordarr)
    if len(seq) < 32:
        print('Sequence too short:', len(seq), input_path)
        return None
    
    npenc = seq2npenc(seq)
    if (npenc[:,1] >= DUR_RANGE).any(): 
        print(f'npenc exceeds max {DUR_RANGE} duration:', input_path)
        return None
    
    # https://en.wikipedia.org/wiki/Scientific_pitch_notation - 88 key range - 21 = A0, 108 = C8
    if ((npenc[...,0] > VALTSEP) & ((npenc[...,0] < PIANO_RANGE[0]) | (npenc[...,0] >= PIANO_RANGE[1]))).any(): 
        print('npenc out of piano note range 12 - 116:', input_path)
        return None
    
    return npenc

In [13]:
def try_process_metadata(metadata):
    try:
        return process_metadata(metadata)
    except Exception:
#         print(traceback.format_exc())
        return None

In [14]:
# # sanity check
import random
for r in random.sample(all_records, 10):
    process_metadata(r)

In [15]:
def timeout_func(data, seconds):
    print("Timeout:", seconds, data.get('midi'))

In [16]:
processed = process_all(try_process_metadata, all_records, timeout=500, timeout_func=timeout_func)

Sequence too short: 25 data/midi/v15/midi_sources/hooktheory/pianoroll/y/yes/heart-of-the-sunrise/intro_key_original.mid


Sequence too short: 29 data/midi/v15/midi_sources/hooktheory/pianoroll/j/jeff-liu/steven-and-the-crystal-gems/outro_key_original.mid
Sequence too short: 29 data/midi/v15/midi_sources/hooktheory/pianoroll/j/jessica-simpson/part-of-your-world/bridge_key_original.mid
Sequence too short: 25 data/midi/v15/midi_sources/hooktheory/pianoroll/j/jamiroquai/canned-heat/intro_key_original.mid
Sequence too short: 25 data/midi/v15/midi_sources/hooktheory/pianoroll/l/lildeucedeuce/mine-turtle/intro_key_original.mid
Sequence too short: 29 data/midi/v15/midi_sources/hooktheory/pianoroll/v/vertical-horizon/im-still-here/chorus-lead-out_key_original.mid
Sequence too short: 17 data/midi/v15/midi_sources/hooktheory/pianoroll/f/frankie-valli/the-night/intro_key_original.mid
Sequence too short: 21 data/midi/v15/midi_sources/hooktheory/pianoroll/d/david-bowie/starman/pre-chorus_key_original.mid
Sequence too short: 25 data/midi/v15/midi_sources/hooktheory/pianoroll/n/nobuo-uematsu/final-fantasy-vi---dancing-ma

Sequence too short: 25 data/midi/v15/midi_sources/from_mxl/musescore/data/1435276.mid
Timeout: 500 midi_sources/from_mxl/classical_archives/9/kv334.mid
Timeout: 500 midi_sources/from_mxl/classical_archives/9/brop24.mid
Sequence too short: 29 data/midi/v15/midi_sources/from_mxl/musescore/data/3910721.mid
npenc exceeds max 130 duration: data/midi/v15/midi_sources/from_mxl/musescore/data/5371535.mid
npenc exceeds max 130 duration: data/midi/v15/midi_sources/from_mxl/wikifonia/Mongo Santamaria - Afro Blue.mid
npenc exceeds max 130 duration: data/midi/v15/midi_sources/from_mxl/wikifonia/Ricardo & Chantal, You are Cordially invited to share in our special day on the 1st January 2011 - The Wedding Song.mid
npenc exceeds max 130 duration: data/midi/v15/midi_sources/from_mxl/wikifonia/Joseph Zawinul - Birdland.mid
Could not encode to chordarr: data/midi/v15/midi_sources/lmd_clean/Jennifer Lopez/Let's Get Loud.1.mid index 2087 is out of bounds for axis 0 with size 2087
Timeout: 500 midi_sources/

Timeout: 500 midi_sources/130k_reddit/M/M/mthm26d.mid
Timeout: 500 midi_sources/130k_reddit/M/M/mthm26b.mid
Timeout: 500 midi_sources/130k_reddit/M/M/Medley2.mid
npenc exceeds max 130 duration: data/midi/v15/midi_sources/130k_reddit/Metal_Rock_wolverine-metalmidi.wen.ru_MIDIRip/Darkthrone/Darkthrone - Earth's Last Picture.mid
Sequence too short: 27 data/midi/v15/midi_sources/130k_reddit/Classical Archives - The Greats (MIDI)/Classical Piano Midis/Varios - Título desconocido/p_z/schumann.mid
Sequence too short: 23 data/midi/v15/midi_sources/130k_reddit/Z/Z/z1secret.mid
npenc exceeds max 130 duration: data/midi/v15/midi_sources/130k_reddit/C/C/conga03.mid
npenc exceeds max 130 duration: data/midi/v15/midi_sources/130k_reddit/C/C/coldplay-violet_hill.mid
npenc exceeds max 130 duration: data/midi/v15/midi_sources/130k_reddit/C/C/Cave.mid
npenc exceeds max 130 duration: data/midi/v15/midi_sources/130k_reddit/C/C/calypsox.mid
Timeout: 500 midi_sources/130k_reddit/Classical Archives - The Gr

Timeout: 500 midi_sources/130k_reddit/H/H/hndl_var.mid
Sequence too short: 17 data/midi/v15/midi_sources/130k_reddit/T/T/tutugan.mid
Sequence too short: 9 data/midi/v15/midi_sources/130k_reddit/T/T/The Legend of Zelda - Uncover Secret.mid
Sequence too short: 21 data/midi/v15/midi_sources/130k_reddit/T/T/The Legend of Zelda - Ganon Appears and is Defeated.mid
Sequence too short: 25 data/midi/v15/midi_sources/130k_reddit/T/T/The Legend of Zelda The Wind Waker - Earth Gods Lyric Baton.mid
Sequence too short: 9 data/midi/v15/midi_sources/130k_reddit/T/T/The Legend of Zelda Ocarina of Time - Achievement.mid
Sequence too short: 25 data/midi/v15/midi_sources/130k_reddit/T/T/The Legend of Zelda - Flute.mid
Sequence too short: 31 data/midi/v15/midi_sources/130k_reddit/2/292.mid
Sequence too short: 17 data/midi/v15/midi_sources/130k_reddit/R/R/Rehab - Mrkrstft.mid
npenc exceeds max 130 duration: data/midi/v15/midi_sources/130k_reddit/R/R/RENDEZII.MID
npenc exceeds max 130 duration: data/midi/v15

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [17]:
arr2csv(processed, out_csv); len(processed)

196514

In [18]:
df = pd.read_csv(out_csv); df.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,song_url,section,numpy,ht_mode,midi,title,ht_time_signature,mxl,ht_offset,ht_bpm,ht_key,md5,midi_title,artist,genres,parts,source
0,https://www.hooktheory.com/theorytab/view/wayn...,chorus,piano_duet/hooktheory/pianoroll/w/wayne-sharpe...,1.0,midi_sources/hooktheory/pianoroll/w/wayne-shar...,yu-gi-oh-theme-song,4.0,,0.0,128.0,C,bf1f29e5ff84e3e93e37fb873bfb590e,yu-gi-oh3,wayne-sharpe,,"intro,chorus",hooktheory
1,https://www.hooktheory.com/theorytab/view/wayn...,intro,piano_duet/hooktheory/pianoroll/w/wayne-sharpe...,1.0,midi_sources/hooktheory/pianoroll/w/wayne-shar...,yu-gi-oh-theme-song,3.0,,0.0,85.0,C,055f80ad67f64edb14a85ca8fbfe8c29,yu-gi-oh,wayne-sharpe,,"intro,chorus",hooktheory
2,https://www.hooktheory.com/theorytab/view/what...,chorus,piano_duet/hooktheory/pianoroll/w/what-a-day/k...,6.0,midi_sources/hooktheory/pianoroll/w/what-a-day...,kiefer,4.0,,-5.0,96.0,D,197f96f5d181f6ce1e2c5ab04ac1ff87,kiefer,what-a-day,Jazz,chorus,hooktheory
3,https://www.hooktheory.com/theorytab/view/whit...,pre-chorus,piano_duet/hooktheory/pianoroll/w/whiteflame/s...,6.0,midi_sources/hooktheory/pianoroll/w/whiteflame...,senbonzakura,4.0,,-5.0,152.0,D,9e7ce13a35f1314423a9a6d5a5287a4a,senbonzakura - pre-Pre-Chorus,whiteflame,"J-Pop,Pop","verse,pre-chorus,chorus",hooktheory
4,https://www.hooktheory.com/theorytab/view/whit...,verse,piano_duet/hooktheory/pianoroll/w/whiteflame/s...,6.0,midi_sources/hooktheory/pianoroll/w/whiteflame...,senbonzakura,4.0,,-5.0,152.0,D,d5aaf79d0989222f1362f9f46c540a27,Senbonzakura,whiteflame,"J-Pop,Pop","verse,pre-chorus,chorus",hooktheory


In [20]:
len([s for s in df.numpy.values if isinstance(s, str)])

112947