In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import os
os.chdir('../../../')

In [3]:
from musicautobot.numpy_encode import *
from musicautobot.utils.file_processing import *
from musicautobot.utils.midifile import *
from musicautobot.config import *
from musicautobot.music_transformer import *
from musicautobot.multitask_transformer import *

In [4]:
import traceback
import time

## Standardize and reformat raw midi files before encoding to text
- Transform key to C major
- Remove unused instruments
- Combine multiple tracks with the same instrument into a single part
- Melody, Piano, String

### Load midi data

In [5]:
version = 'v20'
data_path = Path('data/midi')
version_path = data_path/version

In [6]:
import pandas as pd

In [7]:
out_dir = 's2s_encode'

In [8]:
source_dir = 'midi_sources'
source_csv = version_path/'metadata'/f'{source_dir}.csv'
out_csv = version_path/out_dir/f'{out_dir}.csv'
out_csv.parent.mkdir(parents=True, exist_ok=True)
source_csv, out_csv

(PosixPath('data/midi/v20/metadata/midi_sources.csv'),
 PosixPath('data/midi/v20/s2s_encode/s2s_encode.csv'))

In [9]:
# num_comps = 2 # note, duration
min_variation = 3 # minimum number of different midi notes played
# max_dur = 128

### Encoding midi to numpy

In [10]:
df = pd.read_csv(source_csv); df.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,title,midi,source,parts,mxl,ht_time_signature,midi_title,ht_mode,md5,genres,ht_bpm,ht_key,artist,song_url,section
0,yu-gi-oh-theme-song,midi_sources/hooktheory/pianoroll/w/wayne-shar...,hooktheory,"intro,chorus",,4.0,yu-gi-oh3,1.0,bf1f29e5ff84e3e93e37fb873bfb590e,,128.0,C,wayne-sharpe,https://www.hooktheory.com/theorytab/view/wayn...,chorus
1,yu-gi-oh-theme-song,midi_sources/hooktheory/pianoroll/w/wayne-shar...,hooktheory,"intro,chorus",,3.0,yu-gi-oh,1.0,055f80ad67f64edb14a85ca8fbfe8c29,,85.0,C,wayne-sharpe,https://www.hooktheory.com/theorytab/view/wayn...,intro
2,kiefer,midi_sources/hooktheory/pianoroll/w/what-a-day...,hooktheory,chorus,,4.0,kiefer,6.0,197f96f5d181f6ce1e2c5ab04ac1ff87,Jazz,96.0,D,what-a-day,https://www.hooktheory.com/theorytab/view/what...,chorus
3,senbonzakura,midi_sources/hooktheory/pianoroll/w/whiteflame...,hooktheory,"verse,pre-chorus,chorus",,4.0,senbonzakura - pre-Pre-Chorus,6.0,9e7ce13a35f1314423a9a6d5a5287a4a,"J-Pop,Pop",152.0,D,whiteflame,https://www.hooktheory.com/theorytab/view/whit...,pre-chorus
4,senbonzakura,midi_sources/hooktheory/pianoroll/w/whiteflame...,hooktheory,"verse,pre-chorus,chorus",,4.0,Senbonzakura,6.0,d5aaf79d0989222f1362f9f46c540a27,"J-Pop,Pop",152.0,D,whiteflame,https://www.hooktheory.com/theorytab/view/whit...,verse


In [11]:
all_records = df.to_dict(orient='records'); len(all_records)

198757

In [12]:
def process_metadata(metadata):
    result = metadata.copy()
    
    # Part 1. Compress tracks/instruments
    if not isinstance(metadata.get('midi'), str): return None
    
    input_path = version_path/metadata['midi']
    extension = input_path.suffix.lower()
    if not input_path.exists(): 
        print('Input path does not exist:', input_path, metadata)
        return result
    
    # Get outfile and check if it exists
    out_file = Path(str(input_path).replace(f'/{source_dir}/', f'/{out_dir}/'))
    out_file = out_file.with_suffix('.npy')
    out_file.parent.mkdir(parents=True, exist_ok=True)
    if out_file.exists(): 
        result['numpy'] = str(out_file.relative_to(version_path))
        return result
    
#     if result['source'] not in ['hooktheory', 'hooktheory_c']: return result
    
    npenc = transform_midi(input_path)
    if npenc is None: return result
    np.save(out_file, npenc)
    result['numpy'] = str(out_file.relative_to(version_path))
    return result

In [13]:
def transform_midi(midi_file):
    input_path = midi_file
    
    try: 
        if num_piano_tracks(input_path) not in [1, 2]: return None
        input_file = compress_midi_file(input_path, min_variation=min_variation, cutoff=2, supported_types=set([Track.PIANO])) # remove non note tracks and standardize instruments
        if not input_file: return None
    except Exception as e:
        if 'badly form' in str(e): return None # ignore badly formatted midi errors
        if 'out of range' in str(e): return None # ignore badly formatted midi errors
        print('Error parsing midi', input_path, e)
        return None
        
    # Part 2. Compress rests and long notes
    stream = file2stream(input_file) # 1.
    try:
        chordarr = stream2chordarr(stream) # 2. max_dur = quarter_len * sample_freq (4). 128 = 8 bars
    except Exception as e:
        print('Could not encode to chordarr:', input_path, e)
#         print(traceback.format_exc())
        return None
    
    chord_trim = trim_chordarr_rests(chordarr)
    chord_short = shorten_chordarr_rests(chord_trim)
    delta_trim = chord_trim.shape[0] - chord_short.shape[0]
#     if delta_trim > 300: 
#         print(f'Removed {delta_trim} rests from {input_path}. Skipping song')
#         return None
    chordarr = chord_short
    
    # Only 2 piano parts allowed
    _,num_parts,_ = chordarr.shape
    if num_parts != 2: return None
    
    # Individual parts must have notes
    parts = [part_enc(chordarr, i) for i in range(num_parts)]
    for p in parts: 
        if not is_valid_npenc(p, min_notes=8, input_path=input_path): return None
        
    # order by melody > chords
    p1, p2 = parts
    m, c = (p1, p2) if avg_pitch(p1) > avg_pitch(p2) else (p2, p1) # Assuming melody has higher pitch
    
    return np.array([m, c])

In [14]:
# transform_midi(piano_file)
midi_mxl_file = version_path/'midi_sources/from_mxl/musescore/data/49143.mid'
input_file = midi_mxl_file
stream = file2stream(input_file) # 1.
chordarr = stream2chordarr(stream)

In [15]:
chordarr.shape

(1021, 2, 128)

In [16]:
transform_midi(midi_mxl_file).shape

(2,)

In [17]:
def try_process_metadata(metadata):
    try:
        return process_metadata(metadata)
    except Exception:
#         print(traceback.format_exc())
        return None

In [18]:
# # sanity check
import random
for r in random.sample(all_records, 10):
    process_metadata(r)

In [19]:
def timeout_func(data, seconds):
    print("Timeout:", seconds, data.get('midi'))

In [20]:
processed = process_all(try_process_metadata, all_records, timeout=300, timeout_func=timeout_func)

Sequence too short: 6 data/midi/v20/midi_sources/hooktheory/pianoroll/w/willie-nelson/you-were-always-on-my-mind/verse_key_original.mid


Sequence too short: 4 data/midi/v20/midi_sources/hooktheory/pianoroll/y/yasunori-mitsuda/black-omen/instrumental_key_original.mid
Sequence too short: 7 data/midi/v20/midi_sources/hooktheory/pianoroll/y/yo-la-tengo/ohm/verse_key_original.mid
Sequence too short: 3 data/midi/v20/midi_sources/hooktheory/pianoroll/y/yes/heart-of-the-sunrise/intro_key_original.mid
Sequence too short: 5 data/midi/v20/midi_sources/hooktheory/pianoroll/j/janet-jackson/nasty/intro_key_original.mid
Sequence too short: 3 data/midi/v20/midi_sources/hooktheory/pianoroll/j/jose-gonzales/step-out/intro_key_original.mid
Sequence too short: 7 data/midi/v20/midi_sources/hooktheory/pianoroll/j/jon-lajoie/the-best-song/chorus_key_original.mid
Sequence too short: 3 data/midi/v20/midi_sources/hooktheory/pianoroll/j/john-powell/how-to-train-your-dragon---test-drive/intro_key_original.mid
Sequence too short: 3 data/midi/v20/midi_sources/hooktheory/pianoroll/j/jay-hardway---mike-hawkins/freedom/chorus_key_original.mid
Sequence 

Sequence too short: 7 data/midi/v20/midi_sources/hooktheory/pianoroll/s/steve-jablonsky/lone-survivor/verse_key_original.mid
Sequence too short: 7 data/midi/v20/midi_sources/hooktheory/pianoroll/s/say-anything/slowly-through-a-vector/chorus_key_original.mid
Sequence too short: 7 data/midi/v20/midi_sources/hooktheory/pianoroll/s/skrillex/breakn-a-sweat/verse_key_original.mid
Sequence too short: 7 data/midi/v20/midi_sources/hooktheory/pianoroll/c/coldplay/hurts-like-heaven/solo_key_original.mid
Sequence too short: 7 data/midi/v20/midi_sources/hooktheory/pianoroll/c/childish-gambino/me-and-your-mama/intro_key_original.mid
Sequence too short: 7 data/midi/v20/midi_sources/hooktheory/pianoroll/c/c418/acid/verse_key_original.mid
Sequence too short: 4 data/midi/v20/midi_sources/hooktheory/pianoroll/c/creedence-clearwater-revival/born-on-the-bayou/intro_key_original.mid
Sequence too short: 3 data/midi/v20/midi_sources/hooktheory/pianoroll/c/cysmix/aumetra-the-witch/intro_key_original.mid
Sequen

Sequence too short: 7 data/midi/v20/midi_sources/hooktheory/pianoroll/t/the-crystal-method/starting-over/verse_key_original.mid
Sequence too short: 5 data/midi/v20/midi_sources/hooktheory/pianoroll/t/the-dispatch/how-deep-the-fathers-love-for-us/intro_key_original.mid
Sequence too short: 7 data/midi/v20/midi_sources/hooktheory/pianoroll/t/the-pussycat-dolls/buttons/chorus_key_original.mid
Sequence too short: 4 data/midi/v20/midi_sources/hooktheory/pianoroll/t/the-beatles/if-i-needed-someone/intro_key_original.mid
Sequence too short: 4 data/midi/v20/midi_sources/hooktheory/pianoroll/t/the-beatles/ticket-to-ride/outro_key_original.mid
Sequence too short: 4 data/midi/v20/midi_sources/hooktheory/pianoroll/t/the-beatles/you-cant-do-that/intro_key_original.mid
Sequence too short: 4 data/midi/v20/midi_sources/hooktheory/pianoroll/t/the-lorax/let-it-grow/intro_key_original.mid
npenc exceeds max 161 duration: 192 data/midi/v20/midi_sources/hooktheory/pianoroll/t/the-supremes/the-happening/verse

npenc exceeds max 161 duration: 272 data/midi/v20/midi_sources/hooktheory/pianoroll/l/la-roux/quicksand/bridge_key_cmajor.mid
Sequence too short: 7 data/midi/v20/midi_sources/hooktheory/pianoroll/l/lcd-soundsystem/dance-yrself-clean/solo-1_key_cmajor.mid
Sequence too short: 7 data/midi/v20/midi_sources/hooktheory/pianoroll/v/village-people/ymca/pre-chorus_key_cmajor.mid
Sequence too short: 7 data/midi/v20/midi_sources/hooktheory/pianoroll/v/van-morrison/days-like-this/intro_key_cmajor.mid
Sequence too short: 5 data/midi/v20/midi_sources/hooktheory/pianoroll/f/fort-minor/welcome/intro_key_cmajor.mid
Sequence too short: 7 data/midi/v20/midi_sources/hooktheory/pianoroll/b/boards-of-canada/dayvan-cowboy/instrumental_key_cmajor.mid
npenc exceeds max 161 duration: 216 data/midi/v20/midi_sources/hooktheory/pianoroll/b/biffy-clyro/all-the-way-down/pre-chorus-and-chorus_key_cmajor.mid
Sequence too short: 3 data/midi/v20/midi_sources/hooktheory/pianoroll/b/big-wild/venice-venture/instrumental_ke

npenc exceeds max 161 duration: 362 data/midi/v20/midi_sources/hooktheory/pianoroll/i/inxs/never-tear-us-apart/verse-and-pre-chorus_key_cmajor.mid
Sequence too short: 4 data/midi/v20/midi_sources/hooktheory/pianoroll/i/ikke-huftgold/dicke-titten-kartoffelsalat/intro_key_cmajor.mid
Sequence too short: 7 data/midi/v20/midi_sources/hooktheory/pianoroll/i/ilan-bluestone-and-jerome-isma-ae/tension/pre-chorus_key_cmajor.mid
Sequence too short: 3 data/midi/v20/midi_sources/hooktheory/pianoroll/h/homestuck/skies-of-skaia/chorus-lead-out_key_cmajor.mid
Sequence too short: 7 data/midi/v20/midi_sources/hooktheory/pianoroll/h/homestuck/gilded-sands/intro-and-verse_key_cmajor.mid
Sequence too short: 3 data/midi/v20/midi_sources/hooktheory/pianoroll/h/homestuck/carbon-nadsat---cuestick-genius/intro_key_cmajor.mid
Sequence too short: 3 data/midi/v20/midi_sources/hooktheory/pianoroll/h/homestuck/unite-synchronization/chorus_key_cmajor.mid
Sequence too short: 3 data/midi/v20/midi_sources/hooktheory/pia

Sequence too short: 7 data/midi/v20/midi_sources/hooktheory/pianoroll/m/massive-attack/dissolved-girl/intro-and-verse_key_cmajor.mid
Sequence too short: 7 data/midi/v20/midi_sources/hooktheory/pianoroll/m/microsoft/windows-xp-startup-sound/instrumental_key_cmajor.mid
Sequence too short: 3 data/midi/v20/midi_sources/hooktheory/pianoroll/m/meghan-trainor/no/pre-chorus_key_cmajor.mid
Sequence too short: 3 data/midi/v20/midi_sources/hooktheory/pianoroll/m/masashi-hamauzu/battle-results/pre-chorus-and-chorus_key_cmajor.mid
Sequence too short: 3 data/midi/v20/midi_sources/hooktheory/pianoroll/a/a-lovely-war/autumn-leaves-us-blue/intro_key_cmajor.mid
npenc exceeds max 161 duration: 192 data/midi/v20/midi_sources/hooktheory/pianoroll/a/american-football/never-meant/intro_key_cmajor.mid
Sequence too short: 7 data/midi/v20/midi_sources/hooktheory/pianoroll/a/avicii-vs-nicky-romero/i-could-be-the-one/intro_key_cmajor.mid
Sequence too short: 3 data/midi/v20/midi_sources/hooktheory/pianoroll/a/anal

In [21]:
arr2csv(processed, out_csv); len(processed)

197919

In [22]:
df = pd.read_csv(out_csv); df.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,song_url,midi,mxl,section,genres,title,ht_time_signature,artist,ht_bpm,source,ht_key,md5,midi_title,numpy,ht_mode,parts
0,https://www.hooktheory.com/theorytab/view/wayn...,midi_sources/hooktheory/pianoroll/w/wayne-shar...,,chorus,,yu-gi-oh-theme-song,4.0,wayne-sharpe,128.0,hooktheory,C,bf1f29e5ff84e3e93e37fb873bfb590e,yu-gi-oh3,s2s_encode/hooktheory/pianoroll/w/wayne-sharpe...,1.0,"intro,chorus"
1,https://www.hooktheory.com/theorytab/view/wayn...,midi_sources/hooktheory/pianoroll/w/wayne-shar...,,intro,,yu-gi-oh-theme-song,3.0,wayne-sharpe,85.0,hooktheory,C,055f80ad67f64edb14a85ca8fbfe8c29,yu-gi-oh,,1.0,"intro,chorus"
2,https://www.hooktheory.com/theorytab/view/what...,midi_sources/hooktheory/pianoroll/w/what-a-day...,,chorus,Jazz,kiefer,4.0,what-a-day,96.0,hooktheory,D,197f96f5d181f6ce1e2c5ab04ac1ff87,kiefer,s2s_encode/hooktheory/pianoroll/w/what-a-day/k...,6.0,chorus
3,https://www.hooktheory.com/theorytab/view/whit...,midi_sources/hooktheory/pianoroll/w/whiteflame...,,pre-chorus,"J-Pop,Pop",senbonzakura,4.0,whiteflame,152.0,hooktheory,D,9e7ce13a35f1314423a9a6d5a5287a4a,senbonzakura - pre-Pre-Chorus,s2s_encode/hooktheory/pianoroll/w/whiteflame/s...,6.0,"verse,pre-chorus,chorus"
4,https://www.hooktheory.com/theorytab/view/whit...,midi_sources/hooktheory/pianoroll/w/whiteflame...,,verse,"J-Pop,Pop",senbonzakura,4.0,whiteflame,152.0,hooktheory,D,d5aaf79d0989222f1362f9f46c540a27,Senbonzakura,s2s_encode/hooktheory/pianoroll/w/whiteflame/s...,6.0,"verse,pre-chorus,chorus"


In [23]:
len([f for f in df.numpy.values if isinstance(f, str)])

59539

In [24]:
from collections import Counter

In [25]:
df[df.numpy.notnull()]

Unnamed: 0,song_url,midi,mxl,section,genres,title,ht_time_signature,artist,ht_bpm,source,ht_key,md5,midi_title,numpy,ht_mode,parts
0,https://www.hooktheory.com/theorytab/view/wayn...,midi_sources/hooktheory/pianoroll/w/wayne-shar...,,chorus,,yu-gi-oh-theme-song,4.0,wayne-sharpe,128.0,hooktheory,C,bf1f29e5ff84e3e93e37fb873bfb590e,yu-gi-oh3,s2s_encode/hooktheory/pianoroll/w/wayne-sharpe...,1.0,"intro,chorus"
2,https://www.hooktheory.com/theorytab/view/what...,midi_sources/hooktheory/pianoroll/w/what-a-day...,,chorus,Jazz,kiefer,4.0,what-a-day,96.0,hooktheory,D,197f96f5d181f6ce1e2c5ab04ac1ff87,kiefer,s2s_encode/hooktheory/pianoroll/w/what-a-day/k...,6.0,chorus
3,https://www.hooktheory.com/theorytab/view/whit...,midi_sources/hooktheory/pianoroll/w/whiteflame...,,pre-chorus,"J-Pop,Pop",senbonzakura,4.0,whiteflame,152.0,hooktheory,D,9e7ce13a35f1314423a9a6d5a5287a4a,senbonzakura - pre-Pre-Chorus,s2s_encode/hooktheory/pianoroll/w/whiteflame/s...,6.0,"verse,pre-chorus,chorus"
4,https://www.hooktheory.com/theorytab/view/whit...,midi_sources/hooktheory/pianoroll/w/whiteflame...,,verse,"J-Pop,Pop",senbonzakura,4.0,whiteflame,152.0,hooktheory,D,d5aaf79d0989222f1362f9f46c540a27,Senbonzakura,s2s_encode/hooktheory/pianoroll/w/whiteflame/s...,6.0,"verse,pre-chorus,chorus"
5,https://www.hooktheory.com/theorytab/view/whit...,midi_sources/hooktheory/pianoroll/w/whiteflame...,,chorus,"J-Pop,Pop",senbonzakura,4.0,whiteflame,152.0,hooktheory,D,e0c189ee753b30c4758d85211f13c189,Senbonzakura,s2s_encode/hooktheory/pianoroll/w/whiteflame/s...,6.0,"verse,pre-chorus,chorus"
6,https://www.hooktheory.com/theorytab/view/wham...,midi_sources/hooktheory/pianoroll/w/wham/last-...,,verse,Holiday,last-christmas,4.0,wham,108.0,hooktheory,Db,38e38402443506e326b76536e8e327a0,Last Christmas Verse,s2s_encode/hooktheory/pianoroll/w/wham/last-ch...,1.0,"intro,verse,chorus"
7,https://www.hooktheory.com/theorytab/view/wham...,midi_sources/hooktheory/pianoroll/w/wham/last-...,,chorus,Holiday,last-christmas,4.0,wham,108.0,hooktheory,Db,75d0251177c8c1fa9a02821299fa5ba8,Last Christmas Chorus,s2s_encode/hooktheory/pianoroll/w/wham/last-ch...,1.0,"intro,verse,chorus"
8,https://www.hooktheory.com/theorytab/view/wham...,midi_sources/hooktheory/pianoroll/w/wham/last-...,,intro,Holiday,last-christmas,4.0,wham,108.0,hooktheory,Db,83d2a800f40aeca07e30e4718cda8fe5,Last Christmas Intro,s2s_encode/hooktheory/pianoroll/w/wham/last-ch...,1.0,"intro,verse,chorus"
9,https://www.hooktheory.com/theorytab/view/wham...,midi_sources/hooktheory/pianoroll/w/wham/freed...,,chorus,,freedom,4.0,wham,128.0,hooktheory,C,60fa29cfec107df27b053cf9708823d5,Freedom Chorus,s2s_encode/hooktheory/pianoroll/w/wham/freedom...,1.0,chorus
11,https://www.hooktheory.com/theorytab/view/wg-s...,midi_sources/hooktheory/pianoroll/w/wg-snuffy-...,,instrumental,,west-wing-suite,4.0,wg-snuffy-walden,86.0,hooktheory,G,a856dff6c54398544c217104d047abe0,snuffy,s2s_encode/hooktheory/pianoroll/w/wg-snuffy-wa...,1.0,instrumental


In [26]:
Counter(df[df.numpy.notnull()].source.values)

Counter({'hooktheory': 18312,
         'freemidi': 394,
         'midiworld': 345,
         'ecomp': 2380,
         'cprato': 165,
         'classical_piano': 318,
         'classical_archives': 3654,
         'musescore': 8688,
         'wikifonia': 36,
         'lmd': 925,
         'reddit': 5878,
         'hooktheory_c': 17817})

In [27]:
len(df[df.numpy.notnull()].source.values)

59539

In [28]:
Counter(df[df.numpy.notnull()].source.values)

Counter({'hooktheory': 18311,
         'freemidi': 394,
         'midiworld': 345,
         'ecomp': 2380,
         'cprato': 165,
         'classical_piano': 318,
         'classical_archives': 3654,
         'musescore': 8688,
         'wikifonia': 36,
         'lmd': 925,
         'reddit': 5878,
         'hooktheory_c': 18445})

In [29]:
Counter(df.source.values)

Counter({'hooktheory': 20544,
         'freemidi': 5168,
         'midiworld': 4109,
         'ecomp': 2533,
         'cprato': 312,
         'classical_piano': 328,
         'classical_archives': 14647,
         'musescore': 10936,
         'wikifonia': 6346,
         'lmd': 13568,
         'reddit': 98683,
         'hooktheory_c': 20745})

## Convert to hooktheory databunch

In [30]:
out_path = version_path/out_dir

In [31]:
csv = pd.read_csv(out_csv); csv.head()

Unnamed: 0,song_url,midi,mxl,section,genres,title,ht_time_signature,artist,ht_bpm,source,ht_key,md5,midi_title,numpy,ht_mode,parts
0,https://www.hooktheory.com/theorytab/view/wayn...,midi_sources/hooktheory/pianoroll/w/wayne-shar...,,chorus,,yu-gi-oh-theme-song,4.0,wayne-sharpe,128.0,hooktheory,C,bf1f29e5ff84e3e93e37fb873bfb590e,yu-gi-oh3,s2s_encode/hooktheory/pianoroll/w/wayne-sharpe...,1.0,"intro,chorus"
1,https://www.hooktheory.com/theorytab/view/wayn...,midi_sources/hooktheory/pianoroll/w/wayne-shar...,,intro,,yu-gi-oh-theme-song,3.0,wayne-sharpe,85.0,hooktheory,C,055f80ad67f64edb14a85ca8fbfe8c29,yu-gi-oh,,1.0,"intro,chorus"
2,https://www.hooktheory.com/theorytab/view/what...,midi_sources/hooktheory/pianoroll/w/what-a-day...,,chorus,Jazz,kiefer,4.0,what-a-day,96.0,hooktheory,D,197f96f5d181f6ce1e2c5ab04ac1ff87,kiefer,s2s_encode/hooktheory/pianoroll/w/what-a-day/k...,6.0,chorus
3,https://www.hooktheory.com/theorytab/view/whit...,midi_sources/hooktheory/pianoroll/w/whiteflame...,,pre-chorus,"J-Pop,Pop",senbonzakura,4.0,whiteflame,152.0,hooktheory,D,9e7ce13a35f1314423a9a6d5a5287a4a,senbonzakura - pre-Pre-Chorus,s2s_encode/hooktheory/pianoroll/w/whiteflame/s...,6.0,"verse,pre-chorus,chorus"
4,https://www.hooktheory.com/theorytab/view/whit...,midi_sources/hooktheory/pianoroll/w/whiteflame...,,verse,"J-Pop,Pop",senbonzakura,4.0,whiteflame,152.0,hooktheory,D,d5aaf79d0989222f1362f9f46c540a27,Senbonzakura,s2s_encode/hooktheory/pianoroll/w/whiteflame/s...,6.0,"verse,pre-chorus,chorus"


In [32]:
def get_files(csv):
    files = csv['numpy']
    flist = [Path(version_path/f) for f in files.values if isinstance(f, str)]
    flist = [f for f in flist if f.exists()]
    return flist

In [42]:
def create_databunch(files, data_save_name, path=out_path):
    save_file = path/data_save_name
    if save_file.exists():
        data = load_data(path, data_save_name)
    else:
        save_file.parent.mkdir(exist_ok=True, parents=True)
        vocab = MusicVocab.create()
        processors = [S2SFileProcessor(), S2SPartsProcessor()]

        data = MusicDataBunch.from_files(files, path, processors=processors, 
                                          preloader_cls=S2SPreloader, list_cls=S2SItemList)
        data.save(data_save_name)
    return data

In [43]:
hook_csv = csv.loc[csv.source.isin(['hooktheory'])]
hook_files = get_files(hook_csv); len(hook_files)
hook_data = create_databunch(hook_files, 'cached/hook.pkl')

18311

In [45]:
hook_csv = csv.loc[csv.source.isin(['hooktheory_c'])]
hook_files = get_files(hook_csv); len(hook_files)
hook_data = create_databunch(hook_files, 'cached/hook_c.pkl')

In [46]:
lq_csv = csv.loc[csv.source.isin(['reddit', 'classical_piano', 'ecomp', 'midiworld', 'freemidi', 'lmd', 'cprato', 'wikifonia', 'classical_archives'])]
lq_files = get_files(lq_csv); len(lq_files)
lq_data = create_databunch(lq_files, 'cached/lq.pkl')

In [47]:
hq_csv = csv.loc[csv.source.isin(['hooktheory', 'musescore'])]
hq_files = get_files(hq_csv); len(hq_files)
hq_data = create_databunch(hq_files, 'cached/hq.pkl')

In [48]:
len(hook_data.train_dl.dl.dataset)

15708

In [49]:
all_files = get_files(csv); len(all_files)
all_data = create_databunch(all_files, 'cached/all.pkl')

In [50]:
import random
sample_data = create_databunch(random.sample(all_files, 1000), 'cached/sample.pkl')

## Load data

In [45]:
load_data =  load_data(path=out_path, cache_name='tmp/hook_c', preloader_cls=S2SPreloader)

FileNotFoundError: [Errno 2] No such file or directory: 'data/midi/v20/s2s_encode/data_save.pkl'

In [None]:
load_data =  MusicDataBunch.load(path=out_path, cache_name='tmp/hook_c', preloader_cls=S2SPreloader)

In [None]:
load_data.one_batch()

In [None]:
# ps = [S2SFileProcessor()]

# single_tfm = partial(to_single_stream, vocab=vocab)
# data = (MusicItemList(items=hook_files[:100], path=out_path, processor=ps, tfms=[single_tfm])
#         .split_by_rand_pct(0.01, seed=6)
#         .label_const(label_cls=LMLabelList))
# data.x._bunch = MusicDataBunch

In [78]:
# data.x.tfms = [single_tfm]

In [None]:
data = data.databunch(bs=4, preloader_cls=S2SPreloader, train_tfms=[single_tfm])

In [80]:
out = data.train_dl.dl.dataset[0]

In [None]:
data.one_batch()