In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
from encode_data import *
from midi_data import *

In [3]:
from tqdm import tqdm
import pandas as pd
from data_sources import process_parallel, transform_csv_row

In [4]:
from collections import Counter

## Encode music21 stream to text representation 

This notebook uses a full component format 
- measure separators, instruments, and separated octaves
- Format: note, octave, action type, instrument
- note repr: nG# o4 t1 i0

### Load midi data

In [5]:
version = 'v8'
data_path = Path('data/midi')
version_path = data_path/version

In [6]:
source_dir = 'midi_npz'

out_dir = 'midi_encode/np/shortdur_2comp'
num_comps = 2 # 2:(note,dur), 3:(note,dur,oct), 4:(note,dur,oct,inst)
source_csv = version_path/source_dir/f'{source_dir}.csv'
out_csv = version_path/out_dir/f'midi_encode.csv'

In [7]:
df = pd.read_csv(source_csv); df.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,inferred_offset,song_url,instruments,ht_mode,midi_title,title,seconds,midi,inferred_key,quarter_length,...,ht_key,ht_time_signature,bpm,section,parts,genres,mxl,midi_transform,midi_npz,midi_npz_timesteps
0,0.0,https://www.hooktheory.com/theorytab/view/wayn...,Piano,1.0,yu-gi-oh,yu-gi-oh-theme-song,25.411765,midi_sources/hooktheory/pianoroll/w/wayne-shar...,C major,36.0,...,C,3.0,85.0,intro,"intro,chorus",,,midi_transform/hooktheory/pianoroll/w/wayne-sh...,midi_npz/hooktheory/pianoroll/w/wayne-sharpe/y...,145.0
1,0.0,https://www.hooktheory.com/theorytab/view/wayn...,"Piano,Piano",1.0,yu-gi-oh3,yu-gi-oh-theme-song,15.0,midi_sources/hooktheory/pianoroll/w/wayne-shar...,C major,32.0,...,C,4.0,128.0,chorus,"intro,chorus",,,midi_transform/hooktheory/pianoroll/w/wayne-sh...,midi_npz/hooktheory/pianoroll/w/wayne-sharpe/y...,129.0
2,5.0,https://www.hooktheory.com/theorytab/view/what...,"Piano,Piano",1.0,kiefer,kiefer,10.0,midi_sources/hooktheory/pianoroll/w/what-a-day...,E minor,16.0,...,C,4.0,96.0,chorus,chorus,Jazz,,midi_transform/hooktheory/pianoroll/w/what-a-d...,midi_npz/hooktheory/pianoroll/w/what-a-day/kie...,65.0
3,,https://www.hooktheory.com/theorytab/view/weez...,,1.0,My New Song,beverly-hills,,midi_sources/hooktheory/pianoroll/w/weezer/bev...,,,...,C,4.0,,intro-and-verse,intro-and-verse,,,,,
4,0.0,https://www.hooktheory.com/theorytab/view/weez...,"Piano,Piano",1.0,Weezer - Fall Together,fall-together-,10.322581,midi_sources/hooktheory/pianoroll/w/weezer/fal...,A minor,16.0,...,C,4.0,93.0,chorus,chorus,Rock,,midi_transform/hooktheory/pianoroll/w/weezer/f...,midi_npz/hooktheory/pianoroll/w/weezer/fall-to...,65.0


In [8]:
df_filtered = df.loc[df[source_dir].notna()]; df_filtered.shape

(162688, 25)

In [9]:
df_filter = (df_filtered.source != 'hooktheory') | (df_filtered.ht_time_signature == 4.0)
df_filtered = df_filtered.loc[df_filter]; df_filtered.shape

(160965, 25)

In [10]:
df_filtered = df_filtered.loc[df_filtered[f'{source_dir}_timesteps'] < 2e4]; df_filtered.shape

(160946, 25)

In [11]:
df_filtered[source_dir].values

array(['midi_npz/hooktheory/pianoroll/w/wayne-sharpe/yu-gi-oh-theme-song/chorus_key_cmajor.npz',
       'midi_npz/hooktheory/pianoroll/w/what-a-day/kiefer/chorus_key_cmajor.npz',
       'midi_npz/hooktheory/pianoroll/w/weezer/fall-together-/chorus_key_cmajor.npz',
       'midi_npz/hooktheory/pianoroll/w/wavves/afraid-of-heights/intro_key_cmajor.npz', ...,
       'midi_npz/130k_reddit/R/R/R.CONIFF.Medley.npz', 'midi_npz/130k_reddit/R/R/rubbersoul.npz',
       'midi_npz/130k_reddit/2/24preludescl.npz', 'midi_npz/130k_reddit/R/R/Rock-Around-Medley-(Medley).npz'],
      dtype=object)

In [12]:
def transform_func(file, out_file, row):
    chordarr = load_chordarr(file)
    if (chordarr > 128).any(): 
        print('Song exceeds max 128 duration:', file)
        return None
    seq = chordarr2seq(chordarr)
    npenc = seq2npenc(seq, num_comps=num_comps)
    np.save(out_file, npenc)
    return npenc

In [13]:
# fp = Path('data/midi/v8')/df_filtered[source_dir].values[0]
# transform_func(fp, None, None).shape

In [14]:
from functools import partial
parallel_func = partial(transform_csv_row, 
        transform_func=transform_func,
        base_path=version_path,
        source_dir=source_dir,
        out_dir=out_dir,
        out_extension='.npy'
       )

In [15]:
# for r in df_filtered.iterrows():
#     parallel_func(r)
#     break

In [None]:
encoded_files = process_parallel(parallel_func, df_filtered.iterrows(), total=df_filtered.shape[0])

In [None]:
tdf = pd.DataFrame(data={out_dir: list(encoded_files.values())}, index=list(encoded_files.keys()))

In [None]:
merged_df = df.join(tdf, how='outer'); tdf.shape, df.shape, merged_df.shape

In [None]:
merged_df.to_csv(out_csv, index=False); merged_df.head()