In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [3]:
import sys
sys.path.insert(0, '../../')
from src.encode_data import *
from src.midi_data import *
from src.data_sources import process_all, arr2csv
from src.midi_transform import *
from src.fastai_data import *

In [4]:
import traceback
import time

## Standardize and reformat raw midi files before encoding to text
- Transform key to C major
- Remove unused instruments
- Combine multiple tracks with the same instrument into a single part
- Melody, Piano, String

### Load midi data

In [5]:
version = 'v15'
data_path = Path('data/midi')
version_path = data_path/version

In [6]:
import pandas as pd

In [7]:
# out_dir = 'midi_encode'
# duet_only = False
out_dir = 's2s_encode'
duet_only = True

In [8]:
source_dir = 'midi_sources'
source_csv = version_path/'metadata'/f'{source_dir}.csv'
out_csv = version_path/out_dir/f'{out_dir}.csv'
out_csv.parent.mkdir(parents=True, exist_ok=True)
source_csv, out_csv

(PosixPath('data/midi/v15/metadata/midi_sources.csv'),
 PosixPath('data/midi/v15/s2s_encode/s2s_encode.csv'))

In [9]:
# num_comps = 2 # note, duration
cutoff = 5 # max instruments
min_variation = 3 # minimum number of different midi notes played
# max_dur = 128

### Encoding midi to numpy

In [9]:
df = pd.read_csv(source_csv); df.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,ht_time_signature,ht_offset,midi,section,parts,ht_bpm,title,midi_title,artist,song_url,genres,source,ht_key,md5,mxl,ht_mode
0,4.0,0.0,midi_sources/hooktheory/pianoroll/w/wayne-shar...,chorus,"intro,chorus",128.0,yu-gi-oh-theme-song,yu-gi-oh3,wayne-sharpe,https://www.hooktheory.com/theorytab/view/wayn...,,hooktheory,C,bf1f29e5ff84e3e93e37fb873bfb590e,,1.0
1,3.0,0.0,midi_sources/hooktheory/pianoroll/w/wayne-shar...,intro,"intro,chorus",85.0,yu-gi-oh-theme-song,yu-gi-oh,wayne-sharpe,https://www.hooktheory.com/theorytab/view/wayn...,,hooktheory,C,055f80ad67f64edb14a85ca8fbfe8c29,,1.0
2,4.0,-5.0,midi_sources/hooktheory/pianoroll/w/what-a-day...,chorus,chorus,96.0,kiefer,kiefer,what-a-day,https://www.hooktheory.com/theorytab/view/what...,Jazz,hooktheory,D,197f96f5d181f6ce1e2c5ab04ac1ff87,,6.0
3,4.0,-5.0,midi_sources/hooktheory/pianoroll/w/whiteflame...,pre-chorus,"verse,pre-chorus,chorus",152.0,senbonzakura,senbonzakura - pre-Pre-Chorus,whiteflame,https://www.hooktheory.com/theorytab/view/whit...,"J-Pop,Pop",hooktheory,D,9e7ce13a35f1314423a9a6d5a5287a4a,,6.0
4,4.0,-5.0,midi_sources/hooktheory/pianoroll/w/whiteflame...,verse,"verse,pre-chorus,chorus",152.0,senbonzakura,Senbonzakura,whiteflame,https://www.hooktheory.com/theorytab/view/whit...,"J-Pop,Pop",hooktheory,D,d5aaf79d0989222f1362f9f46c540a27,,6.0


In [10]:
all_records = df.to_dict(orient='records'); len(all_records)

197182

In [11]:
# input_file = version_path/all_records[0]['midi']

# stream = file2stream(input_file)
# chordarr = stream2chordarr(stream, max_dur=DUR_RANGE-2, flat=False)

# _,num_inst,_ = chordarr.shape

# parts = [part_enc(chordarr, i) for i in range(num_inst)]

# parts = sorted(parts, key=avg_pitch)

In [12]:
def avg_pitch(npenc):
    notes = npenc[:, 0]
    return notes[notes > 0].mean()

In [13]:
def part_enc(chordarr, part):
    partarr = chordarr[:,part:part+1,:]
    # Part 3. Chord array to numpy
    seq = chordarr2seq(partarr)
    return seq2npenc(seq)

In [14]:
def process_metadata(metadata):
    result = metadata.copy()
    
    # Part 1. Compress tracks/instruments
    if not isinstance(metadata.get('midi'), str): return None
    
    input_path = version_path/metadata['midi']
    extension = input_path.suffix.lower()
    if not input_path.exists(): 
        print('Input path does not exist:', input_path, metadata)
        return result
    
    # Get outfile and check if it exists
    out_file = Path(str(input_path).replace(f'/{source_dir}/', f'/{out_dir}/'))
    out_file = out_file.with_suffix('.npy')
    out_file.parent.mkdir(parents=True, exist_ok=True)
    if out_file.exists(): 
        result['numpy'] = str(out_file.relative_to(version_path))
        return result
    
    npenc = transform_midi(input_path)
    if npenc is None: return result
    np.save(out_file, npenc)
    result['numpy'] = str(out_file.relative_to(version_path))
    return result

In [18]:
def transform_midi(midi_file):
    input_path = midi_file
    
    if duet_only:
        try: 
            if num_piano_tracks(input_path) != 2: return None
        except Exception: return None
    
    try: input_file = compress_midi_file(input_path, min_variation=min_variation, cutoff=cutoff) # remove non note tracks and standardize instruments
    except Exception as e:
        print('Error parsing midi', input_path, e)
        return None
    if not input_file: return None
        
    # Part 2. Compress rests and long notes
    stream = file2stream(input_file) # 1.
    try:
        chordarr = stream2chordarr(stream, max_dur=DUR_RANGE-2, flat=False) # 2. max_dur = quarter_len * sample_freq (4). 128 = 8 bars
    except Exception as e:
        print('Could not encode to chordarr:', input_path, e)
#         print(traceback.format_exc())
        return None
    
    chord_trim = trim_chordarr_rests(chordarr)
    chord_short = shorten_chordarr_rests(chord_trim)
    delta_trim = chord_trim.shape[0] - chord_short.shape[0]
    if delta_trim > 300: 
        print(f'Removed {delta_trim} rests from {input_path}. Skipping song')
        return None
    chordarr = chord_short
    
    _,num_inst,_ = chordarr.shape
    if num_inst != 2: return None
    
    parts = [part_enc(chordarr, i) for i in range(num_inst)]
    
    # Part 3. Chord array to numpy
    if sum([p.shape[0] for p in parts]) < 32:
        print('Sequence too short:', len(seq), input_path)
        return None
    
    return np.array(parts)

In [19]:
def try_process_metadata(metadata):
    try:
        return process_metadata(metadata)
    except Exception:
#         print(traceback.format_exc())
        return None

In [20]:
# # sanity check
import random
for r in random.sample(all_records, 10):
    process_metadata(r)

In [21]:
def timeout_func(data, seconds):
    print("Timeout:", seconds, data.get('midi'))

In [22]:
processed = process_all(try_process_metadata, all_records, timeout=300, timeout_func=timeout_func)

Removed 320 rests from data/midi/v15/midi_sources/freemidi/genre-dance-eletric/Fatboy Slim - Right Here Right Now.mid. Skipping song
Removed 456 rests from data/midi/v15/midi_sources/freemidi/genre-pop/Sade - Siempre Hay Esperanza.mid. Skipping song
Removed 320 rests from data/midi/v15/midi_sources/freemidi/genre-pop/Shakira - Waka Waka.mid. Skipping song
Removed 544 rests from data/midi/v15/midi_sources/midiworld/named_midi/Tag_Team_-_Whoomp_There_It_Is.mid. Skipping song
Removed 512 rests from data/midi/v15/midi_sources/midiworld/named_midi/Aerosmith_-_Falling_in_Love_Is_Hard_on_my_Knees.mid. Skipping song
Could not encode to chordarr: data/midi/v15/midi_sources/midiworld/named_midi/Howlin_Wolf_-_Little_Red_Rooster.mid index 1147 is out of bounds for axis 0 with size 1147
Could not encode to chordarr: data/midi/v15/midi_sources/midiworld/named_midi/Jelly_Roll_Morton_-_Honky_Tonk_Blues.mid index 2219 is out of bounds for axis 0 with size 2219
Removed 2064 rests from data/midi/v15/midi

Removed 532 rests from data/midi/v15/midi_sources/from_mxl/musescore/data/1481516.mid. Skipping song
Timeout: 300 midi_sources/from_mxl/musescore/data/1188146.mid
Timeout: 300 midi_sources/from_mxl/musescore/data/2638111.mid
Removed 368 rests from data/midi/v15/midi_sources/lmd_clean/The Prodigy/Breath.1.mid. Skipping song
Timeout: 300 midi_sources/from_mxl/musescore/data/2279591.mid
Timeout: 300 midi_sources/from_mxl/musescore/data/2430311.mid
Removed 632 rests from data/midi/v15/midi_sources/lmd_clean/The Beatles/Tomorrow Never Knows.2.mid. Skipping song
Removed 888 rests from data/midi/v15/midi_sources/lmd_clean/Carmina Burana/O Fortuna.mid. Skipping song
Removed 532 rests from data/midi/v15/midi_sources/from_mxl/musescore/data/58146.mid. Skipping song
Removed 580 rests from data/midi/v15/midi_sources/lmd_clean/Genesis/Supper's Ready.mid. Skipping song
Removed 416 rests from data/midi/v15/midi_sources/lmd_clean/Black Sabbath/War Pigs.1.mid. Skipping song
Timeout: 300 midi_sources/fr

Could not encode to chordarr: data/midi/v15/midi_sources/130k_reddit/S/S/s2jazz.mid index 455 is out of bounds for axis 0 with size 455
Removed 660 rests from data/midi/v15/midi_sources/130k_reddit/S/S/shirin.mid. Skipping song
Removed 336 rests from data/midi/v15/midi_sources/130k_reddit/S/S/sayonara3.mid. Skipping song
Removed 304 rests from data/midi/v15/midi_sources/130k_reddit/S/S/shakira-waka_waka_version_with_melody.mid. Skipping song
Could not encode to chordarr: data/midi/v15/midi_sources/130k_reddit/S/S/spfc.mid 
Removed 696 rests from data/midi/v15/midi_sources/130k_reddit/S/S/sing06.mid. Skipping song
Timeout: 300 midi_sources/130k_reddit/AMERICANA_FOLK_www.pdmusic.org_MIDIRip/heinrich/aph20tdmaaoa.mid
Removed 744 rests from data/midi/v15/midi_sources/130k_reddit/1/118UNMEX.MID. Skipping song
Removed 460 rests from data/midi/v15/midi_sources/130k_reddit/1/1radarlv.mid. Skipping song
Removed 364 rests from data/midi/v15/midi_sources/130k_reddit/T/T/The_Relic.mid. Skipping so

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [23]:
arr2csv(processed, out_csv); len(processed)

196393

In [11]:
df = pd.read_csv(out_csv); df.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,source,ht_mode,title,artist,ht_time_signature,ht_offset,ht_bpm,ht_key,mxl,midi,genres,numpy,song_url,section,md5,parts,midi_title
0,hooktheory,1.0,yu-gi-oh-theme-song,wayne-sharpe,4.0,0.0,128.0,C,,midi_sources/hooktheory/pianoroll/w/wayne-shar...,,s2s_encode/hooktheory/pianoroll/w/wayne-sharpe...,https://www.hooktheory.com/theorytab/view/wayn...,chorus,bf1f29e5ff84e3e93e37fb873bfb590e,"intro,chorus",yu-gi-oh3
1,hooktheory,1.0,yu-gi-oh-theme-song,wayne-sharpe,3.0,0.0,85.0,C,,midi_sources/hooktheory/pianoroll/w/wayne-shar...,,,https://www.hooktheory.com/theorytab/view/wayn...,intro,055f80ad67f64edb14a85ca8fbfe8c29,"intro,chorus",yu-gi-oh
2,hooktheory,6.0,kiefer,what-a-day,4.0,-5.0,96.0,D,,midi_sources/hooktheory/pianoroll/w/what-a-day...,Jazz,s2s_encode/hooktheory/pianoroll/w/what-a-day/k...,https://www.hooktheory.com/theorytab/view/what...,chorus,197f96f5d181f6ce1e2c5ab04ac1ff87,chorus,kiefer
3,hooktheory,6.0,senbonzakura,whiteflame,4.0,-5.0,152.0,D,,midi_sources/hooktheory/pianoroll/w/whiteflame...,"J-Pop,Pop",s2s_encode/hooktheory/pianoroll/w/whiteflame/s...,https://www.hooktheory.com/theorytab/view/whit...,pre-chorus,9e7ce13a35f1314423a9a6d5a5287a4a,"verse,pre-chorus,chorus",senbonzakura - pre-Pre-Chorus
4,hooktheory,6.0,senbonzakura,whiteflame,4.0,-5.0,152.0,D,,midi_sources/hooktheory/pianoroll/w/whiteflame...,"J-Pop,Pop",s2s_encode/hooktheory/pianoroll/w/whiteflame/s...,https://www.hooktheory.com/theorytab/view/whit...,verse,d5aaf79d0989222f1362f9f46c540a27,"verse,pre-chorus,chorus",Senbonzakura


In [12]:
len([f for f in df.numpy.values if isinstance(f, str)])

39619

In [13]:
from collections import Counter

In [14]:
df[df.numpy.notnull()]

Unnamed: 0,source,ht_mode,title,artist,ht_time_signature,ht_offset,ht_bpm,ht_key,mxl,midi,genres,numpy,song_url,section,md5,parts,midi_title
0,hooktheory,1.0,yu-gi-oh-theme-song,wayne-sharpe,4.0,0.0,128.0,C,,midi_sources/hooktheory/pianoroll/w/wayne-shar...,,s2s_encode/hooktheory/pianoroll/w/wayne-sharpe...,https://www.hooktheory.com/theorytab/view/wayn...,chorus,bf1f29e5ff84e3e93e37fb873bfb590e,"intro,chorus",yu-gi-oh3
2,hooktheory,6.0,kiefer,what-a-day,4.0,-5.0,96.0,D,,midi_sources/hooktheory/pianoroll/w/what-a-day...,Jazz,s2s_encode/hooktheory/pianoroll/w/what-a-day/k...,https://www.hooktheory.com/theorytab/view/what...,chorus,197f96f5d181f6ce1e2c5ab04ac1ff87,chorus,kiefer
3,hooktheory,6.0,senbonzakura,whiteflame,4.0,-5.0,152.0,D,,midi_sources/hooktheory/pianoroll/w/whiteflame...,"J-Pop,Pop",s2s_encode/hooktheory/pianoroll/w/whiteflame/s...,https://www.hooktheory.com/theorytab/view/whit...,pre-chorus,9e7ce13a35f1314423a9a6d5a5287a4a,"verse,pre-chorus,chorus",senbonzakura - pre-Pre-Chorus
4,hooktheory,6.0,senbonzakura,whiteflame,4.0,-5.0,152.0,D,,midi_sources/hooktheory/pianoroll/w/whiteflame...,"J-Pop,Pop",s2s_encode/hooktheory/pianoroll/w/whiteflame/s...,https://www.hooktheory.com/theorytab/view/whit...,verse,d5aaf79d0989222f1362f9f46c540a27,"verse,pre-chorus,chorus",Senbonzakura
5,hooktheory,6.0,senbonzakura,whiteflame,4.0,-5.0,152.0,D,,midi_sources/hooktheory/pianoroll/w/whiteflame...,"J-Pop,Pop",s2s_encode/hooktheory/pianoroll/w/whiteflame/s...,https://www.hooktheory.com/theorytab/view/whit...,chorus,e0c189ee753b30c4758d85211f13c189,"verse,pre-chorus,chorus",Senbonzakura
6,hooktheory,1.0,last-christmas,wham,4.0,-1.0,108.0,Db,,midi_sources/hooktheory/pianoroll/w/wham/last-...,Holiday,s2s_encode/hooktheory/pianoroll/w/wham/last-ch...,https://www.hooktheory.com/theorytab/view/wham...,verse,38e38402443506e326b76536e8e327a0,"intro,verse,chorus",Last Christmas Verse
7,hooktheory,1.0,last-christmas,wham,4.0,-1.0,108.0,Db,,midi_sources/hooktheory/pianoroll/w/wham/last-...,Holiday,s2s_encode/hooktheory/pianoroll/w/wham/last-ch...,https://www.hooktheory.com/theorytab/view/wham...,chorus,75d0251177c8c1fa9a02821299fa5ba8,"intro,verse,chorus",Last Christmas Chorus
8,hooktheory,1.0,last-christmas,wham,4.0,-1.0,108.0,Db,,midi_sources/hooktheory/pianoroll/w/wham/last-...,Holiday,s2s_encode/hooktheory/pianoroll/w/wham/last-ch...,https://www.hooktheory.com/theorytab/view/wham...,intro,83d2a800f40aeca07e30e4718cda8fe5,"intro,verse,chorus",Last Christmas Intro
9,hooktheory,1.0,freedom,wham,4.0,0.0,128.0,C,,midi_sources/hooktheory/pianoroll/w/wham/freed...,,s2s_encode/hooktheory/pianoroll/w/wham/freedom...,https://www.hooktheory.com/theorytab/view/wham...,chorus,60fa29cfec107df27b053cf9708823d5,chorus,Freedom Chorus
11,hooktheory,1.0,west-wing-suite,wg-snuffy-walden,4.0,5.0,86.0,G,,midi_sources/hooktheory/pianoroll/w/wg-snuffy-...,,s2s_encode/hooktheory/pianoroll/w/wg-snuffy-wa...,https://www.hooktheory.com/theorytab/view/wg-s...,instrumental,a856dff6c54398544c217104d047abe0,instrumental,snuffy


In [15]:
Counter(df[df.numpy.notnull()].source.values)

Counter({'hooktheory': 17807,
         'freemidi': 24,
         'midiworld': 49,
         'ecomp': 1,
         'classical_archives': 75,
         'musescore': 65,
         'wikifonia': 2,
         'lmd': 85,
         'reddit': 3574,
         'hooktheory_c': 17937})

## TODO: instead of compresssing chord array, we can just separate them out into different parts

## Scratch notebook for separating MusicXML parts

In [31]:
piano_file = version_path/'midi_sources/musescore/data/49143.mxl'

In [34]:
score = music21.converter.parse(piano_file)

In [36]:
list(score.parts)

[<music21.stream.PartStaff P1-Staff1>, <music21.stream.PartStaff P1-Staff2>]

In [41]:
score.show('midi')

In [44]:
score.parts[0].flat.show('midi')

In [45]:
score.parts[1].flat.show('midi')

## Convert to hooktheory databunch

In [11]:
def get_files(csv):
    files = csv['numpy']
    flist = [Path(version_path/f) for f in files.values if isinstance(f, str)]
    flist = [f for f in flist if f.exists()]
    return flist

In [12]:
class S2SFileProcessor(PreProcessor):
    "`PreProcessor` that opens the filenames and read the texts."
    def process_one(self,item):
        out = np.load(item, allow_pickle=True)
        if out.shape != (2,): return None
        if len(out[0]) > 1024: return None
        if len(out[1]) > 1024: return None
#         return np.array([out[0].reshape(-1), out[1].reshape(-1)])
        return out
    
    def process(self, ds:Collection):
        ds.items = [self.process_one(item) for item in ds.items]
        ds.items = [i for i in ds.items if i is not None]
#         ds.items = array([self.process_one(item) for item in ds.items], dtype=np.object)

In [17]:
class S2SPreloader(Callback):
    def __init__(self, dataset:LabelList, bptt:int=512, **kwargs):
        self.dataset,self.bptt = dataset,bptt
        self.vocab = vocab
        self.single_tfm = partial(to_single_stream, vocab=vocab)
#         self.transpose_tfm = partial(rand_transpose, note_range=vocab.note_range, rand_range=transpose_range)
    
    def __getitem__(self, k:int):
        item,_ = self.dataset[k]
        x,y = item
        
        melody_meta = np.array([self.vocab.stoi[MSEQ], self.vocab.stoi[avg_tempo(x)]]) # pad should be average notes - tempo
        chord_meta = np.array([self.vocab.stoi[CSEQ], self.vocab.stoi[avg_tempo(y)]])
        
        x = self.single_tfm(x, start_seq=melody_meta)
        y = self.single_tfm(y, start_seq=chord_meta)
#         x,y = [self.single_tfm(i) for i in item]
#         x,y = [self.transpose_tfm(i) for i in item]
        
        x = np.pad(x, (0,max(0,self.bptt-len(x))), 'constant', constant_values=vocab.pad_idx)[:self.bptt]
        y_offset = 1
        y = np.pad(y, (0,max(y_offset,self.bptt-len(y))), 'constant', constant_values=vocab.pad_idx)[:self.bptt]
        return x, y
    
    def __len__(self):
        return len(self.dataset)

In [18]:
def create_databunch(files, cache_name, batch_size=32, load_cached=False):
    if load_cached and (out_path/f'{cache_name}/train_ids.npy').exists():
        data = MusicDataBunch.load(out_path, bs=batch_size, cache_name=cache_name)
    else:
        ps = [S2SFileProcessor()]
        data = (MusicItemList(items=files, path=out_path, processor=ps)
                .split_by_rand_pct(0.01, seed=6)
                .label_const(label_cls=LMLabelList))
#         data.x._bunch = MusicDataBunch
        data = data.databunch(bs=batch_size, preloader_cls=S2SPreloader)
        data.save(cache_name)
    return data

In [19]:
out_path = version_path/out_dir

In [20]:
csv = df
hook_csv = csv.loc[csv.source.isin(['hooktheory'])]
hook_files = get_files(hook_csv); len(hook_files)
hook_data = create_databunch(hook_files, cache_name='tmp/hook')

NameError: name 'df' is not defined

In [87]:
hook_csv = csv.loc[csv.source.isin(['hooktheory_c'])]
hook_files = get_files(hook_csv); len(hook_files)
hook_data = create_databunch(hook_files, cache_name='tmp/hook_c')

DLTFMS: None


In [103]:
single_tfm = partial(to_single_stream, vocab=vocab)
transpose_tfm = partial(rand_transpose, note_range=vocab.note_range, rand_range=(0,12))
load_data =  MusicDataBunch.load(path=out_path, cache_name='tmp/hook_c', preloader_cls=S2SPreloader, train_tfms=[single_tfm, transpose_tfm])

DLTFMS: None


Tried: 0,1,2,3,4...
  warn(warn_msg)


In [23]:
load_data =  MusicDataBunch.load(path=out_path, cache_name='tmp/hook_c', preloader_cls=S2SPreloader)

DLTFMS: None


In [24]:
load_data.one_batch()

(tensor([[  6, 273,   8,  ...,   1,   1,   1],
         [  6, 273,   8,  ..., 142,  88, 141],
         [  6, 273,   8,  ..., 145,  85, 141],
         ...,
         [  6, 273,   8,  ..., 141,  62, 141],
         [  6, 273,   8,  ..., 141,  54, 143],
         [  6, 273,   8,  ..., 143,  61, 143]]),
 tensor([[  5, 273,   8,  ..., 147,  69, 147],
         [  5, 273,   8,  ..., 149,   8, 149],
         [  5, 273,   8,  ..., 147,  61, 147],
         ...,
         [  5, 273,   8,  ..., 155,  59, 155],
         [  5, 273,   8,  ..., 147,  64, 147],
         [  5, 273,   8,  ..., 141,  68, 141]]))

In [None]:
# ps = [S2SFileProcessor()]

# single_tfm = partial(to_single_stream, vocab=vocab)
# data = (MusicItemList(items=hook_files[:100], path=out_path, processor=ps, tfms=[single_tfm])
#         .split_by_rand_pct(0.01, seed=6)
#         .label_const(label_cls=LMLabelList))
# data.x._bunch = MusicDataBunch

In [78]:
# data.x.tfms = [single_tfm]

In [None]:
data = data.databunch(bs=4, preloader_cls=S2SPreloader, train_tfms=[single_tfm])

In [80]:
out = data.train_dl.dl.dataset[0]

In [None]:
data.one_batch()

In [None]:

def avg_tempo(t, sep_idx=0):
    avg = t[t[:, 0] == sep_idx][:, 1].sum()/t.shape[0]
    return 'mt'+str(int(max(round(avg), 4)))

class Seq2SeqProcessor(PreProcessor):
    def __init__(self, vocab=None, ds:Collection=None):  
        self.vocab = vocab
        super().__init__(ds)
        
    "`PreProcessor` that opens the filenames and read the texts."
    def process_one(self,item):
        left, right = [np.load(i, allow_pickle=True) for i in item]
        start_seq = np.array([self.vocab.stoi[BOS], self.vocab.stoi[PAD]])
        
        chord_meta = np.array([self.vocab.stoi[CSEQ], self.vocab.stoi[avg_tempo(left)]])
        chord_seq = to_single_stream(left, self.vocab, start_seq=chord_meta)
        melody_meta = np.array([self.vocab.stoi[MSEQ], self.vocab.stoi[avg_tempo(right)]]) # pad should be average notes - tempo
        melody_seq = to_single_stream(right, self.vocab, start_seq=melody_meta)
        
        
        cat_sequence = np.concatenate([start_seq, chord_seq, melody_seq, np.array([-100]*100)])
            
        return cat_sequence
