### Create CSV from midi sources

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import glob
import os
from tqdm import tqdm
from IPython.display import Image, Audio
import traceback
import json

In [3]:
import pandas as pd
from fastai.data_block import get_files

In [4]:
# parallel
from functools import partial
from pathlib import Path

In [5]:
import os
os.chdir('../../../')

In [6]:
# from musicautobot import *
from musicautobot.utils.file_processing import *

In [7]:
version = 'v20'
data_path = Path('data/midi')
version_path = data_path/version
orig_path = version_path/'midi_sources'
metapath = version_path/'metadata'
combined_csv = metapath/'combined.csv'
all_csv = metapath/'midi_sources.csv'
metapath.mkdir(parents=True, exist_ok=True)

In [8]:
def create_paths(dirname):
    "Standardize midi_source paths"
    dir_path = orig_path/dirname
    csv_path = metapath/f'{dirname}_metadata.csv'
    return dir_path, csv_path

In [9]:
sources = ['hooktheory', 'hooktheory_c', 'freemidi', 'midiworld', 'ecomp', 'cprato', 'classic_piano', 'classical_archives', 'musescore', 'wikifonia', 'lmd_clean', '130k_reddit']

In [10]:
version_path.relative_to(data_path)

PosixPath('v20')

In [11]:
def relative_path(filepath):
    return str(Path(filepath).relative_to(version_path))

### Remove corrupted file - this causes deadlock with music21 processing

In [12]:
corrupted_files = [
    'midiworld/named_midi/NITRO_BRO_-_IT_WONT_DIE.mid',
    'ecomp/2004/MORET02.mid',
    'ecomp/2006/Mordvinov9.MID',
    'ecomp/2006/Na06.MID',
    'ecomp/2008/Cui01.MID',
    'ecomp/2008/Cui02.MID',
    'ecomp/2008/Cui03.MID',
    'ecomp/2008/Cui04.MID',
    'ecomp/2008/Cui05.MID',
    'ecomp/2008/Cui06.MID',
    'ecomp/2008/Cui07.MID',
    'ecomp/2008/Cui08.MID',
    'ecomp/2008/Tan01.MID',
    'ecomp/2008/Tan02.MID',
    'ecomp/2008/Tan03.MID',
    'ecomp/2018/KaszoS14.MID',
    'midiworld/named_midi/Rob_Zombie_-_Demonoid_Phenomenon.mid',
    'midiworld/named_midi/Rob_Zombie_-_Demonoid_Phenomenon.mxl',
]
for f in corrupted_files:
    fp = orig_path/f
    if fp.exists(): fp.unlink()

In [13]:
import hashlib

In [14]:

def directory2csv(files, meta_func, csv_path):
    "Iterate through midi_source dir and map file to metadata"
    
    def get_meta(fp):
        # over 350mb takes crazy long to analyze
        size = fp.stat().st_size/1000
        if fp.suffix == 'mid' and size > 350: 
#             print('Removing mid over 350mb', fp, dixr)
            return None
        if fp.suffix == 'mxl' and size > 420: 
#             print('Removing mxl over 420mb', fp, size)
            return None
#         try:
        m = meta_func(fp)
        if m: m['md5'] = hashlib.md5(open(fp,'rb').read()).hexdigest()
        return m
#         except Exception as e:
#             print('Error:', fp, e)
#         return None
    
    mlist = [get_meta(fp) for fp in files]
    mlist = [x for x in mlist if x is not None]
    arr2csv(mlist, csv_path)
    return mlist

### Hooktheory

In [15]:
ht_cat = 'hooktheory'
ht_path, ht_csv = create_paths(ht_cat)
ht_midi_list = list((ht_path/'pianoroll').glob('*/*/*/*_key_original.mid')); 
len(ht_midi_list)

20745

In [15]:
ht_cat = 'hooktheory_c'
ht_path, ht_csv = create_paths(ht_cat)
ht_path = ht_path.with_name('hooktheory')
ht_midi_list = list((ht_path/'pianoroll').glob('*/*/*/*_key_cmajor.mid'));
len(ht_midi_list)

20745

In [16]:
ht_song_list = metapath/'hooktheory_key2info.json'

In [17]:
def song_key(s): return '_'.join(s.parts[-3:-1])

In [18]:
if ht_song_list.exists():
    ht_key2info = json.load(open(ht_song_list, 'r'))
else:
    song_info = list((ht_path/'xml').glob('*/*/*/*.json'))
    ht_key2info = {song_key(s):json.load(open(s, 'r')) for s in song_info}
    with open(ht_song_list, 'w') as f: json.dump(ht_key2info, f)
len(ht_key2info)

12346

In [19]:
# ht_midi_list = list((ht_path/'pianoroll').glob('*/*/*/*_key_cmajor.mid')); 

In [20]:
def get_ht_jsonfile(midi_file): # using json instead of midi for metadata
    return str(midi_file.with_suffix('.json')).replace('pianoroll', 'event').replace('_key', '_symbol_key')

In [21]:
def get_hooktheory_attr(fp):
    song_info = ht_key2info[song_key(fp)]
    song_json = json.load(open(get_ht_jsonfile(fp), 'r'))
    metadata = song_json['metadata']
    artist = fp.parts[-3]
    title = fp.parts[-2]
    section = fp.name.split('_')[0]
    
    ht_key = metadata['key']
    ht_mode = metadata['mode']
    if ht_mode is None: ht_mode = 'major'
    
    # convert stream here
    return {
        'artist': artist,
        'title': title,
        'midi': relative_path(fp),
        'section': section,
        'parts': song_info['section'],
        'song_url': song_info['song_url'],
        'genres': song_info['genres'],
        'midi_title': metadata['title'],
        'source': ht_cat,
        'ht_bpm': metadata['BPM'],
        'ht_mode': metadata['mode'],
        'ht_key': metadata['key'],
        'ht_time_signature': metadata['beats_in_measure']
    }

In [22]:
# sanity check
# hook_out = get_hooktheory_attr(song_json[1000]); hook_out

In [23]:
ht_metadata = directory2csv(ht_midi_list, 
                            meta_func=get_hooktheory_attr, 
                            csv_path=ht_csv)
df = pd.read_csv(ht_csv); df.head()

Unnamed: 0,title,midi,source,parts,ht_time_signature,midi_title,ht_mode,md5,song_url,genres,ht_bpm,ht_key,artist,section
0,yu-gi-oh-theme-song,midi_sources/hooktheory/pianoroll/w/wayne-shar...,hooktheory,"intro,chorus",4,yu-gi-oh3,1.0,bf1f29e5ff84e3e93e37fb873bfb590e,https://www.hooktheory.com/theorytab/view/wayn...,,128,C,wayne-sharpe,chorus
1,yu-gi-oh-theme-song,midi_sources/hooktheory/pianoroll/w/wayne-shar...,hooktheory,"intro,chorus",3,yu-gi-oh,1.0,055f80ad67f64edb14a85ca8fbfe8c29,https://www.hooktheory.com/theorytab/view/wayn...,,85,C,wayne-sharpe,intro
2,kiefer,midi_sources/hooktheory/pianoroll/w/what-a-day...,hooktheory,chorus,4,kiefer,6.0,197f96f5d181f6ce1e2c5ab04ac1ff87,https://www.hooktheory.com/theorytab/view/what...,Jazz,96,D,what-a-day,chorus
3,senbonzakura,midi_sources/hooktheory/pianoroll/w/whiteflame...,hooktheory,"verse,pre-chorus,chorus",4,senbonzakura - pre-Pre-Chorus,6.0,9e7ce13a35f1314423a9a6d5a5287a4a,https://www.hooktheory.com/theorytab/view/whit...,"J-Pop,Pop",152,D,whiteflame,pre-chorus
4,senbonzakura,midi_sources/hooktheory/pianoroll/w/whiteflame...,hooktheory,"verse,pre-chorus,chorus",4,Senbonzakura,6.0,d5aaf79d0989222f1362f9f46c540a27,https://www.hooktheory.com/theorytab/view/whit...,"J-Pop,Pop",152,D,whiteflame,verse


In [24]:
df.shape

(20745, 14)

## FreeMidi

In [36]:
fm_path, fm_csv = create_paths('freemidi')
fm_dance_path = metapath/f'freemidi_dance_metadata.json'
fm_pop_path = metapath/f'freemidi_pop_metadata.json'
list(fm_path.glob('*'))

[PosixPath('data/midi/v20/midi_sources/freemidi/genre-disco'),
 PosixPath('data/midi/v20/midi_sources/freemidi/genre-pop'),
 PosixPath('data/midi/v20/midi_sources/freemidi/genre-dance-eletric'),
 PosixPath('data/midi/v20/midi_sources/freemidi/genre-punk'),
 PosixPath('data/midi/v20/midi_sources/freemidi/genre-hip-hop-rap'),
 PosixPath('data/midi/v20/midi_sources/freemidi/genre-rock')]

In [37]:
def parse_freemidi_songs(fp, genres=None, source=None):
    name = fp.with_suffix('').name.split(' - ')
    artist = name[0]
    title = name[-1]
    return {
        'artist': artist.strip(),
        'title': title.strip(),
        'midi': relative_path(fp),
        'genres': genres,
        'source': source
    }

In [38]:
d_parse_func = partial(parse_freemidi_songs, genres='dance', source='freemidi')
dir_path = fm_path/'genre-dance-eletric'
file_list = get_files(dir_path, extensions=['.mid'], recurse=True)
fm_dance_list = directory2csv(file_list, meta_func=d_parse_func, csv_path=fm_dance_path)

In [39]:
p_parse_func = partial(parse_freemidi_songs, genres='pop', source='freemidi')
dir_path = fm_path/'genre-pop'
file_list = get_files(dir_path, extensions=['.mid'], recurse=True)
fm_pop_list = directory2csv(file_list, meta_func=p_parse_func, csv_path=fm_pop_path)

In [40]:
fm_all = fm_dance_list + fm_pop_list
arr2csv(fm_all, fm_csv)
df = pd.read_csv(fm_csv); df.head()

Unnamed: 0,genres,midi,source,artist,md5,title
0,dance,midi_sources/freemidi/genre-dance-eletric/Veng...,freemidi,Vengaboys,eb504f29b1a10567814f198e7e049d15,Up And Down
1,dance,midi_sources/freemidi/genre-dance-eletric/ATB ...,freemidi,ATB,7c461c21684baee9946019c0ed7ce102,Dont stop
2,dance,midi_sources/freemidi/genre-dance-eletric/Mado...,freemidi,Madonna,ac1e447bff339c29bccbaee3deb13b24,Dress You Up
3,dance,midi_sources/freemidi/genre-dance-eletric/Aqua...,freemidi,Aqua,d0306034dbbb4bbc31a95e3232e5fb73,Dr Jones
4,dance,midi_sources/freemidi/genre-dance-eletric/Tune...,freemidi,Tune Up,996662d57a8e3236b36285c54093697e,Bounce


### Gather Lakh Midi Dataset

In [41]:
lmd_path, lmd_csv = create_paths('lmd_clean')

In [42]:
def parse_lmd_songs(fp):
    artist = fp.parts[-2]
    title = fp.parts[-1]
    return {
        'artist': artist.strip(),
        'title': title.strip(),
        'midi': relative_path(fp),
        'genres': 'pop,inferred',
        'source': 'lmd'
    }

In [43]:
file_list = get_files(lmd_path, extensions=['.mid'], recurse=True)
lmd_md = directory2csv(file_list, meta_func=parse_lmd_songs, csv_path=lmd_csv)
df = pd.read_csv(lmd_csv); df.head()

Unnamed: 0,genres,midi,source,artist,md5,title
0,"pop,inferred",midi_sources/lmd_clean/Peter Maffay/Du.mid,lmd,Peter Maffay,6d2ac0d68f5976b161afca8ce061d376,Du.mid
1,"pop,inferred",midi_sources/lmd_clean/Peter Maffay/Josie.mid,lmd,Peter Maffay,6ccac8947814b6faa132cb5bec7a3bdf,Josie.mid
2,"pop,inferred",midi_sources/lmd_clean/Anne Murray/Snowbird.mid,lmd,Anne Murray,f5069f36a7e56475d7f706ed2d2f8517,Snowbird.mid
3,"pop,inferred",midi_sources/lmd_clean/Anne Murray/You Needed ...,lmd,Anne Murray,48419c2acdc476094487157582829781,You Needed Me.mid
4,"pop,inferred",midi_sources/lmd_clean/The Tremeloes/Silence I...,lmd,The Tremeloes,3befa396df58762e746c4288fa851f03,Silence Is Golden.mid


### Gather 130k Reddit

In [44]:
reddit_path, reddit_csv = create_paths('130k_reddit')

In [45]:
def parse_reddit_songs(fp):
    name = fp.with_suffix('').name.replace('_', ' ').split(' - ')
    if len(name) == 1:
        artist = fp.parts[-1]
        title = name[0]
    else:
        artist = name[0]
        title = name[-1]
    return {
        'artist': artist.strip(),
        'title': title.strip(),
        'midi': relative_path(fp),
        'genres': 'anything,inferred',
        'source': 'reddit'
    }

In [46]:
file_list = get_files(reddit_path, extensions=['.mid'], recurse=True)
file_list = [fp for fp in file_list if fp.stat().st_size/1000 < 400] # over 200mb takes crazy long to analyze
reddit_md = directory2csv(file_list, meta_func=parse_reddit_songs, csv_path=reddit_csv)
df = pd.read_csv(reddit_csv); df.head()

Unnamed: 0,genres,midi,source,artist,md5,title
0,"anything,inferred",midi_sources/130k_reddit/Jazz_www.thejazzpage....,reddit,phasedance.mid,c175323dbdff4b676588609081bf5606,phasedance
1,"anything,inferred",midi_sources/130k_reddit/Jazz_www.thejazzpage....,reddit,IGotRhythm.MID,912b07a01ae9b81bc0d86118e3972a47,IGotRhythm
2,"anything,inferred",midi_sources/130k_reddit/Jazz_www.thejazzpage....,reddit,Cheek_To_Cheek.mid,53136c05b1dd56a9f11367f8cdda5c2e,Cheek To Cheek
3,"anything,inferred",midi_sources/130k_reddit/Jazz_www.thejazzpage....,reddit,16goingon17.mid,31ddfcdb86c20e4e67cbaa3363c88309,16goingon17
4,"anything,inferred",midi_sources/130k_reddit/Jazz_www.thejazzpage....,reddit,poinciana.mid,d15dd01250feb42f3b17251c56e6721e,poinciana


### Gather Cprato

In [47]:
cp_path, cp_csv = create_paths('cprato')
# list(cp_path.glob('*'))[:5]

In [48]:
def parse_cprato_songs(fp, genres=None, source=None):
    name = fp.with_suffix('').name.split(' - ')
    artist = name[0]
    title = name[-1].replace('(midi by Carlo Prato) (www.cprato.com)', '')
    return {
        'artist': artist.strip(),
        'title': title.strip(),
        'midi': relative_path(fp),
        'genres': genres,
        'source': source
    }

In [49]:
cp_meta = partial(parse_cprato_songs, genres='EDM,inferred', source='cprato')
file_list = get_files(cp_path, extensions=['.mid'], recurse=True)
cp_md = directory2csv(file_list, meta_func=cp_meta, csv_path=cp_csv)
df = pd.read_csv(cp_csv); df.head()

Unnamed: 0,genres,midi,source,artist,md5,title
0,"EDM,inferred",midi_sources/cprato/Basto - Again And Again (m...,cprato,Basto,44ea7e9b46e04ba6f4836f00b3cc50a3,Again And Again (midi By Carlo Prato) (www.cpr...
1,"EDM,inferred",midi_sources/cprato/The Weeknd ft. Lana Del Re...,cprato,The Weeknd ft. Lana Del Rey,d67ead892ee2c92cfbb5306bd47c9a0f,Stargirl Interlude
2,"EDM,inferred",midi_sources/cprato/Two Steps From Hell - Magi...,cprato,Two Steps From Hell,222db08d4744ab9a53ca0d9c6c6e5113,Magic of Love
3,"EDM,inferred",midi_sources/cprato/Bermuda Loverz - My Girl (...,cprato,Bermuda Loverz,2befd21ebd0f0c779f7fb436ed828ba1,My Girl (Ladidada) (Rimini Rockaz Radio Edit) ...
4,"EDM,inferred",midi_sources/cprato/Cascada - Everytime We Tou...,cprato,Cascada,b53bfa6f4ab72df165e44263d50a4cbd,Everytime We Touch (Midi By Carlo Prato) (www....


### Gather MidiWorld

In [50]:
mw_path, mw_csv = create_paths('midiworld')

In [51]:
def parse_midiworld_songs(fp):
    name = fp.with_suffix('').name.replace('_', ' ').split(' - ')
    artist = name[0]
    title = name[-1]
    return {
        'artist': artist.strip(),
        'title': title.strip(),
        'midi': relative_path(fp),
        'genres': 'pop,inferred',
        'source': 'midiworld'
    }

In [52]:
file_list = get_files(mw_path/'named_midi', extensions=['.mid'], recurse=True)
mw_md = directory2csv(file_list, meta_func=parse_midiworld_songs, csv_path=mw_csv)
df = pd.read_csv(mw_csv); df.head()

Unnamed: 0,genres,midi,source,artist,md5,title
0,"pop,inferred",midi_sources/midiworld/named_midi/The_Carpente...,midiworld,The Carpenters,6d6e23b4f0e44537f8b5309ffeaa1880,Rainy Days and Mondays
1,"pop,inferred",midi_sources/midiworld/named_midi/Joan_Jett_-_...,midiworld,Joan Jett,19efd3ac590d3aede49d2e9e62209115,I Hate Myself for Loving You
2,"pop,inferred",midi_sources/midiworld/named_midi/George_Harri...,midiworld,George Harrison,bea4eba9aa4e8154ab01108b2b808e3c,When We Was Fab
3,"pop,inferred",midi_sources/midiworld/named_midi/Video_Game_T...,midiworld,Video Game Themes,dfbd9c523e1846767746285281d5e971,Diddy Kong
4,"pop,inferred",midi_sources/midiworld/named_midi/The_Corrs_-_...,midiworld,The Corrs,2445fa5424432de2a40ece46cbbc853c,Someday


### Gather Wikifonia

In [53]:
wf_path, wf_csv = create_paths('wikifonia')

In [54]:
def parse_wikifonia_songs(fp):
    name = fp.with_suffix('').name.replace('_', ' ').split(' - ')
    artist = name[0]
    title = name[-1]
    return {
        'artist': artist.strip(),
        'title': title.strip(),
        'mxl': relative_path(fp),
        'genres': 'pop,inferred',
        'source': 'wikifonia'
    }

In [55]:
# Warning: if you get a deadlock: 
# PosixPath('data/midi/midi_sources/midiworld/named_midi/NITRO_BRO_-_IT_WONT_DIE.mid') is broken
file_list = get_files(wf_path, extensions=['.mxl'], recurse=True)
wf_md = directory2csv(file_list, meta_func=parse_wikifonia_songs, csv_path=wf_csv)
df = pd.read_csv(wf_csv); df.head()

Unnamed: 0,genres,mxl,source,artist,md5,title
0,"pop,inferred",midi_sources/wikifonia/Tommy Dorsey - Swingin'...,wikifonia,Tommy Dorsey,f3418afa104cd03604c8831123e086e4,Swingin' on Nothin'
1,"pop,inferred",midi_sources/wikifonia/Amanda McBroom - The Ro...,wikifonia,Amanda McBroom,a01bde25baf5c5a91b9f6235bf019890,The Rose
2,"pop,inferred","midi_sources/wikifonia/Hans Leo Hassler, From ...",wikifonia,"Hans Leo Hassler, From the Latin",4beb6aafed89a87a8171da906d8b5ff8,O Sacred Head Now Wounded
3,"pop,inferred","midi_sources/wikifonia/Arthur Siegel, June Car...",wikifonia,"Arthur Siegel, June Carroll",a739908a6505fcbe8c7d9bf7b1e47492,Love Is A Simple Thing
4,"pop,inferred",midi_sources/wikifonia/Unknow - KINDERLIEDJES ...,wikifonia,Unknow,cb2beddd4a86439dcf2d80b5fc6b4e4f,KINDERLIEDJES MEDLEY


### Gather Musescore

In [57]:
ms_path, ms_csv = create_paths('musescore')
ms_songs = json.load(open(ms_path/'song_map.json', 'r'))

In [59]:
def get_number(num_str):
    num_str = num_str.replace(',','').split(' ')[0]
    return int(num_str)

In [60]:
def parse_musescore_songs(fp):
    score_id = fp.with_suffix('').name
    if score_id not in ms_songs: return None
    
    meta = ms_songs[score_id]
    parts = get_number(meta['parts'])
    views = get_number(meta['views'])
    if parts > 2 or views < 150: return None
    
    return {
        'artist': meta['author'].strip(),
        'title': meta['title'].strip(),
        'mxl': relative_path(fp),
        'genres': 'classical,pop,inferred',
        'source': 'musescore'
    }

In [61]:
# Warning: if you get a deadlock: 
# PosixPath('data/midi/midi_sources/midiworld/named_midi/NITRO_BRO_-_IT_WONT_DIE.mid') is broken
file_list = get_files(ms_path, extensions=['.mxl'], recurse=True)
ms_list = directory2csv(file_list, meta_func=parse_musescore_songs, csv_path=ms_csv)
df = pd.read_csv(ms_csv); df.head()

Unnamed: 0,genres,mxl,source,artist,md5,title
0,"classical,pop,inferred",midi_sources/musescore/data/2985741.mxl,musescore,000@xn--80akgejic5ahko1h.xn--p1ai,36fb6c38927e5e025f7f8427770cf8fa,Упражнение 2
1,"classical,pop,inferred",midi_sources/musescore/data/1425126.mxl,musescore,sam027,b666753133bc72910840a97bddb1fbf9,Place de la République - Coeur de pirate
2,"classical,pop,inferred",midi_sources/musescore/data/5370824.mxl,musescore,CrazyClique,83de066be1fa555a986376170c467f99,Beyond The Trees - Original Composition
3,"classical,pop,inferred",midi_sources/musescore/data/1195001.mxl,musescore,Mjmatthews51,a85e407d2427234ee6bf5aa7ae5ce5e9,Sister Sadie
4,"classical,pop,inferred",midi_sources/musescore/data/4621586.mxl,musescore,Spencer Vanderkley,d1f72939369c106948993bba9b4fc0a9,My Top 20 Film Soundtracks Medley


### Yamaha - piano

In [62]:
ec_path, ec_csv = create_paths('ecomp')
ec_songs = json.load(open(ec_path/'song_list.json', 'r'))
# list(ec_path.glob('*'))[:5]

In [63]:
def parse_ecomp_songs(fp):
    song_info = ec_songs[fp.stem]
    return {
        'artist': song_info['artist'],
        'title': song_info['title'],
        'mxl': relative_path(fp),
        'genres': 'classical',
        'source': 'ecomp'
    }

In [64]:
file_list = get_files(ec_path, extensions=['.mxl'], recurse=True)

In [65]:
ec_md = directory2csv(file_list, meta_func=parse_ecomp_songs, csv_path=ec_csv)
df = pd.read_csv(ec_csv); df.head()

Unnamed: 0,genres,mxl,source,artist,md5,title
0,classical,midi_sources/ecomp/2017/SirajA01.mxl,ecomp,Johann Sebastian Bach,39099f086fdc79c2828c13274fcd0f25,"Prelude and Fugue in E-flat Major, WTC II, ..."
1,classical,midi_sources/ecomp/2017/LiC05.mxl,ecomp,Moritz Moszkowski,fe22a58bcff66e8b724b71c10b5465b6,"Chanson Boheme de l'Opera ""Carmen"" by Georg..."
2,classical,midi_sources/ecomp/2017/WangY05.mxl,ecomp,Nikolai Kapustin,8b38131646b8d71dc10269d3e2d608d0,Concert Etude Op. 40 No. 3
3,classical,midi_sources/ecomp/2017/SunY05.mxl,ecomp,,2637be3aec226a2fb74e86bb7a1fde81,I. Con moto agitato. Andante. Con moto agitato
4,classical,midi_sources/ecomp/2017/ZhangE06.mxl,ecomp,Giuseppe Scarlatti,c154b4269f8c8cecdb5b2972e3e2d831,"Sonata in G Major, K. 455"


### Classic Piano

In [66]:
clc_path, clc_csv = create_paths('classic_piano')
# list(clc_path.glob('*'))[:5]

In [67]:
def parse_classic_songs(fp):
    name = fp.with_suffix('').name.split('_')
    artist = name[0]
    title = ' '.join(name[1:])
    return {
        'artist': artist,
        'title': title,
        'mxl': relative_path(fp),
        'genres': 'classical',
        'source': 'classical_piano'
    }

In [68]:
file_list = get_files(clc_path, extensions=['.mxl'], recurse=True)

In [69]:
clc_md = directory2csv(file_list, meta_func=parse_classic_songs, csv_path=clc_csv)
df = pd.read_csv(clc_csv); df.head()

Unnamed: 0,genres,mxl,source,artist,md5,title
0,classical,midi_sources/classic_piano/liz_rhap15_format0.mxl,classical_piano,liz,93d730fce9a3e2fabe1a0ea3e2cf80b6,rhap15 format0
1,classical,midi_sources/classic_piano/ty_september_format...,classical_piano,ty,d45bf8f2b4b0242879305a8423a3cb99,september format0
2,classical,midi_sources/classic_piano/schumm-3_format0.mxl,classical_piano,schumm-3,2238d47b4162b55bafee79b16af46a37,format0
3,classical,midi_sources/classic_piano/chpn_op33_4_format0...,classical_piano,chpn,3d5c30ba63b886de6555a5d4911cf55f,op33 4 format0
4,classical,midi_sources/classic_piano/grieg_spring_format...,classical_piano,grieg,51758e4fb8b37d0389148d37652b1d58,spring format0


### Classical Music Archives

In [70]:
cma_path, cma_csv = create_paths('classical_archives')
# list(cma_path.glob('*'))[:5]

In [71]:
def parse_cma_songs(fp):
    name = fp.with_suffix('').name
    return {
        'artist': name,
        'title': name,
        'mxl': relative_path(fp),
        'genres': 'classical',
        'source': 'classical_archives'
    }

In [72]:
file_list = get_files(cma_path, extensions=['.mxl'], recurse=True); len(file_list)

14671

In [73]:
cma_md = directory2csv(file_list, meta_func=parse_cma_songs, csv_path=cma_csv)
df = pd.read_csv(cma_csv); df.head()

Unnamed: 0,genres,mxl,source,artist,md5,title
0,classical,midi_sources/classical_archives/021/jsrjeuxd.mxl,classical_archives,jsrjeuxd,d14444f06de8a7ad6bec95c98afa566c,jsrjeuxd
1,classical,midi_sources/classical_archives/021/men26.mxl,classical_archives,men26,42c8738df5fa98fa5b715d058890e376,men26
2,classical,midi_sources/classical_archives/021/szecheny.mxl,classical_archives,szecheny,0af7005d072bc22ea2f188569adcfa9d,szecheny
3,classical,midi_sources/classical_archives/021/acocored.mxl,classical_archives,acocored,e8842eb5b97285673075043aaab09ca8,acocored
4,classical,midi_sources/classical_archives/021/op73_2_3.mxl,classical_archives,op73_2_3,be5fea22e3d87257123c98090d9f4c6c,op73_2_3


### Creating CSV

In [25]:
combined_csvs = [create_paths(s)[-1] for s in sources if s != 'hooktheory_c']
dfs = [pd.read_csv(csv) for csv in combined_csvs]

In [26]:
combined_df = pd.concat(dfs, sort=False)
combined_df = combined_df.reset_index(drop=True); combined_df.head()

Unnamed: 0,title,midi,source,parts,ht_time_signature,midi_title,ht_mode,md5,song_url,genres,ht_bpm,ht_key,artist,section,mxl
0,yu-gi-oh-theme-song,midi_sources/hooktheory/pianoroll/w/wayne-shar...,hooktheory,"intro,chorus",4.0,yu-gi-oh3,1.0,bf1f29e5ff84e3e93e37fb873bfb590e,https://www.hooktheory.com/theorytab/view/wayn...,,128.0,C,wayne-sharpe,chorus,
1,yu-gi-oh-theme-song,midi_sources/hooktheory/pianoroll/w/wayne-shar...,hooktheory,"intro,chorus",3.0,yu-gi-oh,1.0,055f80ad67f64edb14a85ca8fbfe8c29,https://www.hooktheory.com/theorytab/view/wayn...,,85.0,C,wayne-sharpe,intro,
2,kiefer,midi_sources/hooktheory/pianoroll/w/what-a-day...,hooktheory,chorus,4.0,kiefer,6.0,197f96f5d181f6ce1e2c5ab04ac1ff87,https://www.hooktheory.com/theorytab/view/what...,Jazz,96.0,D,what-a-day,chorus,
3,senbonzakura,midi_sources/hooktheory/pianoroll/w/whiteflame...,hooktheory,"verse,pre-chorus,chorus",4.0,senbonzakura - pre-Pre-Chorus,6.0,9e7ce13a35f1314423a9a6d5a5287a4a,https://www.hooktheory.com/theorytab/view/whit...,"J-Pop,Pop",152.0,D,whiteflame,pre-chorus,
4,senbonzakura,midi_sources/hooktheory/pianoroll/w/whiteflame...,hooktheory,"verse,pre-chorus,chorus",4.0,Senbonzakura,6.0,d5aaf79d0989222f1362f9f46c540a27,https://www.hooktheory.com/theorytab/view/whit...,"J-Pop,Pop",152.0,D,whiteflame,verse,


In [27]:
deduped_df = combined_df.drop_duplicates(subset=['md5'], keep='first') # 

Midiworld - 90% duplicates with rest  
Freemidi - 50% duplicates with rest  
LMD - 70% duplicates with rest

In [28]:
# from collections import Counter
# # No dedups
# print(Counter(merged_df.source.values))
# # replacing reddit
# print(Counter(merged_df.drop_duplicates(subset=['md5'], keep='first').source.values))
# # reddit replace else
# print(Counter(merged_df.drop_duplicates(subset=['md5'], keep='last').source.values))
# # Midiworld - 90% duplicates with rest, 
# # Freemidi - 50% duplicates with rest,
# # LMD - 70% duplicates with rest

In [29]:
from collections import Counter
Counter(deduped_df.source.values)

Counter({'hooktheory': 20544,
         'freemidi': 5168,
         'midiworld': 4109,
         'ecomp': 2735,
         'cprato': 312,
         'classical_piano': 329,
         'classical_archives': 14671,
         'musescore': 11502,
         'wikifonia': 6391,
         'lmd': 13568,
         'reddit': 98683})

In [30]:
# combined df does not contain hooktheory_c files
[df.shape for df in dfs], combined_df.shape, deduped_df.shape

([(20745, 14),
  (5784, 6),
  (4711, 6),
  (2735, 6),
  (314, 6),
  (329, 6),
  (14671, 6),
  (11504, 6),
  (6391, 6),
  (17243, 6),
  (128419, 6)],
 (212846, 15),
 (178012, 15))

In [31]:
hooktheory_c_csv = pd.read_csv(create_paths('hooktheory_c')[-1])
out_df = pd.concat([deduped_df, hooktheory_c_csv], sort=False); out_df.shape

(198757, 15)

In [32]:
out_df.to_csv(combined_csv, index=False)

### Convert MXL to Midi

Makes it easier for us to process in part 2

In [34]:
df = pd.read_csv(combined_csv); df.head()
all_records = df.to_dict(orient='records'); len(all_records)

  interactivity=interactivity, compiler=compiler, result=result)


198757

In [35]:
def mxl2midi_func(metadata):
    result = metadata.copy()
    if not isinstance(result.get('mxl'), str): return result

    input_path = version_path/metadata['mxl']
    out_file = Path(str(metadata['mxl']).replace('midi_sources/', 'midi_sources/from_mxl/')).with_suffix('.mid')
    output_path = version_path/out_file
    
    if not output_path.exists():
        try:
            output_path.parent.mkdir(parents=True, exist_ok=True)
            stream = file2stream(input_path)
            stream.write('midi', fp=output_path)
            print('Encoded:', output_path)
        except Exception:
#             print(traceback.format_exc())
            return result
        
    result['midi'] = out_file
    return result

In [36]:
def timeout_func(data, seconds):
    print("Timeout:", seconds, data.get('mxl'))

In [37]:
processed = process_all(mxl2midi_func, all_records, timeout=600, timeout_func=timeout_func)

In [38]:
# converted = [(p['midi'], p['source']) for p in processed if isinstance(p.get('mxl'), str)]

# converted

In [39]:
arr2csv(processed, all_csv)
df = pd.read_csv(all_csv); df.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,title,midi,source,parts,mxl,ht_time_signature,midi_title,ht_mode,md5,genres,ht_bpm,ht_key,artist,song_url,section
0,yu-gi-oh-theme-song,midi_sources/hooktheory/pianoroll/w/wayne-shar...,hooktheory,"intro,chorus",,4.0,yu-gi-oh3,1.0,bf1f29e5ff84e3e93e37fb873bfb590e,,128.0,C,wayne-sharpe,https://www.hooktheory.com/theorytab/view/wayn...,chorus
1,yu-gi-oh-theme-song,midi_sources/hooktheory/pianoroll/w/wayne-shar...,hooktheory,"intro,chorus",,3.0,yu-gi-oh,1.0,055f80ad67f64edb14a85ca8fbfe8c29,,85.0,C,wayne-sharpe,https://www.hooktheory.com/theorytab/view/wayn...,intro
2,kiefer,midi_sources/hooktheory/pianoroll/w/what-a-day...,hooktheory,chorus,,4.0,kiefer,6.0,197f96f5d181f6ce1e2c5ab04ac1ff87,Jazz,96.0,D,what-a-day,https://www.hooktheory.com/theorytab/view/what...,chorus
3,senbonzakura,midi_sources/hooktheory/pianoroll/w/whiteflame...,hooktheory,"verse,pre-chorus,chorus",,4.0,senbonzakura - pre-Pre-Chorus,6.0,9e7ce13a35f1314423a9a6d5a5287a4a,"J-Pop,Pop",152.0,D,whiteflame,https://www.hooktheory.com/theorytab/view/whit...,pre-chorus
4,senbonzakura,midi_sources/hooktheory/pianoroll/w/whiteflame...,hooktheory,"verse,pre-chorus,chorus",,4.0,Senbonzakura,6.0,d5aaf79d0989222f1362f9f46c540a27,"J-Pop,Pop",152.0,D,whiteflame,https://www.hooktheory.com/theorytab/view/whit...,verse


In [40]:
df.shape

(198757, 15)