Please use the current development version and report any bugs you encounter:
`pip install git+https://github.com/johentsch/ms3.git@corpus_structure`

In [1]:
import os, shutil, subprocess
import ms3
import ray

DEBUG:ray:[ray] Forcing OMP_NUM_THREADS=1 to avoid performance degradation with many workers (issue #6998). You can override this by explicitly setting OMP_NUM_THREADS.


In [4]:
### Arch-Dependant parameters to check

DATA_FOLDER = os.path.abspath('./mscz')
# DATA_FOLDER = os.path.abspath('/scratch/data/musescore.com/') # on the HPC: /scratch/data/musescore.com/
MUSESCORE_CMD = ms3.get_musescore('auto')
# MUSESCORE_CMD = "/usr/local/bin/AppImg???"
# MUSESCORE_CMD = "/home/erwan/.local/bin/MuseScore-3.6.2.548021370-x86_64.AppImage"

In [5]:
NB_THREADS=32
MSCZ_FILENAMES=os.listdir(DATA_FOLDER)

CONVERSION_FOLDER = os.path.abspath('./mscx')
OUTPUT_PATHS = dict(
    events = os.path.abspath('./events'),
    notes = os.path.abspath('./notes'),
    measures = os.path.abspath('./measures'),
    labels = os.path.abspath('./labels'),
    metadata = os.path.abspath('./metadata'),
)

Using the MuseScore 3 binary to convert the file format to the current version of MuseScore 3 (needs to be installed or available as AppImage). [MuseScore commandline options](https://musescore.org/en/handbook/3/command-line-options)

ToDos:

* parallelize the conversion (anyone interested in learning [ray](https://www.ray.io/)?)
* avoid processing files more than once, making use of the fact that the file names correspond to IDs
* add proper error handling, keeping track of a mapping ID -> `errors message` for files that cannot successfully be converted (checkout stdout and stderr arguments of [subprocess.run()](https://docs.python.org/3/library/subprocess.html#subprocess.run))

In [7]:
ray.init(ignore_reinit_error=True)

2022-11-30 10:33:25,818	INFO worker.py:1528 -- Started a local Ray instance.


0,1
Python version:,3.10.8
Ray version:,2.1.0


In [40]:
@ray.remote
def process_chunk(filenames):
    for i, filename in enumerate(filenames):
        ID, file_extension = os.path.splitext(filename)
        converted_file_path = os.path.join(CONVERSION_FOLDER, ID + '.mscx')
        file_path = os.path.join(SAMPLE_FOLDER, filename)
        print(f"Converting {file_path} to {converted_file_path}...", end=' ')
        
        result = subprocess.run([musescore_cmd,"--score-meta", "-o", converted_file_path, file_path], capture_output=True, text=True)
        print(f"Exit code: {result.returncode}")
        print(f"Result: {result.stdout.strip()}") # the extraction of metadata as JSON does not work on Windows; please store the JSON to the metadata output folder
        print(f"Errors: {result.stderr.strip()}")

In [41]:
threads=32
files=os.listdir(SAMPLE_FOLDER)
n=len(files)
futures = [process_chunk.remote(files[i*(n//threads):min((i+1)*(n//threads),n)]) for i in range (threads)]

TypeError: missing a required argument: 'end_idx'

In [14]:
ray.get(futures)

[2m[36m(process_chunk pid=8632)[0m Exit code: -6
[2m[36m(process_chunk pid=8632)[0m Result: 
[2m[36m(process_chunk pid=8632)[0m Errors: 
[2m[36m(process_chunk pid=8632)[0m Converting /home/nathan/Documents/Cours/EPFL_ML_CS433/ComposersClassifier/ml_project/mscz/102077.mscz to /home/nathan/Documents/Cours/EPFL_ML_CS433/ComposersClassifier/ml_project/mscx/102077.mscx... 
[2m[36m(process_chunk pid=8634)[0m Exit code: -6
[2m[36m(process_chunk pid=8634)[0m Result: 
[2m[36m(process_chunk pid=8634)[0m Errors: 
[2m[36m(process_chunk pid=8634)[0m Converting /home/nathan/Documents/Cours/EPFL_ML_CS433/ComposersClassifier/ml_project/mscz/1024771.mscz to /home/nathan/Documents/Cours/EPFL_ML_CS433/ComposersClassifier/ml_project/mscx/1024771.mscx... 
[2m[36m(process_chunk pid=8635)[0m Exit code: -6
[2m[36m(process_chunk pid=8635)[0m Result: 
[2m[36m(process_chunk pid=8635)[0m Errors: 
[2m[36m(process_chunk pid=8635)[0m Converting /home/nathan/Documents/Cours/EPFL_M

[2m[36m(process_chunk pid=8633)[0m Exit code: -6
[2m[36m(process_chunk pid=8633)[0m Result: 
[2m[36m(process_chunk pid=8633)[0m Errors: 


In [11]:
# for i, entry in enumerate(os.scandir(SAMPLE_FOLDER)):
#     if i == 10:
#         break
#     ID, file_extension = os.path.splitext(entry.name)
#     converted_file_path = os.path.join(CONVERSION_FOLDER, ID + '.mscx')
#     print(f"Converting {entry.path} to {converted_file_path}...", end=' ')
    
#     result = subprocess.run([musescore_cmd,"--score-meta", "-o", converted_file_path, entry.path], capture_output=True, text=True)
#     print(f"Exit code: {result.returncode}")
#     print(f"Result: {result.stdout.strip()}") # the extraction of metadata as JSON does not work on Windows; please store the JSON to the metadata output folder
#     print(f"Errors: {result.stderr.strip()}")

Converting /home/erwan/epfl/s1/ml/ComposersClassifier/data/mscz/100462.mscz to /home/erwan/epfl/s1/ml/ComposersClassifier/data/mscx/100462.mscx... Exit code: 0
Result: {
"metadata": {"composer":"","duration":16,"fileVersion":114,"hasHarmonies":"false","hasLyrics":"false","keysig":0,"lyrics":"","measures":8,"mscoreVersion":"1.3","pageFormat":{"height":297,"twosided":"true","width":210},"pages":1,"parts":[{"harmonyCount":0,"hasDrumStaff":"false","hasPitchedStaff":"true","hasTabStaff":"false","instrumentId":"keyboard.piano","isVisible":"true","lyricCount":0,"name":"<font face=\"Times New Roman\"/>Piano","program":0}],"poet":"","previousSource":"","subtitle":"","tempo":0,"tempoText":"","textFramesData":{"composers":[],"poets":[],"subtitles":[],"titles":["Erin Taylor Nicole"]},"timesig":"4/4","title":"Erin Taylor Nicole"}
}
Errors: /lib/x86_64-linux-gnu/libjack.so.0
/lib/x86_64-linux-gnu/libnss3.so
Creating main window…
Reading translations…
Converting /home/erwan/epfl/s1/ml/ComposersClassi

Using the ms3 parsing library to extract score information:

In [5]:
for entry in os.scandir(CONVERSION_FOLDER):
    if entry.is_dir():
        continue
    parsed = ms3.Score(entry.path, read_only=True)
    ID, _ = os.path.splitext(entry.name)
    tsv_name = f"{ID}.tsv"
    dataframes = dict(
        events = parsed.mscx.events(),
        notes = parsed.mscx.notes(),
        measures = parsed.mscx.measures(),
        labels = parsed.mscx.labels(),
    )
    for facet, df in dataframes.items():
        if df is None:
            continue
        tsv_path = os.path.join(OUTPUT_PATHS[facet], tsv_name)
        df.to_csv(tsv_path, sep='\t', index=False)
    metadata = parsed.mscx.metadata # please add this nested dictionary to the JSON stored in the previous step
    metadata['id'] = ID

	MC 15, the 1st measure of a 2nd volta, should have MN 14, not MN 15.
