### Music Sheet Data Extraction

##### This file extracts the various data files needed for running our sheet music classification model. We plan on using previously labeled data from OmrDataset and scraped digital and handwritten data we label ourselves. 

#### We will begin by extracting the OmrDataset from the Audiveris Library, using code from Pacha's Music Symbol Classifier model, which we are basing our model off of

In [3]:
#Install the library for omr datasets
pip install omrdatasettools

Collecting omrdatasettools
  Downloading omrdatasettools-1.4.0.tar.gz (41 kB)
  Preparing metadata (setup.py) ... [?25ldone
Collecting muscima (from omrdatasettools)
  Downloading muscima-0.10.0.tar.gz (105 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting mung (from omrdatasettools)
  Downloading mung-1.2.1.tar.gz (117 kB)
  Preparing metadata (setup.py) ... [?25ldone
Collecting tqdm (from omrdatasettools)
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting scikit-image (from omrdatasettools)
  Downloading scikit_image-0.24.0-cp312-cp312-macosx_10_9_x86_64.whl.metadata (14 kB)
Collecting imageio>=2.33 (from scikit-image->omrdatasettools)
  Downloading imageio-2.36.0-py3-none-any.whl.metadata (5.2 kB)
Collecting tifffile>=2022.8.12 (from scikit-image->omrdatasettools)
  Downloading tifffile-2024.9.20-py3-none-any.whl.metadata (32 kB)
Collecting lazy-loader>=0.4 (from scikit-image->omrdatasettools)
  Downloading lazy_loader-0.4-py3-none-any.whl.met

In [23]:
import argparse
import json

import os
from distutils import dir_util

from omrdatasettools.Downloader import Downloader
from omrdatasettools.AudiverisOmrImageGenerator import AudiverisOmrImageGenerator
from omrdatasettools.OmrDataset import OmrDataset

os.makedirs("audiveris", exist_ok=True)
os.chdir("audiveris")

class AudiverisOmrImageExtractor():
    def __init__(self) -> None:
        self.path_of_this_file = os.getcwd()
        print("self.path_of_this_file",self.path_of_this_file)

    def prepare_dataset(self, intermediate_image_directory, image_dataset_directory):
        with open(os.path.join(self.path_of_this_file, "AudiverisOmrIgnoredClasses.json")) as file:
            ignored_classes = json.load(file)
        with open(os.path.join(self.path_of_this_file, "AudiverisOmrClassNameMapping.json")) as file:
            class_name_mapping = json.load(file)

        image_directories = os.listdir(intermediate_image_directory)

        for symbol_class in image_directories:
            if symbol_class in ignored_classes:
                continue

            destination_class_name = class_name_mapping[symbol_class]
            source_folder = os.path.join(intermediate_image_directory, symbol_class)
            destination_folder = os.path.join(image_dataset_directory, destination_class_name)
            os.makedirs(destination_folder, exist_ok=True)
            dir_util.copy_tree(source_folder, destination_folder)


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--raw_dataset_directory",
        type=str,
        default="../data/audiveris_omr_raw",
        help="The directory, where the raw Muscima++ dataset can be found")
    parser.add_argument(
        "--intermediate_image_directory",
        type=str,
        default="../data/audiveris_omr",
        help="The directory, where the raw bitmaps will be generated")
    parser.add_argument(
        "--image_dataset_directory",
        type=str,
        default="../data/images",
        help="The directory, where the processed bitmaps will be copied to after filtering and renaming classes")

    flags, unparsed = parser.parse_known_args()

    dataset_downloader = Downloader()
    dataset_downloader.download_and_extract_dataset(OmrDataset.Audiveris, flags.raw_dataset_directory)

    # Convert the raw data into images
    image_generator = AudiverisOmrImageGenerator()
    image_generator.extract_symbols(flags.raw_dataset_directory, flags.intermediate_image_directory)

    # Actually prepare our dataset
    dataset_preparer = AudiverisOmrImageExtractor()
    dataset_preparer.prepare_dataset(flags.intermediate_image_directory, flags.image_dataset_directory)

Extracting AudiverisOmrDataset.zip dataset...
Extracting Symbols from Audiveris OMR Dataset...
self.path_of_this_file /Users/kimiyashahamat/Desktop/DL_CV/Final Project/Sheet-Music-Parser/data/audiveris
