# Create a tokenizer

In [33]:
from miditok import REMI, TokenizerConfig  # here we choose to use REMI

# Our parameters
TOKENIZER_PARAMS = {
    "pitch_range": (21, 109),
    "beat_res": {(0, 4): 8, (4, 12): 4},
    "nb_velocities": 32,
    "special_tokens": ["PAD", "BOS", "EOS", "MASK"],
    "use_chords": True,
    "use_rests": False,
    "use_tempos": True,
    "use_time_signatures": False,
    "use_programs": False,
    "nb_tempos": 32,  # nb of tempo bins
    "tempo_range": (40, 250),  # (min, max)
}
config = TokenizerConfig(**TOKENIZER_PARAMS)

# Creates the tokenizer
tokenizer = REMI(config)

In [44]:
tokens_bpe = tokenizer.load_tokens(
    "Maestro_noBPE/MIDI-Unprocessed_XP_20_R2_2004_01_ORIG_MID--AUDIO_20_R1_2004_01_Track01_wav.json")
tokens_bpe_list = list(tokens_bpe.values())

tokens = tokenizer.decode_bpe(tokens_bpe_list)
midi = tokenizer.tokens_to_midi(tokens)


TypeError: 'NoneType' object is not iterable

# MIDI - Tokens conversion

In [32]:
from miditoolkit import MidiFile

midi_path = "we_wish_you.mid"
# Tokenize a MIDI file
midi = MidiFile("we_wish_you.mid")
tokens = tokenizer(midi)  # automatically detects MidiFile, paths
tokenizer.learn_bpe(10000, [str(tokens)])
# MidiTok can handle PyTorch / Tensorflow Tensors
generated_midi = tokenizer(tokens)
# could have been done above by giving the path argument
generated_midi.dump('we_wish_you_generated.mid')

KeyError: ' '

# Convert MIDI files to tokens, and load them for training

In [37]:
from miditok import REMI
from pathlib import Path

midi_paths = list(Path('Maestro').glob('**/*.mid')) + \
    list(Path('Maestro').glob('**/*.midi'))

# A validation method to discard MIDIs we do not want
# It can also be used for custom pre-processing, for instance if you want to merge
# some tracks before tokenizing a MIDI file


def midi_valid(midi) -> bool:
    if any(ts.numerator != 4 for ts in midi.time_signature_changes):
        return False  # time signature different from 4/*, 4 beats per bar
    if midi.max_tick < 10 * midi.ticks_per_beat:
        return False  # this MIDI is too short
    return True


tokenizer.tokenize_midi_dataset(midi_paths,
                                'Maestro_noBPE', [0, 0.5, 1, 1.5])

Tokenizing MIDIs (Maestro_noBPE):   0%|          | 0/132 [00:00<?, ?it/s]

Tokenizing MIDIs (Maestro_noBPE): 100%|██████████| 132/132 [01:44<00:00,  1.26it/s]


## Learn and apply BPE to data we just tokenized


In [40]:
tokens_path = Path("Maestro_noBPE")
tokens_bpe_path = Path("Maestro_BPE")
tokenizer.learn_bpe(
    vocab_size=500,
    tokens_paths=list(tokens_path.glob("**/*.json")),
    start_from_empty_voc=False,
)
tokenizer.save_params("tokenizer_bpe.conf")
tokenizer.apply_bpe_to_dataset(tokens_path, tokens_bpe_path)

Loading token files: 100%|██████████| 132/132 [00:00<00:00, 180.21it/s]
Applying BPE to dataset: 100%|██████████| 132/132 [00:21<00:00,  6.24it/s]


# Decoding the BPE data to the original midi file