In [9]:
from copy import deepcopy
from pathlib import Path
from random import shuffle

from torch import Tensor, argmax
from torch.utils.data import DataLoader
from torch.cuda import is_available as cuda_available, is_bf16_supported
from torch.backends.mps import is_available as mps_available
from transformers import AutoModelForCausalLM, MistralConfig, Trainer, TrainingArguments, GenerationConfig, MambaConfig, MambaForCausalLM
from transformers.trainer_utils import set_seed
from evaluate import load as load_metric
from miditok import REMI, TokenizerConfig
from miditok.pytorch_data import DatasetTok, DataCollator
from tqdm import tqdm

In [10]:
# Our tokenizer's configuration
PITCH_RANGE = (21, 109)
BEAT_RES = {(0, 1): 8, (1, 2): 4, (2, 4): 2, (4, 8): 1}
NUM_VELOCITIES = 24
SPECIAL_TOKENS = ["PAD", "MASK", "BOS", "EOS"]
USE_CHORDS = False
USE_RESTS = False
USE_TEMPOS = True
USE_TIME_SIGNATURE = False
USE_PROGRAMS = False
NUM_TEMPOS = 32
TEMPO_RANGE = (50, 200)  # (min_tempo, max_tempo)
TOKENIZER_PARAMS = {
    "pitch_range": PITCH_RANGE,
    "beat_res": BEAT_RES,
    "num_velocities": NUM_VELOCITIES,
    "special_tokens": SPECIAL_TOKENS,
    "use_chords": USE_CHORDS,
    "use_rests": USE_RESTS,
    "use_tempos": USE_TEMPOS,
    "use_time_signatures": USE_TIME_SIGNATURE,
    "use_programs": USE_PROGRAMS,
    "num_tempos": NUM_TEMPOS,
    "tempo_range": TEMPO_RANGE,
}
config = TokenizerConfig(**TOKENIZER_PARAMS)

# Creates the tokenizer
# tokenizer = REMI(config)
tokenizer = REMI(params='./tokenizer.json')

# Trains the tokenizer with Byte Pair Encoding (BPE) to build the vocabulary, here 10k tokens
midi_paths = list(Path('../data/maestro-v3.0.0').glob('**/*.mid')) + list(Path('../data/maestro-v3.0.0').glob('**/*.midi'))
# tokenizer.learn_bpe(
#     vocab_size=10000,
#     files_paths=midi_paths,
#     start_from_empty_voc=False,
# )
# tokenizer.save_params("tokenizer.json")

# Split MIDI paths in train/valid/test sets
total_num_files = len(midi_paths)
num_files_valid = round(total_num_files * 0.2)
num_files_test = round(total_num_files * 0.1)
shuffle(midi_paths)
midi_paths_valid = midi_paths[:num_files_valid]
midi_paths_test = midi_paths[num_files_valid:num_files_valid + num_files_test]
midi_paths_train = midi_paths[num_files_valid + num_files_test:]

# Loads tokens and create data collator
kwargs_dataset = {"min_seq_len": 256, "max_seq_len": 1024, "tokenizer": tokenizer}
dataset_train = DatasetTok(midi_paths_train, **kwargs_dataset)
dataset_valid = DatasetTok(midi_paths_valid, **kwargs_dataset)
dataset_test = DatasetTok(midi_paths_test, **kwargs_dataset)
collator = DataCollator(
    tokenizer["PAD_None"], tokenizer["BOS_None"], tokenizer["EOS_None"]
)


Loading data: ../data/maestro-v3.0.0/2008: 100%|██████████| 893/893 [00:45<00:00, 19.81it/s]
Loading data: ../data/maestro-v3.0.0/2018: 100%|██████████| 255/255 [00:13<00:00, 19.58it/s]
Loading data: ../data/maestro-v3.0.0/2015: 100%|██████████| 128/128 [00:05<00:00, 21.46it/s]


In [29]:
model_config = MambaConfig(
    vocab_size=len(tokenizer),
    hidden_size=192,
    state_size=8,
    max_position_embeddings=8192,
    num_hidden_layers=16,
    pad_token_id=tokenizer['PAD_None'],
    bos_token_id=tokenizer['BOS_None'],
    eos_token_id=tokenizer['EOS_None'],
)

model = MambaForCausalLM(model_config)

Generate config GenerationConfig {
  "bos_token_id": 2,
  "eos_token_id": 3,
  "pad_token_id": 0
}



In [32]:
def preprocess_logits(logits: Tensor, _: Tensor) -> Tensor:
    """
    Preprocess the logits before accumulating them during evaluation.

    This allows to significantly reduce the memory usage and make the training tractable.
    """
    pred_ids = argmax(logits, dim=-1)  # long dtype
    return pred_ids

# Create config for the Trainer
USE_CUDA = cuda_available()
if not cuda_available():
    FP16 = FP16_EVAL = BF16 = BF16_EVAL = False
elif is_bf16_supported():
    BF16 = BF16_EVAL = True
    FP16 = FP16_EVAL = False
else:
    BF16 = BF16_EVAL = False
    FP16 = FP16_EVAL = True
USE_MPS = not USE_CUDA and mps_available()
training_config = TrainingArguments(
    "runs", False, True, False, False, "steps",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=48,
    gradient_accumulation_steps=3,
    eval_accumulation_steps=None,
    eval_steps=1000,
    learning_rate=1e-4,
    weight_decay=0.01,
    max_grad_norm=3.0,
    max_steps=300,
    lr_scheduler_type="cosine_with_restarts",
    warmup_ratio=0.3,
    log_level="debug",
    logging_strategy="steps",
    logging_steps=20,
    save_strategy="steps",
    save_steps=250,
    save_total_limit=5,
    no_cuda=not USE_CUDA,
    seed=444,
    fp16=FP16,
    fp16_full_eval=FP16_EVAL,
    bf16=BF16,
    bf16_full_eval=BF16_EVAL,
    # load_best_model_at_end=True,
    label_smoothing_factor=0.,
    optim="adamw_torch",
    report_to=["tensorboard"],
    gradient_checkpointing=True,
)

collator = DataCollator(tokenizer["PAD_None"], tokenizer["BOS_None"], tokenizer["EOS_None"], copy_inputs_as_labels=True)
trainer = Trainer(
    model=model,
    args=training_config,
    data_collator=collator,
    train_dataset=dataset_train,
    eval_dataset=dataset_valid,
    callbacks=None,
    preprocess_logits_for_metrics=preprocess_logits,
)

# Training
train_result = trainer.train()
trainer.save_model()  # Saves the tokenizer too
trainer.log_metrics("train", train_result.metrics)
trainer.save_metrics("train", train_result.metrics)
trainer.save_state()

PyTorch: setting up devices
max_steps is given, it will override any value given in num_train_epochs
Using auto half precision backend
Currently training with a batch size of: 16
***** Running training *****
  Num examples = 7,677
  Num Epochs = 2
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 48
  Gradient Accumulation steps = 3
  Total optimization steps = 300
  Number of trainable parameters = 5,800,128


  0%|          | 0/300 [00:00<?, ?it/s]



{'loss': 10.7185, 'grad_norm': 1.0116009712219238, 'learning_rate': 2.2222222222222223e-05, 'epoch': 0.12}
{'loss': 10.3971, 'grad_norm': 0.9841909408569336, 'learning_rate': 4.4444444444444447e-05, 'epoch': 0.25}
{'loss': 9.7775, 'grad_norm': 0.8079961538314819, 'learning_rate': 6.666666666666667e-05, 'epoch': 0.38}
{'loss': 9.1375, 'grad_norm': 0.5027562975883484, 'learning_rate': 8.888888888888889e-05, 'epoch': 0.5}
{'loss': 8.7133, 'grad_norm': 0.39267244935035706, 'learning_rate': 9.944154131125642e-05, 'epoch': 0.62}
{'loss': 8.3953, 'grad_norm': 0.33455485105514526, 'learning_rate': 9.504844339512095e-05, 'epoch': 0.75}
{'loss': 8.1963, 'grad_norm': 0.29263848066329956, 'learning_rate': 8.665259359149132e-05, 'epoch': 0.88}
{'loss': 8.0577, 'grad_norm': 0.25291773676872253, 'learning_rate': 7.500000000000001e-05, 'epoch': 1.0}
{'loss': 7.9943, 'grad_norm': 0.20593200623989105, 'learning_rate': 6.112604669781572e-05, 'epoch': 1.12}
{'loss': 7.937, 'grad_norm': 0.19787374138832092

Saving model checkpoint to runs/checkpoint-250
Configuration saved in runs/checkpoint-250/config.json
Configuration saved in runs/checkpoint-250/generation_config.json
Model weights saved in runs/checkpoint-250/model.safetensors


{'loss': 7.877, 'grad_norm': 0.1827237755060196, 'learning_rate': 8.688061284200266e-06, 'epoch': 1.62}
{'loss': 7.8688, 'grad_norm': 0.19901618361473083, 'learning_rate': 2.221359710692961e-06, 'epoch': 1.75}




Training completed. Do not forget to share your model on huggingface.co/models =)


Saving model checkpoint to runs
Configuration saved in runs/config.json
Configuration saved in runs/generation_config.json
Model weights saved in runs/model.safetensors


{'loss': 7.878, 'grad_norm': 0.19314920902252197, 'learning_rate': 0.0, 'epoch': 1.88}
{'train_runtime': 261.2715, 'train_samples_per_second': 55.115, 'train_steps_per_second': 1.148, 'train_loss': 8.582714284261067, 'epoch': 1.88}
***** train metrics *****
  epoch                    =       1.88
  train_loss               =     8.5827
  train_runtime            = 0:04:21.27
  train_samples_per_second =     55.115
  train_steps_per_second   =      1.148


In [33]:
model = MambaForCausalLM.from_pretrained('./runs/checkpoint-250')

loading configuration file ./runs/checkpoint-250/config.json
Model config MambaConfig {
  "architectures": [
    "MambaForCausalLM"
  ],
  "bos_token_id": 2,
  "conv_kernel": 4,
  "eos_token_id": 3,
  "expand": 2,
  "hidden_act": "silu",
  "hidden_size": 192,
  "initializer_range": 0.1,
  "intermediate_size": 384,
  "layer_norm_epsilon": 1e-05,
  "max_position_embeddings": 8192,
  "model_type": "mamba",
  "num_hidden_layers": 16,
  "pad_token_id": 0,
  "rescale_prenorm_residual": false,
  "residual_in_fp32": true,
  "state_size": 8,
  "time_step_floor": 0.0001,
  "time_step_init_scheme": "random",
  "time_step_max": 0.1,
  "time_step_min": 0.001,
  "time_step_rank": 12,
  "time_step_scale": 1.0,
  "torch_dtype": "float32",
  "transformers_version": "4.40.0.dev0",
  "use_bias": false,
  "use_cache": true,
  "use_conv_bias": true,
  "vocab_size": 10000
}

loading weights file ./runs/checkpoint-250/model.safetensors
Generate config GenerationConfig {
  "bos_token_id": 2,
  "eos_token_id":

In [35]:
model.eval()
out = model.generate(max_new_tokens=200)

In [37]:
tokenizer.tokens_to_midi(out).dump_midi('krowakrowa.midi')