References:
https://www.youtube.com/watch?v=CbTU92pbDKw
https://arxiv.org/pdf/2203.12105
https://github.com/jordan-bird/Keras-LSTM-Music-Generator/tree/master

In [1]:
!pip install miditok miditoolkit muspy

Collecting miditok
  Downloading miditok-3.0.6.post1-py3-none-any.whl.metadata (10 kB)
Collecting miditoolkit
  Downloading miditoolkit-1.0.1-py3-none-any.whl.metadata (4.9 kB)
Collecting muspy
  Downloading muspy-0.5.0-py3-none-any.whl.metadata (5.5 kB)
Collecting symusic>=0.5.0 (from miditok)
  Downloading symusic-0.5.8-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (9.0 kB)
Collecting mido>=1.1.16 (from miditoolkit)
  Downloading mido-1.3.3-py3-none-any.whl.metadata (6.4 kB)
Collecting bidict>=0.21 (from muspy)
  Downloading bidict-0.23.1-py3-none-any.whl.metadata (8.7 kB)
Collecting pretty-midi>=0.2 (from muspy)
  Downloading pretty_midi-0.2.10.tar.gz (5.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m44.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pypianoroll>=1.0 (from muspy)
  Downloading pypianoroll-1.0.4-py3-none-any.whl.metadata (3.8 kB)
C

In [2]:
def transpose_midi(midi, semitone_shift):
    """Return a copy of MIDI transposed by semitone_shift."""
    new_midi = deepcopy(midi)
    for inst in new_midi.instruments:
        for note in inst.notes:
            note.pitch += semitone_shift
    return new_midi

def augment_midi(midi):
    """Generate transposed versions of the MIDI file."""
    augmented = []
    for shift in range(-1, 2):  # -6 to +6 semitones; original (-6, 7)
        if shift == 0:
            augmented.append(deepcopy(midi))  # original
        else:
            augmented.append(transpose_midi(midi, shift))
    return augmented

In [3]:
from miditok import REMI, TokenizerConfig
from miditoolkit import MidiFile
from pathlib import Path
import numpy as np
from copy import deepcopy

# --- Tokenizer config ---
config = TokenizerConfig(
    use_chords=True,
    use_rests=True,
    use_tempos=True,
    use_time_signatures=True,
    nb_tempos=32,
    tempo_range=(40, 250),
    chord_types='all'
)

tokenizer = REMI(config)

# --- Tokenize all MIDI files directly in memory ---
midi_folder = Path("/kaggle/input/lstm-midi-training")
all_sequences = []

for midi_path in midi_folder.glob("*.mid"):
    try:
        midi = MidiFile(midi_path)
        augmented_versions = augment_midi(midi)

        for aug_midi in augmented_versions:
            tokens = tokenizer(aug_midi)
            if isinstance(tokens, list):
                for seq in tokens:
                    all_sequences.append(seq.ids)
            else:
                all_sequences.append(tokens.ids)

    except Exception as e:
        print(f"Error with {midi_path.name}: {e}")

print(f"Total token sequences: {len(all_sequences)}")

  config = TokenizerConfig(
  tokens = tokenizer(aug_midi)


Total token sequences: 624


In [4]:
SEQ_LENGTH = 128  # number of tokens per input sequence
X, y = [], []

for seq in all_sequences:
    if len(seq) <= SEQ_LENGTH:
        continue
    for i in range(len(seq) - SEQ_LENGTH):
        X.append(seq[i:i+SEQ_LENGTH])
        y.append(seq[i+SEQ_LENGTH])

X = np.array(X, dtype=np.int32)
y = np.array(y, dtype=np.int32)

print(f"Training samples: {X.shape[0]}")
print(f"X shape: {X.shape}, y shape: {y.shape}")

vocab_size = tokenizer.vocab_size

Training samples: 1862380
X shape: (1862380, 128), y shape: (1862380,)


In [5]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, shuffle=False
)

In [6]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import layers
from tensorflow.keras.callbacks import EarlyStopping
import tensorflow as tf

tf.keras.mixed_precision.set_global_policy('mixed_float16')

# --- Define model ---
# model = Sequential([
#     layers.Input(shape=(3, 2)),
#     layers.Bidirectional(layers.LSTM(128, return_sequences=True)),
#     layers.Bidirectional(layers.LSTM(64)),
#     layers.BatchNormalization(),
#     layers.Dense(128, activation='relu'),
#     layers.Dropout(0.3),
#     layers.Dense(64, activation='relu'),
#     layers.Dense(2)
# ])
model = Sequential([
  layers.Embedding(input_dim=vocab_size, output_dim=256),
  layers.LSTM(512, return_sequences=True),
  layers.LSTM(1024, return_sequences=True),
  # layers.LSTM(1024, return_sequences=True),
  layers.LSTM(512),
  layers.Dropout(0.3),
  layers.BatchNormalization(),
  layers.Dense(256),
  layers.Dropout(0.3),
  layers.Dense(128),
  layers.Dropout(0.3),
  layers.Activation(activation='swish'),
  layers.BatchNormalization(),
  layers.Dense(vocab_size, activation='softmax')
])

model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer=Adam(learning_rate=0.001),
    metrics=['accuracy']
)

# --- Early stopping ---
# early_stop = EarlyStopping(
#     monitor='val_loss',
#     patience=10,
#     restore_best_weights=True
# )

log_dir = "logs/fit/0.0001-300epoch"
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

# --- Train ---
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=15,
    # callbacks=[early_stop]
    callbacks=[tensorboard_callback],
    batch_size=512
)


2025-08-14 01:08:37.377709: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1755133717.589671      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1755133717.647457      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
I0000 00:00:1755133732.725013      19 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 15513 MB memory:  -> device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0


Epoch 1/15


I0000 00:00:1755133746.649813      76 cuda_dnn.cc:529] Loaded cuDNN version 90300


[1m2910/2910[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2153s[0m 734ms/step - accuracy: 0.3239 - loss: 3.0878 - val_accuracy: 0.4635 - val_loss: 1.9412
Epoch 2/15
[1m2910/2910[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2134s[0m 733ms/step - accuracy: 0.6094 - loss: 1.2430 - val_accuracy: 0.4596 - val_loss: 2.1040
Epoch 3/15
[1m2910/2910[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2135s[0m 734ms/step - accuracy: 0.6783 - loss: 0.9601 - val_accuracy: 0.4609 - val_loss: 2.4467
Epoch 4/15
[1m2910/2910[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2137s[0m 734ms/step - accuracy: 0.7236 - loss: 0.7922 - val_accuracy: 0.4557 - val_loss: 2.7005
Epoch 5/15
[1m2910/2910[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2134s[0m 733ms/step - accuracy: 0.7546 - loss: 0.6867 - val_accuracy: 0.4558 - val_loss: 2.9436
Epoch 6/15
[1m2910/2910[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2132s[0m 733ms/step - accuracy: 0.7744 - loss: 0.6186 - val_accuracy: 0.4537 - val_loss: 3.13

In [7]:
import random
import numpy as np
import muspy

# 1. Generate tokens
seed = X[random.randint(0, len(X) - 1)].tolist()
generated = seed.copy()

seq_len = 64
num_tokens_to_generate = 2000

for _ in range(num_tokens_to_generate):
    input_seq = np.array(generated[-seq_len:]).reshape(1, seq_len)
    preds = model.predict(input_seq, verbose=0)[0]
    next_token = np.argmax(preds)
    generated.append(next_token)

# Decode token sequence into ScoreTick
score_tick = tokenizer.decode([generated])

# Save MIDI file
score_tick.dump_midi("generated_music.mid")