load and preprocess data based on 001.

In [1]:
import os
import numpy as np
import pretty_midi
import tensorflow as tf
from tensorflow.keras import layers
import matplotlib.pyplot as plt

In [5]:
SEQ_LEN = 32
FEATURE_DIM = 4
BATCH_SIZE = 32
EPOCHS = 100
MIDI_ROOT = path
EMBED_DIM = 128
NUM_HEADS = 4
FF_DIM = 256
NUM_LAYERS = 2

In [None]:
def midi_to_note_vector(midi_file):
    try:
        pm = pretty_midi.PrettyMIDI(midi_file)
        notes = []
        for instrument in pm.instruments:
            if instrument.is_drum:
                continue
            for note in instrument.notes:
                notes.append([note.pitch / 128, note.velocity / 128, note.end - note.start, note.start])
        notes = sorted(notes, key=lambda x: x[3])
        vectors = []
        for i in range(len(notes) - 1):
            delta_time = notes[i+1][3] - notes[i][3]
            vectors.append([notes[i][0], notes[i][1], notes[i][2], delta_time])
            if len(vectors) == SEQ_LEN:
                break
        return np.array(vectors[:SEQ_LEN]) if len(vectors) == SEQ_LEN else None
    except Exception as e:
        print(f"Skipping {midi_file} due to error: {e}")
        return None


def load_midi_dataset(path):
    data = []
    for root, _, files in os.walk(path):
        for file in files:
            if file.endswith('.mid') or file.endswith('.midi'):
                vec = midi_to_note_vector(os.path.join(root, file))
                if vec is not None:
                    data.append(vec)
    return np.array(data, dtype=np.float32)

In [None]:
class PositionalEncoding(layers.Layer):
    def __init__(self, sequence_length, embed_dim):
        super().__init__()
        self.pos_encoding = self.get_positional_encoding(sequence_length, embed_dim)

    def get_positional_encoding(self, length, d_model):
        pos = np.arange(length)[:, np.newaxis]
        i = np.arange(d_model)[np.newaxis, :]
        angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(d_model))
        angle_rads = pos * angle_rates
        angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
        angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
        return tf.constant(angle_rads[np.newaxis, ...], dtype=tf.float32)

    def call(self, x):
        return x + self.pos_encoding[:, :tf.shape(x)[1], :]


In [None]:
def transformer_block(embed_dim, num_heads, ff_dim):
    inputs = layers.Input(shape=(None, embed_dim))
    attn_output = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)(inputs, inputs)
    x = layers.LayerNormalization()(inputs + attn_output)
    ff_output = layers.Dense(ff_dim, activation='relu')(x)
    ff_output = layers.Dense(embed_dim)(ff_output)
    x = layers.LayerNormalization()(x + ff_output)
    return tf.keras.Model(inputs=inputs, outputs=x)


def build_transformer(seq_len, feature_dim):
    inputs = layers.Input(shape=(seq_len, feature_dim))
    x = layers.Dense(EMBED_DIM)(inputs)
    x = PositionalEncoding(seq_len, EMBED_DIM)(x)

    for _ in range(NUM_LAYERS):
        x = transformer_block(EMBED_DIM, NUM_HEADS, FF_DIM)(x)

    outputs = layers.Dense(feature_dim)(x)
    return tf.keras.Model(inputs=inputs, outputs=outputs)


In [None]:
def visualize_prediction(true_seq, pred_seq):
    fig, axes = plt.subplots(FEATURE_DIM, 1, figsize=(10, 6))
    for i in range(FEATURE_DIM):
        axes[i].plot(true_seq[:, i], label='True', color='black')
        axes[i].plot(pred_seq[:, i], label='Predicted', linestyle='dashed', color='blue')
        axes[i].set_ylabel(f'Feature {i+1}')
    axes[0].legend()
    plt.suptitle("True vs Predicted Note Features")
    plt.xlabel("Time Step")
    plt.tight_layout()
    plt.show()


def plot_losses(losses):
    plt.figure(figsize=(8, 4))
    plt.plot(losses, label='Training Loss', color='green')
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.title("Transformer Training Loss")
    plt.grid(True)
    plt.legend()
    plt.show()


def train_transformer(model, dataset):
    losses = []
    optimizer = tf.keras.optimizers.Adam()
    loss_fn = tf.keras.losses.MeanSquaredError()

    for epoch in range(EPOCHS):
        idx = np.random.randint(0, len(dataset), BATCH_SIZE)
        batch = dataset[idx]
        x_input = batch[:, :-1, :]
        y_target = batch[:, 1:, :]

        with tf.GradientTape() as tape:
            y_pred = model(x_input, training=True)
            loss = loss_fn(y_target, y_pred)
        grads = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(grads, model.trainable_variables))
        losses.append(loss.numpy())

        if epoch % 10 == 0:
            print(f"Epoch {epoch}: Loss = {loss.numpy():.4f}")

    plot_losses(losses)


    sample = dataset[np.random.randint(len(dataset))]
    x_in = sample[np.newaxis, :-1, :]
    true_out = sample[1:, :]
    pred_out = model(x_in, training=False)[0]
    visualize_prediction(true_out, pred_out)


dataset = load_midi_dataset(MIDI_ROOT)
print(f"Loaded {len(dataset)} MIDI samples.")

transformer_model = build_transformer(SEQ_LEN - 1, FEATURE_DIM)
train_transformer(transformer_model, dataset)