# Imports:

In [22]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
import pretty_midi
from music21 import midi
import IPython.display as ipd
import numpy as np

# Similar:
# https://magenta.tensorflow.org/music-vae
# https://openai.com/index/musenet/

In [23]:
# Positional Encoding
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * -(math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        return x + self.pe[:, :x.size(1)]

# Transformer Model for Music Generation
class MusicTransformer(nn.Module):
    def __init__(self, vocab_size, d_model=512, nhead=8, num_layers=6, dim_feedforward=2048, max_seq_length=500):
        super(MusicTransformer, self).__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoder = PositionalEncoding(d_model, max_seq_length)
        self.transformer = nn.Transformer(
            d_model=d_model, 
            nhead=nhead, 
            num_encoder_layers=num_layers,
            num_decoder_layers=num_layers,
            dim_feedforward=dim_feedforward
        )
        self.d_model = d_model
        self.output_layer = nn.Linear(d_model, vocab_size)

    def forward(self, src, tgt):
        src = self.embedding(src) * math.sqrt(self.d_model)
        tgt = self.embedding(tgt) * math.sqrt(self.d_model)
        src = self.pos_encoder(src)
        tgt = self.pos_encoder(tgt)

        # Adjusted permute calls to match the expected dimensions
        output = self.transformer(src.permute(1, 0, 2), tgt.permute(1, 0, 2)) 
        output = self.output_layer(output.permute(1, 0, 2))  # Corrected to avoid 4D tensor issues
        return output


In [24]:
# Instantiate the model
def get_model(vocab_size, max_seq_length):
    model = MusicTransformer(vocab_size=vocab_size, max_seq_length=max_seq_length)
    return model

# Example usage
vocab_size = 128  # Assuming MIDI note numbers as tokens
max_seq_length = 500
model = get_model(vocab_size, max_seq_length)

# Check model architecture
print(model)

# MIDI Preprocessing Function
def midi_to_sequence(midi_file):
    midi_data = pretty_midi.PrettyMIDI(midi_file)
    notes = []
    for instrument in midi_data.instruments:
        for note in instrument.notes:
            notes.append(note.pitch)
    return notes

# Example of preprocessing all files in a folder
import os
midi_folder = 'numbers/albeniz'
music_data = []

for file in os.listdir(midi_folder):
    if file.endswith(".mid"):
        music_data.extend(midi_to_sequence(os.path.join(midi_folder, file)))

# Convert to tensor
data_tensor = torch.tensor(music_data, dtype=torch.long)
#print("Tokenized music data size:", data_tensor.size())




MusicTransformer(
  (embedding): Embedding(128, 512)
  (pos_encoder): PositionalEncoding()
  (transformer): Transformer(
    (encoder): TransformerEncoder(
      (layers): ModuleList(
        (0-5): 6 x TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
          )
          (linear1): Linear(in_features=512, out_features=2048, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (linear2): Linear(in_features=2048, out_features=512, bias=True)
          (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (dropout1): Dropout(p=0.1, inplace=False)
          (dropout2): Dropout(p=0.1, inplace=False)
        )
      )
      (norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
    )
    (decoder): TransformerDecoder(
      (layers): ModuleList(
        (

In [25]:
def train_model(model, data_tensor, num_epochs=1, batch_size=32):
    model.train()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.CrossEntropyLoss()

    for epoch in range(num_epochs):
        total_loss = 0
        for i in range(0, len(data_tensor) - batch_size, batch_size):
            src = data_tensor[i:i + batch_size].unsqueeze(0)
            tgt = data_tensor[i + 1:i + batch_size + 1].unsqueeze(0)

            optimizer.zero_grad()
            output = model(src, tgt)
            loss = criterion(output.view(-1, vocab_size), tgt.view(-1))
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        
        print(f"Epoch {epoch + 1}, Loss: {total_loss:.4f}")

# Run a quick training loop with limited data
train_model(model, data_tensor[:1000], num_epochs=1)


Epoch 1, Loss: 125.8950


In [35]:
# Music Generation
model.eval()
def generate_music(model, start_sequence, max_length=500):  # Reduced max_length for faster testing
    generated = start_sequence.squeeze(0).tolist()  # Ensure a flat list of tokens
    input_seq = torch.tensor([generated], dtype=torch.long)

    for _ in range(max_length):
        with torch.no_grad():
            # Ensure token values are clipped within the vocabulary size
            input_seq = torch.clamp(input_seq, min=0, max=vocab_size - 1)
            embedded_seq = model.embedding(input_seq).permute(1, 0, 2)  # (seq_len, batch_size, d_model)

            # Generate output and predict next token
            output = model.transformer(embedded_seq, embedded_seq)
            next_token = torch.argmax(output[-1, 0, :]).item()

            # Prevent next token from exceeding vocabulary size
            next_token = max(0, min(next_token, vocab_size - 1))
            generated.append(next_token)

            # Update input sequence for next step
            input_seq = torch.tensor([generated[-50:]], dtype=torch.long)  # Shortened sequence length

    return generated

# Convert generated sequence to MIDI file and play it
def sequence_to_midi(sequence, output_file="generated_music.mid"):
    midi = pretty_midi.PrettyMIDI()
    instrument = pretty_midi.Instrument(program=0)
    start = 0
    for note in sequence:
        note = max(0, min(note, 127))  # Ensure valid MIDI note range
        midi_note = pretty_midi.Note(velocity=100, pitch=note, start=start, end=start + 0.5)
        instrument.notes.append(midi_note)
        start += 0.5
    midi.instruments.append(instrument)
    midi.write(output_file)

# Generate and save a music piece with fewer tokens for faster testing
start_sequence = torch.randint(0, vocab_size, (1, 5))  # Reduced start sequence length
generated_sequence = generate_music(model, start_sequence)
sequence_to_midi(generated_sequence)



# Play:

In [34]:
from music21 import midi

def play_midi_with_music21(midi_file_path):
    """Play a MIDI file using music21"""
    print("Playing generated MIDI using music21...")
    mf = midi.MidiFile()
    mf.open(midi_file_path)  # Load the specified MIDI file
    mf.read()
    mf.close()
    stream = midi.translate.midiFileToStream(mf)
    stream.show('midi')  # This will open a media player if supported

play_midi_with_music21("generated_music.mid")

Playing generated MIDI using music21...
