This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
You should have received a copy of the GNU General Public License along with this program. If not, see <https://www.gnu.org/licenses/>.

In [1]:
import os
import torch

In [2]:
"""
Sets up vocabulary for the representation of MIDI files using Oore et al, 2018 vocabulary

Code from Gomatam's repository https://github.com/spectraldoy/music-transformer/blob/main/vocabulary.py with modifications
Velocity events were removed
Start and end tokens were removed
LTH variable renamed to max_time_shift for easier understandability
DIV variable renamed to time_shift for easier understandability
"""
note_on_events = 128
note_off_events = note_on_events
note_events = note_on_events + note_off_events
time_shift = 8
max_time_shift = 1000
time_shift_events = max_time_shift // time_shift
total_midi_events = note_on_events + note_off_events + time_shift_events

# create vocabulary
note_on_vocab = [f"note_on_{i}" for i in range(note_on_events)]
note_off_vocab = [f"note_off_{i}" for i in range(note_off_events)]
time_shift_vocab = [f"time_shift_{i}" for i in range(time_shift_events)]
vocab = ["<pad>"] + note_on_vocab + note_off_vocab + time_shift_vocab
vocab_size = len(vocab)
pad_token = vocab.index("<pad>")

In [3]:
"""
Converts a list of events to a list of indices

Code from Gomatam's repository https://github.com/spectraldoy/music-transformer/blob/main/vocabulary.py
"""
def events_to_indices(event_list, _vocab=None):
    if _vocab is None:
        _vocab = vocab
    index_list = []
    for event in event_list:
        index_list.append(_vocab.index(event))
    return index_list

In [4]:
"""
Converts a list of indices to a list of events

Code from Gomatam's repository https://github.com/spectraldoy/music-transformer/blob/main/vocabulary.py
"""
def indices_to_events(index_list, _vocab=None):
    if _vocab is None:
        _vocab = vocab
    event_list = []
    for idx in index_list:
        event_list.append(_vocab[idx])
    return event_list

In [5]:
"""
Customing rounding function that rounds 0.5 to the greater integer

Code from Gomatam's repository https://github.com/spectraldoy/music-transformer/blob/main/vocabulary.py
"""
def round_(a):
    b = a // 1
    decimal_digits = a % 1
    adder = 1 if decimal_digits >= 0.5 else 0
    return int(b + adder)

In [6]:
"""
Handles the creation of time shift events from Oore et al, 2018 vocabulary

Code from Gomatam's repository https://github.com/spectraldoy/music-transformer/blob/main/vocabulary.py
LTH variable renamed to max_time_shift for easier understandability
DIV variable renamed to time_shift for easier understandability
"""
def time_cutter(time, max_time_shift=max_time_shift, time_shift=time_shift):
    time_shifts = []

    # assume time = k * lth, k >= 0; add k max_time_shifts (lth // div) to time_shifts
    for i in range(time // max_time_shift):
        time_shifts.append(round_(max_time_shift / time_shift))   # custom round for consistent rounding of 0.5
    leftover_time_shift = round_((time % max_time_shift) / time_shift)
    time_shifts.append(leftover_time_shift) if leftover_time_shift > 0 else None

    return time_shifts

In [7]:
"""
Adds time shift events to index list and event list using delta time value calulated in the midi parser function

Code from Gomatam's repository https://github.com/spectraldoy/music-transformer/blob/main/vocabulary.py
"""
def time_to_events(delta_time, event_list=None, index_list=None, _vocab=None):
    if _vocab is None:
        _vocab = vocab
    time = time_cutter(delta_time)
    for i in time:
        # repeatedly create and append time events to the input lists
        idx = note_on_events + note_off_events + i
        if event_list is not None:
            event_list.append(_vocab[idx])
        if index_list is not None:
            index_list.append(idx)
    return

In [8]:
"""
Samples sequences to a specified length
"""
def sample_data(seqs, length):
  data = []
  for seq in seqs:
    data.append(seq[:length])

  return data

In [9]:
"""
Connect to Google Drive for importing and exporting files
"""
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [10]:
"""
Allow for connection to GPU
"""
from torch import cuda, device as d
if cuda.is_available():
    dev = "cuda"
else:
    dev = "cpu"
device = d(dev)
print(device)

cuda


In [11]:
!pip install mido
import mido
from torch import LongTensor

Collecting mido
  Downloading mido-1.3.0-py3-none-any.whl (50 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/50.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.3/50.3 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: mido
Successfully installed mido-1.3.0


In [12]:
"""
Translates midi files into Oore et. al, 2018 vocabulary
Returns both a LongTensor of indices and a list of events as strings

Code from Gomatam's repository https://github.com/spectraldoy/music-transformer/blob/main/tokenizer.py with modifications
Handling of pedal events removed
Code to set tempo of midi tracks to 500000 microseconds per beat (120 bpm) added
Code to handle velocity events removed
delta_time is scaled up by a factor of 5
"""
def midi_parser(fname=None, mid=None):
    if not ((fname is None) ^ (mid is None)):
        raise ValueError("Input one of fname or mid, not both or neither")

    tempo_in_microseconds_per_beat = 500000

    if fname is not None:
        mid = mido.MidiFile(fname)

    # Find original tempo
    original_tempo = None
    for track in mid.tracks:
        for msg in track:
            if msg.is_meta and msg.type == 'set_tempo':
                original_tempo = msg.tempo
                break
        if original_tempo is not None:
            break

    # Change tempo to desired value
    for track in mid.tracks:
        for msg in track:
            if msg.is_meta and msg.type == 'set_tempo':
                msg.tempo = tempo_in_microseconds_per_beat

    # things needed for conversion
    delta_time = 0          # time between important midi messages
    event_list = []         # list of events in vocab
    index_list = []         # list of indices in vocabdelta_time = 0

    # translate midi file to event list
    for track in mid.tracks:
        for msg in track:

            # increase delta_time by msg time for all messages and scale by 5
            delta_time += msg.time * 5

            if msg.is_meta:
                continue

            t = msg.type

            if t == "note_on":
                idx = msg.note + 1
            elif t == "note_off":
                idx = note_on_events + msg.note + 1
            else:
                continue

            time_to_events(delta_time, event_list=event_list, index_list=index_list)
            delta_time = 0

            event_list.append(vocab[idx])
            index_list.append(idx)

    return LongTensor(index_list), event_list


In [13]:
"""
Specify the directory containing the MIDI files
"""
midi_directory = "/content/drive/MyDrive/dissertationMidis"

In [14]:
"""
Iterate through MIDI files, extract data, represent data as Oore et. al, 2018 vocabulary, convert to a tensor, then add to list (only indices are added not events)
Tensors are then shortened to a length of 220 if longer or padded to a length of 220 if shorter
The list of tensors is then shuffled

Based off of code from Gomatam's repository https://github.com/spectraldoy/music-transformer/blob/main/preprocessing.py (Lines 224 - 251)
"""
training_seqs = []

for filename in os.listdir(midi_directory):
    if filename.endswith(".mid"):
        file_path = os.path.join(midi_directory, filename)
        try:
            index_tensor = midi_parser(file_path)[0]
            training_seqs.append(index_tensor)
        except OSError:
            pass

training_seqs = sample_data(training_seqs, length=220)
training_seqs = torch.nn.utils.rnn.pad_sequence(training_seqs, padding_value=pad_token).transpose(-1, -2)
training_seqs = training_seqs[torch.randperm(training_seqs.shape[0])]

print("Index List:")
print(training_seqs[0])

event_list = indices_to_events(training_seqs[0].tolist())
print("Event List:")
print(event_list)

Index List:
tensor([ 72,  68,  65,  53, 381, 311, 200, 196, 193, 181,  75,  67,  70,  63,
         51, 376, 195, 198, 191, 179,  68,  65,  61,  49, 346, 196, 193, 189,
        177,  67,  63,  51, 346, 203, 195, 191, 179,  72,  68,  65,  53, 381,
        311, 200, 196, 193, 181,  75,  67,  70,  63,  51, 376, 195, 198, 191,
        179,  68,  65,  61,  49, 346, 196, 193, 189, 177,  67,  63,  51, 346,
        203, 195, 191, 179,  72,  68,  65,  53, 381, 311, 200, 196, 193, 181,
         75,  67,  70,  63,  51, 376, 195, 198, 191, 179,  68,  65,  61,  49,
        346, 196, 193, 189, 177,  67,  63,  51, 346, 203, 195, 191, 179,  72,
         68,  65,  53, 381, 311, 200, 196, 193, 181,  75,  67,  70,  63,  51,
        376, 195, 198, 191, 179,  68,  65,  61,  49, 346, 196, 193, 189, 177,
         67,  63,  51, 346, 203, 195, 191, 179,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,

In [15]:
"""
Translates either a sequence events or indices in the Oore et. al, 2018 vocabulary into a MIDI file

Code from Gomatam's repository https://github.com/spectraldoy/music-transformer/blob/main/tokenizer.py with modifications
DIV variable renamed to time_shift for easier understandability
Handling of velocity events removed
Tempo changed to 500000
"""
def list_parser(index_list=None, event_list=None, fname="bloop", tempo=500000):
    # take only one of event_list or index_list to translate
    if not ((index_list is None) ^ (event_list is None)):
        raise ValueError("Input one of index_list or event_list, not both or neither")

    # check index_list is ints, assuming 1d list
    if index_list is not None:
        try:
            # assume torch tensor
            if not all([isinstance(i.item(), int) for i in index_list]):
                raise ValueError("All indices in index_list must be int type")
        except AttributeError:
            # otherwise assume normal ,jst
            if not all([isinstance(i, int) for i in index_list]):
                raise ValueError("All indices in index_list must be int type")

    # check event_list is str, assuming 1d list and convert to index_list
    if event_list is not None:
        if not all(isinstance(i, str) for i in event_list):
            raise ValueError("All events in event_list must be str type")
        index_list = events_to_indices(event_list)

    # set up midi file
    mid = mido.MidiFile()
    meta_track = mido.MidiTrack()
    track = mido.MidiTrack()

    # meta messages; meta time is 0 everywhere to prevent delay in playing notes
    meta_track.append(mido.MetaMessage("track_name").copy(name=fname, time=0))
    meta_track.append(mido.MetaMessage("smpte_offset"))
    # assume time_signature is 4/4
    time_sig = mido.MetaMessage("time_signature")
    time_sig = time_sig.copy(numerator=4, denominator=4, time=0)
    meta_track.append(time_sig)
    # assume key_signature is C
    key_sig = mido.MetaMessage("key_signature", time=0)
    meta_track.append(key_sig)
    # assume tempo is constant at input tempo
    set_tempo = mido.MetaMessage("set_tempo")
    set_tempo = set_tempo.copy(tempo=tempo, time=0)
    meta_track.append(set_tempo)
    # end of meta track
    end = mido.MetaMessage("end_of_track").copy(time=0)
    meta_track.append(end)

    # set up the piano; default channel is 0 everywhere; program=0 -> piano
    program = mido.Message("program_change", channel=0, program=0, time=0)
    track.append(program)
    # dummy pedal off message; control should be < 64
    cc = mido.Message("control_change", time=0)
    track.append(cc)

    # things needed for conversion
    delta_time = 0

    # reconstruct the performance
    for idx in index_list:
        # if torch tensor, get item
        try:
            idx = idx.item()
        except AttributeError:
            pass
        # if pad token, continue
        if idx <= 0:
            continue
        # adjust idx to ignore pad token
        idx = idx - 1

        # note messages
        if 0 <= idx < note_on_events + note_off_events:
            # note on event
            if 0 <= idx < note_on_events:
                note = idx
                t = "note_on"
            # note off event
            else:
                note = idx - note_on_events
                t = "note_off"

            # create note message and append to track
            msg = mido.Message(t)
            msg = msg.copy(note=note, time=delta_time)
            track.append(msg)

            # reinitialize delta_time and velocity to handle subsequent notes
            delta_time = 0

        # time shift event
        elif note_on_events + note_off_events <= idx < note_on_events + note_off_events + time_shift_events:
            # find cut time in range (1, time_shift_events)
            cut_time = idx - (note_on_events + note_off_events - 1)
            # scale cut_time by time_shift (from vocabulary) to find time in ms; add to delta_time
            delta_time += cut_time * time_shift

    # end the track
    end = mido.MetaMessage("end_of_track").copy(time=0)
    track.append(end)

    # append finalized track and return midi file
    mid.tracks.append(meta_track)
    mid.tracks.append(track)
    return mid

In [16]:
"""
Converts a list of indeces to a midi file (using list_parser) and saves the file

Code from Gomatam's repository https://github.com/spectraldoy/music-transformer/blob/main/generate.py with a minor modification
tempo chnaged to 500000
verbose parameter removed
commented code for saving other file formats removed
"""
def audiate(token_ids, save_path="/content/drive/MyDrive/genMidis", tempo=500000):
    # set file to a midi file
    if save_path.endswith(".midi"):
        save_path = save_path[:-1]
    elif save_path.endswith(".mid"):
        pass
    else:
        save_path += ".mid"

    # create and save the midi file
    mid = list_parser(index_list=token_ids, fname=save_path[:-4], tempo=tempo)
    mid.save(save_path)

    print("Done")
    return

In [17]:
"""
Creates input and target sequences as a list of tensors, then converts each list to a single tensor
"""
input_sequences = []
target_sequences = []

# Iterate through the parsed MIDI data to create input and target sequences
for seq in training_seqs:

    input_seq = seq[:-1]
    target_seq = seq[1:]

    input_sequences.append(input_seq)
    target_sequences.append(target_seq)

input_data = torch.stack(input_sequences)
target_data = torch.stack(target_sequences)

print("Input Data Shape:", input_data.shape)
print("Target Data Shape:", target_data.shape)

Input Data Shape: torch.Size([1219, 219])
Target Data Shape: torch.Size([1219, 219])


In [18]:
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as data
from torch.utils.data import DataLoader

In [19]:
"""
"""
class RNNModel(nn.Module):
    #Create structure of the model
    def __init__(self, input_size, hidden_size, output_size, num_layers):
        super(RNNModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.lstm_layers = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    #Initiate a forward pass
    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        lstm_out, _ = self.lstm_layers(x, (h0.detach(), c0.detach()))
        out = self.fc(lstm_out)
        return out

# Define the dimensions
input_size = vocab_size
hidden_size = 430
output_size = vocab_size
num_layers = 3
learning_rate = 0.001

#Move inputs and targets to gpu
input_data = input_data.to(device)
target_data = target_data.to(device)

#Create dataset from inputs and targets then convert to dataloader for batch training and shuffling of batches
dataset = torch.utils.data.TensorDataset(input_data, target_data)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

#Instantiate model
model = RNNModel(input_size, hidden_size, output_size, num_layers).to(device)
print(model)

#Specify loss and optimizers functions
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

#Specify number of epochs for training
EPOCHS = 150

#Iterate through epochs
for epoch in range(EPOCHS):
    total_loss = 0.0

    #Iterate through batches
    for i, (inputs, targets) in enumerate(dataloader):
        #Zero gradients
        optimizer.zero_grad()

        # One-hot encode the inputs from the dataloader
        inputs_one_hot = F.one_hot(inputs, num_classes=vocab_size).float()

        # Forward pass
        outputs = model(inputs_one_hot)

        #Loss calculation
        loss = criterion(outputs.contiguous().view(-1, output_size), targets.contiguous().view(-1))

        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    #Compute average loss over each epoch and print
    average_loss = total_loss / len(dataloader)
    print(f"Epoch {epoch+1}, Average Loss: {loss:.4f}")


RNNModel(
  (lstm_layers): LSTM(382, 430, num_layers=3, batch_first=True)
  (fc): Linear(in_features=430, out_features=382, bias=True)
)
Epoch 1, Average Loss: 4.9564
Epoch 2, Average Loss: 3.7445
Epoch 3, Average Loss: 3.8761
Epoch 4, Average Loss: 3.9072
Epoch 5, Average Loss: 2.5802
Epoch 6, Average Loss: 3.2834
Epoch 7, Average Loss: 3.3762
Epoch 8, Average Loss: 4.4002
Epoch 9, Average Loss: 2.9282
Epoch 10, Average Loss: 3.8030
Epoch 11, Average Loss: 3.0699
Epoch 12, Average Loss: 2.0415
Epoch 13, Average Loss: 2.9115
Epoch 14, Average Loss: 3.2534
Epoch 15, Average Loss: 2.3916
Epoch 16, Average Loss: 1.9978
Epoch 17, Average Loss: 3.3815
Epoch 18, Average Loss: 3.0800
Epoch 19, Average Loss: 2.0553
Epoch 20, Average Loss: 3.2181
Epoch 21, Average Loss: 1.8199
Epoch 22, Average Loss: 2.1590
Epoch 23, Average Loss: 1.5977
Epoch 24, Average Loss: 2.9344
Epoch 25, Average Loss: 1.5438
Epoch 26, Average Loss: 2.3152
Epoch 27, Average Loss: 1.4491
Epoch 28, Average Loss: 2.0866
Epoc

In [20]:
"""
Save trained model as a .pt file in Google Drive
"""
torch.save({
            'epoch': epoch,
            'state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': loss,
            },"/content/drive/My Drive/genModels/model41.pt")


In [23]:
"""
Load saved model from google drive, then generate the specified number of files of a specified length
"""
#Load saved model
checkpoint = torch.load("/content/drive/My Drive/genModels/model41.pt")
model.load_state_dict(checkpoint['state_dict'])

#Switch to eval mode for generation
model.eval()

#Specify length of each file and number of files
num_files = 5
sequence_length = 220

#Initialize set for generated seeds
generated_seeds = set()

# Loop to generate multiple MIDI files
for i in range(num_files):
    while True:
        #Generate random seed between 37 and 100
        seed = torch.randint(37, 100, (1, 1)).to(device)

        #Make sure seed has not already been generated
        if seed.item() not in generated_seeds:
            break

    #Add to set to check for uniqueness of following generations
    generated_seeds.add(seed.item())

    #Add generated seed to list
    generated_idx = [seed.item()]

    #Loop for specified sequence length
    for _ in range(sequence_length - 1):

        #One hot encode index list
        input_tensor = F.one_hot(torch.tensor(generated_idx).to(device), num_classes=382).float().unsqueeze(0)

        # Make a prediction using the model without calculating gradients
        with torch.no_grad():
            output = model(input_tensor)

        # Get the token ID with the highest score as the next token ID
        next_event_idx = torch.argmax(output[0, -1, :], dim=0).item()
        generated_idx.append(next_event_idx)

    print(generated_idx)
    generated_event = indices_to_events(generated_idx)
    print(generated_event)

    #Convert to MIDI and save
    audiate(torch.tensor(generated_idx), save_path=f"/content/drive/MyDrive/genMidis/generated_{i}.mid")

print("MIDI files generated and saved!")


[97, 104, 108, 72, 91, 94, 89, 286, 200, 217, 70, 89, 286, 198, 217, 68, 316, 89, 286, 217, 89, 286, 196, 217, 63, 286, 191, 77, 286, 225, 232, 236, 219, 222, 205, 75, 108, 97, 104, 87, 91, 82, 286, 215, 87, 286, 215, 316, 84, 286, 212, 84, 286, 212, 316, 203, 236, 225, 232, 219, 210, 94, 91, 103, 99, 106, 72, 89, 286, 200, 217, 70, 89, 286, 198, 217, 68, 316, 89, 286, 217, 89, 286, 196, 217, 63, 286, 191, 77, 286, 222, 219, 231, 227, 234, 205, 99, 103, 106, 91, 82, 75, 87, 286, 215, 87, 286, 215, 316, 84, 286, 212, 84, 286, 212, 316, 231, 234, 219, 210, 203, 259, 227, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
['note_on_96', 'note_on_103', 'note_on_107', 'note_on_71', 'note_on_90', 'note_on_93', 'note_on_88', 'time_shift_29', 'note_off_71', 