#### Trains our transformer model on the tokenized data

1. Loads our token sequences and vocabulary 
2. Build Transformer model (probably going to use Pytorch but we will look into what)
3. Train on tokens [:-1] -> [tokens [1:]]
4. Save checkpoints and training logs


P.S This is subject to lots of change on how we prepare the model I think we will probably be training on multiple datasets and then freezing at different points
i.e train first on the accompaniment dataset so it understands general structure and how accompaniment works, then train on dataset with melodies as well so it gets how 
to play and be reactive based on the melody (we also need to add in a lot of noise and personal data for this part since we will not be perfect soloist like in the jazz
songs and MIDI from a Database)

Load the token sequences

In [1]:
import json
from pathlib import Path
import torch
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader

In [2]:
project_root = Path.cwd().parent
remi_segments_path = project_root / "remi_segments.jsonl"
vocab_path = project_root / "vocab.json"

# Load vocab
with open(vocab_path, "r") as f:
    vocab = json.load(f)

# I should use stoi and itos from the vocab
stoi = vocab # string to index
itos = {int(index): token for token, index in vocab.items()} # index to string


# Load token sequences
token_sequences = []
with open(remi_segments_path, "r") as f:
    for line in f:
        record = json.loads(line)
        token_sequences.append(record["tokens"])

Encode the tokens as integers

In [3]:
def encode(tokens):
    return [vocab.get(t, vocab["<UNK>"]) for t in tokens]

encoded_sequences = [encode(seq) for seq in token_sequences]

# test
print(encoded_sequences[0])
for idx in [1308, 1340, 1726]:
    print(itos.get(idx, "<UNK>"))

[2, 1640, 1072, 148, 2, 1640, 683, 770, 2, 1640, 235, 490, 2, 1640, 545, 996, 1384, 1308, 1326, 1668, 1301, 1460, 1308, 1326, 1758, 1297, 1564, 1308, 1326, 1758, 1301, 1580, 1308, 1326, 1758, 1301, 1599, 1310, 1349, 1721, 1260, 1310, 1358, 1718, 1260, 1310, 1353, 1718, 1260, 1307, 1318, 1739, 1260, 1309, 1341, 1723, 1263, 1309, 1346, 1727, 1263, 1309, 1349, 1726, 1262, 1308, 1340, 1742, 1250, 1617, 1308, 1333, 1732, 1250, 1633, 1307, 1333, 1737, 1260, 1308, 1340, 1735, 1250, 1634, 1310, 1343, 1716, 1260, 1310, 1349, 1709, 1260, 1310, 1352, 1711, 1260, 1391, 1309, 1343, 1721, 1281, 1309, 1349, 1726, 1281, 1309, 1352, 1730, 1290, 1397, 1308, 1333, 1737, 1250, 1408, 1308, 1340, 1707, 1250, 1414, 1310, 1349, 1713, 1263, 1310, 1354, 1718, 1263, 1310, 1357, 1716, 1263, 1307, 1326, 1738, 1262, 1309, 1349, 1717, 1262, 1309, 1345, 1716, 1263, 1309, 1354, 1721, 1263, 1308, 1340, 1709, 1250, 1431, 1308, 1340, 1723, 1250, 1432, 1308, 1333, 1737, 1250, 1450, 1310, 1351, 1718, 1262, 1310, 1357, 1716

Training Data

In [4]:
sequence_length = 128  # Number of tokens the model sees at once
step_size = 8  # How much to slide the window by

X, y = [], []
for seq in encoded_sequences:
    for i in range(0, len(seq) - sequence_length, step_size):
        X.append(seq[i:i+sequence_length])
        y.append(seq[i+sequence_length])

In [5]:
import torch
print(torch.cuda.is_available())
print(torch.cuda.device_count())
print(torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU found")


True
1
NVIDIA GeForce RTX 2060 SUPER


In [6]:
X = torch.tensor(X, dtype=torch.long)
y = torch.tensor(y, dtype=torch.long)

In [7]:

batch_size = 64
dataset = TensorDataset(X, y)
loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [8]:
class MusicLSTM(nn.Module):
    def __init__(self, vocab_size, embed_dim=256, hidden_dim=512, num_layers=2):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x, hidden=None):
        x = self.embed(x)
        out, hidden = self.lstm(x, hidden)
        out = self.fc(out[:, -1])  # Use only last output for prediction
        return out, hidden

In [9]:
# After defining your model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = MusicLSTM(vocab_size=len(stoi)).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

epochs = 10
for epoch in range(epochs):
    total_loss = 0
    for xb, yb in loader:
        xb = xb.to(device)  # Move batch to GPU
        yb = yb.to(device)  # Move batch to GPU
        optimizer.zero_grad()
        preds, _ = model(xb)
        loss = criterion(preds, yb)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch + 1}, Loss: {total_loss:.4f}")

Epoch 1, Loss: 11239.9710
Epoch 2, Loss: 5090.4671
Epoch 3, Loss: 4196.6616
Epoch 4, Loss: 3783.8724
Epoch 5, Loss: 3523.0093
Epoch 6, Loss: 3321.9635
Epoch 7, Loss: 3171.2735
Epoch 8, Loss: 3033.8396
Epoch 9, Loss: 2935.0199
Epoch 10, Loss: 2849.3381


In [10]:
# save the model
model_path = project_root / "music_lstm_model.pth"
torch.save(model.state_dict(), model_path)
