In [1]:
# external imports
import music21 as m21
import ast
import os

In [14]:
# reload cell
from src.midi import *
from src.train import *
import sys, importlib
importlib.reload(sys.modules['src.midi'])
importlib.reload(sys.modules['src.train'])

<module 'src.train' from 'D:\\Documents\\GitHub\\Pyotr\\src\\train.py'>

# Encoding and Read-in

In [10]:
# Read in all of the midi files
path = './data/sample'
mdl = gen_md_from_path(path, by_measure=False, verbose=False)
mdm = gen_md_from_path(path, by_measure=True, verbose=False)

In [11]:
# Encode them (as entire piece)
me = MidiEncoder()
mdl_enc = {}
for piece in mdl:
    mdl_enc[piece] = me.Encode(mdl[piece].flat, 'pitch_position_duration_strings')

In [12]:
# Encode them (by measure)
me = MidiEncoder()
mdm_enc = {}
for piece in mdm:
    mdm_enc[piece] = {}
    for i, m in enumerate(mdm[piece]):
        mdm_enc[piece][i] = me.Encode(m, 'pitch_position_duration_strings')

# Pre-processing

In [15]:
data = TrainingSet(mdm_enc, by_measure=True, num_notes=32, build_type='next_note')

In [18]:
data.get_vocab_size()

589

# Modeling

In [22]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [98]:
# Sample network
class Net_1(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size):
        super(Net_1, self).__init__()
        self.hidden_dim = hidden_dim

        self.token_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)

        # The linear layer that maps from hidden state space to tag space
        self.out = nn.Linear(hidden_dim, 1)

    def forward(self, note_sequence, state=None):
        embeds = self.token_embeddings(note_sequence)
        x, lstm_state = self.lstm(embeds, state)
        x = self.out(x)
        return x[-1]

In [99]:
# Params and setup
embedding_dim = 6
hidden_dim = 6

model = Net_1(embedding_dim, hidden_dim, data.get_vocab_size())
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

In [114]:
# See what the scores are before training
# Note that element i,j of the output is the score for tag j for word i.
# Here we don't need to train, so the code is wrapped in torch.no_grad()
with torch.no_grad():
    inputs = torch.tensor(data.Xnp[:1].tolist())
    note_scores = model(inputs)
    print(note_scores[-1])

tensor([0.0128])


In [116]:
torch.tensor([data.ynp[0]], dtype=torch.long)

tensor([30])

In [125]:
# Training Loop
for epoch in range(10):  
    for i in range(len(data.Xnp)):
        
        # Reset
        model.zero_grad()

        # Data
        inputs = torch.tensor([data.Xnp[i].tolist()], dtype=torch.long)
        target = torch.tensor([data.ynp[i]], dtype=torch.long)

        # Forward
        score = model(inputs)[-1]

        # Backward
        loss = loss_function(score.view(1, 1), target)
        loss.backward()
        optimizer.step()

IndexError: Target 30 is out of bounds.

In [123]:
score.view(1, 1)

tensor([[0.0128]], grad_fn=<ViewBackward>)

In [None]:
# See what the scores are after training
with torch.no_grad():
    inputs = prepare_sequence(training_data[0][0], word_to_ix)
    tag_scores = model(inputs)

    # The sentence is "the dog ate the apple".  i,j corresponds to score for tag j
    # for word i. The predicted tag is the maximum scoring tag.
    # Here, we can see the predicted sequence below is 0 1 2 0 1
    # since 0 is index of the maximum value of row 1,
    # 1 is the index of maximum value of row 2, etc.
    # Which is DET NOUN VERB DET NOUN, the correct sequence!
    print(tag_scores)