# LSTMusic with key and metro

In [33]:
import pandas as pd
import re
from music21 import *
import numpy as np

import time
import math
import os

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [34]:
# torch.cuda.is_available() checks and returns a Boolean True if a GPU is available, else it'll return False
is_cuda = torch.cuda.is_available()

# If we have a GPU available, we'll set our device to GPU. We'll use this device variable later in our code.
if is_cuda:
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

In [35]:
# INPUT_PATH = "/content/drive/My Drive/Colab Notebooks/MusicGenerator/data/jiggs.txt"
# OUTPUT_PATH = "/content/drive/My Drive/Colab Notebooks/MusicGenerator/models"
INPUT_PATH = "../data/jiggs.txt"
OUTPUT_PATH = "../models/"

## Data Formatting

In [36]:
class Repertoir():
    def __init__(self, path):
        self.path = path
        f = open(path, "r")
        self.string = f.read()
        self.handler = abcFormat.ABCHandler()
        self.handler.process(self.string)
        self.songs_handlers = self.handler.splitByReferenceNumber()
        self.songs = {}
        self.__process()
    
    def __str__(self):
        return self.string
    
    
    def __process(self):
        for ref_number, handler in self.songs_handlers.items():
            self.songs[ref_number] = Song(handler)
            
    def get_part_vocab(self):
        tokens = []
        for ref_number, song in self.songs.items():
            tokens+= song.part
        tokens = list(set(tokens))            
        return sorted(tokens)
    
    def get_metadata_vocab(self, key):
        tokens = []
        for ref_number, song in self.songs.items():
            tokens+= [song.metadata[key]]
        tokens = list(set(tokens))            
        return sorted(tokens)   

In [37]:
class Song():
    def __init__(self, handler):
        self.handler = handler
        self.metadata = {
            'X':1,
            'T':'Unknown',
            'S':'Unknown',
            'M':'none',
            'L':'',
            'Q':'',
            'K':''
        }
        self.part = []
        self.__process()
        
    def __process(self):
        for token in self.handler.tokens:
            meta_data_ended=False
            if isinstance(token, abcFormat.ABCMetadata):
                if token.tag in self.metadata.keys():
                    if self.metadata[token.tag]=='' or not meta_data_ended:
                        self.metadata[token.tag] = token.data
                else:
                    self.metadata[token.tag] = token.data
            elif isinstance(token, abcFormat.ABCNote ) or isinstance(token, abcFormat.ABCBar):
                meta_data_ended = True
                self.part.append(token.src)
    
    def __str__(self):
        return self.to_abc()
    
    def to_abc(self):
        output = ''
        for key, value in self.metadata.items():
            output+= key+':'+value+"\n"
        for note in self.part:
            output+=note
        return output

In [38]:
def generate_char_idx_mappings(vocab):
    char2idx = {u:i for i, u in enumerate(vocab)}
    idx2char = np.array(vocab)
    return char2idx, idx2char

In [39]:
def get_input_tensors(part, k, m, part_char2idx, k_char2idx, m_char2idx):
    part_tensor = torch.tensor([part_char2idx[note] for note in part[0:-1]], dtype=torch.long)
    k_tensor = torch.tensor([k_char2idx[k] for note in part[0:-1]], dtype=torch.long)
    m_tensor = torch.tensor([m_char2idx[m] for note in part[0:-1]], dtype=torch.long)
    return part_tensor, k_tensor, m_tensor,

def get_target_tensor(part, part_char2idx):
    target_tensor = torch.tensor([part_char2idx[note] for note in part[1:]], dtype=torch.long)
    return target_tensor

In [40]:
rep = Repertoir(INPUT_PATH)

In [41]:
part_vocab = rep.get_part_vocab()
m_vocab = rep.get_metadata_vocab('M')
k_vocab = rep.get_metadata_vocab('K')


In [42]:
part_char2idx, part_idx2char = generate_char_idx_mappings(part_vocab)
k_char2idx, k_idx2char = generate_char_idx_mappings(k_vocab)
m_char2idx, m_idx2char = generate_char_idx_mappings(m_vocab)

## LSTMusic

In [0]:
class LSTMusic(nn.Module):
    def __init__(self, 
                 part_vocab_size, 
                 k_vocab_size, 
                 m_vocab_size,
                 part_embedding_dim = 512, 
                 k_embedding_dim = 8, 
                 m_embedding_dim = 8, 
                 lstm_dim = 256,
                 drop_prob = 0.2,
                 num_lstm_layers = 3,
                 bidirectional = False
):
        super(LSTMusic, self).__init__()
        self.lstm_dim = lstm_dim
        self.num_lstm_layers = num_lstm_layers
        
        self.part_embeddings = nn.Embedding(part_vocab_size, part_embedding_dim)
        self.k_embeddings = nn.Embedding(k_vocab_size, k_embedding_dim)
        self.m_embeddings = nn.Embedding(m_vocab_size, m_embedding_dim)

        self.lstm = nn.LSTM(input_size = m_embedding_dim + k_embedding_dim + part_embedding_dim, 
                            hidden_size = lstm_dim,
                            num_layers = num_lstm_layers,
                            dropout = drop_prob,
                            bidirectional = bidirectional 
                           )

        self.dense = nn.Linear(lstm_dim, part_vocab_size)

    def forward(self, part, k, m, hidden):
        part_embeds = self.part_embeddings(part)
        k_embeds = self.k_embeddings(k)
        m_embeds = self.m_embeddings(m)
        combined = torch.cat((m_embeds, k_embeds, part_embeds), 1)
        lstm_out, hidden = self.lstm(combined.view(len(part), 1, -1), hidden)
        output = self.dense(lstm_out.view(len(part), -1))
        return output, hidden
    
    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        hidden = (weight.new(self.num_lstm_layers, batch_size, self.lstm_dim).zero_().to(device),
                  weight.new(self.num_lstm_layers, batch_size, self.lstm_dim).zero_().to(device))
        return hidden

In [0]:
def generate(model, start, k, m, length, top_k=2):
    model = model.eval()
    pred_notes = []

    with torch.no_grad():  # no need to track history in sampling
        hidden = model.init_hidden(1)
        start_tensor, k_tensor, m_tensor = get_input_tensors(start + [' '], k, m, part_char2idx, k_char2idx, m_char2idx)
        start_tensor, k_tensor, m_tensor = start_tensor.to(device), k_tensor.to(device), m_tensor.to(device)
        output, hidden = model(start_tensor, k_tensor, m_tensor, hidden)
        _, top_idx = torch.topk(output[-1], k=top_k)
        choice = np.random.choice(top_idx.tolist())
        pred_notes.append(part_idx2char[choice])
        note_tensor =  torch.tensor(choice, dtype=torch.long).unsqueeze(0)
        k_tensor = k_tensor[0].unsqueeze(0)
        m_tensor = m_tensor[0].unsqueeze(0)
        for _ in range(length):
            note_tensor, k_tensor, m_tensor = note_tensor.to(device), k_tensor.to(device), m_tensor.to(device)
            output, hidden = model(note_tensor, k_tensor, m_tensor, hidden)
            _, top_idx = torch.topk(output[-1], k=top_k)
            choice = np.random.choice(top_idx.tolist())
            pred_notes.append(part_idx2char[choice])
            note_tensor =  torch.tensor(choice, dtype=torch.long).unsqueeze(0)

    abc = "M:{}\nK:{}\n".format(m,k) + ''.join(start) + ''.join(pred_notes)
    print(abc)
    return abc, pred_notes

In [0]:
def timeSince(since):
    now = time.time()
    s = now - since
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

In [0]:
part_vocab_size = len(part_vocab)
k_vocab_size = len(k_vocab)
m_vocab_size = len(m_vocab)

nb_epoch = 10000
lr = 0.001
max_norm = 5

start_part = rep.songs[15].part[0:5]
start_k = rep.songs[15].metadata['K']
start_m = rep.songs[15].metadata['M']
length = 100

In [0]:
model = LSTMusic(part_vocab_size, k_vocab_size, m_vocab_size)
model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

start = time.time()
epoch = 0
for epoch in range(1,nb_epoch+1):
    print('\n========= Epoch {} out of {} ({} %) =========\n'.format(epoch, nb_epoch, (epoch)/nb_epoch*100))
    nb_iters = len(rep.songs)
    iteration = 0
    print_every = nb_iters//10
    hidden = model.init_hidden(1)
    songs_idxs = list(rep.songs.keys())
    
    for iteration in range(1, len(songs_idxs)+1):
        song_idx = np.random.choice(songs_idxs)
        song = rep.songs[song_idx]
        
        model = model.train()

        model.zero_grad()
        
        part, k, m  = get_input_tensors(song.part, song.metadata['K'], song.metadata['M'], part_char2idx, k_char2idx, m_char2idx)
        target = get_target_tensor(song.part, part_char2idx)
        
        part, k, m, target = part.to(device), k.to(device), m.to(device), target.to(device)
        
        output, hidden = model(part, k, m, hidden)
        
        hidden = (hidden[0].detach(), hidden[1].detach())
        
        loss = criterion(output, target)
        loss.backward()

        nn.utils.clip_grad_norm_(model.parameters(), max_norm)
        
        optimizer.step()
        
        if iteration % print_every == 0:
            model.eval()
            num_correct = 0
            pred = torch.argmax(output, dim=1)
            correct_tensor = pred.eq(target)
            correct = np.squeeze(correct_tensor.cpu().numpy())
            acc = np.sum(correct)/len(part)*100
            model.train()
            print('%s (%d %d%%) || loss: %.4f || accuracy: %.3f%% || Song ID: %d' % (timeSince(start), iteration, iteration / nb_iters * 100, loss.item(), acc, song_idx))
    print('\nGenerating example :\n')
    abc, op_notes = generate(model, start_part, start_k, start_m, length)    
    torch.save(model.state_dict(),os.path.join(OUTPUT_PATH,'model-{}.pth'.format(epoch)))



0m 3s (34 10%) || loss: 3.5811 || accuracy: 15.534% || Song ID: 114
0m 6s (68 20%) || loss: 4.0778 || accuracy: 17.204% || Song ID: 135
0m 10s (102 30%) || loss: 4.1833 || accuracy: 16.438% || Song ID: 75
0m 14s (136 40%) || loss: 4.8430 || accuracy: 12.261% || Song ID: 223
0m 18s (170 50%) || loss: 4.2700 || accuracy: 22.727% || Song ID: 304
0m 21s (204 60%) || loss: 4.5954 || accuracy: 15.625% || Song ID: 123
0m 25s (238 70%) || loss: 3.5743 || accuracy: 20.202% || Song ID: 166
0m 29s (272 80%) || loss: 3.4665 || accuracy: 14.151% || Song ID: 317
0m 32s (306 90%) || loss: 3.5938 || accuracy: 22.936% || Song ID: 177
0m 36s (340 100%) || loss: 3.6060 || accuracy: 21.053% || Song ID: 303

Generating example :

M:6/8
K:G
"G"G2G"D"A2A|AeAG|BeGGGG|AGAA|AeAG|BGGGG|BeGA|AeGA|BeAG|AeGA|AeGGG|AeAAGG|BGAA|BGAA|AeGGG|BGGA|BGGG|AGGGGG|BeAGGG|


0m 40s (34 10%) || loss: 3.4803 || accuracy: 17.431% || Song ID: 234
0m 44s (68 20%) || loss: 3.4620 || accuracy: 17.910% || Song ID: 201
0m 47s (102 30

## Load model

In [43]:
epoch = 100
model = LSTMusic(part_vocab_size, k_vocab_size, m_vocab_size)
model.load_state_dict(torch.load(os.path.join(OUTPUT_PATH, 'model-{}.pth'.format(str(epoch))),map_location=torch.device('cpu')))
model.to(device)
model.eval()

LSTMusic(
  (part_embeddings): Embedding(1104, 512)
  (k_embeddings): Embedding(11, 8)
  (m_embeddings): Embedding(2, 8)
  (lstm): LSTM(528, 256, num_layers=3, dropout=0.2)
  (dense): Linear(in_features=256, out_features=1104, bias=True)
)

In [52]:
song_idx = 321
song = rep.songs[song_idx]
start_part = rep.songs[song_idx].part[0:20]
start_k = rep.songs[song_idx].metadata['K']
start_m = rep.songs[song_idx].metadata['M']
length = 400
abc = generate(model, start_part, start_k, start_m, length, top_k=2)

M:6/8
K:D
A|"D"d2d"A"c2A|"G"BdB"D"AGF|"G"G2e"D"F2d|"Em"E2f"A7"ecc|"D"d3d2:||"Em"g2fe3|"A7"ed^c"D"A2a|"D"g2f"A"e3|"A7"e2cB2A|"D"d2ef2a|"G"bag"D"faa|"A7"gag"D"fgfe|"D"f2ff2f"f#"f|"D"f2dfga|"Em"gfge2d|"A7"cdccAF:|[2"A7"Add"D"def|"G"g3eBc|"G"d3/2ba"A7"gfe|"Bm"dcd"E7"ccB:|[2"A"e3"D"d3|:"A"ecAcee|"D"f2gfef|"C"g2ccfg|"D"a2fa2f|"E7"eBcdcB|"F#m"A3AFA|e2dcBA:|[2"Dm"d3dc|d^cffe=c|:"G"Bdd"A7"efb|"D"afddfa|"G"gfd"A7"ece:|gfe|:"D"fga"A"ece|"D"faa"A7/e"gfe||"D/f+"aga"G"bgg|"D"agf"A7"ede||"D"f2g"Bm"aff||"G"gfedBA|"D"defAFF|:"G"GAB"D7"ABA|d2AABA||d2fa2f|:dBA=FF|"Gm"GBd^c2f|"C"gaag^fg|"D""Bm"f^cfAdefg|"D"abaafg|"D7"adddAF:|[2"D"dcd"A"ede|:fg||"D"fdd"A"efgf/2g/2|"D"a2aba^g|"D"abafag||"Bm"fBgfed|"Em"edBd2A|"D"dfAA2fe|:"G"d3/2e/2d"D"dcd|"G"B2d"D/f+"A2B|"D"A3A2Bc|:"G"d2GBAB|:A|"D"F2DF2d|"D"AGFAGF|"A7"ABAFGF:|F/2F/2G/2|:"A7"AEB"A"EAB|"F#m"AB


In [70]:
start_part = [part_vocab[500]]
print(start_part)
start_k = k_vocab[7]
start_m = m_vocab[0]
length = 400
abc = generate(model, start_part, start_k, start_m, length, top_k=2)

['"D7"F2']
M:5/4
K:Em
"D7"F2|d2^cdFE|"G"DEFGAcB^A|"Bb"d^cdfFG|"D"FDDFED|FGc"D"DFA|"D7"d2cAFD|"Em"EFG"A7"F2E|"A7"E2A"D"D2F||"Em"G=FE||"D"DEDF2:||:A|:"G"BAGBddB"Em"e2f|:"G"dc^cd2e|egfe2Bc|"G"dge"D7"dBA:|"G"BAAdef|"G"gbg"D"aba|:"G"g2B"A7"A2B|:ABA|"D"D2DF3:|[2"A7"GECD3|:EA/2B/2|AEAABc|:BB=B"A"A|"A"e2cAB^c|:d|"F"cFAdAF:|E|"Am"E"3"AcA2B|:"G"DEFG2D|"Em"GEEBED|"Em"E2FG2E:|[2"Em""Em"GFAB2A||"D"FEF"G"GFED|"Em"E2FG2"E"^G|:"Em"e2e|"G"dBAgbgf|"G"gbg"D"afd:|"C"ceagec|"G"d3/2e/2afgfg|BAffe=c|"G"BAB"A7"ece||"Bm"dBBfff|efeBAB=ce|:e|"A"aedcde|:ef|"C"g2efgfeBBef|"C"e3BcB|:A|"G"BAB"D"def||"Em"e2b"A7"a2A/2G/2|"G"DGGB/2c/2d/2B/2|"G"GAB"D7"AFD|"Em"EEDG2A|"D"FAAFDF|"G"GBdg2a/2e/2f/2g/2|"G"Bgb"Em"ef^g|"C"c'ba"Em"g2f|"Em"g2eefe:|"D7"d3/2a/2g/2"G"gdg"Em/d+"ggf||Bgf|"Em"ede"A7"gfe|"D"dfe"G"dBG:|"Em"g3"Bm"dedc
