# To-do

1. Consider pickling the model instead of putting it in here
2. Also consider storing all the helper functions in a separate python file
3. Decoder text and listen
4. Consider sequencing the interactions, to save all outputs
5. Stylize the piano to be nicer
6. Add acknowledgement

In [83]:
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline
from IPython.display import (
    Audio, display, clear_output)
from ipywidgets import widgets, Button, Layout, ButtonStyle
from functools import partial

import torch
from torch import nn
from torch import optim
import torch.nn.functional as F

import keras
from keras.utils import to_categorical

import time

## Set up 3 helper functions

In [2]:
# function to play corresponding.wav file of piano note
def play(noteIdx):
    wavfile = '61-notes-piano/piano-ff-' + noteIdx + '.wav'
    display(Audio(wavfile, autoplay=True))

In [3]:
# in a deciated output widget, play the sound when a button is clicked
def on_button_clicked(noteIdx, b):
    with widgets.Output():
        play(noteIdx)
        sequence.append(int(noteIdx))

In [33]:
# convert sequence from piano into input text
def intToText(int_seq):
    text_seq = []
    for i in int_seq:
        text_seq.append("p"+str(i))
        text_seq.append("wait6")
    return " ".join(text_seq)

## Styling the widgets to resemble piano keys

In [5]:
# css for white keys
layout_white = widgets.Layout(
    width='40px', height='200px',
    padding = '1px',
    border='1px solid black')

# css for black keys
layout_black = widgets.Layout(
    width='40px', height='200px',
    padding = '1px',
    border='1px solid black')

## Set Device

In [26]:
dtype = torch.float
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# Assume that we are on a CUDA machine, then this should print a CUDA device:
print(device)

cpu


## Load word2int dictionary

In [24]:
# replace with any text file containing full set of data
mozart_data = './mozart.txt'

with open(mozart_data, 'r') as file:
    text = file.read()
    
# get vocabulary set
words = sorted(tuple(set(text.split())))
n = len(words)

# create word-integer encoder/decoder
word2int = dict(zip(words, list(range(n))))
int2word = dict(zip(list(range(n)), words))

## Define model

In [42]:
# define neural net
class WordLSTM(nn.ModuleList):
    
    def __init__(self, sequence_len, vocab_size, hidden_dim, batch_size):
        super(WordLSTM, self).__init__()
        
        # init the hyperparameters
        self.vocab_size = vocab_size
        self.sequence_len = sequence_len
        self.batch_size = batch_size
        self.hidden_dim = hidden_dim
        
        # first layer lstm cell
        self.lstm_1 = nn.LSTMCell(input_size=vocab_size, hidden_size=hidden_dim)
        
        # second layer lstm cell
        self.lstm_2 = nn.LSTMCell(input_size=hidden_dim, hidden_size=hidden_dim)
        
        # dropout layer
        self.dropout = nn.Dropout(p=0.5)
        
        # fully connected layer
        self.fc = nn.Linear(in_features=hidden_dim, out_features=vocab_size)
        
    # forward pass in training   
    def forward(self, x, hc):
        """
            accepts 2 arguments: 
            1. x: input of each batch 
                - shape 128*149 (batch_size*vocab_size)
            2. hc: tuple of init hidden, cell states 
                - each of shape 128*512 (batch_size*hidden_dim)
        """
        
        # create empty output seq
        output_seq = torch.empty((self.sequence_len,
                                  self.batch_size,
                                  self.vocab_size))
        # if using gpu        
        output_seq = output_seq.to(device)
        
        # init hidden, cell states for lstm layers
        hc_1, hc_2 = hc, hc
        
        # for t-th word in every sequence 
        for t in range(self.sequence_len):
            
            # layer 1 lstm
            hc_1 = self.lstm_1(x[t], hc_1)
            h_1, c_1 = hc_1
            
            # layer 2 lstm
            hc_2 = self.lstm_2(h_1, hc_2)
            h_2, c_2 = hc_2
            
            # dropout and fully connected layer
            output_seq[t] = self.fc(self.dropout(h_2))
            
        return output_seq.view((self.sequence_len * self.batch_size, -1))
          
    def init_hidden(self):
        
        # initialize hidden, cell states for training
        # if using gpu
        return (torch.zeros(self.batch_size, self.hidden_dim).to(device),
                torch.zeros(self.batch_size, self.hidden_dim).to(device))
    
    def init_hidden_generator(self):
        
        # initialize hidden, cell states for prediction of 1 sequence
        # if using gpu
        return (torch.zeros(1, self.hidden_dim).to(device),
                torch.zeros(1, self.hidden_dim).to(device))
    
    def predict(self, seed_seq, top_k=5, pred_len=256):
        """
            accepts 3 arguments: 
            1. seed_seq: seed string sequence for prediction (prompt)
            2. top_k: top k words to sample prediction from
            3. pred_len: number of words to generate after the seed seq
        """
        
        # set evaluation mode
        self.eval()
        
        # split string into list of words
        seed_seq = seed_seq.split()
        
        # get seed sequence length
        seed_len = len(seed_seq)
        
        # create output sequence
        out_seq = np.empty(seed_len+pred_len)
        
        # append input seq to output seq
        out_seq[:seed_len] = np.array([word2int[word] for word in seed_seq])
 
        # init hidden, cell states for generation
        hc = self.init_hidden_generator()
        hc_1, hc_2 = hc, hc
        
        # feed seed string into lstm
        # get the hidden state set up
        for word in seed_seq[:-1]:
            
            # encode starting word to one-hot encoding
            word = to_categorical(word2int[word], num_classes=self.vocab_size)

            # add batch dimension
            word = torch.from_numpy(word).unsqueeze(0)
            # if using gpu
            word = word.to(device) 
            
            # layer 1 lstm
            hc_1 = self.lstm_1(word, hc_1)
            h_1, c_1 = hc_1
            
            # layer 2 lstm
            hc_2 = self.lstm_2(h_1, hc_2)
            h_2, c_2 = hc_2
        
        word = seed_seq[-1]
        
        # encode starting word to one-hot encoding
        word = to_categorical(word2int[word], num_classes=self.vocab_size)

        # add batch dimension
        word = torch.from_numpy(word).unsqueeze(0)
        # if using gpu
        word = word.to(device) 

        # forward pass
        for t in range(pred_len):
            
            # layer 1 lstm
            hc_1 = self.lstm_1(word, hc_1)
            h_1, c_1 = hc_1
            
            # layer 2 lstm
            hc_2 = self.lstm_2(h_1, hc_2)
            h_2, c_2 = hc_2
            
            # fully connected layer without dropout (no need)
            output = self.fc(h_2)
            
            # software to get probabilities of output options
            output = F.softmax(output, dim=1)
            
            # get top k words and corresponding probabilities
            p, top_word = output.topk(top_k)
            # if using gpu           
            p = p.cpu()
            
            # sample from top k words to get next word
            p = p.detach().squeeze().numpy()
            top_word = torch.squeeze(top_word)
            
            word = np.random.choice(top_word, p = p/p.sum())
            
            # add word to sequence
            out_seq[seed_len+t] = word
            
            # encode predicted word to one-hot encoding for next step
            word = to_categorical(word, num_classes=self.vocab_size)
            word = torch.from_numpy(word).unsqueeze(0)
            # if using gpu
            word = word.to(device)
            
        return out_seq

## Run only one of the following cells

1. **25-keys** piano OR
2. **49-keys** piano OR
3. **61-keys** piano

In [6]:
# notes for 25 keys piano
octave = "C,C#,D,D#,E,F,F#,G,G#,A,A#,B,"
notes = (octave*2).split(",")
notes[-1] = "C"

# note index for 25 key piano
noteIdxs = [("00"+str(i))[-3:]  for i in range(25,50)]

noteDict = list(zip(notes, noteIdxs))

In [100]:
# notes for 49 keys piano
octave = "C,C#,D,D#,E,F,F#,G,G#,A,A#,B,"
notes = (octave*4).split(",")
notes[-1] = "C"

# note index for 49 key piano
noteIdxs = [("00"+str(i))[-3:]  for i in range(13,62)]

noteDict = list(zip(notes, noteIdxs))

In [101]:
# notes for 61 keys piano
octave = "C,C#,D,D#,E,F,F#,G,G#,A,A#,B,"
notes = (octave*5).split(",")
notes[-1] = "C"

# note index for 61 key piano
noteIdxs = [("00"+str(i))[-3:]  for i in range(1,62)]

# zip notes and index into tuples
noteDict = list(zip(notes, noteIdxs))

## Load Pre-Trained Model

In [113]:
# load model
model_path = './models/lstm20_ed'
model = torch.load(model_path, map_location='cpu')



# Demo

## Step 1: Play a starting tune

Re-run the cell to clear and reset the starting tune.

In [180]:
# init buttons and seed sequence
buttons, sequence = [], []

for note, noteIdx in noteDict:
    if '#' in note:
        button = widgets.Button(
            description=note, layout=layout_black, style=ButtonStyle(button_color='gray'))     
    else:
        button = widgets.Button(
            description=note, layout=layout_white, style=ButtonStyle(button_color='white'))

    button.on_click(partial(on_button_clicked, noteIdx))
    buttons.append(button)

# We place all buttons horizontally.
widgets.Box(children=buttons)

Box(children=(Button(description='C', layout=Layout(border='1px solid black', height='200px', padding='1px', w…

In [181]:
sequence

[33, 32, 38, 39, 40, 45]

## Step 2: Let maia complete the piece for you

In [182]:
# format piano sequence into input text
seed_seq = intToText(sequence)

# predict using model
output_text = ' '.join([int2word[int_] for int_ in model.predict(seed_seq, pred_len=256)])

timestamp = str(int(time.time()))

filename = "generated"+timestamp+".txt"

with open("../demo/output/text/"+filename, "w") as outfile:
    outfile.write(output_text)

## Step 3: Enjoy the final piece finished by maia!

In [183]:
import decoder
decoder.main(filename)

Done! Decoded midi file saved to '/output/midi/'


https://freesound.org/people/jobro/packs/2489/?page=1#sound

Sound pack downloaded from Freesound
----------------------------------------

This pack of sounds contains sounds by the following user:
 - jobro ( https://freesound.org/people/jobro/ )

You can find this pack online at: https://freesound.org/people/jobro/packs/2489/

License details
---------------

Attribution: http://creativecommons.org/licenses/by/3.0/

In [44]:
# notes for 88 keys piano
octave = "C,C#,D,D#,E,F,F#,G,G#,A,A#,B,"
notes = (octave*8).split(",")
notes[-1] = "C"

# note index for 88 key piano
noteIdxs = [("00"+str(i))[-3:]  for i in range(1,89)]

noteDict = list(zip(notes[9:], noteIdxs))