In [None]:
# This is an LSTM network that will be used for generating music based on the 
# Irish song dataset provided

# Dependencies



In [None]:
import tensorflow as tf
import numpy as np
import os
import time
import functools
from IPython import display as ipythondisplay
from tqdm import tqdm
import matplotlib.pyplot as plt
!apt-get install abcmidi timidity 

assert len(tf.config.list_physical_devices('GPU')) > 0

# Dataset



The Irish songs are in abc format in the file irish.abc. There are a total of 817 songs in the file

In [None]:
# !/bin/bash abc2wav tmp.abc
# os.system('/bin/bash abc2wav tmp.abc')

In [None]:
from IPython.display import Audio
import subprocess


cwd = os.path.dirname('./content')

def save_song_to_abc(song, filename="tmp"):
    save_name = "{}.abc".format(filename)
    with open(save_name, "w") as f:
        f.write(song)
    return filename

def abc2wav(abc_file):
    path_to_tool = os.path.join(cwd, 'abc2wav')
    # cmd = "{} {}".format(path_to_tool, abc_file)
    cmd = '/bin/bash abc2wav tmp.abc'
    return os.system(cmd)

def play_wav(wav_file):
    return Audio(wav_file)

def play_song(song):
    basename = save_song_to_abc(song)
    ret = abc2wav(basename+'.abc')
    if ret == 0: #did not suceed
        return play_wav(basename+'.wav')
    return None

def play_generated_song(generated_text):
    songs = extract_song_snippet(generated_text)
    if len(songs) == 0:
        print("No valid songs found in generated text. Try training the \
            model longer or increasing the amount of generated music to \
            ensure complete songs are generated!")

    for song in songs:
        play_song(song)
    print("None of the songs were valid, try training longer to improve \
        syntax.")


In [None]:
import regex as re

def extract_song_snippet(generated_text):
  pattern = '(^|\n\n)(.*?)\n\n'
  search_results = re.findall(pattern, text, overlapped=True, flags=re.DOTALL)
  songs = [song[1] for song in search_results]

  return songs

songs = []
filename = 'irish.abc'
with open(filename, 'r') as file:
  text = file.read()
  songs = extract_song_snippet(text)

print( f'Number of songs: {len(songs)}')

In [None]:
# play example song
play_song(songs[0])

### Process the dataset for generation task

In [None]:
# extract the vocabulary

songs_str = '\n\n'.join(songs)
vocab = sorted(set(songs_str))
print(f'There are {len(vocab)} unique characters')

In [None]:
# Look up table

# character to index
char2idx = {c:i for i, c in enumerate(vocab)}
print(char2idx)

# index to character
idx2char = np.array(vocab)
print(idx2char)

### vectorize String

In [None]:
# Given a string returns a vector based the lookup table above
def vectorize(string):
  vector = [char2idx[char] for char in string]
  return np.array(vector)

# vectorize the dataset
vectorized_songs = vectorize(songs_str)
vectorized_songs.shape[0]

### Generating Batches
Each batch will have length of seq_length. The input batch and the output batch have the same lenght but the output batch is shifted one character to the right

In [None]:
# generate batches
def get_batch(vectorized_songs, seq_length, batch_size):
  n = vectorized_songs.shape[0] - 1 
  # random indexes to start the sequence from
  idx = np.random.choice(n - seq_length, batch_size)

  # input and output batch. Output batch shifted to the right by one character
  input_batch = [vectorized_songs[i: i + seq_length] for i in idx]
  output_batch = [vectorized_songs[i+1: i+1 + seq_length] for i in idx]

  x_batch = np.reshape(input_batch, (batch_size, seq_length))
  y_batch = np.reshape(output_batch, (batch_size, seq_length))

  return x_batch, y_batch

In [None]:
x_batch, y_batch = get_batch(vectorized_songs, seq_length=5, batch_size=1)

for i, (input_idx, target_idx) in enumerate(zip(np.squeeze(x_batch), np.squeeze(y_batch))):
    print("Step {:3d}".format(i))
    print("  input: {} ({:s})".format(input_idx, repr(idx2char[input_idx])))
    print("  expected output: {} ({:s})".format(target_idx, repr(idx2char[target_idx])))

# The RNN Model
The RNN model has Three Layers

*  `tf.keras.layers.Embedding`- The input layer with a trainable lookup table that maps each number to a vector of dimensions `embedding_dim`
*  `tf.keras.layers.LSTM` - The RNN with size rnn_units
*  `tf.keras.layers.Dense` - The final layer to output the probability distribution.



In [None]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):

  model = tf.keras.Sequential([
      # Layer 1: Embedding layer
      # input shape: vocab_size
      # output shape: embedding_dim
      tf.keras.layers.Embedding(vocab_size, embedding_dim, batch_input_shape=[batch_size, None]), 

      # Layer 2: LSTM layer
      tf.keras.layers.LSTM(units=rnn_units, 
                           return_sequences=True, 
                           recurrent_initializer='glorot_uniform',
                           recurrent_activation='sigmoid',
                           stateful=True,),

      # Layer 3: Dense Layer
      tf.keras.layers.Dense(units = vocab_size)
  ])

  return model

In [None]:
num_training_iterations = 2000
batch_size = 32
seq_length = 100
learning_rate = 1e-3


vocab_size = len(vocab)
embedding_dim = 256
rnn_units = 1024

checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, 'my_ckpt')

In [None]:
def compute_loss(y_true, y_pred):
  loss = tf.keras.losses.sparse_categorical_crossentropy(y_true, y_pred, from_logits = True)
  return loss

In [None]:
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size)
optimizer = tf.keras.optimizers.Adam(learning_rate)

@tf.function
def train_step(x, y):
  with tf.GradientTape() as tape:
    y_hat = model(x)

    loss = compute_loss(y, y_hat)
  
  grads = tape.gradient(loss, model.trainable_variables)
  optimizer.apply_gradients(zip(grads, model.trainable_variables))

  return loss

In [None]:
history = []
# plotter = mdl.util.PeriodicPlotter(sec=2, xlabel='Iterations', ylabel='Loss')
if hasattr(tqdm, '_instances'): tqdm._instances.clear() # clear if it exists

for iter in tqdm(range(num_training_iterations)):

  # Grab a batch and propagate it through the network
  x_batch, y_batch = get_batch(vectorized_songs, seq_length, batch_size)
  loss = train_step(x_batch, y_batch)

  # Update the progress bar
  history.append(loss.numpy().mean())
  # plotter.plot(history)

  # Update the model with the changed weights!
  if iter % 100 == 0:     
    model.save_weights(checkpoint_prefix)
    
# Save the trained model and the weights
model.save_weights(checkpoint_prefix)

In [None]:
# visualizing the loss function
x = [i for i in range(2000)]
plt.plot(x, history)

In [None]:
'''TODO: Rebuild the model using a batch_size=1'''
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)

# Restore the model weights for the last checkpoint after training
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
model.build(tf.TensorShape([1, None]))

model.summary()

In [None]:
### Prediction of a generated song ###

def generate_text(model, start_string, generation_length=1000):
  # Evaluation step (generating ABC text using the learned RNN model)

  '''TODO: convert the start string to numbers (vectorize)'''
  input_eval = [char2idx[i] for i in start_string]
  input_eval = tf.expand_dims(input_eval, 0)

  # Empty string to store our results
  text_generated = []

  # Here batch size == 1
  model.reset_states()
  tqdm._instances.clear()

  for i in tqdm(range(generation_length)):
      '''TODO: evaluate the inputs and generate the next character predictions'''
      predictions = model(input_eval)
      
      # Remove the batch dimension
      predictions = tf.squeeze(predictions, 0)
      
      '''TODO: use a multinomial distribution to sample'''
      predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()
      
      # Pass the prediction along with the previous hidden state
      #   as the next inputs to the model
      input_eval = tf.expand_dims([predicted_id], 0)
      
      '''TODO: add the predicted character to the generated text!'''
      # Hint: consider what format the prediction is in vs. the output
      text_generated.append(idx2char[predicted_id])
    
  return (start_string + ''.join(text_generated))

In [None]:
generated_text = generate_text(model, start_string="#", generation_length=1000) # TODO
generated_songs = extract_song_snippet(generated_text)

for i, song in enumerate(generated_songs): 
  # Synthesize the waveform from a song
  waveform = play_song(song)

  # If its a valid song (correct syntax), lets play it! 
  if waveform:
    print("Generated song", i)
    ipythondisplay.display(waveform)