Imports

In [None]:
import numpy as np
import cv2
import re
import tensorflow.compat.v1 as tf
# Disable TensorFlow v2 behavior
tf.compat.v1.disable_v2_behavior()

Utility functions

In [2]:
# Function to convert target text to CTC format
def convert_inputs_to_ctc_format(target_text):
    # Constant that represents blank spaces
    SPACE_TOKEN = '-' 
    # Constant that represents the index value of the blank space in the set
    SPACE_INDEX = 4 
    # Constant that represents the index value of the first character in the set
    FIRST_INDEX = 0 
    
    # Remove unwanted characters from target_text and convert them to lowercase
    original = ' '.join(target_text.strip().lower().split(' ')).replace('.', '').replace('?', ''). \
                    replace(',', '').replace("'", '').replace('!', '').replace('-', '')
    # Insert double spaces between each character and split
    targets = original.replace(' ', '  ')
    targets = targets.split(' ')

    # Replace an empty space with 'SPACE_TOKEN' and converts each character to 
            # a list of characters then stacks them horizontally
    targets = np.hstack([SPACE_TOKEN if x == '' else list(x) for x in targets])

    # Convert characters to their corresponding index values
    targets = np.asarray([SPACE_INDEX if x == SPACE_TOKEN else ord(x) - FIRST_INDEX
                          for x in targets])

    # Convert targets array to a sparse representation
    train_targets = sparse_tuple_from([targets])

    # Returns 'train_targets' and the original input text as a tuple
    return train_targets, original

# Function to create sparse representation
def sparse_tuple_from(sequences, dtype=np.int32):
    # Initialising empty lists for indices and values
    indices = []
    values = []
    
    # Loop through each sequence
    for n, seq in enumerate(sequences):
        # Append the indices and values of the sequence
        indices.extend(zip([n] * len(seq), range(len(seq))))
        values.extend(seq)
    
    # Convert the indices and values to NumPy arrays with the specified data type
    indices = np.asarray(indices, dtype=np.int64)
    values = np.asarray(values, dtype=dtype)
    
    # Calculate the shape of the tensor
    shape = np.asarray([len(sequences), np.asarray(indices).max(0)[1] + 1], dtype=np.int64) 

    # Returns a sparse tensor representation
    return indices, values, shape

# Function to convert sparse tensor to strings
def sparse_tensor_to_strs(sparse_tensor):
    # Extract the indices, values and the shape of the sparse tensor
    indices= sparse_tensor[0][0]
    values = sparse_tensor[0][1]
    dense_shape = sparse_tensor[0][2]
    
    # Create a list of empty lists with length equal to the number of rows in the dense shape
    strs = [ [] for i in range(dense_shape[0]) ]
    
    # Initialize variables
    string = []
    ptr = 0
    b = 0
    
    # Loop through the indices
    for idx in range(len(indices)):
        # If the row index changes, append the current string to the list of strings and reset it
        if indices[idx][0] != b:
            # Append the current string to the list of strings
            strs[b] = string
            # Reset string to an empty list
            string = []
            # Set 'b' to the new row index
            b = indices[idx][0]
            
        # Append the current value to the current string
        string.append(values[ptr])
        # Increment the pointer
        ptr = ptr + 1
        
    # Append the final string to the list of strings
    strs[b] = string

    # Returns a list of strings, where each string is a row of dense tensor represented by the 
            # sparse tensor
    return strs

# Function to pad sequences to a specified maximum length, or the maximum length of the longest 
        # sequence if no maxlen is given
def pad_sequences(sequences, maxlen=None, dtype=np.float32, padding='post', 
                          truncating='post', value=0.):
    # Calculate the lengths of each sequence
    lengths = np.asarray([len(s) for s in sequences], dtype=np.int64)
    
    # Get the number of samples
    nb_samples = len(sequences)
    
    # If no maxlen is specified, use the length of the longest sequence
    if maxlen is None:
        maxlen = np.max(lengths)

    # Take the sample shape from the first non-empty sequence as the sample shape
    # Checking for consistency in the main loop
    sample_shape = tuple()
    for s in sequences:
        if len(s) > 0:
            sample_shape = np.asarray(s).shape[1:]
            break

    # Create a mask to identify which elements of the output array should be populated
    mask = np.arange(maxlen)[None, :] < lengths[:, None]

    # Initialize the output array with the specified datatype and fill it with the value parameter
    x = np.full((nb_samples, maxlen) + sample_shape, value, dtype=dtype)
    
    # Set the relevant elements of the output array to the values of the input sequences
    if padding == 'post':
        x[mask] = np.concatenate(sequences)[mask]
    elif padding == 'pre':
        x[:, -np.max(lengths):][mask] = np.concatenate(sequences)[mask]
    else:
        raise ValueError('Padding type "%s" not understood' % padding)

    # Truncate the output array if necessary
    if truncating == 'pre':
        x = x[:, -maxlen:]
    elif truncating == 'post':
        x = x[:, :maxlen]
    else:
        raise ValueError('Truncating type "%s" not understood' % truncating)

    # Returns a padded NumPy array of sequences and their lengths
    return x, lengths

# Function that computes the Levenshtein distance between two strings
def levenshtein(a,b):
    "Computes the Levenshtein distance between a and b."
    n, m = len(a), len(b) # Get the length of strings a and b
    
    # If string a is longer than string b, swap them
    if n > m:
        a,b = b,a
        n,m = m,n
        
    # Create a range object for the current row
    current = range(n+1)
    
    # Iterate through each row
    for i in range(1,m+1):
        # Set the current row and store the previous row
        previous, current = current, [i]+[0]*n
        
         # Iterate through each column
        for j in range(1,n+1):
            # Calculate the cost of adding or deleting a character
            add, delete = previous[j]+1, current[j-1]+1
            # Calculate the cost of changing a character
            change = previous[j-1]
            
            # If the characters in the two strings are not the same, add the cost of changing 
                    # a character
            if a[j-1] != b[i-1]:
                change = change + 1
                
            # Choose the minimum cost and update the current row
            current[j] = min(add, delete, change)
            
    # Returns the value at the bottom-right corner of the matrix; the Levenshtein distance
    return current[n]

# Function used to measure the difference between two sequence tokens
def edit_distance(a,b,EOS=-1,PAD=-1):
     # Remove any instances of the EOS and PAD tokens from the input sequences
    _a = [s for s in a if s != EOS and s != PAD]
    _b = [s for s in b if s != EOS and s != PAD]

    # Calculate the Levenshtein distance between the two sequences
    return levenshtein(_a,_b)

# Function that normalizes an image
def normalize(image):
    
    # Normalize the image by scaling the pixel values from 0 to 1 
    return (255. - image)/255.

# Function that resized an imput image to a given height while maintaining the aspect ratio
def resize(image, height):
    # Calculate the new width of the image using proportional scaling based on the given height
    width = int(float(height * image.shape[1]) / image.shape[0])
    # Resize the image using OpenCV's resize function
    sample_img = cv2.resize(image, (width, height))
    
    # Returns the resized image
    return sample_img

Paths

In [51]:
# Path to the input image.
image_path = 'Image Path/Stave.png'

# Path to the trained model.
model_path = 'Model Path/semantic_model.meta'

# Path to the vocabulary file.
voc_file_path = 'Vocabulary path/vocabulary_semantic.txt'

Prediction code

In [None]:
# Reset the TensorFlow default graph
tf.compat.v1.reset_default_graph()
# Start a new TensorFlow interactive session
sess = tf.InteractiveSession()

# Read the vocabulary file specified by the path in 'voc_file_path'
dict_file = open(voc_file_path, 'r')
# Split the contents of the vocabulary file into a list of words
dict_list = dict_file.read().splitlines()
# Create an empty dictionary to hold the integer-to-word mappings
int2word = dict()

# Loop through each word in the vocabulary list
for word in dict_list:
    # Get the current length of the 'int2word' dictionary
    word_idx = len(int2word)
    # Add an entry to the 'int2word' dictionary mapping the current index to the current word
    int2word[word_idx] = word
# Close the vocabulary file.
dict_file.close()

# Restore weights
#Load the trained model graph
saver = tf.train.import_meta_graph(model_path)
# Restore the trained model weights
saver.restore(sess, model_path[:-5])

# Get the graph
graph = tf.get_default_graph()

# Get the input tensor
input = graph.get_tensor_by_name("model_input:0")
# Get the tensor representing sequance length
seq_len = graph.get_tensor_by_name("seq_lengths:0")
# Get the tensor representing dropout probability during training
rnn_keep_prob = graph.get_tensor_by_name("keep_prob:0")
# Get the tensor representing image height
height_tensor = graph.get_tensor_by_name("input_height:0")
# Get the tensor representing width reduction function
width_reduction_tensor = graph.get_tensor_by_name("width_reduction:0")
# Get the tensor representing the output of the model
logits = tf.get_collection("logits")[0]

# Constants that are saved inside the model itself
WIDTH_REDUCTION, HEIGHT = sess.run([width_reduction_tensor, height_tensor])

# Read the input image and preprocess it
image = cv2.imread(image_path,cv2.IMREAD_GRAYSCALE)
image = resize(image, HEIGHT)
image = normalize(image)
image = np.asarray(image).reshape(1, image.shape[0], image.shape[1], 1)

# Calculate the sequence length based on the width of the image
seq_lengths = [image.shape[2] / WIDTH_REDUCTION]

# Using CTC greedy decoder to decode the logits into the final output
decoded, _ = tf.nn.ctc_greedy_decoder(logits, seq_len)

# Running the prediction session and get the predicted text
prediction = sess.run(decoded,
                      feed_dict={
                          input: image,
                          seq_len: seq_lengths,
                          rnn_keep_prob: 1.0,
                      })

Modifying the output to be more readable

In [53]:
# Define regular expressions to extract elements from the outputed list
note_regex = r'^note-([A-G][b#]?\d+)(_([a-z]+))?$'
key_regex = r'^keySignature-([A-G][b#]?)(M|m)?$'
time_regex = r'^timeSignature-(\d+)/(\d+)$'
clef_regex = r'^clef-([A-G])([1-5])$'
rest_regex = r'^rest-([a-z]+)(\.|_([a-z]+))?(\.)?$'
multirest_regex = r'^multirest-(\d+)$'
gracenote_regex = r'^gracenote-([A-G][b#]?\d+)_(\w+)$'
barline_regex = r'^barline-(\w+)$'

# Define function to map labels to their respective names
def map_label(label):
    # Check if the label matches any of the regular expressions and extract the relevant elements
    note_match = re.match(note_regex, label)
    key_match = re.match(key_regex, label)
    time_match = re.match(time_regex, label)
    clef_match = re.match(clef_regex, label)
    rest_match = re.match(rest_regex, label)
    multirest_match = re.match(multirest_regex, label)
    gracenote_match = re.match(gracenote_regex, label)
    barline_match = re.match(barline_regex, label)
    
    # If the label matches a note regular expression, return the note name and duration (if any)
    if note_match:
        note_name, _, duration = note_match.groups()
        if duration:
            return f'note {note_name} ({duration})'
        else:
            return f'note {note_name}'
        
    # If the label matches a key signature regular expression, return the key name and 
            # minor/major mode (if any)
    elif key_match:
        key_name, minor = key_match.groups()
        if minor:
            return f'key signature {key_name} {minor}'
        else:
            return f'key signature {key_name}'
        
     # If the label matches a time signature regular expression, return the time signature
    elif time_match:
        numerator, denominator = time_match.groups()
        return f'time signature {numerator}/{denominator}'
    
    # If the label matches a clef regular expression, return the clef name and line number
    elif clef_match:
        clef_name, line_number = clef_match.groups()
        return f'{clef_name} clef line {line_number}'
    
    # If the label matches a rest regular expression, return the rest name and duration (if any)
    elif rest_match:
        rest_name, _, duration, _ = rest_match.groups()
        if duration:
            return f'rest {rest_name} ({duration})'
        else:
            return f'rest {rest_name}'
        
    # If the label matches a multirest regular expression, return the number of measures 
            # in the multirest
    elif multirest_match:
        num_measures, = multirest_match.groups()
        return f'multirest {num_measures} measures'
    
    # If the label matches a grace note regular expression, return the note name and duration
    elif gracenote_match:
        note_name, duration = gracenote_match.groups()
        return f'grace note {note_name} ({duration})'
    
    # If the label matches a barline regular expression, return the type of barline
    elif barline_match:
        barline_type, = barline_match.groups()
        return f'barline {barline_type}'
    else:
        return label

Output

In [54]:
# Convert the sparse tensor predicted by the model into a list of strings
str_predictions = sparse_tensor_to_strs(prediction)
# Create an empty list to store the predicted labels
predicted_labels = []

# Loop through each predicted word in the list of strings and use the 'int2word' dictionary 
        # to get the corresponding label
for w in str_predictions[0]:
    predicted_labels.append(int2word[w])
    
# Map each predicted label to a human-readable string using the 'map_label' function
mapped_labels = [map_label(label) for label in predicted_labels]

# Join the mapped labels into a comma-separated string and print it
symbols = ', '.join(mapped_labels)
print(symbols)

G clef line 2, key signature F M, time signature 2/4, note C6 (sixteenth), note A5 (sixteenth), barline, note F5 (eighth), note-C6_sixteenth., note-A5_thirty_second, note F5 (eighth), note-A5_sixteenth., note-F5_thirty_second, barline, note G5 (eighth), note Bb5 (eighth), rest eighth, note-Bb5_sixteenth., note-G5_thirty_second, barline
