# Music Score Page Turner by 06_Chan_Yat_Long_Ariel and 08_Cheung_Tsz_Hong_Edward (Project ID 04)
A Music Score Page Turner that uses Optical Music Recogntion (OMR) from a github repo (https://github.com/cal-pratt/SheetVision) to convert image of music score to midi file then to .wav file, and Dynamic Time Warping (DTW) for page turning.

*If ran into problems with kernel dying/ python crashing after closing the tkinter main window, may restart kernel to solve the problem.


# Import required modules
Running on: Python 3.8.10

Non-python libraries needed:
timidity 2.15.0_2 --> brew install timidity
SheetVision (Already clond into this repo) --> https://github.com/cal-pratt/SheetVision

In [None]:
import os
import sys
import subprocess
import tkinter as tk
from tkinter import ttk
import tkinter.filedialog as fd
import numpy as np
import matplotlib.pyplot as plt
import shutil
from PIL import ImageTk, Image

#dtw modules
import IPython.display as ipd
import librosa, librosa.display
%matplotlib inline
import libfmp.c3

#realtime_audio_input modules
import cv2
import pyaudio
import time
from math import floor
from numpy import loadtxt

print('Imported modules')

# Define constants and variables

In [None]:
target_dir = './SheetVision/TempInput/'


img_paths = ()
midi_paths = []
selected_files = False

sr = 16384
fs = sr
chunk_size = sr//4
hop_length = 1024
n_fft = 2048
aud_in_device = None
aud_in_channels = 1
ref_chroma = None
ref_chroma_window_start = 0
ref_chroma_frames_increment_constant = 20

img_index = 0
turn_page = False
sheet = None
dtw_win = None
imgs = None
frame = None

# Main helper functions 
## OMR-Related

In [None]:
def print_instructions():
    instructions_win = tk.Toplevel(root)
    instructions_win.title('Instructions')
    instructions_win.geometry('500x600')
    instructions_text = f'''Welcome to Scan and Play! 
    1. Click the "1. Select Music Sheet Images" button to select images of your music sheet, you are reminded to select each page in order
    
    2. Click the "2. OMR selected files" button to convert the selected images to midi
    
    3. Click the "3. Check and Validate MIDI" button to open the midi files, you may need to install software for editing midi files. If you have edited the files, please make sure you have saved the updated midi file in {target_dir}
    
    4. Click the "4. Convert Midi to Audio" button to convert the midi files to wav files for realtime page turning
    
    5. Click the "5. Load Reference Audio for Page Turning" button to load the generated reference audio for page turning
    
    6. Click the "6. Initialise GUI for Page Turning" button to open in the 1st page of the score
    
    7. Click the "7. Start Tracking for Page Turning" button and you may now start play the instrument.
    '''
    tk.Label(instructions_win, text=instructions_text, font=('TkDefaultFont'), wraplengt=380).pack(pady=30)

In [None]:
def copy_and_rename_selected_files():
    global img_paths, selected_files
    if not selected_files:
        print('Not yet selected files!')
        return
    img_paths = list(img_paths)
    for i, img_path in enumerate(img_paths):
        shutil.copy2(img_path, target_dir)
        new_filename = f'p{i+1}.{img_path.split(".")[-1]}'
        os.rename(target_dir+str(img_path.split('/')[-1]), target_dir+new_filename)
        img_paths[i] = target_dir + new_filename
    os.listdir(target_dir)
    print(img_paths)

In [None]:
def OMR_files():
    global img_paths, midi_paths
    for i, img_path in enumerate(img_paths):
        print(img_path)
        os.system(f'python3.8 ./SheetVision/main.py {img_path}')
        midi_paths.append(target_dir+f'p{i+1}.mid')
        shutil.copy2('./output.mid', target_dir+f'p{i+1}.mid')
    os.listdir(target_dir)

In [None]:
def open_file(path):
    cmd = {'linux':'eog', 'win32':'explorer', 'darwin':'open'}[sys.platform]
    subprocess.run([cmd, path])

In [None]:
def validate_midi():
    global midi_paths
    for i, midi_path in enumerate(midi_paths):
        open_file(midi_path)

In [None]:
def convert_midi_to_audio():
    global midi_paths
    if len(midi_paths) > len(img_paths):
        midi_paths = midi_paths[:len(img_paths)]
    for i, midi_path in enumerate(midi_paths):
        os.system(f'timidity {midi_path} -Ow -o {target_dir}'+ f'p{i+1}.wav')

## DTW-related

In [None]:
def time_to_frames(time, sr, hop_length):
    return librosa.time_to_frames(time, sr=sr, hop_length=hop_length)

def frames_to_time(frames, sr, hop_length):
    return librosa.frames_to_time(frames,sr=sr,hop_length=hop_length)

mic_chroma_frames_per_dtw = time_to_frames(10, sr, hop_length)
ref_chroma_frames_per_dtw = floor(mic_chroma_frames_per_dtw * 1.5)
# print(mic_chroma_frames_per_dtw, ref_chroma_frames_per_dtw)

In [None]:
def init_audio():
    global sr, ref_chroma
    #Load chroma array for background noise from csv
    noise_for_ref_chroma = loadtxt('./noise_for_ref_chroma.csv', delimiter=',')

    #Load reference audio with librosa
    ref_wav, sr = librosa.load(target_dir+'p1.wav', sr=sr)
    
    #Extract chroma features of reference audio
    ref_chroma = librosa.feature.chroma_cqt(y=ref_wav, sr=fs,
                                             hop_length=hop_length)
    #Add chroma noise to the front and back of reference audio for increased accuracy (Approach 1 to increase accuracy)
    ref_chroma = np.array([np.concatenate((noise_for_ref_chroma[i], ref_chroma[i], noise_for_ref_chroma[i])) for i in range(12)])
    
    #Display chroma features of reference audio
    librosa.display.specshow(ref_chroma, x_axis='time', y_axis='chroma', hop_length=hop_length, cmap='coolwarm')

In [None]:
def determine_too_quick_or_slow(wp_s):
    global ref_chroma_window_start, counts
    x_ref_max = wp_s[0,0]
    x_ref_th = x_ref_max*0.85
    y_mic_max = wp_s[0,1]
    
    first = True
    for i, over in enumerate(wp_s):
        if over[0] <= x_ref_th and first:
            first = False
            print(over)
            break
    y_mic_progress = over[1]/y_mic_max
    x_ref_progress = over[0]/x_ref_max
    #Assume ref has a few beats and notes more than mic
    if x_ref_progress < y_mic_progress: #x already at the end but y still in the middle
        print(f'ref ahead of mic, y_mic={y_mic_progress}, x_ref={x_ref_progress}')
        if x_ref_progress > 0.7:
            print('not far ahead') #shift the ref by 40%
            ref_chroma_window_start += floor(ref_chroma_frames_increment_constant*0.4)
#             counts[0] += 1
        elif x_ref_progress > 0.5:
            print('ahead by a lot') #shift the ref by 30%
            ref_chroma_window_start += floor(ref_chroma_frames_increment_constant*0.2)
#             counts[1] += 1
        else:
            print('ahead by a hell lot') #don't shift ref
#             counts[2] += 1
    else:
        print(f'ref behind of mic y_mic={y_mic_progress}, x_ref={x_ref_progress}')
        if y_mic_progress > 0.7:
            print('not far behind') #Shift the ref by 50%
            ref_chroma_window_start += floor(ref_chroma_frames_increment_constant*0.5)
#             counts[3] += 1
        elif y_mic_progress > 0.5:
            print('behind by a lot') #Shift the ref 60%
            ref_chroma_window_start += floor(ref_chroma_frames_increment_constant*0.6)
#             counts[4] += 1
        else:
            print('behind by a hell lot') #Shift the ref by 70%
            ref_chroma_window_start += floor(ref_chroma_frames_increment_constant*0.7)
#             counts[5] += 1
            
def get_ref_chroma_window():
    global ref_chroma_window_start, turn_page
    print(ref_chroma_window_start,ref_chroma_window_start+ref_chroma_frames_per_dtw)
    if ref_chroma_window_start+ref_chroma_frames_per_dtw > len(ref_chroma[0]):
        print('turn page')
        turn_page = True
#         update_dtw_gui()
        open_file(img_paths[1])
        
    return ref_chroma[:,ref_chroma_window_start:ref_chroma_window_start+ref_chroma_frames_per_dtw]

def perform_dtw(mic_chroma_input, img_index=img_index, display=False, full=False):
    if full:
        ref_chroma_window = ref_chroma
    else:
        ref_chroma_window = get_ref_chroma_window()
        
#     print(ref_chroma_window.shape)

    D, wp = librosa.sequence.dtw(X=mic_chroma_input, Y=ref_chroma_window, metric='cosine')
    wp_s = librosa.frames_to_time(wp, sr=fs, hop_length=hop_length) #wp is in the format of [[y(ref), x(mic)]]
    determine_too_quick_or_slow(wp_s)
    
    if display:
        fig, ax = plt.subplots(nrows=2, sharey=True)
        img = librosa.display.specshow(mic_chroma_input, x_axis='time',
                                       y_axis='chroma',
                                       hop_length=hop_length, ax=ax[0])
#         ax[0].set(title='Chroma Representation of $Mic$')
        librosa.display.specshow(ref_chroma_window, x_axis='time',
                                 y_axis='chroma',
                                 hop_length=hop_length, ax=ax[1])
#         ax[1].set(title='Chroma Representation of $Ref$')
        fig.colorbar(img, ax=ax)
        fig, ax = plt.subplots()
        img = librosa.display.specshow(D, x_axis='time', y_axis='time', sr=fs,
                                       cmap='gray_r', hop_length=hop_length, ax=ax)
        ax.plot(wp_s[:, 1], wp_s[:, 0], marker='o', color='r')

        ax.set(title='Warping Path on Acc. Cost Matrix $D$',
               xlabel='Time $(Ref)$', ylabel='Time $(Mic)$')
        fig.colorbar(img, ax=ax)

In [None]:
def init_microphone_and_stream():
    p = pyaudio.PyAudio()
    device_count = p.get_device_count()
    for i in range(0, device_count):
        info = p.get_device_info_by_index(i)
        if 'Microphone' in info['name']:
            print("Device {} = {}".format(info["index"], info["name"]))
            aud_in_device = int(info['index'])
    stream = p.open(format=pyaudio.paFloat32,
                    channels=1,
                    rate=sr,
                    input=True,
                    input_device_index=aud_in_device,
                    frames_per_buffer=chunk_size) # a.k.a smaples per buffer = chunk_size

    print('Stream init')
    return stream

In [None]:
stream = init_microphone_and_stream()
mic_chroma_frames = []
mic_chroma = []
mic_chroma_save = []
ref_chroma_window_start = 0
# counts = [0,0,0,0,0,0]
started = False
turn_page = False

In [None]:
def read_stream_and_cal_mic_chroma():
    global frames, mic_chroma, mic_chroma_frames, started, stream
    
    #Read from microphone input stream
    data = np.frombuffer(stream.read(chunk_size, exception_on_overflow = False), dtype=np.float32) #Read 1 buffer a.k.a. chunk_size 咁多個 sample_frames
#     print('data', str(len(data)))

    #Extract chroma features from microphone input
    mic_chroma = librosa.feature.chroma_cqt(y=data, sr=sr, hop_length=hop_length) #returns 1 chroma_frame = (number of frames in data)//hop_length sample_frame
    mic_chroma_frames.append(mic_chroma)
#     mic_chroma_save.append(mic_chroma)
    
    #Make sure enough chroma frames are used in dtw for meaningful output
    if len(mic_chroma_frames) == mic_chroma_frames_per_dtw//len(mic_chroma[0]):
        mic_chroma_frames_stack = np.hstack(mic_chroma_frames) #len(mic_chroma_frames_stack) = len(1 mic_chroma) * len(mic_chroma_frames)
        mic_chroma_frames.pop(0)
#         print('mic_chroma_frames_stack.shape', str(mic_chroma_frames_stack.shape))

        #Show the chromagram of the microphone input
#         librosa.display.specshow(mic_chroma_frames_stack, x_axis='time', y_axis='chroma', hop_length=hop_length, cmap='coolwarm')
#         plt.title('Chromagram')
#         plt.draw()
#         plt.pause(0.0001)
#         plt.clf()

        #Calculate the standard deviation and mean of recent chroma frames to determine if it was background noise or music (Approch 2 to increase accuracy)
        print(np.mean([np.mean(i) for i in mic_chroma[len(mic_chroma)//2:-1]]), np.mean([np.std(i) for i in mic_chroma[len(mic_chroma)//2:-1]]))
        if started or (np.mean([np.mean(i) for i in mic_chroma[:len(mic_chroma)//2]]) < 0.5 and np.mean([np.std(i) for i in mic_chroma[:len(mic_chroma)//2]]) < 0.3):
            started = True
            perform_dtw(mic_chroma_frames_stack, img_index, True)

In [None]:
# librosa.display.specshow(np.hstack(mic_chroma_save), x_axis='time', y_axis='chroma', hop_length=hop_length, cmap='coolwarm')
# print(np.array(mic_chroma_save).shape)
# print(np.hstack(mic_chroma_save).shape)
# perform_dtw(np.hstack(mic_chroma_save), True, True)

In [None]:
def init_dtw_gui():
    global dtw_win, imgs, frame, sheet
    open_file(img_paths[0])
    
#     dtw_win = tk.Toplevel(root)
#     dtw_win.title('Page Turner')
#     dtw_win.geometry('1500x1000')
    
#     imgs = [ImageTk.PhotoImage(Image.open(img_path)) for img_path in img_paths]
    
#     frame = tk.Frame(dtw_win, width=1500, height=1000)
#     frame.pack()
#     frame.place(anchor='center', relx=0.5, rely=0.5)
    
#     sheet = tk.Label(frame, image = imgs[0])
#     sheet.pack()
    
#     dtw_win.mainloop()
    
#     def exit():
#         dtw_win.quit()
#     exit_button = tk.Button(dtw_win, text='Exit Page Turner Mode', command=exit)
    
    

def update_dtw_gui():
    global sheet
#     sheet.destroy()
#     sheet = tk.Label(frame, image = imgs[img_index])
#     sheet.pack()
    
    return sheet

In [None]:
def start_dtw():
    global turn_page
    while True:
#         print('here')
        if turn_page == True:
            turn_page = False
#             open_file(target_dir+f'p{file_index+1}')
            break
        read_stream_and_cal_mic_chroma()

# Main

In [None]:
root = tk.Tk()
root.title('Scan and Play!')
root.resizable(False, False)
root.geometry('400x350')

def select_files():
    global img_paths, selected_files
    img_paths = fd.askopenfilenames(title='Choose a file(s) for image of each page in order')
    selected_files = True
    copy_and_rename_selected_files()
    
instructions_button = ttk.Button(root, text='Instructions', command=print_instructions)
instructions_button.pack(expand=True)

open_button = ttk.Button(root, text='1. Select Music Sheet Images', command=select_files)
open_button.pack(expand=True)

omr_button = ttk.Button(root,text='2. OMR Selected Files', command=OMR_files)
omr_button.pack(expand=True)
        
validate_midi_button = ttk.Button(root,text='3. Check and Validate MIDI', command=validate_midi)
validate_midi_button.pack(expand=True)

convert_to_audio_button = ttk.Button(root,text='4. Convert Midi to Audio', command=convert_midi_to_audio)
convert_to_audio_button.pack(expand=True)

load_audio_for_page_turning = ttk.Button(root,text='5. Load Reference Audio for Page Turning', command=init_audio)
load_audio_for_page_turning.pack(expand=True)

init_dtw_gui_button = ttk.Button(root,text='6. Initialise GUI for Page Turning', command=init_dtw_gui)
init_dtw_gui_button.pack(expand=True)

start_tracking_button = ttk.Button(root,text='7. Start Tracking for Page Turning', command=start_dtw)
start_tracking_button.pack(expand=True)

root.mainloop()
print(img_paths)
root.quit()