In [1]:
!pip install preprocessing
!pip install connected-components-3d
!pip install tables
!pip install music21



In [2]:
import pandas as pd
import cv2
import pickle
import os
import zipfile
import warnings
import numpy as np
import matplotlib.pyplot as plt
import requests
warnings.filterwarnings('ignore')

from skimage import io
from tqdm import tqdm
from skimage.color import rgb2gray
from preprocessing import *
from scipy.ndimage import binary_fill_holes
from skimage.morphology import thin
from skimage.color import label2rgb
from collections import Counter

#NN
from sklearn import preprocessing
from skimage.filters import threshold_otsu, gaussian, median
from tensorflow.keras.utils import to_categorical
from skimage.morphology import binary_opening, binary_closing, binary_dilation, binary_erosion, closing, opening, square, skeletonize, disk
from sklearn.model_selection import train_test_split
from skimage.measure import label, regionprops
from keras.models import Sequential, load_model
from keras.layers import Dense, Conv2D, MaxPooling2D, GlobalAveragePooling2D, InputLayer, Dropout

# web scraping imports
from bs4 import BeautifulSoup
# audio file
from scipy.io.wavfile import write
from music21 import note, stream, tempo, instrument




In [3]:
def IsHorizontal(img):
    projected = []
    rows, cols = img.shape
    for i in range(rows):
        proj_sum = 0
        for j in range(cols):
            if img[i][j] == 0:
                proj_sum += 1
        projected.append([1]*proj_sum + [0]*(cols-proj_sum))
        if(proj_sum >= 0.9*cols):
            return True
    return False

In [4]:
label_map = {0:{0: 'N0'}, 1:{0:'b4',1:'a4'}, 2:{0:'g4',1:'f4'}, 3:{0:'e4',1:'d4'}, 4:{0:'c4',1:'b3'},
             5:{0:'a3',1:'g3'}, 6:{0:'f3',1:'e3'}, 7:{0:'d3',1:'c3'}}

In [5]:
# with zipfile.ZipFile('Notes.zip', 'r') as zip_ref:
#       zip_ref.extractall('')

In [6]:
row_percentage = 0.3

def calculate_thickness_spacing(rle, most_common):
    bw_patterns = [most_common_bw_pattern(col, most_common) for col in rle]
    bw_patterns = [x for x in bw_patterns if x]  # Filter empty patterns

    flattened = []
    for col in bw_patterns:
        flattened += col

    pair, count = Counter(flattened).most_common()[0]

    line_thickness = min(pair)
    line_spacing = max(pair)

    return line_thickness, line_spacing


def whitene(rle, vals, max_height):
    rlv = []
    for length, value in zip(rle, vals):
        if value == 0 and length < 1.1*max_height:
            value = 1
        rlv.append((length, value))

    n_rle, n_vals = [], []
    count = 0
    for length, value in rlv:
        if value == 1:
            count = count + length
        else:
            if count > 0:
                n_rle.append(count)
                n_vals.append(1)

            count = 0
            n_rle.append(length)
            n_vals.append(0)
    if count > 0:
        n_rle.append(count)
        n_vals.append(1)

    return n_rle, n_vals


def remove_staff_lines(rle, vals, thickness, shape):
    n_rle, n_vals = [], []
    for i in range(len(rle)):
        rl, val = whitene(rle[i], vals[i], thickness)
        n_rle.append(rl)
        n_vals.append(val)

    return hv_decode(n_rle, n_vals, shape)


def remove_staff_lines_2(thickness, img_with_staff):
    img = img_with_staff.copy()
    projected = []
    rows, cols = img.shape
    for i in range(rows):
        proj_sum = 0
        for j in range(cols):
            proj_sum += img[i][j] == 1
        projected.append([1]*proj_sum + [0]*(cols-proj_sum))
        if(proj_sum <= row_percentage*cols):
            img[i, :] = 1
    closed = binary_opening(img, np.ones((3*thickness, 1)))
    return closed


def get_rows(start, most_common, thickness, spacing):
    # start = start-most_common
    rows = []
    num = 6
    if start - most_common >= 0:
        start -= most_common
        num = 7
    for k in range(num):
        row = []
        for i in range(thickness):
            row.append(start)
            start += 1
        start += (spacing)
        rows.append(row)
    if len(rows) == 6:
        rows = [0] + rows
    return rows


def horizontal_projection(img):
    projected = []
    rows, cols = img.shape
    for i in range(rows):
        proj_sum = 0
        for j in range(cols):
            proj_sum += img[i][j] == 1
        projected.append([1]*proj_sum + [0]*(cols-proj_sum))
        if(proj_sum <= 0.1*cols):
            return i
    return 0


def get_staff_row_position(img):
    found = 0
    row_position = -1
    for i in range(img.shape[0]):
        for j in range(img.shape[1]):
            if(img[i][j] == 0):
                row_position = i
                found = 1
                break
        if found == 1:
            break
    return row_position


def coordinator(bin_img, horizontal):
    rle, vals = hv_rle(bin_img)
    most_common = get_most_common(rle)
    thickness, spacing = calculate_thickness_spacing(rle, most_common)
    start = 0
    if horizontal:
        no_staff_img = remove_staff_lines_2(thickness, bin_img)
        staff_lines = otsu(bin_img - no_staff_img)
        start = horizontal_projection(bin_img)
    else:
        no_staff_img = remove_staff_lines(rle, vals, thickness, bin_img.shape)
        no_staff_img = binary_closing(
            no_staff_img, np.ones((thickness+2, thickness+2)))
        no_staff_img = median(no_staff_img)
        no_staff_img = binary_opening(
            no_staff_img, np.ones((thickness+2, thickness+2)))
        staff_lines = otsu(bin_img - no_staff_img)
        staff_lines = binary_erosion(
            staff_lines, np.ones((thickness+2, thickness+2)))
        staff_lines = median(staff_lines, selem=square(21))
        start = get_staff_row_position(staff_lines)
    staff_row_positions = get_rows(
        start, most_common, thickness, spacing)
    staff_row_positions = [np.average(x) for x in staff_row_positions]
    return spacing, staff_row_positions, no_staff_img

In [7]:
def rle_encode(arr):
    if len(arr) == 0:
        return [], [], []

    x = np.copy(arr)
    first_dismatch = np.array(x[1:] != x[:-1])
    distmatch_positions = np.append(np.where(first_dismatch), len(x)-1)
    rle = np.diff(np.append(-1, distmatch_positions))
    values = [x[i] for i in np.cumsum(np.append(0, rle))[:-1]]
    return rle, values

In [8]:
def hv_rle(img, axis=1):
    '''
    img: binary image
    axis: 0 for rows, 1 for cols
    '''
    rle, values = [], []

    if axis == 1:
        for i in range(img.shape[1]):
            col_rle, col_values = rle_encode(img[:, i])
            rle.append(col_rle)
            values.append(col_values)
    else:
        for i in range(img.shape[0]):
            row_rle, row_values = rle_encode(img[i])
            rle.append(row_rle)
            values.append(row_values)

    return rle, values

In [9]:
def rle_decode(starts, lengths, values):
    starts, lengths, values = map(np.asarray, (starts, lengths, values))
    ends = starts + lengths
    n = ends[-1]

    x = np.full(n, np.nan)
    for lo, hi, val in zip(starts, ends, values):
        x[lo:hi] = val
    return x

In [10]:
def hv_decode(rle, values, output_shape, axis=1):
    starts = [[int(np.sum(arr[:i])) for i in range(len(arr))] for arr in rle]

    decoded = np.zeros(output_shape, dtype=np.int32)
    if axis == 1:
        for i in range(decoded.shape[1]):
            decoded[:, i] = rle_decode(starts[i], rle[i], values[i])
    else:
        for i in range(decoded.shape[0]):
            decoded[i] = rle_decode(starts[i], rle[i], values[i])

    return decoded

In [11]:
def calculate_pair_sum(arr):
    if len(arr) == 1:
        return list(arr)
    else:
        res = [arr[i] + arr[i + 1] for i in range(0, len(arr) - 1, 2)]
        if len(arr) % 2 == 1:
            res.append(arr[-2] + arr[-1])
        return res


def get_most_common(rle):
    pair_sum = [calculate_pair_sum(col) for col in rle]

    flattened = []
    for col in pair_sum:
        flattened += col

    most_common = np.argmax(np.bincount(flattened))
    return most_common


def most_common_bw_pattern(arr, most_common):
    if len(arr) == 1:
        # print("Empty")
        return []
    else:
        res = [(arr[i], arr[i + 1]) for i in range(0, len(arr) - 1, 2)
               if arr[i] + arr[i + 1] == most_common]

        if len(arr) % 2 == 1 and arr[-2] + arr[-1] == most_common:
            res.append((arr[-2], arr[-1]))
        # print(res)
        return res

In [12]:
def gray_img(img):
    '''
    img: rgb image
    return: gray image, pixel values 0:255
    '''
    if img.shape[2] == 4:  # перевіряємо, чи є у зображення альфа-канал
        img = img[:, :, :3]  # видаляємо альфа-канал
    gray = rgb2gray(img)
    gray = (gray * 255).astype(np.uint8)  # перетворюємо зображення у формат 8-бітового сірого
    return gray

In [13]:
class Box(object):
    def __init__(self, x, y, w, h):
        self.x = x
        self.y = y
        self.w = w
        self.h = h
        self.center = x + w/2, self.y+self.h/2
        self.area = w*h

    def overlap(self, other):
        x = max(0, min(self.x+self.w, other.x+other.w) - max(other.x, self.x))
        y = max(0, min(self.y+self.h, other.y+other.h) - max(other.y, self.y))
        area = x*y
        return area/self.area

    def distance(self, other):
        return math.sqrt((self.center[0]-other.center[0])**2+(self.center[1]-other.center[1])**2)

    def merge(self, other):
        x = min(self.x, other.x)
        y = max(self.y, other.y)
        w = max(self.x+self.w, other.x+other.w) - x
        h = max(self.y+self.h, other.y+other.h) - y
        return Box(x, y, w, h)

    def draw(self, img, color, thickness):
        pos = ((int)(self.x), (int)(self.y))
        size = ((int)(self.x + self.w), (int)(self.y + self.h))
        cv2.rectangle(img, pos, size, color, thickness)

In [14]:
def read_all_images(num_of_images):
    all_images_list = []
    for i in range(num_of_images):
        path = f'C:/Users/User/Desktop/Diplomna/notes/music sheet ({i+1}).png'
        img = gray_img(io.imread(path))
        all_images_list.append(get_thresholded(img, threshold_otsu(img)))
    return all_images_list


In [15]:
def predict(img):
    if not os.path.exists('nn_trained_model_hog.sav'):
        print('Please wait while training the NN-HOG model....')
        train('NN', 'hog', 'nn_trained_model_hog')

    model = pickle.load(open('nn_trained_model_hog.sav', 'rb'))
    features = extract_features(img, 'hog')
    labels = model.predict([features])

    return labels

In [16]:
class Segmenter(object):
    def __init__(self, bin_img):
        self.bin_img = bin_img
        self.rle, self.vals = hv_rle(self.bin_img)
        self.most_common = get_most_common(self.rle)
        self.thickness, self.spacing = calculate_thickness_spacing(
            self.rle, self.most_common)
        self.thick_space = self.thickness + self.spacing
        self.no_staff_img = remove_staff_lines(
            self.rle, self.vals, self.thickness, self.bin_img.shape)

        self.segment()

    def open_region(self, region):
        thickness = np.copy(self.thickness)
        # if thickness % 2 == 0:
        #     thickness += 1
        return opening(region, np.ones((thickness, thickness)))

    def segment(self):
        self.line_indices = get_line_indices(histogram(self.bin_img, 0.8))
        if len(self.line_indices) < 10:
            self.regions_without_staff = [
                np.copy(self.open_region(self.no_staff_img))]
            self.regions_with_staff = [np.copy(self.bin_img)]
            return

        generated_lines_img = np.copy(self.no_staff_img)
        lines = []
        for index in self.line_indices:
            line = ((0, index), (self.bin_img.shape[1]-1, index))
            lines.append(line)

        end_of_staff = []
        for index, line in enumerate(lines):
            if index > 0 and (line[0][1] - end_of_staff[-1][1] < 4*self.spacing):
                pass
            else:
                p1, p2 = line
                x0, y0 = p1
                x1, y1 = p2
                end_of_staff.append((x0, y0, x1, y1))

        box_centers = []
        spacing_between_staff_blocks = []
        for i in range(len(end_of_staff)-1):
            spacing_between_staff_blocks.append(
                end_of_staff[i+1][1] - end_of_staff[i][1])
            if i % 2 == 0:
                offset = (end_of_staff[i+1][1] - end_of_staff[i][1])//2
                center = end_of_staff[i][1] + offset
                box_centers.append((center, offset))

        max_staff_dist = np.max(spacing_between_staff_blocks)
        max_margin = max_staff_dist // 2
        margin = max_staff_dist // 10

        end_points = []
        regions_without_staff = []
        regions_with_staff = []
        for index, (center, offset) in enumerate(box_centers):
            y0 = int(center) - max_margin - offset + margin
            y1 = int(center) + max_margin + offset - margin
            end_points.append((y0, y1))

            region = self.bin_img[y0:y1, 0:self.bin_img.shape[1]]
            regions_with_staff.append(region)
            staff_block = self.no_staff_img[y0:y1,
                                            0:self.no_staff_img.shape[1]]

            regions_without_staff.append(self.open_region(staff_block))

        self.regions_without_staff = regions_without_staff
        self.regions_with_staff = regions_with_staff

In [17]:
def segmenting(img):
    segmenter = Segmenter(get_thresholded(img, threshold_otsu(img))) #binary image
    imgs_with_staff = segmenter.regions_with_staff
    imgs_spacing, imgs_rows, coord_imgs  = [], [], []
    for i, img in enumerate(imgs_with_staff):
        spacing, rows, no_staff_img = coordinator(img,IsHorizontal(img))
        imgs_rows.append(rows)
        imgs_spacing.append(spacing)
        coord_imgs.append(no_staff_img)
    return segmenter, imgs_spacing, imgs_rows, coord_imgs, imgs_with_staff

In [18]:
def estim(c, idx, imgs_spacing, imgs_rows):
    spacing = imgs_spacing[idx]
    rows = imgs_rows[idx]
    margin = 1+(spacing/4)
    for index,line in enumerate (rows):
        if c >= line - margin and c <= line + margin:
            return index+1, 0
        elif c >= line + margin and c <= line + 3*margin:
            return index+1, 1
    return 0, 0

In [19]:
def filter_beams(prims, prim_with_staff, bounds):
    n_bounds, n_prims, n_prim_with_staff = [], [], []
    for i, prim in enumerate(prims):
        if prim.shape[1] >= 2*prim.shape[0]:
            continue
        else:
            n_bounds.append(bounds[i])
            n_prims.append(prims[i])
            n_prim_with_staff.append(prim_with_staff[i])
    return n_prims, n_prim_with_staff, n_bounds

In [20]:
def show_images(images, titles=None):
    n_ims = len(images)
    if titles is None:
        titles = ['(%d)' % i for i in range(1, n_ims + 1)]
    fig = plt.figure()
    n = 1
    for image, title in zip(images, titles):
        a = fig.add_subplot(1, n_ims, n)
        if image.ndim == 2:
            plt.gray()
        plt.imshow(image)
        a.set_title(title)
        plt.axis('off')
        n += 1
    fig.set_size_inches(np.array(fig.get_size_inches()) * n_ims)
    plt.show()

In [21]:
def showHist(img):
    plt.figure()
    imgHist = histogram(img, nbins=256)

    bar(imgHist[1].astype(np.uint8), imgHist[0], width=0.8, align='center')

In [22]:
def otsu(img):
    '''
    Otsu with gaussian
    img: gray image
    return: binary image, pixel values 0:1
    '''
    blur = gaussian(img)
    otsu_bin = 255*(blur > threshold_otsu(blur))
    return (otsu_bin/255).astype(np.int32)

In [23]:
target_img_size = (100, 100)
sample_count = 50


def extract_raw_pixels(img):
    resized = cv2.resize(img, target_img_size)
    return resized.flatten()


def extract_hsv_histogram(img):
    resized = cv2.resize(img, target_img_size)
    hsv = cv2.cvtColor(resized, cv2.COLOR_BGR2HSV)
    hist = cv2.calcHist([hsv], [0, 1, 2], None, [8, 8, 8],
                        [0, 180, 0, 256, 0, 256])
    if imutils.is_cv2():
        hist = cv2.normalize(hist)
    else:
        cv2.normalize(hist, hist)
    return hist.flatten()


def extract_hog_features(img):
    img = cv2.resize(img, target_img_size)
    win_size = (100, 100)
    cell_size = (4, 4)
    block_size_in_cells = (2, 2)

    block_size = (block_size_in_cells[1] * cell_size[1],
                  block_size_in_cells[0] * cell_size[0])
    block_stride = (cell_size[1], cell_size[0])
    nbins = 9  # Number of orientation bins
    hog = cv2.HOGDescriptor(win_size, block_size,
                            block_stride, cell_size, nbins)
    h = hog.compute(img)
    h = h.flatten()
    return h.flatten()


def extract_features(img, feature_set='raw'):
    if feature_set == 'hog':
        return extract_hog_features(img)
    elif feature_set == 'raw':
        return extract_raw_pixels(img)
    else:
        return extract_hsv_histogram(img)


def load_dataset(feature_set='raw', dir_names=[]):
    features = []
    labels = []
    count = 0
    for dir_name in dir_names:
        print(dir_name)
        imgs = glob(f'{dataset_path}/{dir_name}/*.png')
        count += len(imgs)
        subset = random.sample([i for i in range(len(imgs))], min(len(imgs), sample_count))
        for i in subset:
            img = cv2.imread(imgs[i])
            labels.append(dir_name)
            features.append(extract_features(img, feature_set))
    print(f'Total: {len(dir_names)} directories, and {count} images')
    return features, labels


def load_classifiers():
    random_seed = 42
    random.seed(random_seed)
    np.random.seed(random_seed)

    classifiers = {
        'SVM': svm.LinearSVC(random_state=random_seed),
        'KNN': KNeighborsClassifier(n_neighbors=7),
        'NN': MLPClassifier(activation='relu', hidden_layer_sizes=(200,),
                            max_iter=10000, alpha=1e-4,
                            solver='adam', verbose=20,
                            tol=1e-8, random_state=1,
                            learning_rate_init=.0001,
                            learning_rate='adaptive')
    }
    return classifiers, random_seed


def run_experiment(classifier='SVM', feature_set='hog', dir_names=[]):
    print('Loading dataset. This will take time ...')
    features, labels = load_dataset(feature_set, dir_names)
    print('Finished loading dataset.')

    classifiers, random_seed = load_classifiers()

    train_features, test_features, train_labels, test_labels = train_test_split(
        features, labels, test_size=0.2, random_state=random_seed)

    model = classifiers[classifier]
    print('############## Training', classifier, "##############")
    model.fit(train_features, train_labels)
    accuracy = model.score(test_features, test_labels)
    print(classifier, 'accuracy:', accuracy*100, '%')

    return model, accuracy


def train(model_name, feature_name, saved_model_name):
    dir_names = [path.split('/')[2] for path in glob(f'{dataset_path}/*')]

    model, accuracy = run_experiment(model_name, feature_name, dir_names)

    filename = f'trained_models/{saved_model_name}.sav'
    pickle.dump(model, open(filename, 'wb'))

In [24]:
def get_gray(img):
    gray = rgb2gray(np.copy(img))
    return gray

In [25]:
def get_thresholded(img, thresh):
    return 1*(img > thresh)

In [26]:
def histogram(img, thresh):
    hist = (np.ones(img.shape) - img).sum(dtype=np.int32, axis=1)
    _max = np.amax(hist)
    hist[hist[:] < _max * thresh] = 0
    return hist

In [27]:
def get_line_indices(hist):
    indices = []
    prev = 0
    for index, val in enumerate(hist):
        if val > 0 and prev <= 0:
            indices.append(index)
        prev = val
    return indices

In [28]:
def get_region_lines_indices(self, region):
    indices = get_line_indices(histogram(region, 0.8))
    lines = []
    for line_index in indices:
        line = []
        for k in range(self.thickness):
            line.append(line_index+k)
        lines.append(line)
    self.rows.append([np.average(x) for x in lines])

In [29]:
def get_connected_components(img_without_staff, img_with_staff):
    components = []
    boundary = []
    # thresh = threshold_otsu(img_without_staff)
    # bw = closing(img_without_staff <= thresh, square(3))
    bw = 1-img_without_staff
    label_img = label(bw)
    img_label_overlay = label2rgb(
        label_img, image=img_without_staff, bg_label=0)
    for region in regionprops(label_img):
        if region.area >= 100:
            boundary.append(region.bbox)

    boundary = sorted(boundary, key=lambda b: b[1])

    comp_with_staff = []
    for bbox in boundary:
        minr, minc, maxr, maxc = bbox
        components.append(img_without_staff[minr:maxr, minc:maxc])
        comp_with_staff.append(img_with_staff[minr:maxr, minc:maxc])
    return components, comp_with_staff, boundary

In [30]:
def get_labeled_data(img):
    labels_list, images_list = [], []
    segmenter, imgs_spacing, imgs_rows, coord_imgs, imgs_with_staff = segmenting(img)
    black_names = ['4', '8', '8_b_n', '8_b_r', '16', '16_b_n', '16_b_r', '32', '32_b_n', '32_b_r', 'a_4',
                   'a_8', 'a_16', 'a_32', 'chord']
    disk_size = segmenter.most_common / 4
    for i, img in enumerate(coord_imgs):
        primitives, prim_with_staff, boundary = get_connected_components(img, imgs_with_staff[i])
        for j, prim in enumerate(primitives):
            prim = binary_opening(prim, square(segmenter.most_common-imgs_spacing[i]))
            label = predict((255*(1 - prim)).astype(np.uint8))[0]
            if label in black_names:
                test_img = binary_dilation(np.copy(prim_with_staff[j]), disk(disk_size))
                comps, comp_w_staff, bounds = get_connected_components(test_img, prim_with_staff[j])
                comps, comp_w_staff, bounds = filter_beams(comps, comp_w_staff, bounds)
                bounds = [np.array(bound)+disk_size-2 for bound in bounds]
                if len(bounds) <= 1 or label in ['8_b_n', '8_b_r', '16_b_n', '16_b_r', '32_b_n', '32_b_r']:
                    for bbox in bounds:
                        line_idx, p = estim(int(bbox[2]+boundary[j][0]), i, imgs_spacing, imgs_rows)
                        labels_list.append(label_map[line_idx][p])
                        images_list.append(prim_with_staff[j])
            elif label in ['2', 'a_2']:
                head_img = binary_closing(1-binary_fill_holes(1-prim), disk(disk_size))
                comps, comp_w_staff, bounds = get_connected_components(head_img, prim_with_staff[j])
                for bbox in bounds:
                    line_idx, p = estim(int(bbox[2]+boundary[j][0]), i, imgs_spacing, imgs_rows)
                    labels_list.append(label_map[line_idx][p])
                    images_list.append(prim_with_staff[j])
            elif label in ['1', 'a_1']:
                line_idx, p = estim(int(boundary[j][2]), i, imgs_spacing, imgs_rows)
                labels_list.append(label_map[line_idx][p])
                images_list.append(prim_with_staff[j])
    return labels_list, images_list

In [31]:
# all_images = read_all_images(2500)
# all_images_labels, all_images_arr, all_images_ids =[], [], []
# for i, image in tqdm(enumerate(all_images)):
#     labels_list, images_list = get_labeled_data(image)
#     all_images_labels += labels_list
#     all_images_arr += images_list
#     all_images_ids += [i] * len(labels_list)

In [32]:
#view some labels
# print(all_images_labels[:20])

In [33]:
#view some images
# show_images(all_images_arr[:20])

In [34]:
#check that all lengths match
# len(all_images_labels) == len(all_images_arr) == len(all_images_ids)

In [35]:
#Store the data in a dataframe
# df = pd.DataFrame(data= {'ID':all_images_ids, 'img': all_images_arr, 'label': all_images_labels})
# df.head()

In [36]:
# df.shape

In [37]:
# df.to_hdf('all_images_data.h5', key='data', mode='w')


In [38]:
df = pd.read_hdf('all_images_data.h5')

In [39]:
#creating a column for each unique label
unique_labels = df.label.unique()
for label in unique_labels:
    df[label] = 0
df.head()

Unnamed: 0,ID,img,label,c4,g4,a4,f4,e4,d4,a3,N0,b4,b3,e3,g3,f3,d3,c3
0,0,"[[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,...",c4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,"[[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",c4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,"[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,...",g4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,"[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,...",g4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,"[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...",a4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [40]:
#filling each column with the corresponding label
for i, label in enumerate(df.label):
    df[label][i] = 1
df.head()

Unnamed: 0,ID,img,label,c4,g4,a4,f4,e4,d4,a3,N0,b4,b3,e3,g3,f3,d3,c3
0,0,"[[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,...",c4,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,"[[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",c4,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,"[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,...",g4,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,"[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,...",g4,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,"[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...",a4,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0


In [41]:
def unify_shape(df):
    rows, cols = [], []
    for image in df.img:
        rows.append(image.shape[0])
        cols.append(image.shape[1])

    rows = max(rows)
    cols = max(cols)
    for i in range(len(df)):
        add_rows = np.ones((rows - df['img'][i].shape[0], df['img'][i].shape[1]), dtype = int)
        df['img'][i] = np.vstack((df['img'][i], add_rows))

        add_cols = np.ones((rows, cols - df['img'][i].shape[1]), dtype = int)
        df['img'][i] = np.hstack((df['img'][i], add_cols))
    return df, rows, cols

In [42]:
df, rows, cols = unify_shape(df)

In [43]:
#checking shapes
for x in df.img[:5]:
    print(x.shape)

(169, 280)
(169, 280)
(169, 280)
(169, 280)
(169, 280)


In [44]:

#split the data based on the number of images -> each music sheet
train_data = df[df.ID >= int(df.ID.nunique() * 0.2)]
test_data = df[df.ID < int(df.ID.nunique() * 0.2)]

In [45]:
X_train, y_train = train_data['img'], np.asarray(train_data.drop(columns=['ID','img','label']))
X_test, y_test = test_data['img'], np.asarray(test_data.drop(columns=['ID','img','label']))

In [46]:
def reshape_x(X):
    return np.asarray([x.reshape((rows, cols, 1)) for x in X])

In [47]:
X_train = reshape_x(X_train)
X_train.shape

(19552, 169, 280, 1)

In [48]:
y_train.shape

(19552, 15)

In [49]:
X_test = reshape_x(X_test)
X_test.shape

(4345, 169, 280, 1)

In [50]:
y_test.shape

(4345, 15)

In [51]:
# NN = Sequential()
# NN.add(InputLayer(input_shape=X_train.shape[1:]))

# NN.add(Conv2D(filters=64, kernel_size=3, activation='relu'))
# NN.add(MaxPooling2D())
# NN.add(Conv2D(filters=128, kernel_size=3, activation='relu'))
# NN.add(MaxPooling2D())
# NN.add(Conv2D(filters=256, kernel_size=3, activation='relu'))
# NN.add(MaxPooling2D())
# NN.add(Conv2D(filters=512, kernel_size=3, activation='relu'))
# NN.add(MaxPooling2D())
# NN.add(Conv2D(filters=1024, kernel_size=3, activation='relu'))
# NN.add(MaxPooling2D())

# NN.add(GlobalAveragePooling2D())

# # NN.add(Dropout(0.5))

# NN.add(Dense(2048, activation='relu'))
# NN.add(Dense(df.label.nunique(), activation='softmax'))

# NN.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# NN.summary()

In [52]:
# NN.fit(X_train, y_train, epochs=30, verbose=1, validation_split=0.2);

In [53]:
# NN.save("trained_model.h5")

In [54]:
NN = load_model("trained_model.h5")





In [55]:
#storing the predictions
results = []
pred = NN.predict(X_test)
for p in pred:
    results.append(unique_labels[np.argmax(p)])
print(results[:50])

['d4', 'd4', 'b3', 'b3', 'N0', 'N0', 'b3', 'b3', 'b3', 'b3', 'd4', 'd4', 'f4', 'f4', 'f4', 'd4', 'b3', 'b3', 'f4', 'b3', 'd4', 'd4', 'f4', 'f4', 'b3', 'b3', 'e4', 'b3', 'd4', 'e4', 'e4', 'a3', 'g4', 'N0', 'g4', 'g4', 'N0', 'g4', 'N0', 'e4', 'g4', 'f4', 'b3', 'f4', 'a3', 'N0', 'f4', 'N0', 'f4', 'g4']


In [56]:
print('The test accuracy is', round(NN.evaluate(X_test, y_test, verbose=0)[1] * 100, 2),'%')


The test accuracy is 71.32 %


In [57]:
#storing the list of predicted notes for each sheet (image)
IDs = list(test_data.ID)
num_of_notes = IDs.count(0) #for the first image
results_dict = {}
results_dict[0] = results[:num_of_notes]
for i in range(test_data.ID.nunique() - 1):
    results_dict[i+1] = results[num_of_notes : num_of_notes + IDs.count(i+1)]
    num_of_notes += IDs.count(i+1)

In [61]:
print(results_dict[2]) #predi

['b4', 'b4', 'b4', 'b4', 'b4', 'a4', 'a4', 'a4', 'a4', 'a4', 'b4', 'b4', 'b4', 'b4']


In [62]:
def notes_to_music_with_instrument(notes_sequence, instrument_name):
    # Створення нового музичного листа
    music_stream = stream.Stream()

    # Визначення інструменту
    instr = instrument.fromString(instrument_name)
    music_stream.insert(0, instr)

    # Додавання нот до музичного листа
    for note_name in notes_sequence:
        if len(note_name) == 2:
            new_note = note.Note(note_name)
            music_stream.append(new_note)

    # Задання темпу
    music_stream.insert(0, tempo.MetronomeMark(number=120))

    return music_stream

# Конвертування послідовності нот у музику зі скрипкою як інструментом
music_with_violin = notes_to_music_with_instrument(results_dict[2], 'Violin')

# Збереження музики у форматі MIDI
file_path = "music_with_violin.mid"
music_with_violin.write('midi', fp=file_path)


'music_with_violin.mid'

In [63]:
#getting the notes and frequencies from wikipedia
page = requests.get('https://en.wikipedia.org/wiki/Piano_key_frequencies')
soup = BeautifulSoup(page.content, "html.parser")

In [64]:
#initializing with a Quarter Note Rest, which has a 0 frequency
note = ['N0']
frequency = ['0']
for i, row in enumerate(soup.find('table', class_="wikitable sortable").find_all('tr')):
    if i < 2: #skipping the headers
        continue
    note.append(row.find_all('td')[3].text.strip().lower()) #third element is the note
    frequency.append(row.find_all('td')[4].text.strip()) #fourth element is the note

In [65]:
#Store the data in a dataframe
frequency_df = pd.DataFrame(data= {'note':note, 'frequency': frequency})
frequency_df.head()

Unnamed: 0,note,frequency
0,N0,0
1,b8,99
2,a♯8/b♭8,98
3,a8,97
4,g♯8/a♭8,96


In [66]:
#getting the list of labels
original_labels_df = pd.DataFrame.from_dict(label_map, orient='index').reset_index(drop=True)
original_labels_list = list(original_labels_df[0]) + list(original_labels_df[1][1:])
print(original_labels_list)

['N0', 'b4', 'g4', 'e4', 'c4', 'a3', 'f3', 'd3', 'a4', 'f4', 'd4', 'b3', 'g3', 'e3', 'c3']


In [67]:
#collecting the index of the notes that are not in our labels list
idx_to_drop = []
for i, note in enumerate(frequency_df.note):
    if note.split()[0] not in original_labels_list:
        idx_to_drop.append(i)

In [68]:
#dropping the notes that are not in our labels list
frequency_df = frequency_df.drop(index=idx_to_drop).reset_index(drop=True)
frequency_df

Unnamed: 0,note,frequency
0,N0,0
1,b4,51
2,a4 a440,49
3,g4,47
4,f4,45
5,e4,44
6,d4,42
7,c4 middle c,40
8,b3,39
9,a3,37


In [69]:
#manually clean two rows
frequency_df.iloc[2].note = 'a4'
frequency_df.iloc[7].note = 'c4'
frequency_df

Unnamed: 0,note,frequency
0,N0,0
1,b4,51
2,a4,49
3,g4,47
4,f4,45
5,e4,44
6,d4,42
7,c4,40
8,b3,39
9,a3,37


In [70]:
#check that both lengths match
len(original_labels_list) == len(frequency_df)

True

In [71]:
#from the official documentation of SciPy
samplerate = 44100 #Frequecy in Hz
amplitude = np.iinfo(np.int16).max
t = np.linspace(0., 0.5, int(samplerate * 0.5))

In [72]:
#producing the music of the first sheet
waves = []
for note in results_dict[0]:
    freq = float(frequency_df[frequency_df.note == note].frequency.values[0])
    wave = amplitude * np.sin(2 * np.pi * freq * t)
    waves.append(wave)
song_data = np.concatenate(waves)

In [73]:
#outputs a wav audio file of the song
write('song.wav', samplerate, song_data.astype(np.int16))