<a href="https://colab.research.google.com/github/Danpollak/Word2Note/blob/master/DALI_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This notebook was designed to run on local environments, and requires the DALI dataset.  

In [None]:
!pip install dali-dataset --upgrade

In [None]:
import os
import DALI as dali_code
from math import log2, pow
import pickle as pk
import re
import copy

In [None]:
dali_data_path = 'D:\DALI\DALI_v1.0/'

In [None]:
# Load the dataset, takes time

# Load all the dataset
# dali_data = dali_code.get_the_DALI_dataset(dali_data_path, skip=[], keep=[])

# load sample data
dali_data = dali_code.get_the_DALI_dataset(dali_data_path, skip=[],keep=[])

# Confirmation of data load
# Should print ['DALI_ID' 'NAME' 'YOUTUBE' 'WORKING'] if working
dali_info = dali_code.get_info(dali_data_path + '/info/DALI_DATA_INFO.gz')


In [None]:
# Auxilary functions to preprocess dataset
A4 = 440
C0 = A4*pow(2, -4.75)
name = ["C", "C#", "D", "D#", "E", "F", "F#", "G", "G#", "A", "A#", "B"]
    
def pitch(freq,isFirstNote):
    h = round(12*log2(freq/C0))
    octave = h // 12
    n = h % 12
    firstNote = '_S' if isFirstNote else ''
    return name[n] + str(octave) + firstNote

def round_time_delta(delta):
  d = delta*10
  d = round(d)
  return d/10

In [None]:
### Create dataset to subwords-notes

def DALI_to_subwords_notes:
  data_song_keys = list(dali_data.keys())
  only_char_reg = re.compile("[^a-zA-Z_\']")
  fixed_song = 0
  failed_song = 0

  ds = []

  for song in data_song_keys:
    # get the data
    song_info = dali_data[song].info
    song_annotations = dali_data[song].annotations

    # remove non-english songs
    if song_info['metadata']['language'] != 'english':
      continue

    if song_annotations['type'] != 'vertical':
      try:
          dali_data[song].horizontal2vertical()
      except Exception as e:
          failed_song+=1
      if song_annotations['type'] != 'vertical':
          failed_song+=1
          continue

    # iterate over paragraphs
    for paragraph in song_annotations['annot']['hierarchical']:
      for line in paragraph['text']:
        line_in_notes = []
        line_in_words = ''
        for word in line['text']:
          for note in word['text']:
            if type(note) != type({}):
              note = word
            
            note_text = only_char_reg.sub('',note['text'])
            delta = round_time_delta(note['time'][1]-note['time'][0])
            ref_note = [note_text,pitch(note['freq'][0]),delta]
            line_in_notes.append(ref_note)
            line_in_words = line_in_words + note_text
          line_in_words = line_in_words + ' '
        ds.append([line_in_notes,line_in_words])

In [None]:
### Create dataset characters-notes

def DALI_to_characters_notes:
  data_song_keys = list(dali_data.keys())
  only_char_reg = re.compile("[^a-zA-Z_\']")
  fixed_song = 0
  failed_song = 0

  EMPTY_SPACE_NOTE = (' ', 'BREAK','BREAK')

  ds = []

  for song in data_song_keys:
    # get the data
    song_info = dali_data[song].info
    song_annotations = dali_data[song].annotations

    # remove non-english songs
    if song_info['metadata']['language'] != 'english':
      continue

    if song_annotations['type'] != 'vertical':
      try:
          dali_data[song].horizontal2vertical()
      except Exception as e:
          failed_song+=1
      if song_annotations['type'] != 'vertical':
          failed_song+=1
          continue

    # iterate over paragraphs
    for paragraph in song_annotations['annot']['hierarchical']:
      for line in paragraph['text']:
        line_in_notes = []
        line_in_words = ''
        for word in line['text']:
          complete_word = ''
          for note in word['text']:
            if type(note) != type({}):
              note = word
            
            note_text = only_char_reg.sub('',note['text'])
              
          for note in word['text']:
            if type(note) != type({}):
              note = word
            
            note_text = only_char_reg.sub('',note['text'])
            index = 0
            
            for character in note_text:
              line_in_notes.append((character, pitch(note['freq'][0], index == 0), note_text))
              index+=1
              complete_word += note_text
            line_in_words = line_in_words + note_text
          line_in_notes.append(EMPTY_SPACE_NOTE)
          line_in_words = line_in_words + ' '
        ds.append([line_in_notes,line_in_words])

In [None]:
# save dataset as pickle file
output =  'D:/DALI/output/ds_characters_withword.pickle'
with open(output, 'wb') as pickle_file:
    pk.dump(ds,pickle_file,protocol=pk.HIGHEST_PROTOCOL)


In [None]:
# split the dataset to train and test
split_rate = 0.2
split_point= round(len(ds) * split_rate)
train = ds[split_point:]
test = ds[:split_point]
DALI_dataset = {'train': train, 'test': test}

In [None]:
# save the split dataset as pickle file
output =  'D:/DALI/output/ds_characters_withword_split.pickle'
with open(output, 'wb') as pickle_file:
    pk.dump(DALI_dataset,pickle_file,protocol=pk.HIGHEST_PROTOCOL)