<a href="https://colab.research.google.com/github/AryanMethil/Brain_Tumor_Detection/blob/master/Melody_Generation_Preprocesing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import zipfile

local_zip = '/content/drive/My Drive/deutschl.zip'
zip_ref = zipfile.ZipFile(local_zip, 'r')
zip_ref.extractall('/content/')

zip_ref.close()

In [None]:
!pip install music21



In [None]:
import music21 as m21

In [None]:
KERN_DATASET_PATH='/content/essen/europa/deutschl/erk'
ACCEPTABLE_DURATIONS=[0.25,0.5,0.75,1,1.5,2,3,4]
SINGLE_FILE_DATASET='file_dataset'
SEQUENCE_LENGTH=64

In [None]:
def load_songs_in_kern(dataset_path):

  songs=[]

  #go through the files and load them using music21
  for dir,subdir,files in os.walk(dataset_path):
    for f in files:
      if(f[-3:]=='krn'):
        song=m21.converter.parse(os.path.join(dir,f))
        songs.append(song)
  return songs

In [None]:
def has_acceptable_durations(song,ACCEPTABLE_DURATIONS):
  for note in song.flat.notesAndRests:
    if(note.duration.quarterLength not in ACCEPTABLE_DURATIONS):
      return False
  return True

In [None]:
def transpose(song):

  #get key from the song
  parts=song.getElementsByClass(m21.stream.Part)
  measures_part0=parts[0].getElementsByClass(m21.stream.Measure)
  key=measures_part0[0][4]

  #estimate key using music21
  if not isinstance(key,m21.key.Key):
    key=song.analyze('key')

  #get interval for transposition
  if key.mode=='major':
    interval=m21.interval.Interval(key.tonic,m21.pitch.Pitch('C'))
  elif key.mode=='minor':
    interval=m21.interval.Interval(key.tonic,m21.pitch.Pitch('A'))
    
  #transpose song by calculated interval
  transposed_song=song.transpose(interval)

  return transposed_song

In [None]:
def encode_song(song):

  encoded_song=[]
  
  # pitch=60 duration=1 -> [60,'_','_','_']
  # ie each element in the encoded list will represent a quarter length 
  # so pitch 60 will be there for 4 elements but we write only the first occurence as 60 and the others as underscore

  for event in song.flat.notesAndRests:
    
    # handle notes
    if isinstance(event,m21.note.Note):
      symbol=event.pitch.midi #60
    
    elif isinstance(event,m21.note.Rest):
      symbol='r'
    
    steps=int(event.duration.quarterLength/0.25)

    for step in range(steps):
      if step==0:
        encoded_song.append(symbol)
      else:
        encoded_song.append('_')

  encoded_song=" ".join(list(map(str,encoded_song)))

  return encoded_song

In [None]:
os.makedirs('/content/dataset/')

In [None]:
def preprocess(dataset_path):

  #load the filter songs
  songs=load_songs_in_kern(dataset_path)

  for i,song in enumerate(songs):

    #filter out songs that have non-acceptable durations
    if not has_acceptable_durations(song,ACCEPTABLE_DURATIONS):
      continue

    #transpose songs to C major / A minor
    song=transpose(song)


    #encode songs with music time series representation
    encoded_song=encode_song(song)

    #save songs to text file
    save_path='/content/dataset/'+str(i)
    with open(save_path,'w') as f:
      f.write(encoded_song)

In [None]:
preprocess(KERN_DATASET_PATH)

In [None]:
def load(file_path):
  with open(file_path,'r') as f:
    song=f.read()
  return song

In [None]:
def create_single_file_dataset(dataset_path,single_file_dataset,sequence_length):

  new_song_delimiter="/ "*sequence_length
  songs=""
  # load encoded songs and add delimiters
  for dir,subdir,files in os.walk(dataset_path):
    for f in files:
      file_path=os.path.join(dir,f)
      song=load(file_path)
      songs+=song+" "+new_song_delimiter
  songs=songs[:-1]
  # save string that contains all dataset
  with open(single_file_dataset,'w') as f:
    f.write(songs)
  
  return songs


In [None]:
songs=create_single_file_dataset('/content/dataset/',SINGLE_FILE_DATASET,SEQUENCE_LENGTH)

In [None]:
import json
def create_mapping(songs):
  
  mapping={}

  # identify the vocabulary
  songs=songs.split()
  vocabulary=list(set(songs))

  # create mapping
  for i,symbol in enumerate(vocabulary):
    mapping[symbol]=i

  # save the vocabulary in json file
  with open('mapping.json','w') as f:
    json.dump(mapping,f,indent=4)

In [None]:
create_mapping(songs)

In [None]:
def convert_songs_to_int(songs):

  int_songs=[]

  # load mappings
  with open('mapping.json','r') as f:
    mapping=json.load(f)

  # songs to list
  songs=songs.split()

  # map strings to int
  for symbol in songs:
    int_songs.append(mapping[symbol])
  
  return int_songs

In [None]:
import tensorflow as tf
import numpy as np

def generate_training_sequences(sequence_length):

  # load the songs and map them to int
  songs=load(SINGLE_FILE_DATASET)
  int_songs=convert_songs_to_int(songs)

  # generate the training sequences
  inputs=[]
  targets=[]

  num_sequences=len(int_songs)-sequence_length
  for i in range(num_sequences):
    inputs.append(int_songs[i:i+sequence_length])
    targets.append(int_songs[i+sequence_length])
  
  # onehot encode the sequences
  # inputs shape = num_sequences x sequence_length
  # [[0,1,2],[1,1,2]] -> [ [ [1,0,0],[0,1,0],[0,0,1] ] , [ [],[],[] ] ]
  # one-hot encoded shape -> num_sequences x sequence_length x vocabulary_size

  vocabulary_size=len(set(int_songs))
  inputs=tf.keras.utils.to_categorical(inputs,num_classes=vocabulary_size)
  targets=np.array(targets)
  return inputs,targets

In [None]:
inputs,targets=generate_training_sequences(SEQUENCE_LENGTH)
print(inputs.shape)
print(targets.shape)

(362178, 64, 38)
(362178,)


In [None]:
from shutil import make_archive,rmtree

In [None]:
make_archive('dataset','zip','/content/dataset/')

'/content/dataset.zip'