In [88]:
import numpy as np
import matplotlib.pyplot as plt 
import pandas 
import mido
import os
import time
from tqdm import tqdm
from mido.midifiles.meta import KeySignatureError
from mido import MidiTrack, MetaMessage, Message, MidiFile

In [None]:
InputPath = os.path.realpath('clean_midi')
LogFolder = os.path.realpath('LogFolder')
EmptyFolder = 'EmptyFolder.txt'
CorruptedSongs = 'CorruptedSongs.txt'
WrongTimeStamp = 'WrongTimeStamp.txt'

# Cleaning data

In [8]:
#Checks if the folder is empty or not given the path to the folder.
#If empty write in a log file the name of the folder
def Func_EmptyFolder(DirPath, dir):

   FilesInFolder = sum(1 for entry in os.scandir(DirPath) if entry.is_file())
   if FilesInFolder == 0:

      LogFilePath = os.path.join(LogFolder, EmptyFolder)
      with open(LogFilePath, 'a') as f:
         f.write(f'{dir}\n')
      
      try: 
         os.rmdir(DirPath)
      except Exception as rm_err:
         print(f"Failed to delete Folder")



#Check if the file is corrupted (there a re just a few)
def Func_CorruptedFile(FilePath, file, dir):

   try :
      mid = mido.MidiFile(FilePath)
      return mid

   except (OSError, ValueError, KeyError, KeySignatureError, EOFError) as e:

      CorruptedFilePath = os.path.join(LogFolder, CorruptedSongs)
      with open(CorruptedFilePath, 'a') as f:
         f.write(f'{dir}/{file}\n')

         try:
            os.remove(FilePath)
         except Exception as rm_err:
            print(f"Failed to delete {file}: {rm_err}")



#Check the time signature of the file, for now considering only the one with 4/4
def Func_CheckTimeStamp(FilePath, track, file, dir):
      
   invalid = False
   WrongTimeStampPath = os.path.join(LogFolder, WrongTimeStamp)


   for msg in track:
      if msg.type == 'time_signature':
         if msg.numerator != 4 or msg.denominator != 4:
               invalid = True
               break  
   if invalid:
      with open(WrongTimeStampPath, 'a') as f:
         f.write(f'{dir}/{file}\n')
      try:
         os.remove(FilePath)
      except Exception as e:
         print(f"Failed to delete {file}: {e}")

In [None]:
def CleaningData():

   InputPath = os.path.realpath('clean_midi')
   
   for dir in tqdm(os.listdir(InputPath)):

      DirPath = os.path.join(InputPath, dir)

      #to avoid .Ds_Store to be read
      if not os.path.isdir(DirPath):
         continue

      for file in os.listdir(DirPath):
         FilePath = os.path.join(DirPath, file)

         
         mid = Func_CorruptedFile(FilePath, file, dir) 
         if mid is None:
            continue

         #Check the timestamp (found in the first track as convention)
         InitTrack = mid.tracks[0]
         Func_CheckTimeStamp(FilePath, InitTrack, file, dir)

      Func_EmptyFolder(DirPath, dir)

In [10]:
CleaningData()

  0%|          | 0/2135 [00:00<?, ?it/s]

100%|██████████| 2135/2135 [09:59<00:00,  3.56it/s]


# Preprocessing data:

In [114]:
#Transorm a given track into monophonic
def ToMonphonic(track):

   absTime = 0
   Events, Metadata = [], []

   for msg in track:
      absTime += msg.time

      #Recreate metadata with absolute time from original midi file
      if msg.is_meta:
         Metadata.append((absTime, msg))
      elif msg.type == 'note_on' and msg.velocity > 0:
                        #time, note, velocity and kind
         Events.append((absTime, msg.note, msg.velocity, 'on'))
      elif msg.type == 'note_off' or (msg.type == 'note_on' and msg.velocity == 0):
         Events.append((absTime, msg.note, 0, 'off'))

   #sort events prioritizing the ones with the higher notes
   Events.sort(key = lambda x: (x[0], -x[1]))

   activeNote = None
   monoEvents = []

   #Checks if there are multiple active notes (polyphonic)
   #Ifthere are choose the one with the highest note 
   #recreate the MidiMessage
   for time, note, velocity, kind in Events:
      if kind == 'on':
         if activeNote is None or note > activeNote:
            if activeNote is not None:
               monoEvents.append(('off', activeNote, time))
            activeNote = note
            monoEvents.append(('on', note, time))
      elif kind == 'off' and note == activeNote:
         monoEvents.append(('off', note, time))
         activeNote = None


   #Rebuild the monophonic track
   newTrack = MidiTrack()
   prevTime = 0

   for absTime, msg in sorted(Metadata, key=lambda x: x[0]):
      Delta = absTime - prevTime
      #define a dictionary in which append all the information
      msgDict = msg.dict().copy()
      #pop the information already presents 
      msgDict.pop('time', None)
      msgDict.pop('type', None)

                                    #add the informations and unpack the dictionary
      newTrack.append(MetaMessage(msg.type, time=Delta, **msgDict))
      prevTime = absTime

   #add Note Message 
   for kind, note, absTime in monoEvents:
      Delta = absTime - prevTime
      if kind == 'on':                                   #Flatten the velocity to 64 (can do better)
         newTrack.append(Message('note_on', note = note, velocity = 64, time = Delta))

      else:
         newTrack.append(Message('note_off', note = note, velocity = 64, time = Delta))
      prevTime = absTime

   return newTrack




#Recreate the whole database with monophonic information
def RecreateDatabase():
   InputPath = os.path.realpath('clean_midi')
   OutputPath = os.path.realpath('Mono_CleanMidi')

   for dir in tqdm(os.listdir(InputPath)):
      DirPath = os.path.join(InputPath, dir)

      if not os.path.isdir(DirPath):
         continue

      #In the output path, create the folder of the artist if does not exits
      if not os.path.exists(os.path.join(OutputPath, dir)):
        os.makedirs(os.path.join(OutputPath, dir))

      for file in os.listdir(DirPath):
         FilePath = os.path.join(DirPath, file)

         mid = mido.MidiFile(FilePath)
         #Instatiate the new monophonic midi file
         newMid = mido.MidiFile(ticks_per_beat=mid.ticks_per_beat)

         #loop over all the tracks in the original file and saving as new file:
         for track in mid.tracks:
            
            try: 
               MonoMidi = ToMonphonic(track)
               newMid.tracks.append(MonoMidi)
            except (KeyError) as e:
               continue
         
         try:
            newMid.save(os.path.join(OutputPath, dir, file))
         except (ValueError, KeyError) as e:
            continue

In [115]:
RecreateDatabase()

100%|██████████| 2079/2079 [14:46<00:00,  2.34it/s] 


In [None]:
def PreProcessing():

   InputPath = os.path.realpath('clean_midi')

   for dir in tqdm(os.listdir(InputPath)):
      DirPath = os.path.join(InputPath, dir)

      #to avoid .Ds_Store to be read
      if not os.path.isdir(DirPath):
         continue

      for file in os.listdir(DirPath):
         FilePath = os.path.join(DirPath, file)

         mid = mido.MidiFile(FilePath)

         InitialTrack = mid.tracks[0]

         Tempo = ExtractTempo(InitialTrack)





In [None]:
PreProcessing()

In [64]:
meta_messages = [msg for track in mid.tracks for msg in track if msg.is_meta == True]
meta_messages

[MetaMessage('set_tempo', tempo=500000, time=0),
 MetaMessage('time_signature', numerator=4, denominator=4, clocks_per_click=24, notated_32nd_notes_per_beat=8, time=0),
 MetaMessage('key_signature', key='C', time=0),
 MetaMessage('set_tempo', tempo=633245, time=38400),
 MetaMessage('end_of_track', time=0),
 MetaMessage('track_name', name='Acoustic Guitar', time=0),
 MetaMessage('midi_port', port=0, time=0),
 MetaMessage('channel_prefix', channel=0, time=0),
 MetaMessage('end_of_track', time=0),
 MetaMessage('track_name', name='Piano', time=0),
 MetaMessage('midi_port', port=0, time=0),
 MetaMessage('channel_prefix', channel=1, time=0),
 MetaMessage('end_of_track', time=0),
 MetaMessage('track_name', name='Whistle', time=0),
 MetaMessage('midi_port', port=0, time=0),
 MetaMessage('channel_prefix', channel=2, time=0),
 MetaMessage('end_of_track', time=0),
 MetaMessage('track_name', name='Bass!', time=0),
 MetaMessage('midi_port', port=0, time=0),
 MetaMessage('channel_prefix', channel=3,

In [67]:
# #for track in mid.tracks[0]:
# for msg in mid.tracks[1]:
#    if msg.type in ('note_on', 'note_off'):
#       print(msg)