# AMMA Pre-Processing

# Imports and package installations needed for pre-processing of MIDI data

In [0]:
!pip install suffix-tree
!pip install pypianoroll

Collecting suffix-tree
  Downloading https://files.pythonhosted.org/packages/56/ff/97c5c2307642bb2eed08b5ba589a5c54111507aed8c6761cfd655afb5c03/suffix_tree-0.0.6-py3-none-any.whl
Installing collected packages: suffix-tree
Successfully installed suffix-tree-0.0.6
Collecting pypianoroll
  Downloading https://files.pythonhosted.org/packages/aa/33/fa38c07909e425add987146cb0f8d5ad80262f6a72cc820bf7e5f690d527/pypianoroll-0.5.0.tar.gz
Building wheels for collected packages: pypianoroll
  Building wheel for pypianoroll (setup.py) ... [?25ldone
[?25h  Stored in directory: /root/.cache/pip/wheels/ed/f6/fb/5d070524ecf7ba9ed201247a293c01945cfd7f840f8ef338c0
Successfully built pypianoroll
Installing collected packages: pypianoroll
Successfully installed pypianoroll-0.5.0


In [0]:
from suffix_tree import Tree
import numpy as np
import pretty_midi
import os
import pypianoroll

Imageio: 'ffmpeg-linux64-v3.3.1' was not found on your computer; downloading it now.
Try 1. Download from https://github.com/imageio/imageio-binaries/raw/master/ffmpeg/ffmpeg-linux64-v3.3.1 (43.8 MB)
Downloading: 8192/45929032 bytes (0.0%)1892352/45929032 bytes (4.1%)5865472/45929032 bytes (12.8%)9814016/45929032 bytes (21.4%)13590528/45929032 bytes (29.6%)17309696/45929032 bytes (37.7%)20938752/45929032 bytes (45.6%)24748032/45929032 bytes (53.9%)28418048/45929032 bytes (61.9%)32014336/45929032 bytes (69.7%)35725312/45929032 bytes (77.8%)39583744/45929032 bytes (86.2%)43253760/45929032 bytes (94.2%)

# Google Drive authentication and file sourcing

In [0]:
!pip install -U -q PyDrive

from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from google.colab import files
from google.colab import drive
from oauth2client.client import GoogleCredentials

drive.mount('/content/gdrive', force_remount=True)

[?25l[K    1% |▎                               | 10kB 19.6MB/s eta 0:00:01[K    2% |▋                               | 20kB 1.8MB/s eta 0:00:01[K    3% |█                               | 30kB 2.7MB/s eta 0:00:01[K    4% |█▎                              | 40kB 1.7MB/s eta 0:00:01[K    5% |█▋                              | 51kB 2.1MB/s eta 0:00:01[K    6% |██                              | 61kB 2.5MB/s eta 0:00:01[K    7% |██▎                             | 71kB 2.9MB/s eta 0:00:01[K    8% |██▋                             | 81kB 3.2MB/s eta 0:00:01[K    9% |███                             | 92kB 3.6MB/s eta 0:00:01[K    10% |███▎                            | 102kB 2.8MB/s eta 0:00:01[K    11% |███▋                            | 112kB 2.8MB/s eta 0:00:01[K    12% |████                            | 122kB 4.0MB/s eta 0:00:01[K    13% |████▎                           | 133kB 4.0MB/s eta 0:00:01[K    14% |████▋                           | 143kB 7.8MB/s eta 0:00:01[

In [0]:
midi_folder = '/content/gdrive/My Drive/Senior Design Project/MIDI Training Data/'
os.chdir(midi_folder)
num_of_files = len(os.listdir(midi_folder))

# Loading and classification

In [0]:
note_shift = 24
num_of_notes = 83

In [0]:
def sublist(lst1, lst2):
    result = all(elem in lst2 for elem in lst1)
    return result

def notes_to_string(notes):
    notes_string = []
    for note in notes:
        notes_string.append(str(note.pitch))
    return ','.join(notes_string)

def get_notes_dict_int(pm):
    notes_dict = dict()
    for instrument in pm.instruments:
        pitches = []
        for note in instrument.notes:
            pitches.append(note.pitch)
        notes_dict[instrument.program] = pitches
    return notes_dict

def get_notes_dict(pm):
    notes_dict = dict()
    for instrument in pm.instruments:
        notes_dict[instrument.program] = notes_to_string(instrument.notes)
    return notes_dict

def grab_paths(tree):
    paths = []
    for C, path in sorted (tree.maximal_repeats ()):
        path_str = str(path)
        path_str.replace(' ', '')
        paths.append(path_str)
    paths.sort(key = len, reverse = True)
    return paths
    
def filter_paths(paths):
    results = []
    for path in paths:
        result = path.split(',')
        result2 = []
        for r in result:
            j = r.replace(' ', '')
            result2.append(j)
        result3 = list(filter(None, result2))
        result3 = [int(i) for i in result3]
        results.append(result3)
    return list(filter(None, results))

def construct_weighted_seq(filtered_paths):
    weighted_seq = dict()
    for path in filtered_paths:
        max_pitch = max(path)
        min_pitch = min(path)
        if min_pitch >= note_shift:
            weighted_seq[len(path) * (max_pitch - min_pitch)] = path
    return weighted_seq

def get_melody_instrument(weighted_seq, notes_int):
    best_fit = max(weighted_seq.keys())
    seq = weighted_seq[best_fit]
    for key in notes_int.keys():
        list1 = notes_int[key]  
        if sublist(seq, list1):
            return key

def melody_identifier(pm):
    notes_dict = get_notes_dict(pm)
    notes_int = get_notes_dict_int(pm)
    tree = Tree(notes_dict)
    paths = grab_paths(tree)
    results = filter_paths(paths)
    weighted_seq = construct_weighted_seq(results)
    return get_melody_instrument(weighted_seq, notes_int)

In [0]:
def readLabels(instrProgram, instrName, is_drum):
    if (is_drum):
        return 'Percussion'
    elif ((instrProgram in range(32,40) or "bass" in instrName.lower())):
        return 'Bass'
    elif(("vocal" in instrName.lower()) or ("voice" in instrName.lower())):
        return 'Vocals'
    elif("chord" in instrName.lower()):
        return 'Chords'
    else:
        return None

In [0]:
def readNumbers(instrProgram):
    if((instrProgram in range(24,32) or instrProgram in range(40, 52))):
        return 'Strings'
    elif((instrProgram in range(80,96))):
        return 'Chords'
    elif((instrProgram in range(56, 80))):
        return 'Winds'
    else:
        return None

Define matrices for each instrument classification of dimension (num_ticks x num_of_notes) for each song

In [0]:
global strings_pianorolls
strings_pianorolls = [0]*num_of_files

global melody_pianorolls
melody_pianorolls = [0]*num_of_files

global percussion_pianorolls
percussion_pianorolls = [0]*num_of_files

global bass_pianorolls
bass_pianorolls = [0]*num_of_files

global winds_pianorolls
winds_pianorolls = [0]*num_of_files

global vocals_pianorolls
vocals_pianorolls = [0]*num_of_files

global chords_pianorolls
chords_pianorolls = [0]*num_of_files

Setting up the Pianoroll Matrices for each Instrument Class

In [0]:
i = 0
j = 0
i_loaded = 0
melodyNumber = [0]*num_of_files
stringsNumber = [0]*num_of_files
chordsNumber = [0]*num_of_files
vocalsNumber = [0]*num_of_files
windsNumber = [0]*num_of_files
percussionNumber = [0]*num_of_files
bassNumber = [0]*num_of_files
unclassifiedTracks = dict()

for file in os.listdir(midi_folder):
    filename = os.fsdecode(file)
    print(str(j) + ' - ' + filename)
    j+=1
    

    pr = pypianoroll.parse(midi_folder + filename)
    pm_song = pretty_midi.PrettyMIDI(midi_folder + filename)

    if(pm_song.resolution == 96): # need the file resolution to be constant
        
        tracklist = pr.tracks
        for t, track in enumerate(tracklist):
            instrument_class = readLabels(track.program, track.name, track.is_drum)

            if(t == 0): # do it for first track only as an initialization

                # Initialize every entry with an empty pianoroll         
                percussion_pianorolls[i] = np.zeros(np.shape(track.pianoroll[:, note_shift:(note_shift+num_of_notes)]), dtype=np.uint8)
                melody_pianorolls[i] = np.zeros(np.shape(track.pianoroll[:, note_shift:(note_shift+num_of_notes)]), dtype=np.uint8)
                bass_pianorolls[i] = np.zeros(np.shape(track.pianoroll[:, note_shift:(note_shift+num_of_notes)]), dtype=np.uint8)
                vocals_pianorolls[i] = np.zeros(np.shape(track.pianoroll[:, note_shift:(note_shift+num_of_notes)]), dtype=np.uint8)
                chords_pianorolls[i] = np.zeros(np.shape(track.pianoroll[:, note_shift:(note_shift+num_of_notes)]), dtype=np.uint8)
                strings_pianorolls[i] = np.zeros(np.shape(track.pianoroll[:, note_shift:(note_shift+num_of_notes)]), dtype=np.uint8)
                winds_pianorolls[i] = np.zeros(np.shape(track.pianoroll[:, note_shift:(note_shift+num_of_notes)]), dtype=np.uint8)
               
            # append the pianoroll that belongs to that instrument class matrix
            if instrument_class=='Percussion':
                percussion_pianorolls[i] = np.bitwise_or(percussion_pianorolls[i], track.pianoroll[:, note_shift:(note_shift+num_of_notes)])
                percussionNumber[i] += track.program
            elif instrument_class=='Vocals':
                vocals_pianorolls[i] = np.bitwise_or(vocals_pianorolls[i], track.pianoroll[:, note_shift:(note_shift+num_of_notes)])
                vocalsNumber[i] += track.program
            elif instrument_class=='Bass':
                bass_pianorolls[i] = np.bitwise_or(bass_pianorolls[i], track.pianoroll[:, note_shift:(note_shift+num_of_notes)])
                bassNumber[i] += track.program
            elif instrument_class=='Chords':
                chords_pianorolls[i] = np.bitwise_or(chords_pianorolls[i], track.pianoroll[:, note_shift:(note_shift+num_of_notes)])
                chordsNumber[i] += track.program

            pr.remove_tracks(t) # remove the classified track afterwards to avoid reclassification

            
        melodyNumber[i] = melody_identifier(pr.to_pretty_midi()) # identify the melody
        tracklist = pr.tracks
        for t, track in enumerate(tracklist):            
            if(track.program == melodyNumber[i]):
                melody_pianorolls[i] = np.bitwise_or(melody_pianorolls[i], track.pianoroll[:, note_shift:(note_shift+num_of_notes)])
                pr.remove_tracks(t)
            else:
                instrument_class = readNumbers(track.program)
                if instrument_class == 'Chords':
                    chords_pianorolls[i] = np.bitwise_or(chords_pianorolls[i], track.pianoroll[:, note_shift:(note_shift+num_of_notes)])
                    chordsNumber[i] += track.program
                elif instrument_class == 'Strings':
                    strings_pianorolls[i] = np.bitwise_or(strings_pianorolls[i], track.pianoroll[:, note_shift:(note_shift+num_of_notes)])
                    stringsNumber[i] += track.program
                elif instrument_class == 'Winds':
                    winds_pianorolls[i] = np.bitwise_or(winds_pianorolls[i], track.pianoroll[:, note_shift:(note_shift+num_of_notes)])
                    windsNumber[i] += track.program
                pr.remove_tracks(t)

        for track in pr.tracks:
            unclassifiedTracks[filename] = (track.program, track.name)

        i += 1
    i_loaded += 1
                                  
print(i_loaded,'/', num_of_files, 'files loaded')
print(i,'/', i_loaded,'files loaded with 96 resolution')

0 - Dropgun-Aspyer-Next-To-Me-rlc-winston-20180418044640-nonstop2k.com.mid
1 - Daddy-s-Groove-Ferdy-Latido-rlc-winston-20180305092352-nonstop2k.com.mid
2 - R3HAB-BAD-max123a-20190127181244-nonstop2k.com.mid
3 - Mike-Williams-Dastic-You-I-rlc-winston-20180103060010-nonstop2k.com.mid
4 - Mike-Williams-Melody-Tip-of-my-Tongue-rlc-winston-20171001140716-nonstop2k.com.mid
5 - San-Holo-Duskus-Forever-Free-max123a-20190120103950-nonstop2k.com.mid
6 - Nora-En-Pure-We-Found-Love-ft-Ashibah-max123a-20190120104015-nonstop2k.com.mid
7 - Alan-Walker-Darkside-ft-Tomine-Harket-Au-Ra-rlc-winston-20180819174026-nonstop2k.com.mid
8 - Porter-Robinson-Unison-Huntroxic-20171214141416-nonstop2k.com.mid
9 - Sako-Isoyan-Irina-Makosh-Dreamer-rlc-winston-20170817223548-nonstop2k.com.mid
10 - The-Chainsmokers-Winona-Oak-Hope-theseus-20190105151228-nonstop2k.com.mid
11 - Dropgun-Kaleena-Zanders-Nothing-New-rlc-winston-20180103054507-nonstop2k.com.mid
12 - Mike-Perry-Shy-Martin-The-Ocean-rlc-winston-20170922112042

In [0]:
# Remove any extra entries at the end if not all files had same resolution
# i = num of files with 96 res

strings_pianorolls = strings_pianorolls[:i]
melody_pianorolls = melody_pianorolls[:i]
percussion_pianorolls = percussion_pianorolls[:i]
bass_pianorolls = bass_pianorolls[:i]
chords_pianorolls = chords_pianorolls[:i]
vocals_pianorolls = vocals_pianorolls[:i]
winds_pianorolls = winds_pianorolls[:i]

# Phrase Creation

Creating phrases (blocks of ticks) to use as training examples

In [0]:
tpqn = 96 # Ticks Per Quarter Note resolution. Varies with MIDI file, currently using same resolution
num_measures = 0.25 # number of bars/measures

In [0]:
# Output Phrase Lists

# i is from earlier, equals num of files loaded with tpqn = 96
global strings_phrases
strings_phrases = []

global percussion_phrases
percussion_phrases = []

global bass_phrases
bass_phrases = []

global chords_phrases
chords_phrases = []

global winds_phrases
winds_phrases = []

In [0]:
# Input Pianorolls Lists for Each Output

global X_strings 
X_strings = melody_pianorolls

global X_percussion 
X_percussion = melody_pianorolls

global X_bass 
X_bass = melody_pianorolls

global X_chords
X_chords = melody_pianorolls

global X_winds
X_winds = melody_pianorolls

In [0]:
# Input Phrase Lists for Each Output

global X_strings_phrases
X_strings_phrases = []

global X_percussion_phrases
X_percussion_phrases = []

global X_bass_phrases
X_bass_phrases = []

global X_chords_phrases
X_chords_phrases = []

global X_winds_phrases
X_winds_phrases = []

In [0]:
phrase_length = int(num_measures*4*tpqn) # in ticks

Creating Phrases From Pianoroll Lists


In [0]:
# Strings

for j, song in enumerate(strings_pianorolls):
    
    phrase_end = phrase_length # initialize the end of a phrase to be 4 bars from first tick
    # print(j)
    for phrase_start in range(0,len(song)-phrase_length + 1, phrase_length):
        # print(phrase_start, phrase_end)
        y_phrase = song[phrase_start:phrase_end] # grab a phrase
        
        if(np.any(np.count_nonzero(y_phrase, axis=1)) > 0): # if any string bar is not empty
            X_phrase = X_strings[j][phrase_start:phrase_end] # grab the input phrase at same song/indices
            
            if(np.any(np.count_nonzero(X_phrase, axis=1)) > 0):# if any melody bar is not empty
                strings_phrases.append(y_phrase)
                X_strings_phrases.append(X_phrase)
                # print("adding phrases")
           
        phrase_end += phrase_length

In [0]:
# Chords

for j, song in enumerate(chords_pianorolls):
    
    phrase_end = phrase_length # initialize the end of a phrase to be 4 bars from first tick
    # print(j)
    for phrase_start in range(0,len(song)-phrase_length + 1, phrase_length):
        # print(phrase_start, phrase_end)
        y_phrase = song[phrase_start:phrase_end] # grab a phrase
        
        if(np.any(np.count_nonzero(y_phrase, axis=1)) > 0): # if any string bar is not empty
            X_phrase = X_chords[j][phrase_start:phrase_end] # grab the input phrase at same song/indices
            
            if(np.any(np.count_nonzero(X_phrase, axis=1)) > 0):# if any melody bar is not empty
                chords_phrases.append(y_phrase)
                X_chords_phrases.append(X_phrase)
                # print("adding phrases")
           
        phrase_end += phrase_length

In [0]:
# Bass

for j, song in enumerate(bass_pianorolls):
    
    phrase_end = phrase_length # initialize the end of a phrase to be 4 bars from first tick
    # print(j)
    for phrase_start in range(0,len(song)-phrase_length + 1, phrase_length):
        # print(phrase_start, phrase_end)
        y_phrase = song[phrase_start:phrase_end] # grab a phrase
        
        if(np.any(np.count_nonzero(y_phrase, axis=1)) > 0): # if any string bar is not empty
            X_phrase = X_bass[j][phrase_start:phrase_end] # grab the input phrase at same song/indices
            
            if(np.any(np.count_nonzero(X_phrase, axis=1)) > 0):# if any melody bar is not empty
                bass_phrases.append(y_phrase)
                X_bass_phrases.append(X_phrase)
                # print("adding phrases")
           
        phrase_end += phrase_length

In [0]:
# Winds

for j, song in enumerate(winds_pianorolls):
    
    phrase_end = phrase_length # initialize the end of a phrase to be 4 bars from first tick
    # print(j)
    for phrase_start in range(0,len(song)-phrase_length + 1, phrase_length):
        # print(phrase_start, phrase_end)
        y_phrase = song[phrase_start:phrase_end] # grab a phrase
        
        if(np.any(np.count_nonzero(y_phrase, axis=1)) > 0): # if any string bar is not empty
            X_phrase = X_winds[j][phrase_start:phrase_end] # grab the input phrase at same song/indices
            
            if(np.any(np.count_nonzero(X_phrase, axis=1)) > 0):# if any melody bar is not empty
                winds_phrases.append(y_phrase)
                X_winds_phrases.append(X_phrase)
                # print("adding phrases")
           
        phrase_end += phrase_length

In [0]:
# Percussion

for j, song in enumerate(percussion_pianorolls):
    
    phrase_end = phrase_length # initialize the end of a phrase to be 4 bars from first tick
    # print(j)
    for phrase_start in range(0,len(song)-phrase_length + 1, phrase_length):
        # print(phrase_start, phrase_end)
        y_phrase = song[phrase_start:phrase_end] # grab a phrase
        
        if(np.any(np.count_nonzero(y_phrase, axis=1)) > 0): # if any string bar is not empty
            X_phrase = X_percussion[j][phrase_start:phrase_end] # grab the input phrase at same song/indices
            
            if(np.any(np.count_nonzero(X_phrase, axis=1)) > 0):# if any melody bar is not empty
                percussion_phrases.append(y_phrase)
                X_percussion_phrases.append(X_phrase)
                # print("adding phrases")
           
        phrase_end += phrase_length

# Saving The Data

In [0]:
cleaned_data_folder = '/content/gdrive/My Drive/Senior Design Project/Cleaned Data/'

os.chdir(cleaned_data_folder)
import glob

files = glob.glob(cleaned_data_folder + '*')
for f in files:
    os.remove(f) # clear folder each time

# Input Data
np.save(cleaned_data_folder + 'X_strings', X_strings_phrases)
np.save(cleaned_data_folder + 'X_chords', X_chords_phrases)
np.save(cleaned_data_folder + 'X_bass', X_bass_phrases)
np.save(cleaned_data_folder + 'X_winds', X_winds_phrases)
np.save(cleaned_data_folder + 'X_percussion', X_percussion_phrases)

# Output Data
np.save(cleaned_data_folder + 'y_strings', strings_phrases)
np.save(cleaned_data_folder + 'y_chords', chords_phrases)
np.save(cleaned_data_folder + 'y_bass', bass_phrases)
np.save(cleaned_data_folder + 'y_winds', winds_phrases)
np.save(cleaned_data_folder + 'y_percussion', percussion_phrases)

In [0]:
# melodyNumber
# stringsNumber
# chordsNumber
# vocalsNumber
# windsNumber
# percussionNumber
# bassNumber
# unclassifiedTracks = dict()

from collections import Counter
#print(Counter(melodyNumber))
#print(Counter(chordsNumber))
#print(Counter(windsNumber))
#print(Counter(bassNumber))
print(len(stringsNumber))
print(Counter(stringsNumber))

129
Counter({0: 92, 48: 7, 44: 4, 24: 4, 28: 4, 26: 3, 25: 2, 51: 1, 42: 1, 91: 1, 79: 1, 74: 1, 78: 1, 50: 1, 54: 1, 76: 1, 49: 1, 103: 1, 58: 1, 29: 1})
