In [1]:
# Convert the time stamps in midi to seconds

import pretty_midi
import numpy as np
import pandas as pd
from tqdm import tqdm

In [2]:
def note_vec(note):
    return np.array([note.start, note.end, note.pitch, note.velocity])

In [3]:
for i in tqdm(range(1, 910)):
    idx = f"{i:0>3}"
    mid = pretty_midi.PrettyMIDI(f"POP909/POP909/{idx}/{idx}.mid")
    for inst in mid.instruments:
        seq = pd.DataFrame(np.vstack([note_vec(note) for note in inst.notes]), columns = ['start','end','pitch','velocity'])
        seq.to_csv(rf"POP909/POP909/{idx}/{inst.name}.txt")
        

100%|██████████| 909/909 [01:21<00:00, 11.15it/s]


In [10]:
# Match the beat and midi
import numpy as np
import pandas as pd
from tqdm import tqdm

for i in tqdm(range(1,910)):
    idx = f"{i:0>3}"
    beat = pd.read_csv(rf"POP909/POP909/{idx}/beat_midi.txt", sep=" ", header = None) 
    beat = beat.rename(columns = {0: 'beat_start', 1: 'beat1', 2: 'beat2'})
    beat['beat_index'] = beat.index
    mel = pd.read_csv(rf"POP909/POP909/{idx}/MELODY.txt", sep=",", index_col = 'Unnamed: 0').sort_values(by = 'start').reset_index(drop = True)
    mel['mel_index'] = mel.index
    mel = mel.rename(columns={'start': 'mel_start', 'end': 'mel_end', 'pitch': 'mel_pitch', 'velocity' : 'mel_velocity'})

    bri = pd.read_csv(rf"POP909/POP909/{idx}/BRIDGE.txt", sep=",", index_col = 'Unnamed: 0').sort_values(by = 'start').reset_index(drop = True)
    bri['bri_index'] = bri.index
    bri = bri.rename(columns={'start': 'bri_start', 'end': 'bri_end', 'pitch': 'bri_pitch', 'velocity' : 'bri_velocity'}) 

    pia = pd.read_csv(rf"POP909/POP909/{idx}/PIANO.txt", sep=",", index_col = 'Unnamed: 0').sort_values(by = 'start').reset_index(drop = True)
    pia['pia_index'] = pia.index
    pia = pia.rename(columns={'start': 'pia_start', 'end': 'pia_end', 'pitch': 'pia_pitch', 'velocity' : 'pia_velocity'})

    chord = pd.read_csv(rf"POP909/POP909/{idx}/chord_midi.txt", sep="\t", header = None)
    chord = chord.rename(columns = {0: 'chord_start', 1: 'chord_end', 2:'chord'}).sort_values(by = 'chord_start').reset_index(drop = True)
    chord['chord_index'] = chord.index

    relevant_times = np.sort(np.unique(np.hstack((beat.beat_start, mel.mel_start, mel.mel_end, bri.bri_start, bri.bri_end, pia.pia_start, pia.pia_end, chord.chord_start, chord.chord_end))))

    midi_in_txt = pd.DataFrame({'time' : relevant_times})
    midi_in_txt = midi_in_txt.merge(beat, left_on = 'time', right_on = 'beat_start', how = 'left')
    midi_in_txt = midi_in_txt.merge(mel, left_on = 'time', right_on = 'mel_start', how = 'left')
    midi_in_txt = midi_in_txt.merge(bri, left_on = 'time', right_on = 'bri_start', how = 'left')
    midi_in_txt = midi_in_txt.merge(pia, left_on = 'time', right_on = 'pia_start', how = 'left')
    midi_in_txt = midi_in_txt.merge(chord, left_on = 'time', right_on = 'chord_start', how = 'left')

    midi_in_txt = midi_in_txt.ffill()

    rows = midi_in_txt.time > midi_in_txt.mel_end
    midi_in_txt.loc[rows, ['mel_index', 'mel_start', 'mel_end', 'mel_pitch', 'mel_velocity']] = np.NaN

    rows = midi_in_txt.time > midi_in_txt.bri_end
    midi_in_txt.loc[rows, ['bri_index', 'bri_start', 'bri_end', 'bri_pitch', 'bri_velocity']] = np.NaN

    rows = midi_in_txt.time > midi_in_txt.pia_end
    midi_in_txt.loc[rows, ['pia_index', 'pia_start', 'pia_end', 'pia_pitch', 'pia_velocity']] = np.NaN

    rows = midi_in_txt.time > midi_in_txt.chord_end
    midi_in_txt.loc[rows, ['chord_index', 'chord_start', 'chord_end', 'chord']] = np.NaN


    midi_in_txt.to_csv(rf"POP909/POP909/{idx}/all_in_text_{idx}.csv")

100%|██████████| 909/909 [01:00<00:00, 14.92it/s]


In [None]:
    # chord = pd.read_csv(rf"POP909/POP909/{idx}/beat_chord_match_midi.txt", sep=",", index_col = 'Unnamed: 0')
    # chord = chord[chord['chord'].notna()].sort_values(by = 'beat').reset_index(drop = True)
    # chord['chord_index'] = chord.index
    # chord = chord.rename(columns={'beat': 'chord_start'})