#### This will Convert our MIDI files into Tokenized Training data so that we can train our model 

1. Load the .mid files from the databases (right now we only have the accompaniment database but we will add more)
2. Extract piano, bass, drums etc any other instruments 
3. Slice into 4 bar segments (could choose 8,16,32 etc) maybe do multiple we will need to do some testing
4. Convert to REMI-style tokens (subject to change depending on how well this works out since this has a different functionality)
5. Save token sequences as .json or .npz 

### Import the dependencies

In [1]:
import os 
import pretty_midi
import json 
from pathlib import Path
import pretty_midi
import unicodedata 
import re

### Setup Paths 

In [2]:
project_root = Path.cwd().parent
midi_files_a_l_path = project_root / "midi_files_A-L"
midi_files_m_z_path = project_root / "midi files"

print(f"midi_files_a_l_path exists: {midi_files_a_l_path.exists()}")
print(f"midi_files_m_z_path exists: {midi_files_m_z_path.exists()}")

# collect all midi files from both and then merge 
midi_files = list(midi_files_a_l_path.glob("*.mid")) + list(midi_files_m_z_path.glob("*.mid"))

print(f"Found {len(midi_files)} midi files in total.")

midi_files_a_l_path exists: True
midi_files_m_z_path exists: True
Found 1445 midi files in total.


load the JSON file with Jazz Standards that has the chord data and section info plus rhythm feel since 
these will probably be useful


In [3]:
with open(project_root / "JazzStandards.json", "r") as f:
    jazz_standards = json.load(f)

print (f"Found {len(jazz_standards)} jazz standards in the JSON file.")

Found 1382 jazz standards in the JSON file.


In [4]:
# match these files with the midi files

def normalize_title(title):
    """
    Normalize titles by:
    - Lowercasing
    - Removing articles (a, an, the)
    - Removing punctuation and special characters
    - Removing extra spaces
    """
    title = title.lower()
    title = unicodedata.normalize("NFKD", title).encode("ascii", "ignore").decode("utf-8")
    title = re.sub(r"[^a-z0-9\s]", "", title)  # Remove punctuation
    title = re.sub(r"\b(the|a|an)\b", "", title)  # Remove articles
    title = re.sub(r"\s+", "", title)  # Remove all whitespace
    return title.strip()
 

def match_json_with_midi(midi_files, jazz_standards):
    """
    Matches MIDI files with their corresponding JSON chord data.

    Args:
        midi_files (list): List of Path objects or filenames (e.g., Path("Autumn_Leaves.mid")).
        jazz_standards (list): List of entries from JazzStandards.json (each with "Title" key).

    Returns:
        dict: Mapping of {Path(midi_file): matching_json_entry}
    """
    matched_data = {}

    # Create a normalized lookup from JSON titles
    json_lookup = {
        normalize_title(entry["Title"]): entry
        for entry in jazz_standards
    }

    for midi_path in midi_files:
        midi_title = midi_path.stem  # filename without .mid
        midi_norm = normalize_title(midi_title)

        if midi_norm in json_lookup:
            matched_data[midi_path] = json_lookup[midi_norm]

    return matched_data

In [5]:
# Match MIDI files with JSON data
matched_data = match_json_with_midi(midi_files, jazz_standards)

# Process each MIDI file
for midi_file, json_data in matched_data.items():
    print(f"Processing {midi_file} with JSON data for {json_data['Title']}")

    # Example: Print the rhythm and sections from the JSON data
    print(f"Rhythm: {json_data['Rhythm']}")
    print(f"Sections: {json_data['Sections']}")

# check how many matched 
print(f"Matched {len(matched_data)} MIDI files with JSON data.")

Processing d:\musicJamAI\Music_Jam_AI_Real\Music-Jam-AI\midi_files_A-L\26-2.mid with JSON data for 26-2
Rhythm: Medium Up Swing
Sections: [{'Label': 'A', 'MainSegment': {'Chords': 'Fmaj7,Ab7|Dbmaj7,E7|Amaj7,C7|Cm7,F7|Bbmaj7,Db7|Gbmaj7,A7|Dm7,G7|Gm7,C7'}}, {'Label': 'A', 'MainSegment': {'Chords': 'Fmaj7,Ab7|Dbmaj7,E7|Amaj7,C7|Cm7,F7|Bbmaj7,Ab7|Dbmaj7,E7|Amaj7,C7|Fmaj7'}}, {'Label': 'B', 'MainSegment': {'Chords': 'Cm7,F7|Em7,A7|Dmaj7,F7|Bbmaj7|Ebm7|Ab7|Dbmaj7|Gm7,C7'}}, {'Label': 'A', 'MainSegment': {'Chords': 'Fmaj7,Ab7|Dbmaj7,E7|Amaj7,C7|Cm7,F7|Bbmaj7,Ab7|Dbmaj7,E7|Amaj7,C7|Fmaj7'}}]
Processing d:\musicJamAI\Music_Jam_AI_Real\Music-Jam-AI\midi_files_A-L\500-miles-high.mid with JSON data for 500 Miles High
Rhythm: Bossa Nova
Sections: [{'MainSegment': {'Chords': 'Em7|Em7|Gm7|Gm7|Bbmaj7|Bbmaj7|Bm7b5|E7#9|Am7|Am7|F#m7b5|F#m7b5|Fm7|Fm7|Cm7|Cm7|B7#9|B7#9'}}, {'Repeats': 1, 'MainSegment': {'Chords': 'Cm7|Cm7|Abmaj7|Abmaj7'}}]
Processing d:\musicJamAI\Music_Jam_AI_Real\Music-Jam-AI\midi_files

### Define all the Instrument Filters for MIDI

In [6]:
# accoustic piano + electric piano
piano_instruments = [0,1,2,3,4,5,]

# acoustic + electric bass
bass_instruments = list(range(32, 40))

# Jazz, Electric, Acoustic Guitar
guitar_instruments = list(range(24, 32))

# no need for drums 

### Extract the different instrument parts

In [7]:
# function to extract the seperate instruments 

def extract_instruments(midi_file):
    """
    Extracts the instruments from a midi file and returns them as a dictionary
    """

    try:
        midi_data = pretty_midi.PrettyMIDI(str(midi_file))

        instruments = {
            "piano": [],
            "bass": [],
            "guitar": [],
            "drums": []
        }

        # need to convert the note into a json serializable format
        def note_to_dict(note):
            return {
                "pitch": note.pitch,
                "start": note.start,
                "end": note.end,
                "velocity": note.velocity
            }
        
        for instrument in midi_data.instruments:
            if instrument.is_drum:
                instruments["drums"].extend([note_to_dict(note) for note in instrument.notes])
            elif instrument.program in piano_instruments:
                instruments["piano"].extend([note_to_dict(note) for note in instrument.notes])
            elif instrument.program in bass_instruments:
                instruments["bass"].extend([note_to_dict(note) for note in instrument.notes])
            elif instrument.program in guitar_instruments:
                instruments["guitar"].extend([note_to_dict(note) for note in instrument.notes])

        # Skip empty instruments
        if not any(instruments.values()):
            return None
        
        # Tempo 
        tempo = midi_data.get_tempo_changes()
        tempo = tempo[1][0] if len(tempo[1]) > 0 else 120.0

        # Time signature
        if midi_data.time_signature_changes:
            time_signature = midi_data.time_signature_changes[0]
            time_signature = (time_signature.numerator, time_signature.denominator)
        else:
            time_signature = (4, 4)
        
        # Key signature
        if midi_data.key_signature_changes:
            key_signature = midi_data.key_signature_changes[0]
            key_signature = key_signature.key_number
        else:
            key_signature = 0

        # start and end times
        start_time = min([note.start for instrument in midi_data.instruments for note in instrument.notes]) if any(instruments.values()) else 0
        end_time = max([note.end for instrument in midi_data.instruments for note in instrument.notes]) if any(instruments.values()) else 0
        
        return {
            "instruments": instruments,
            "tempo": tempo,
            "time_signature": time_signature,
            "key_signature": key_signature,
            "start_time": start_time,
            "end_time": end_time
        }

    except Exception as e:
        print(f"Error processing {midi_file}: {e}")
        return None


### Loop through all the files and extract them 

In [9]:
extracted_data = []

# loop through MIDI files 
for midi_file in matched_data.keys():
    print(f"Processing {midi_file}")
    data = extract_instruments(midi_file)
    if data:
        extracted_data.append(data)

print(f"Extracted data from {len(extracted_data)} midi files.")

# Save the extracted data to a JSON file
output_file = project_root / "extracted_instruments.json"
with open(output_file, "w") as f:
    json.dump(extracted_data, f)

print(f"Extracted data saved to {output_file}")

Processing d:\musicJamAI\Music_Jam_AI_Real\Music-Jam-AI\midi_files_A-L\26-2.mid
Processing d:\musicJamAI\Music_Jam_AI_Real\Music-Jam-AI\midi_files_A-L\500-miles-high.mid
Processing d:\musicJamAI\Music_Jam_AI_Real\Music-Jam-AI\midi_files_A-L\502-blues.mid
Processing d:\musicJamAI\Music_Jam_AI_Real\Music-Jam-AI\midi_files_A-L\52nd-street-theme.mid
Processing d:\musicJamAI\Music_Jam_AI_Real\Music-Jam-AI\midi_files_A-L\9.20-special.mid
Processing d:\musicJamAI\Music_Jam_AI_Real\Music-Jam-AI\midi_files_A-L\ablution.mid
Processing d:\musicJamAI\Music_Jam_AI_Real\Music-Jam-AI\midi_files_A-L\adam-s-apple.mid
Processing d:\musicJamAI\Music_Jam_AI_Real\Music-Jam-AI\midi_files_A-L\affirmation.mid
Processing d:\musicJamAI\Music_Jam_AI_Real\Music-Jam-AI\midi_files_A-L\african-flower.mid
Processing d:\musicJamAI\Music_Jam_AI_Real\Music-Jam-AI\midi_files_A-L\afro-blue.mid
Processing d:\musicJamAI\Music_Jam_AI_Real\Music-Jam-AI\midi_files_A-L\afro-centric.mid
Processing d:\musicJamAI\Music_Jam_AI_Real

### Now that we've extracted the important components, we need to slice the songs into 4 bar segments since that is probably the most common phrase in jazz and even if not the others are all going to be factors of 4 

In [13]:
# Helper function to extract chords and section labels from JSON data
def extract_chords_and_sections(json_data):

    bar_chords = []
    bar_sections = []

    for section in json_data.get("Sections", []):
        label = section.get("Label", "")
        bars = section["MainSegment"]["Chords"].split("|")
        for bar in bars:
            chords = [ch.strip() for ch in bar.split(",")]
            bar_chords.append(chords)
            bar_sections.append(label)

    return bar_chords, bar_sections


def slice_4bar_segments(song_data, bar_chords = None, bar_sections = None):
    """
    Slices the song data into 4 bar segments

    we can use the time signature and tempo to calculate the length of 
    each songs bar 
    ex. 4/4 time signature at 120 BPM = 2 seconds per bar

    Also we are adding the bar chords and sections to the segments 
    """
    
    segments = []

    # extract all the important metadata 
    tempo = song_data["tempo"]
    time_signature_changes = song_data.get("time_signature_changes", [(0, (4, 4))])  
    key_signature = song_data["key_signature"]
    instruments = song_data["instruments"]

    song_duration = max(
        max((note["end"] for note in notes), default=0)
        for notes in instruments.values()
    )

    # Calculate the length of bar
    current_time = 0
    bar_index = 0

    while current_time < song_duration:
        # Find the active time signature at the current time
        active_time_signature = time_signature_changes[0][1]  # Default to 4/4
        for change_time, time_sig in reversed(time_signature_changes):
            if current_time >= change_time:
                active_time_signature = time_sig
                break

        
        beats_per_bar = int(active_time_signature[0])
        seconds_per_beat = 60 / tempo
        seconds_per_bar = beats_per_bar * seconds_per_beat
        segment_length = 4 * seconds_per_bar

        # Define the start and end times for the current segment
        start_time = current_time
        end_time = min(current_time + segment_length, song_duration)

        # Create a new segment
        segment = {
            "start_time": start_time,
            "end_time": end_time,
            "tempo": tempo,
            "time_signature": active_time_signature,
            "key_signature": key_signature,
            "instruments": {}
        }

        # Slice the instruments into the segment 
        for instrument, notes in instruments.items():
            segment["instruments"][instrument] = [
                note for note in notes if note["start"] < end_time and note["end"] > start_time
            ]

        if bar_chords and bar_sections:
            segment["bar_chords"] = bar_chords[bar_index:bar_index + 4]
            segment["bar_sections"] = bar_sections[bar_index:bar_index + 4]

        segments.append(segment)
        current_time = end_time
        bar_index += 4

        

    return segments

In [15]:
# Apply to your extracted_data list
# Iterate over matched_data.keys() and extracted_data together
for midi_file, item in zip(matched_data.keys(), extracted_data):
    json_data = matched_data.get(midi_file)
    if json_data:
        bar_chords, bar_sections = extract_chords_and_sections(json_data)
        item["segments"] = slice_4bar_segments(item, bar_chords, bar_sections)
    else:
        item["segments"] = slice_4bar_segments(item)


# display the first 5 segments of the first midi file
for i in range(5):
    print(extracted_data[0]["segments"][i])
    print("\n")

{'start_time': 0, 'end_time': np.float64(4.682928), 'tempo': np.float64(204.9999487500128), 'time_signature': (4, 4), 'key_signature': 0, 'instruments': {'piano': [{'pitch': 60, 'start': np.float64(1.170732), 'end': np.float64(1.70426871875), 'velocity': 56}, {'pitch': 69, 'start': np.float64(1.170732), 'end': np.float64(1.7073175), 'velocity': 53}, {'pitch': 64, 'start': np.float64(1.170732), 'end': np.float64(1.71036628125), 'velocity': 53}, {'pitch': 54, 'start': np.float64(1.7591467812500001), 'end': np.float64(2.30182984375), 'velocity': 51}, {'pitch': 60, 'start': np.float64(1.7591467812500001), 'end': np.float64(2.3024396), 'velocity': 45}, {'pitch': 63, 'start': np.float64(1.7597565375), 'end': np.float64(2.3024396), 'velocity': 47}, {'pitch': 60, 'start': np.float64(2.34451278125), 'end': np.float64(2.9268300000000003), 'velocity': 49}, {'pitch': 65, 'start': np.float64(2.34451278125), 'end': np.float64(2.9268300000000003), 'velocity': 53}, {'pitch': 68, 'start': np.float64(2.

### Tokenize 

In [20]:
# we will use the REMI tokens which are of this format
# <Bar>, <Position_x>,<Instrument_X>,<Note_ON_p>,<Duration_x>,<Velocity_X>

# now we can look into adding other things like
# Chord, Key, Temp and Time signature 

# I think I am going to add the chord and key signature since they seem relevant 

def convert_to_token(segment, resolution = 16):
    
    tokens = []

    tempo = segment["tempo"]
    time_signature = segment["time_signature"]
    beats_per_bar = time_signature[0]
    seconds_per_beat = 60 / tempo
    seconds_per_bar = beats_per_bar * seconds_per_beat
    segment_length = 4 * seconds_per_bar
    ticks_per_bar = resolution * beats_per_bar

    # add the chord and section info for each bar
    for bar_idx in range(4):
        tokens.append(f"<Bar>")
        if "bar_sections" in segment and bar_idx < len(segment["bar_sections"]):
            section = segment["bar_sections"][bar_idx]
            tokens.append(f"<Section_{section}>")
        if "bar_chords" in segment and bar_idx < len(segment["bar_chords"]):
            chords = segment["bar_chords"][bar_idx]
            for chord in chords:
                tokens.append(f"<Chord_{chord}>")
    
    all_notes = []
    for instrument, notes in segment["instruments"].items():
        for note in notes:
            relative_start = note["start"] - segment["start_time"]
            position = int((relative_start / segment_length) * (4 * ticks_per_bar))
            position = max(0, min(position, 4 * ticks_per_bar - 1))

            duration_beats = (note["end"] - note["start"]) / seconds_per_beat
            duration_token = int(duration_beats * resolution)
            duration_token = max(1, duration_token)  

            all_notes.append({
                "position": position,
                "instrument": instrument,
                "pitch": note["pitch"],
                "velocity": note["velocity"],
                "duration": duration_token
            })

    # Sort all notes by position
    all_notes.sort(key=lambda x: x["position"])

    last_position = -1
    for note in all_notes:
        if note["position"] != last_position:
            tokens.append(f"<Position_{note['position']}>")
            last_position = note["position"]

        tokens.append(f"<Instrument_{note['instrument']}>")
        tokens.append(f"<Note_ON_{note['pitch']}>")
        tokens.append(f"<Velocity_{note['velocity']}>")
        tokens.append(f"<Duration_{note['duration']}>")

    return tokens




In [47]:
output_file = project_root / "remi_segments.jsonl"

with open(output_file, "w") as f:
    for item, midi_file in zip(extracted_data, matched_data.keys()):
        # Use the MIDI file name (without extension) as the filename
        title = matched_data[midi_file]["Title"]
        for i, segment in enumerate(item["segments"]):
            tokens = convert_to_token(segment)
            record = {
                "title": title,
                "segment_index": i,
                "tokens": tokens
            }
            f.write(json.dumps(record) + "\n")