In [1]:
# Import dependencies
import numpy as np
import pretty_midi
import xml.etree.ElementTree as ET
import sys
import importlib

# Import project modules
from voicing import Voicing
from utils import MPE_MIDI_Exporter

In [2]:
# Load only what we need to test "Something"
import xmlTranslator as xmlT
from utils import get_project_root

# Load the dataset (quietly)
directory = '/dataset/iRealXML'
root = get_project_root()
myPath = str(root) + str(directory)

print("Loading dataset for 'Something' test...")
theChordDataset, theDurationsDataset, all_meta = xmlT.parse_info_from_XML(myPath)

# Just get "Something" 
something_id = None
for i, meta in enumerate(all_meta):
    if meta['song_name'] == 'Something':
        something_id = i
        break

if something_id is None:
    raise ValueError("'Something' not found in dataset!")

dataset = theChordDataset
print(f"‚úì Found 'Something' at index {something_id}")
print(f"‚úì Ready to test slash chords")

Loading dataset for 'Something' test...


  0%|          | 0/4005 [00:00<?, ?it/s]

(4005,) (4005,) (4005,)
‚úì Found 'Something' at index 3986
‚úì Ready to test slash chords


## 1. Find "Something" in Dataset

In [3]:
# Find "Something" in the dataset
something_id = None
for i, meta in enumerate(all_meta):
    if 'Something' in meta['song_name']:
        print(f"[{i}] {meta['song_name']} - {meta['tonality']}")
        if meta['song_name'] == 'Something':
            something_id = i

if something_id is not None:
    print(f"\n‚úì Found 'Something' at index {something_id}")
    print(f"Metadata: {all_meta[something_id]}")
else:
    print("‚ö†Ô∏è  'Something' not found in dataset")

[267] Something From Everyone - G major
[766] You Do Something To Me - Eb major
[1263] You Do Something To Me 1 - Eb major
[2063] Tell Me Something Good 1 - Ab major
[2303] Something 1 - C major
[2333] Tell Me Something Good - Ab major
[2404] This Could Be The Start Of Something Big - Eb major
[3986] Something - C major

‚úì Found 'Something' at index 3986
Metadata: {'composer': 'The Beatles (George Harrison)', 'style': 'Rock Pop', 'song_name': 'Something', 'tonality': 'C major', 'midi_key': 12, 'time_signature': '4/4', 'decade': 'Null', 'software': 'iReal Pro 2022.2', 'encoding_date': '2022-10-16'}


## 2. Extract Chord Sequence

In [4]:
def extract_readable_chords(token_sequence):
    """Extract chords in readable format: root + nature + extensions"""
    structural = {'.', '|', ':|', '|:', 'N.C.', '<end>'}
    
    chords = []
    i = 0
    while i < len(token_sequence):
        token = token_sequence[i]
        
        if token == '.':
            # Start of chord - look ahead to collect all parts
            chord_parts = []
            j = i + 1
            
            # Skip duration
            while j < len(token_sequence):
                next_token = token_sequence[j]
                
                # Stop at next structural element
                if next_token in structural or next_token.startswith('Form_'):
                    break
                
                # Skip duration numbers
                try:
                    float(next_token)
                    j += 1
                    continue
                except:
                    pass
                
                chord_parts.append(next_token)
                j += 1
            
            if chord_parts:
                chord_str = ' '.join(chord_parts)
                chords.append((i, chord_str))
            i = j
        else:
            i += 1
    
    return chords

# Extract readable chord sequence
something_id = 3986
something_tokenized = dataset[something_id]
something_chords = extract_readable_chords(something_tokenized)

print("="*80)
print("CHORD SEQUENCE FOR 'SOMETHING'")
print("="*80)
for pos, chord in something_chords:
    # Highlight slash chords
    marker = " ‚Üê SLASH CHORD" if '/' in chord else ""
    print(f"[{pos:3d}] {chord:<40}{marker}")

print(f"\nTotal chords: {len(something_chords)}")
slash_count = sum(1 for _, c in something_chords if '/' in c)
print(f"Slash chords: {slash_count}")

CHORD SEQUENCE FOR 'SOMETHING'
[  4] F major                                 
[  8] Eb major                                
[ 12] G dominant / D                           ‚Üê SLASH CHORD
[ 20] C major                                 
[ 25] C major-seventh                         
[ 30] C dominant                              
[ 35] F major                                 
[ 39] F major / E                              ‚Üê SLASH CHORD
[ 46] D dominant                              
[ 51] G dominant                              
[ 55] G dominant / A                           ‚Üê SLASH CHORD
[ 61] G dominant / B                           ‚Üê SLASH CHORD
[ 69] A minor                                 
[ 73] A minor add #7                          
[ 79] A minor-seventh / G                      ‚Üê SLASH CHORD
[ 85] D dominant-ninth                        
[ 91] F major                                 
[ 95] Eb major                                
[ 99] G dominant / D                       

## 3. Verify XML Extraction

In [5]:
# Check the original XML file for "Something"
xml_path = '../dataset/iRealXML/Something.xml'
tree = ET.parse(xml_path)
root = tree.getroot()

# Extract all chords from XML
xml_chords = []
harmonies = root.findall('.//{*}harmony')

print("="*80)
print("ORIGINAL XML CHORDS (from iReal Pro)")
print("="*80)

for i, harmony in enumerate(harmonies):
    root_step = harmony.find('.//{*}root-step')
    root_alter = harmony.find('.//{*}root-alter')
    kind = harmony.find('.//{*}kind')
    
    # Bass note for slash chords
    bass_step = harmony.find('.//{*}bass-step')
    bass_alter = harmony.find('.//{*}bass-alter')
    
    if root_step is not None:
        root_note = root_step.text
        if root_alter is not None and root_alter.text:
            alter = int(root_alter.text)
            if alter == 1:
                root_note += '#'
            elif alter == -1:
                root_note += 'b'
        
        kind_text = kind.get('text', '') if kind is not None else ''
        kind_name = kind.text if kind is not None else 'major'
        
        # Extract bass
        bass_note = None
        if bass_step is not None:
            bass_note = bass_step.text
            if bass_alter is not None and bass_alter.text:
                alter = int(bass_alter.text)
                if alter == 1:
                    bass_note += '#'
                elif alter == -1:
                    bass_note += 'b'
        
        chord_display = f"{root_note}{kind_text}"
        if bass_note:
            chord_display += f"/{bass_note}"
            xml_chords.append({
                'root': root_note,
                'kind': kind_name,
                'bass': bass_note,
                'display': chord_display
            })
            print(f"[{i:2d}] {chord_display:<30} (bass: {bass_note})  ‚Üê SLASH CHORD")
        else:
            xml_chords.append({
                'root': root_note,
                'kind': kind_name,
                'bass': None,
                'display': chord_display
            })
            print(f"[{i:2d}] {chord_display}")

print(f"\nTotal chords in XML: {len(xml_chords)}")
xml_slash_count = sum(1 for c in xml_chords if c['bass'] is not None)
print(f"Slash chords in XML: {xml_slash_count}")

ORIGINAL XML CHORDS (from iReal Pro)
[ 0] F
[ 1] Eb
[ 2] G7/D                           (bass: D)  ‚Üê SLASH CHORD
[ 3] C
[ 4] Cmaj7
[ 5] C7
[ 6] F
[ 7] F/E                            (bass: E)  ‚Üê SLASH CHORD
[ 8] D7
[ 9] G7
[10] G7/A                           (bass: A)  ‚Üê SLASH CHORD
[11] G7/B                           (bass: B)  ‚Üê SLASH CHORD
[12] Am
[13] Am
[14] Am7/G                          (bass: G)  ‚Üê SLASH CHORD
[15] D9
[16] F
[17] Eb
[18] G7/D                           (bass: D)  ‚Üê SLASH CHORD
[19] A
[20] A
[21] A/G#                           (bass: G#)  ‚Üê SLASH CHORD
[22] A/F#                           (bass: F#)  ‚Üê SLASH CHORD
[23] A/E                            (bass: E)  ‚Üê SLASH CHORD
[24] D
[25] G
[26] A
[27] A
[28] A/G#                           (bass: G#)  ‚Üê SLASH CHORD
[29] A/F#                           (bass: F#)  ‚Üê SLASH CHORD
[30] A/E                            (bass: E)  ‚Üê SLASH CHORD
[31] D
[32] G
[33] C
[34] C
[35] Cmaj7
[36] C7
[37] F
[38]

## 4. Test Corrected Slash Chord Implementation

In [6]:
# CRITICAL FIX: Convert XML chord names to internal tokens FIRST!
print("\n" + "="*80)
print("STEP 1: Converting XML chord names to internal tokens (replaceTheseChords)")
print("="*80)

import xmlTranslator as xmlT
import importlib
importlib.reload(xmlT)

# Convert XML chord names like 'dominant' ‚Üí 'dom7', 'major' ‚Üí 'maj', etc.
corrected_sequence = xmlT.replaceTheseChords([something_tokenized], False)
something_tokenized_corrected = corrected_sequence[0]

print(f"‚úì Converted: {len(something_tokenized)} ‚Üí {len(something_tokenized_corrected)} tokens")
print("="*80 + "\n")

# STEP 2: Add 'maj' tokens to single notes
print("="*80)
print("STEP 2: Adding 'maj' tokens to single notes")
print("="*80)

from voicing import Voicing as TempVoicing
voicing_temp = TempVoicing()

# Process the sequence to add 'maj' after single notes
processed_sequence = []
added_maj_count = 0

for i, token in enumerate(something_tokenized_corrected):
    processed_sequence.append(token)
    
    # Check if this is a single note that needs 'maj'
    if i < len(something_tokenized_corrected) - 1:
        next_token = something_tokenized_corrected[i + 1]
        previous = something_tokenized_corrected[i - 1] if i > 0 else ''
        
        # If current token is a note, and next is structural, and previous wasn't slash
        if token in voicing_temp.all_notes and next_token != 'N.C.' and previous != '/' \
           and (next_token in voicing_temp.structural_elements or next_token.startswith('Form_')):
            processed_sequence.append('maj')
            added_maj_count += 1

print(f"‚úì Added {added_maj_count} 'maj' tokens")
print(f"‚úì Tokens: {len(something_tokenized_corrected)} ‚Üí {len(processed_sequence)}")
print("="*80 + "\n")

# STEP 3: Convert to voicing with FULLY preprocessed sequence
# FORCE RELOAD - delete from sys.modules completely
for mod_name in list(sys.modules.keys()):
    if 'voicing' in mod_name.lower():
        del sys.modules[mod_name]

import voicing as voicing_module

# Create new Voicing instance
voicing_corrected = voicing_module.Voicing()

# Convert "Something" with FULLY PREPROCESSED sequence
something_midi_corrected, status = voicing_corrected.convert_chords_to_voicing(processed_sequence)
print(f"Conversion status: {status}")
print(f"Converted {len(something_midi_corrected)} events")

# Count REAL chords (3+ notes)
real_chords = sum(1 for m,d,l in something_midi_corrected if len([n for n in m if n>0])>=3)
print(f"‚úì REAL CHORDS WITH 3+ NOTES: {real_chords}")

print("\nTesting corrected slash chord implementation:")
print("="*90)

# Find and display first 5 slash chords
slash_indices = [i for i, item in enumerate(something_midi_corrected) if item[2] == '/']
print(f"Found {len(slash_indices)} slash markers\n")

for slash_num, idx in enumerate(slash_indices[:5], 1):
    print(f"--- SLASH CHORD #{slash_num} at index {idx} ---")
    
    # Show chord before slash, slash marker, and bass note after
    for i in range(max(0, idx-2), min(len(something_midi_corrected), idx+3)):
        midi, duration, label = something_midi_corrected[i]
        
        if i == idx-2:
            # Root note
            note_name = pretty_midi.note_number_to_name(midi[0]) if midi[0] > 0 else "silence"
            print(f"[{i:>3}]  {str(label):<20} {str(midi):<50} ‚Üê Root: {note_name}")
        elif i == idx-1:
            # Full chord
            notes = [pretty_midi.note_number_to_name(m) for m in midi if m > 0]
            print(f"[{i:>3}]  {str(label):<20} {str(midi):<50} ‚Üê Chord: {notes}")
        elif i == idx:
            print(f"[{i:>3}]  {str(label):<20} {str(midi):<50} ‚Üê SLASH MARKER")
        elif i == idx+1:
            # Slash bass note (should have full voicing with new bass + moved root)
            notes = [pretty_midi.note_number_to_name(m) for m in midi if m > 0]
            note_count = len(notes)
            print(f"[{i:>3}]  {str(label):<20} {str(midi):<50} ‚Üê BASS+CHORD: {notes}")
            if len(notes) > 1:
                print(f"       ‚Üí {note_count} notes: Bass={notes[0]}, OldRoot+12={notes[1]}")
    print()

print("="*90)
print("‚úì Expected: Each slash chord should have:")
print("  1. New bass note at [0]")
print("  2. Old root moved up octave (+12) at [1]")
print("  3. Rest of chord voicing preserved")



STEP 1: Converting XML chord names to internal tokens (replaceTheseChords)


  0%|          | 0/1 [00:00<?, ?it/s]

‚úì Converted: 300 ‚Üí 300 tokens

STEP 2: Adding 'maj' tokens to single notes
‚úì Added 0 'maj' tokens
‚úì Tokens: 300 ‚Üí 300

Conversion status: True
Converted 55 events
‚úì REAL CHORDS WITH 3+ NOTES: 53

Testing corrected slash chord implementation:
Found 0 slash markers

‚úì Expected: Each slash chord should have:
  1. New bass note at [0]
  2. Old root moved up octave (+12) at [1]
  3. Rest of chord voicing preserved


In [7]:
# ACTUALLY LOOK AT THE DATA STRUCTURE - Stop making blind assumptions!
print("="*100)
print("RAW DATA INSPECTION - What's ACTUALLY in something_midi_corrected?")
print("="*100)

print(f"\nTotal elements: {len(something_midi_corrected)}")
print("\nFirst 50 elements with ALL details:\n")

for i in range(min(50, len(something_midi_corrected))):
    midi, duration, label = something_midi_corrected[i]
    note_count = len([n for n in midi if n > 0])
    notes = [pretty_midi.note_number_to_name(n) for n in midi if n > 0]
    
    # Color code by type
    if midi == [0, 0, 0, 0, 0, 0, 0, 0]:
        marker = "‚Üê EMPTY"
    elif note_count == 1:
        marker = "‚Üê SINGLE NOTE (root?)"
    elif note_count >= 3:
        marker = "‚Üê FULL CHORD ‚úì"
    else:
        marker = "‚Üê PARTIAL"
    
    print(f"[{i:3d}] label='{label:<15}' dur={duration:4.1f} notes={note_count} {str(notes):<40} {marker}")

# Count by type
empty = sum(1 for m,d,l in something_midi_corrected if m == [0,0,0,0,0,0,0,0])
single = sum(1 for m,d,l in something_midi_corrected if len([n for n in m if n>0])==1)
full = sum(1 for m,d,l in something_midi_corrected if len([n for n in m if n>0])>=3)
partial = sum(1 for m,d,l in something_midi_corrected if 1<len([n for n in m if n>0])<3)

print(f"\n{'='*100}")
print(f"SUMMARY:")
print(f"  Empty MIDI:     {empty}")
print(f"  Single notes:   {single}")
print(f"  Partial (2):    {partial}")
print(f"  Full chords(3+): {full}")
print(f"{'='*100}")

RAW DATA INSPECTION - What's ACTUALLY in something_midi_corrected?

Total elements: 55

First 50 elements with ALL details:

[  0] label='maj            ' dur= 2.0 notes=3 ['F3', 'A3', 'C4']                       ‚Üê FULL CHORD ‚úì
[  1] label='maj            ' dur= 1.0 notes=3 ['D#3', 'G3', 'A#3']                     ‚Üê FULL CHORD ‚úì
[  2] label='D              ' dur= 1.0 notes=4 ['D3', 'G3', 'B3', 'F4']                 ‚Üê FULL CHORD ‚úì
[  3] label='maj            ' dur= 4.0 notes=4 ['C3', 'G3', 'C4', 'E4']                 ‚Üê FULL CHORD ‚úì
[  4] label='maj7           ' dur= 4.0 notes=4 ['C3', 'G3', 'B3', 'E4']                 ‚Üê FULL CHORD ‚úì
[  5] label='dom7           ' dur= 4.0 notes=4 ['C3', 'G3', 'A#3', 'E4']                ‚Üê FULL CHORD ‚úì
[  6] label='maj            ' dur= 3.0 notes=4 ['F3', 'C3', 'A3', 'F4']                 ‚Üê FULL CHORD ‚úì
[  7] label='E              ' dur= 1.0 notes=5 ['E3', 'A3', 'C4', 'F4', 'F4']           ‚Üê FULL CHORD ‚úì
[  8] label='dom7  

In [8]:
# DIAGNOSTIC: Check what the conversion did
print("="*80)
print("DIAGNOSTIC: Checking conversion results")
print("="*80)

# Show first 30 tokens before and after conversion
print("\nBEFORE replaceTheseChords (first 30 tokens):")
for i, token in enumerate(something_tokenized[:30]):
    print(f"  [{i:2d}] {token}")

print("\nAFTER replaceTheseChords (first 30 tokens):")
for i, token in enumerate(something_tokenized_corrected[:30]):
    print(f"  [{i:2d}] {token}")

print("\nAFTER adding 'maj' tokens (first 30 tokens):")
for i, token in enumerate(processed_sequence[:30]):
    print(f"  [{i:2d}] {token}")

print("\n" + "="*80)


DIAGNOSTIC: Checking conversion results

BEFORE replaceTheseChords (first 30 tokens):
  [ 0] <style>
  [ 1] Rock Pop
  [ 2] Form_intro
  [ 3] |
  [ 4] .
  [ 5] 2.0
  [ 6] F
  [ 7] major
  [ 8] .
  [ 9] 1.0
  [10] Eb
  [11] major
  [12] .
  [13] 1.0
  [14] G
  [15] dominant
  [16] /
  [17] D
  [18] Form_Segno
  [19] |:
  [20] .
  [21] 4.0
  [22] C
  [23] major
  [24] |
  [25] .
  [26] 4.0
  [27] C
  [28] major-seventh
  [29] |

AFTER replaceTheseChords (first 30 tokens):
  [ 0] <style>
  [ 1] Rock Pop
  [ 2] Form_intro
  [ 3] |
  [ 4] .
  [ 5] 2.0
  [ 6] F
  [ 7] maj
  [ 8] .
  [ 9] 1.0
  [10] Eb
  [11] maj
  [12] .
  [13] 1.0
  [14] G
  [15] dom7
  [16] /
  [17] D
  [18] Form_Segno
  [19] |:
  [20] .
  [21] 4.0
  [22] C
  [23] maj
  [24] |
  [25] .
  [26] 4.0
  [27] C
  [28] maj7
  [29] |

AFTER adding 'maj' tokens (first 30 tokens):
  [ 0] <style>
  [ 1] Rock Pop
  [ 2] Form_intro
  [ 3] |
  [ 4] .
  [ 5] 2.0
  [ 6] F
  [ 7] maj
  [ 8] .
  [ 9] 1.0
  [10] Eb
  [11] maj
  [12] .
  [13]

In [9]:
# ROOT CAUSE ANALYSIS: Why is export_to_midi broken?
print("="*100)
print("EXPORT FUNCTION DEBUG - Understanding the BROKEN logic")
print("="*100)

print("\nThe export function looks for '.' markers and then searches FORWARD for the chord.")
print("Let's see what it's actually capturing:\n")

# Simulate what export_to_midi does
after_chords = {'.', '|', ':|', '|:', 'N.C.', '<end>', '/'}
dot_captures = []

for i, element in enumerate(something_midi_corrected):
    chord = element[2]
    
    if chord == '.' and i < len(something_midi_corrected) - 2:
        ref = i
        counter = 0
        doIt = True
        
        # Look ahead to find next non-structural element
        while doIt and ref < len(something_midi_corrected)-1:       
            counter += 1 
            ref += 1
            next_element = something_midi_corrected[ref]
            next_label = next_element[2]
            
            if next_label in after_chords or str(next_label).startswith('Form_'):
                doIt = False
                counter -= 1
        
        if counter > 0:
            captured = something_midi_corrected[i+counter]
            midi_data = captured[0]
            label = captured[2]
            note_count = len([n for n in midi_data if n > 0])
            
            dot_captures.append({
                'dot_index': i,
                'captured_index': i+counter,
                'label': label,
                'midi': midi_data,
                'note_count': note_count
            })
            
            if len(dot_captures) <= 10:
                print(f"Dot at [{i:3d}] ‚Üí captured [{i+counter:3d}] '{label}': {midi_data[:4]}... ({note_count} notes)")

print(f"\nTotal dots processed: {len(dot_captures)}")
print(f"\nPROBLEM DIAGNOSIS:")

# Check what labels are being captured
captured_labels = {}
for cap in dot_captures:
    label = cap['label']
    captured_labels[label] = captured_labels.get(label, 0) + 1

print(f"\nWhat the export function is capturing:")
for label, count in captured_labels.items():
    print(f"  {label}: {count} times")

# Check for full chord voicings being skipped
print(f"\n\nWhat's in the FULL converted data (showing all chord types):")
full_chords = []
for i, (midi, duration, label) in enumerate(something_midi_corrected):
    if midi != [0, 0, 0, 0, 0, 0, 0, 0]:
        note_count = len([n for n in midi if n > 0])
        if note_count >= 3:  # Real chords
            full_chords.append((i, label, midi, note_count))

print(f"\nTotal REAL chords in converted data: {len(full_chords)}")
print("First 10 real chords:")
for i, label, midi, note_count in full_chords[:10]:
    notes = [pretty_midi.note_number_to_name(n) for n in midi if n > 0]
    print(f"  [{i:3d}] '{label}': {notes} ({note_count} notes)")

print(f"\n{'='*100}")
print("THE BUG: Export function's 'dot logic' is NOT finding the full chord voicings!")
print("It's only capturing root notes or partial data, not the complete chords.")
print("='*100}")

EXPORT FUNCTION DEBUG - Understanding the BROKEN logic

The export function looks for '.' markers and then searches FORWARD for the chord.
Let's see what it's actually capturing:


Total dots processed: 0

PROBLEM DIAGNOSIS:

What the export function is capturing:


What's in the FULL converted data (showing all chord types):

Total REAL chords in converted data: 53
First 10 real chords:
  [  0] 'maj': ['F3', 'A3', 'C4'] (3 notes)
  [  1] 'maj': ['D#3', 'G3', 'A#3'] (3 notes)
  [  2] 'D': ['D3', 'G3', 'B3', 'F4'] (4 notes)
  [  3] 'maj': ['C3', 'G3', 'C4', 'E4'] (4 notes)
  [  4] 'maj7': ['C3', 'G3', 'B3', 'E4'] (4 notes)
  [  5] 'dom7': ['C3', 'G3', 'A#3', 'E4'] (4 notes)
  [  6] 'maj': ['F3', 'C3', 'A3', 'F4'] (4 notes)
  [  7] 'E': ['E3', 'A3', 'C4', 'F4', 'F4'] (5 notes)
  [  8] 'dom7': ['D3', 'A3', 'C4', 'F#4'] (4 notes)
  [  9] 'dom7': ['G3', 'D3', 'B3', 'F4'] (4 notes)

THE BUG: Export function's 'dot logic' is NOT finding the full chord voicings!
It's only capturing root notes 

## 5. Export and Verify MIDI

Export the corrected MIDI and verify the slash chords are correct in the final output.

In [10]:
# Use the WORKING export_to_midi method from voicing.py
output_path = "../dataset/midi_files/mpe/TESTING_Something_FIXED.mid"

print("Exporting 'Something' to MIDI using voicing.export_to_midi()...")
voicing_corrected.export_to_midi(
    something_midi_corrected,
    "TESTING_Something_FIXED",
    "../dataset/midi_files/mpe/"
)

print(f"‚úì Exported to: {output_path}")

# Load and verify
midi_data = pretty_midi.PrettyMIDI(output_path)
print(f"\nMIDI File: TESTING_Something_FIXED.mid")
print(f"Duration: {midi_data.get_end_time():.2f} seconds")
print(f"Total instruments (MPE channels): {len(midi_data.instruments)}")

# Collect all notes
all_notes = []
for inst in midi_data.instruments:
    all_notes.extend(inst.notes)
all_notes = sorted(all_notes, key=lambda n: n.start)

print(f"Total notes: {len(all_notes)}")

# Show chord summary - group by time
if len(all_notes) > 0:
    print("\nChord progression (first 10 chords):")
    from itertools import groupby
    chord_count = 0
    for time, notes_at_time in groupby(all_notes, key=lambda n: round(n.start, 2)):
        notes_list = list(notes_at_time)
        pitches = [pretty_midi.note_number_to_name(n.pitch) for n in notes_list]
        print(f"  Time {time:6.2f}s: {pitches}")
        chord_count += 1
        if chord_count >= 10:
            break
    
    print(f"\n‚úì SUCCESS! Song has {chord_count}+ chords")
else:
    print("\n‚ö†Ô∏è  NO NOTES IN MIDI FILE!")

Exporting 'Something' to MIDI using voicing.export_to_midi()...
‚úì MIDI file created: TESTING_Something_FIXED.mid
‚úì Exported to: ../dataset/midi_files/mpe/TESTING_Something_FIXED.mid

MIDI File: TESTING_Something_FIXED.mid
Duration: 124.00 seconds
Total instruments (MPE channels): 1
Total notes: 201

Chord progression (first 10 chords):
  Time   0.00s: ['F3', 'A3', 'C4']
  Time   2.00s: ['D#3', 'G3', 'A#3']
  Time   3.00s: ['D3', 'G3', 'B3', 'F4']
  Time   4.00s: ['C3', 'G3', 'C4', 'E4']
  Time   8.00s: ['C3', 'G3', 'B3', 'E4']
  Time  12.00s: ['C3', 'G3', 'A#3', 'E4']
  Time  16.00s: ['F3', 'C3', 'A3', 'F4']
  Time  19.00s: ['E3', 'A3', 'C4', 'F4']
  Time  20.00s: ['D3', 'A3', 'C4', 'F#4']
  Time  24.00s: ['G3', 'D3', 'B3', 'F4']

‚úì SUCCESS! Song has 10+ chords


In [11]:
# TIMING DIAGNOSTIC: Check durations from XML through conversion
print("="*100)
print("TIMING ANALYSIS - Tracking chord durations from XML to MIDI")
print("="*100)

# 1. Check XML durations
print("\n1. DURATIONS FROM XML PARSING:")
print("-"*100)
something_durations = theDurationsDataset[something_id]
print(f"Total duration entries: {len(something_durations)}")
print(f"First 20 durations: {something_durations[:20]}")

# 2. Check token sequence with durations
print("\n2. TOKEN SEQUENCE WITH DURATIONS:")
print("-"*100)
print("First 50 tokens showing dots and durations:")
for i in range(min(50, len(something_tokenized_corrected))):
    token = something_tokenized_corrected[i]
    if token == '.' or token in voicing_temp.durations:
        print(f"  [{i:3d}] {token}")

# 3. Check converted MIDI events with durations
print("\n3. CONVERTED MIDI EVENTS WITH DURATIONS:")
print("-"*100)
print("First 20 events with their durations:")
for i in range(min(20, len(something_midi_corrected))):
    midi, duration, label = something_midi_corrected[i]
    note_count = len([n for n in midi if n > 0])
    print(f"  [{i:3d}] label='{label:<15}' duration={duration:6.3f}s  notes={note_count}")

# 4. Calculate cumulative timing
print("\n4. CUMULATIVE TIMING (first 10 chords):")
print("-"*100)
cumulative_time = 0.0
chord_count = 0
for i, (midi, duration, label) in enumerate(something_midi_corrected):
    if len([n for n in midi if n > 0]) >= 3:  # Real chords
        print(f"  Chord {chord_count+1}: starts at {cumulative_time:6.2f}s, duration={duration:6.3f}s, label='{label}'")
        cumulative_time += duration
        chord_count += 1
        if chord_count >= 10:
            break

# 5. Check actual MIDI file timing
print("\n5. ACTUAL MIDI FILE TIMING:")
print("-"*100)
midi_data = pretty_midi.PrettyMIDI(output_path)
all_notes = []
for inst in midi_data.instruments:
    all_notes.extend(inst.notes)
all_notes = sorted(all_notes, key=lambda n: n.start)

from itertools import groupby
print("First 10 chords in MIDI file:")
chord_count = 0
prev_time = 0.0
for time, notes_at_time in groupby(all_notes, key=lambda n: round(n.start, 2)):
    notes_list = list(notes_at_time)
    pitches = [pretty_midi.note_number_to_name(n.pitch) for n in notes_list]
    delta = time - prev_time if chord_count > 0 else 0
    print(f"  Chord {chord_count+1}: time={time:6.2f}s (Œî={delta:5.2f}s) {pitches}")
    prev_time = time
    chord_count += 1
    if chord_count >= 10:
        break

print("\n" + "="*100)

TIMING ANALYSIS - Tracking chord durations from XML to MIDI

1. DURATIONS FROM XML PARSING:
----------------------------------------------------------------------------------------------------
Total duration entries: 300
First 20 durations: [0. 0. 0. 0. 2. 2. 2. 2. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 0.]

2. TOKEN SEQUENCE WITH DURATIONS:
----------------------------------------------------------------------------------------------------
First 50 tokens showing dots and durations:
  [  4] .
  [  5] 2.0
  [  8] .
  [  9] 1.0
  [ 12] .
  [ 13] 1.0
  [ 20] .
  [ 21] 4.0
  [ 25] .
  [ 26] 4.0
  [ 30] .
  [ 31] 4.0
  [ 35] .
  [ 36] 3.0
  [ 39] .
  [ 40] 1.0
  [ 46] .
  [ 47] 4.0

3. CONVERTED MIDI EVENTS WITH DURATIONS:
----------------------------------------------------------------------------------------------------
First 20 events with their durations:
  [  0] label='maj            ' duration= 2.000s  notes=3
  [  1] label='maj            ' duration= 1.000s  notes=3
  [  2] label='D      

In [12]:
# SIMULATE export_to_midi to see what it captures
print("="*100)
print("SIMULATING export_to_midi() - What durations are being captured?")
print("="*100)

midi_capture = []

# Extract all real chord voicings from the sequence (same logic as export_to_midi)
for i, (midi, duration, label) in enumerate(something_midi_corrected):
    # Skip empty MIDI data
    if midi == [0, 0, 0, 0, 0, 0, 0, 0]:
        continue
    
    # Check if this is a chord nature OR a slash chord bass note
    if label in voicing_corrected.natures:
        midi_capture.append((midi, duration, label, i))
    elif len([n for n in midi if n > 0]) >= 3:
        # This has 3+ notes - check if previous element was a slash
        if i > 0 and something_midi_corrected[i-1][2] == '/':
            midi_capture.append((midi, duration, label, i))

print(f"\nCaptured {len(midi_capture)} chords for export")
print("\nFirst 15 captured chords with their durations:")
for idx, (midi, duration, label, orig_idx) in enumerate(midi_capture[:15]):
    notes = [pretty_midi.note_number_to_name(n) for n in midi if n > 0]
    print(f"  Capture[{idx:2d}] from seq[{orig_idx:3d}]: dur={duration:5.2f}s  label='{label:<10}' {notes}")

# Calculate cumulative time as export_to_midi does
print("\nCumulative timing (as export_to_midi calculates it):")
time = 0.0
for idx, (midi, duration, label, orig_idx) in enumerate(midi_capture[:15]):
    notes = [pretty_midi.note_number_to_name(n) for n in midi if n > 0]
    print(f"  Chord {idx+1}: time={time:6.2f}s, duration={duration:5.2f}s ‚Üí {notes}")
    time += duration

print("\n" + "="*100)

SIMULATING export_to_midi() - What durations are being captured?

Captured 35 chords for export

First 15 captured chords with their durations:
  Capture[ 0] from seq[  0]: dur= 2.00s  label='maj       ' ['F3', 'A3', 'C4']
  Capture[ 1] from seq[  1]: dur= 1.00s  label='maj       ' ['D#3', 'G3', 'A#3']
  Capture[ 2] from seq[  3]: dur= 4.00s  label='maj       ' ['C3', 'G3', 'C4', 'E4']
  Capture[ 3] from seq[  4]: dur= 4.00s  label='maj7      ' ['C3', 'G3', 'B3', 'E4']
  Capture[ 4] from seq[  5]: dur= 4.00s  label='dom7      ' ['C3', 'G3', 'A#3', 'E4']
  Capture[ 5] from seq[  6]: dur= 3.00s  label='maj       ' ['F3', 'C3', 'A3', 'F4']
  Capture[ 6] from seq[  8]: dur= 4.00s  label='dom7      ' ['D3', 'A3', 'C4', 'F#4']
  Capture[ 7] from seq[  9]: dur= 1.00s  label='dom7      ' ['G3', 'D3', 'B3', 'F4']
  Capture[ 8] from seq[ 12]: dur= 2.00s  label='m         ' ['A3', 'C4', 'E4']
  Capture[ 9] from seq[ 13]: dur= 2.00s  label='m         ' ['A3', 'C4', 'E4']
  Capture[10] from seq[ 16

In [13]:
# BRUTAL HONESTY DIAGNOSTIC: Compare what we WANTED vs what we GOT
print("="*100)
print("STUPIDITY DIAGNOSTIC REPORT")
print("="*100)

# 1. What we WANTED (from converted data)
print("\n1. EXPECTED CHORDS (from convert_chords_to_voicing):")
print("-"*100)
expected_chords = []
for i, (midi, duration, label) in enumerate(something_midi_corrected):
    # Only count actual chord voicings (skip structural markers)
    if midi != [0, 0, 0, 0, 0, 0, 0, 0] and len([n for n in midi if n > 0]) >= 3:
        notes = [pretty_midi.note_number_to_name(n) for n in midi if n > 0]
        expected_chords.append((i, notes, midi))
        if len(expected_chords) <= 15:
            print(f"  [{i:3d}] {str(notes):<50} {midi}")

print(f"\nTotal expected chords: {len(expected_chords)}")

# 2. What we GOT (from MIDI file)
print("\n2. ACTUAL MIDI FILE OUTPUT:")
print("-"*100)
try:
    midi_data = pretty_midi.PrettyMIDI(output_path)
    
    # Collect all notes
    all_notes = []
    for inst in midi_data.instruments:
        all_notes.extend(inst.notes)
    all_notes = sorted(all_notes, key=lambda n: (n.start, n.pitch))
    
    # Group by start time
    from itertools import groupby
    actual_chords = []
    for time, notes_at_time in groupby(all_notes, key=lambda n: round(n.start, 3)):
        notes_list = list(notes_at_time)
        pitches = [n.pitch for n in notes_list]
        pitch_names = [pretty_midi.note_number_to_name(p) for p in pitches]
        actual_chords.append((time, pitch_names, pitches))
        if len(actual_chords) <= 15:
            print(f"  Time {time:6.2f}s: {str(pitch_names):<50} {pitches}")
    
    print(f"\nTotal actual chords: {len(actual_chords)}")
    
    # 3. BRUTAL COMPARISON
    print("\n3. REALITY CHECK:")
    print("-"*100)
    
    expected_count = len(expected_chords)
    actual_count = len(actual_chords)
    missing_chords = expected_count - actual_count
    
    print(f"  Expected: {expected_count} chords")
    print(f"  Got:      {actual_count} chords")
    print(f"  Missing:  {missing_chords} chords ({100*missing_chords/expected_count:.1f}% loss!)")
    
    if len(actual_chords) == 0:
        print("\n  ‚ùå MIDI FILE IS COMPLETELY EMPTY!")
        print("  ‚ùå EXPORT FUNCTION WROTE NOTHING!")
        print("  ‚ùå TOTAL FAILURE!")
        stupidity_score = 100
        
    elif len(actual_chords) == 1:
        print("\n  ‚ùå MIDI FILE HAS ONLY ONE CHORD!")
        print("  ‚ùå Lost 99.9% of the song!")
        stupidity_score = 100
        
    elif missing_chords > expected_count * 0.9:
        print(f"\n  ‚ùå LOST {100*missing_chords/expected_count:.0f}% OF THE SONG!")
        print("  ‚ùå EXPORT FUNCTION IS COMPLETELY BROKEN!")
        stupidity_score = 100
        
    elif missing_chords > expected_count * 0.5:
        print(f"\n  ‚ùå LOST MORE THAN HALF THE CHORDS!")
        print("  ‚ùå EXPORT LOGIC IS FUNDAMENTALLY BROKEN!")
        stupidity_score = 95
        
    else:
        # Check if chords actually match
        mismatches = 0
        for i in range(min(len(expected_chords), len(actual_chords))):
            exp_midi = expected_chords[i][2]
            act_midi = actual_chords[i][2]
            exp_notes = sorted([n for n in exp_midi if n > 0])
            act_notes = sorted(act_midi)
            if exp_notes != act_notes:
                mismatches += 1
                if mismatches <= 3:
                    print(f"  ‚ùå Chord {i}: Expected {exp_notes} but got {act_notes}")
        
        if mismatches > expected_count * 0.5:
            print(f"\n  ‚ùå {mismatches}/{expected_count} chords are WRONG!")
            print("  ‚ùå VOICING IS COMPLETELY BROKEN!")
            stupidity_score = 90
        elif mismatches > expected_count * 0.2:
            print(f"\n  ‚ö†Ô∏è  {mismatches}/{expected_count} chords don't match")
            stupidity_score = 70
        elif mismatches > 0:
            print(f"\n  ‚ö†Ô∏è  {mismatches} chords have differences")
            stupidity_score = 30
        else:
            print("\n  ‚úì Chords look correct!")
            stupidity_score = 5
    
    print(f"\n{'='*100}")
    print(f"STUPIDITY SCORE: {stupidity_score}/100")
    print(f"{'='*100}")
    
    if stupidity_score >= 90:
        print("\nüí©üí©üí© VERDICT: CATASTROPHICALLY BROKEN")
        print("The export function is writing almost nothing. The 'continue' fix broke everything.")
        print("Need to revert to original export logic and fix the ROOT CAUSE of empty MIDI chords.")
        
    elif stupidity_score >= 70:
        print("\nüí©üí© VERDICT: SERIOUSLY BROKEN")
        print("Export produces wrong chords. Voicing logic is broken.")
        
    elif stupidity_score >= 30:
        print("\nüí© VERDICT: PARTIALLY BROKEN")
        print("Has some issues but mostly works.")
        
    else:
        print("\n‚úì VERDICT: WORKING")
        
except Exception as e:
    print(f"\n‚ùå ERROR LOADING MIDI: {e}")
    import traceback
    traceback.print_exc()
    stupidity_score = 100
    print(f"\nSTUPIDITY SCORE: {stupidity_score}/100 - CAN'T EVEN LOAD THE FILE!")

STUPIDITY DIAGNOSTIC REPORT

1. EXPECTED CHORDS (from convert_chords_to_voicing):
----------------------------------------------------------------------------------------------------
  [  0] ['F3', 'A3', 'C4']                                 [53, 57, 60, 0, 0, 0, 0, 0]
  [  1] ['D#3', 'G3', 'A#3']                               [51, 55, 58, 0, 0, 0, 0, 0]
  [  2] ['D3', 'G3', 'B3', 'F4']                           [50, 55, 59, 65, 0, 0, 0, 0]
  [  3] ['C3', 'G3', 'C4', 'E4']                           [48, 55, 60, 64, 0, 0, 0, 0]
  [  4] ['C3', 'G3', 'B3', 'E4']                           [48, 55, 59, 64, 0, 0, 0, 0]
  [  5] ['C3', 'G3', 'A#3', 'E4']                          [48, 55, 58, 64, 0, 0, 0, 0]
  [  6] ['F3', 'C3', 'A3', 'F4']                           [53, 48, 57, 65, 0, 0, 0, 0]
  [  7] ['E3', 'A3', 'C4', 'F4', 'F4']                     [52, 57, 60, 65, 65, 0, 0, 0]
  [  8] ['D3', 'A3', 'C4', 'F#4']                          [50, 57, 60, 66, 0, 0, 0, 0]
  [  9] ['G3', 'D3', 'B3',

## Summary

**Slash Chord Fix**:
- Keep full chord voicing
- Move old root UP one octave (+12 semitones)
- Add new bass note at the beginning in bass range

**Example: G7/D**
- G7 chord: `[43, 65, 71]` (G2, F4, B4)
- Old root G2 (43) ‚Üí move up to G3 (55)
- Add D bass (50) at beginning
- **Result**: `[50, 55, 65, 71]` (D3, G3, F4, B4) ‚úì

In [14]:
# INTELLIGENT ANALYSIS: Compare current extraction vs. proper MusicXML parsing
print("="*100)
print("INTELLIGENT TIMING ANALYSIS - Why the durations are misaligned")
print("="*100)

import xml.etree.ElementTree as ET

xml_path = '../dataset/iRealXML/Something.xml'
tree = ET.parse(xml_path)
root = tree.getroot()

# Get divisions value (should be 768)
divisions_elem = root.find('.//divisions')
divisions = int(divisions_elem.text) if divisions_elem is not None else 1
print(f"\n‚úì MusicXML divisions per quarter note: {divisions}")

# PROPER EXTRACTION: Harmony + Duration pairs
print("\n" + "-"*100)
print("PROPER EXTRACTION (harmony-duration pairs as they appear in XML):")
print("-"*100)

proper_events = []
measures = root.findall('.//measure')

for measure in measures:
    measure_num = measure.get('number')
    children = list(measure)
    
    i = 0
    while i < len(children):
        child = children[i]
        
        if child.tag == 'harmony':
            # Extract chord
            root_step = child.find('.//root-step')
            root_alter = child.find('.//root-alter')
            kind = child.find('kind')
            bass = child.find('bass/bass-step')
            
            chord_root = root_step.text if root_step is not None else ''
            if root_alter is not None and root_alter.text:
                alter = int(root_alter.text)
                chord_root += '#' if alter == 1 else ('b' if alter == -1 else '')
            
            kind_text = kind.get('text', '') if kind is not None else ''
            chord = chord_root + kind_text
            if bass is not None:
                chord += '/' + bass.text
            
            # Find the NEXT note element to get duration
            duration_divisions = None
            j = i + 1
            while j < len(children):
                if children[j].tag == 'note':
                    dur_elem = children[j].find('duration')
                    if dur_elem is not None:
                        duration_divisions = int(dur_elem.text)
                        break
                j += 1
            
            if duration_divisions:
                duration_beats = duration_divisions / divisions
                proper_events.append({
                    'measure': measure_num,
                    'chord': chord,
                    'divisions': duration_divisions,
                    'beats': duration_beats
                })
        
        i += 1

print(f"\nTotal chord events with proper timing: {len(proper_events)}")
print("\nFirst 15 events (CORRECT extraction):")
for i, event in enumerate(proper_events[:15]):
    print(f"  M{event['measure']:>2} | {event['chord']:<12} | {event['divisions']:>4} divs = {event['beats']:.2f} beats")

# COMPARE with current system
print("\n" + "-"*100)
print("CURRENT SYSTEM OUTPUT (from theDurationsDataset):")
print("-"*100)

something_durations = theDurationsDataset[something_id]
print(f"\nTotal duration entries: {len(something_durations)}")
print(f"First 30 durations: {something_durations[:30]}")

# Show the mismatch
print("\n" + "-"*100)
print("DIAGNOSIS:")
print("-"*100)
print(f"‚úó Current system extracts durations SEPARATELY from chords")
print(f"‚úó This breaks the harmony-duration correlation")
print(f"‚úó The xmlTranslator.py extracts ALL <note> durations, not just harmony-linked ones")
print(f"\nCurrent durations dataset has {len(something_durations)} entries for {len(proper_events)} chords")
print(f"That's {len(something_durations) / len(proper_events):.1f}x too many duration entries!")

print("\n" + "="*100)
print("SOLUTION: Use proper MusicXML parsing that links <harmony> to its <note> duration")
print("="*100)

INTELLIGENT TIMING ANALYSIS - Why the durations are misaligned

‚úì MusicXML divisions per quarter note: 768

----------------------------------------------------------------------------------------------------
PROPER EXTRACTION (harmony-duration pairs as they appear in XML):
----------------------------------------------------------------------------------------------------

Total chord events with proper timing: 55

First 15 events (CORRECT extraction):
  M 1 | F            | 1536 divs = 2.00 beats
  M 1 | Eb           |  768 divs = 1.00 beats
  M 1 | G7/D         |  768 divs = 1.00 beats
  M 2 | C            | 3072 divs = 4.00 beats
  M 3 | Cmaj7        | 3072 divs = 4.00 beats
  M 4 | C7           | 3072 divs = 4.00 beats
  M 5 | F            | 2304 divs = 3.00 beats
  M 5 | F/E          |  768 divs = 1.00 beats
  M 6 | D7           | 3072 divs = 4.00 beats
  M 7 | G7           |  768 divs = 1.00 beats
  M 7 | G7/A         |  768 divs = 1.00 beats
  M 7 | G7/B         | 1536 divs =

In [15]:
# TEST THE FIXED xmlTranslator - Reload and verify durations are correct
print("="*100)
print("TESTING FIXED XML PARSER")
print("="*100)

# Force reload the fixed xmlTranslator
import importlib
import xmlTranslator as xmlT
importlib.reload(xmlT)

# Reload the dataset with the FIXED parser
print("\n1. Reloading dataset with FIXED parser...")
from utils import get_project_root
directory = '/dataset/iRealXML'
root = get_project_root()
myPath = str(root) + str(directory)

theChordDataset_FIXED, theDurationsDataset_FIXED, all_meta_FIXED = xmlT.parse_info_from_XML(myPath)

# Find "Something" again
something_id_fixed = None
for i, meta in enumerate(all_meta_FIXED):
    if meta['song_name'] == 'Something':
        something_id_fixed = i
        break

print(f"‚úì Found 'Something' at index {something_id_fixed}")

# Check the FIXED durations
something_chords_fixed = theChordDataset_FIXED[something_id_fixed]
something_durations_fixed = theDurationsDataset_FIXED[something_id_fixed]

print(f"\n2. FIXED EXTRACTION RESULTS:")
print("-"*100)
print(f"   Chord tokens: {len(something_chords_fixed)}")
print(f"   Duration entries: {len(something_durations_fixed)}")

# Compare with expected (55 chords from proper XML parsing)
print(f"\n3. VALIDATION:")
print("-"*100)

# Count actual chord tokens (not structural elements)
structural = {'<style>', '|', ':|', '|:', 'Form_intro', 'Form_Segno', 'Form_Coda', 'Form_A', 'Form_B', 'Rock Pop'}
chord_count = 0
for token in something_chords_fixed:
    if token not in structural and not str(token).startswith('Form_') and token != '|' and token != ':|' and token != '|:':
        # This might be a chord or related token
        pass

# Show first 30 chord tokens with their durations
print("\nFirst 30 tokens with durations:")
for i in range(min(30, len(something_chords_fixed))):
    token = something_chords_fixed[i]
    dur = something_durations_fixed[i] if i < len(something_durations_fixed) else 'N/A'
    print(f"  [{i:2d}] {str(token):<20} duration={dur}")

# Check if durations look correct (should be 2.0, 1.0, 1.0, 4.0, etc.)
print(f"\n4. DURATION VALUES:")
print("-"*100)
non_zero_durations = [d for d in something_durations_fixed if d > 0]
print(f"   Non-zero durations: {len(non_zero_durations)}")
print(f"   First 20 non-zero: {non_zero_durations[:20]}")

# Expected from XML: 2.0, 1.0, 1.0, 4.0, 4.0, 4.0, 3.0, 1.0, 4.0, ...
expected_first = [2.0, 1.0, 1.0, 4.0, 4.0, 4.0, 3.0, 1.0, 4.0]
print(f"   Expected first 9:  {expected_first}")

if non_zero_durations[:9] == expected_first:
    print("\n   ‚úì DURATIONS MATCH EXPECTED VALUES!")
else:
    print("\n   ‚ö†Ô∏è Durations don't match - checking further...")

print("\n" + "="*100)

TESTING FIXED XML PARSER

1. Reloading dataset with FIXED parser...


  0%|          | 0/4005 [00:00<?, ?it/s]

(4005,) (4005,) (4005,)
‚úì Found 'Something' at index 3986

2. FIXED EXTRACTION RESULTS:
----------------------------------------------------------------------------------------------------
   Chord tokens: 300
   Duration entries: 300

3. VALIDATION:
----------------------------------------------------------------------------------------------------

First 30 tokens with durations:
  [ 0] <style>              duration=0.0
  [ 1] Rock Pop             duration=0.0
  [ 2] Form_intro           duration=0.0
  [ 3] |                    duration=0.0
  [ 4] .                    duration=2.0
  [ 5] 2.0                  duration=2.0
  [ 6] F                    duration=2.0
  [ 7] major                duration=2.0
  [ 8] .                    duration=1.0
  [ 9] 1.0                  duration=1.0
  [10] Eb                   duration=1.0
  [11] major                duration=1.0
  [12] .                    duration=1.0
  [13] 1.0                  duration=1.0
  [14] G                    duration=1.

In [16]:
# FINAL VALIDATION: Run full pipeline with FIXED durations
print("="*100)
print("FINAL PIPELINE TEST WITH FIXED XML PARSER")
print("="*100)

# Use the FIXED dataset
something_fixed = theChordDataset_FIXED[something_id_fixed]

# Step 1: replaceTheseChords
import xmlTranslator as xmlT
corrected = xmlT.replaceTheseChords([something_fixed], False)[0]
print(f"‚úì Step 1: replaceTheseChords - {len(corrected)} tokens")

# Step 2: Add 'maj' tokens
for mod_name in list(sys.modules.keys()):
    if 'voicing' in mod_name.lower():
        del sys.modules[mod_name]
import voicing as voicing_module
voicing_new = voicing_module.Voicing()

processed = []
for i, token in enumerate(corrected):
    processed.append(token)
    if i < len(corrected) - 1:
        next_token = corrected[i + 1]
        previous = corrected[i - 1] if i > 0 else ''
        if token in voicing_new.all_notes and next_token != 'N.C.' and previous != '/' \
           and (next_token in voicing_new.structural_elements or str(next_token).startswith('Form_')):
            processed.append('maj')

print(f"‚úì Step 2: Add 'maj' tokens - {len(processed)} tokens")

# Step 3: Convert to voicing
midi_result, status = voicing_new.convert_chords_to_voicing(processed)
print(f"‚úì Step 3: convert_chords_to_voicing - {len(midi_result)} events")

# Count real chords
real = sum(1 for m,d,l in midi_result if len([n for n in m if n>0])>=3)
print(f"‚úì Real chords (3+ notes): {real}")

# Check timing of first 10 chords
print(f"\n{'='*100}")
print("CHORD TIMING CHECK:")
print("-"*100)
cumulative = 0.0
chord_num = 0
for i, (midi, dur, label) in enumerate(midi_result):
    if len([n for n in midi if n > 0]) >= 3:
        notes = [pretty_midi.note_number_to_name(n) for n in midi if n > 0]
        print(f"  Chord {chord_num+1:2d}: time={cumulative:6.2f}s  dur={dur:4.1f}  {label:<12} {notes}")
        cumulative += dur
        chord_num += 1
        if chord_num >= 15:
            break

# Compare with expected timing
print(f"\n{'='*100}")
print("EXPECTED TIMING (from MusicXML):")
print("-"*100)
expected = [
    ("F", 2.0),
    ("Eb", 1.0),
    ("G7/D", 1.0),
    ("C", 4.0),
    ("Cmaj7", 4.0),
    ("C7", 4.0),
    ("F", 3.0),
    ("F/E", 1.0),
    ("D7", 4.0),
    ("G7", 1.0),
]
for chord, dur in expected:
    print(f"  {chord:<12} duration={dur:.1f} beats")

print("\n" + "="*100)

FINAL PIPELINE TEST WITH FIXED XML PARSER


  0%|          | 0/1 [00:00<?, ?it/s]

‚úì Step 1: replaceTheseChords - 300 tokens
‚úì Step 2: Add 'maj' tokens - 300 tokens
‚úì Step 3: convert_chords_to_voicing - 55 events
‚úì Real chords (3+ notes): 53

CHORD TIMING CHECK:
----------------------------------------------------------------------------------------------------
  Chord  1: time=  0.00s  dur= 2.0  maj          ['F3', 'A3', 'C4']
  Chord  2: time=  2.00s  dur= 1.0  maj          ['D#3', 'G3', 'A#3']
  Chord  3: time=  3.00s  dur= 1.0  D            ['D3', 'G3', 'B3', 'F4']
  Chord  4: time=  4.00s  dur= 4.0  maj          ['C3', 'G3', 'C4', 'E4']
  Chord  5: time=  8.00s  dur= 4.0  maj7         ['C3', 'G3', 'B3', 'E4']
  Chord  6: time= 12.00s  dur= 4.0  dom7         ['C3', 'G3', 'A#3', 'E4']
  Chord  7: time= 16.00s  dur= 3.0  maj          ['F3', 'C3', 'A3', 'F4']
  Chord  8: time= 19.00s  dur= 1.0  E            ['E3', 'A3', 'C4', 'F4', 'F4']
  Chord  9: time= 20.00s  dur= 4.0  dom7         ['D3', 'A3', 'C4', 'F#4']
  Chord 10: time= 24.00s  dur= 1.0  dom7       

In [17]:
# EXPORT AND FINAL VALIDATION
print("="*100)
print("EXPORTING FINAL MIDI WITH CORRECT TIMING")
print("="*100)

output_path = "../dataset/midi_files/mpe/TESTING_Something_PERFECT.mid"

voicing_new.export_to_midi(
    midi_result,
    "TESTING_Something_PERFECT",
    "../dataset/midi_files/mpe/"
)
print(f"‚úì Exported to: {output_path}")

# Load and verify
midi_data = pretty_midi.PrettyMIDI(output_path)
print(f"\nMIDI File Duration: {midi_data.get_end_time():.2f} seconds")

# Collect all notes and group by time
all_notes = []
for inst in midi_data.instruments:
    all_notes.extend(inst.notes)
all_notes = sorted(all_notes, key=lambda n: n.start)

print(f"Total notes: {len(all_notes)}")

# Show timing comparison
from itertools import groupby
print(f"\n{'='*100}")
print("MIDI FILE CHORD TIMING:")
print("-"*100)
chord_count = 0
prev_time = 0.0
for time, notes_at_time in groupby(all_notes, key=lambda n: round(n.start, 2)):
    notes_list = list(notes_at_time)
    pitches = [pretty_midi.note_number_to_name(n.pitch) for n in notes_list]
    delta = time - prev_time if chord_count > 0 else 0
    print(f"  Chord {chord_count+1:2d}: time={time:6.2f}s (Œî={delta:4.1f}s) {pitches}")
    prev_time = time
    chord_count += 1
    if chord_count >= 15:
        break

# FINAL STUPIDITY CHECK
print(f"\n{'='*100}")
print("FINAL STUPIDITY SCORE")
print("="*100)

# Expected cumulative times: 0, 2, 3, 4, 5, 9, 13, 17, 20, 21, 22, 26, 27, 28, 29...
expected_times = [0, 2, 3, 4, 5, 9, 13, 17, 20, 21, 22, 26, 27, 28, 29]

actual_times = []
for time, _ in groupby(all_notes, key=lambda n: round(n.start, 2)):
    actual_times.append(time)
    if len(actual_times) >= 15:
        break

timing_errors = 0
for i in range(min(len(expected_times), len(actual_times))):
    if abs(expected_times[i] - actual_times[i]) > 0.1:
        print(f"  ‚ö†Ô∏è Chord {i+1}: expected {expected_times[i]}s, got {actual_times[i]}s")
        timing_errors += 1

if timing_errors == 0:
    print("  ‚úÖ ALL CHORD TIMINGS ARE CORRECT!")
    print(f"\n  üéâ STUPIDITY SCORE: 0/100")
    print(f"  üéâ VERDICT: PERFECT!")
else:
    print(f"\n  STUPIDITY SCORE: {timing_errors * 10}/100")

print("\n" + "="*100)

EXPORTING FINAL MIDI WITH CORRECT TIMING
‚úì MIDI file created: TESTING_Something_PERFECT.mid
‚úì Exported to: ../dataset/midi_files/mpe/TESTING_Something_PERFECT.mid

MIDI File Duration: 124.00 seconds
Total notes: 201

MIDI FILE CHORD TIMING:
----------------------------------------------------------------------------------------------------
  Chord  1: time=  0.00s (Œî= 0.0s) ['F3', 'A3', 'C4']
  Chord  2: time=  2.00s (Œî= 2.0s) ['D#3', 'G3', 'A#3']
  Chord  3: time=  3.00s (Œî= 1.0s) ['D3', 'G3', 'B3', 'F4']
  Chord  4: time=  4.00s (Œî= 1.0s) ['C3', 'G3', 'C4', 'E4']
  Chord  5: time=  8.00s (Œî= 4.0s) ['C3', 'G3', 'B3', 'E4']
  Chord  6: time= 12.00s (Œî= 4.0s) ['C3', 'G3', 'A#3', 'E4']
  Chord  7: time= 16.00s (Œî= 4.0s) ['F3', 'C3', 'A3', 'F4']
  Chord  8: time= 19.00s (Œî= 3.0s) ['E3', 'A3', 'C4', 'F4']
  Chord  9: time= 20.00s (Œî= 1.0s) ['D3', 'A3', 'C4', 'F#4']
  Chord 10: time= 24.00s (Œî= 4.0s) ['G3', 'D3', 'B3', 'F4']
  Chord 11: time= 25.00s (Œî= 1.0s) ['A3', 'B3', 'D4

In [18]:
# ============================================================================
# COMPLETE TEST: "Something" - Chords, Slash Chords, Timing
# ============================================================================
import sys
import importlib
import numpy as np
import pretty_midi
import xml.etree.ElementTree as ET
from itertools import groupby

print("="*70)
print("TESTING 'SOMETHING' - COMPLETE VALIDATION")
print("="*70)

# Helper: Split compound tokens
def fix_compound_tokens(token_list):
    """Split tokens like 'dom7 add 9' into ['dom7', 'add 9']"""
    fixed = []
    for token in token_list:
        if isinstance(token, str) and ' add ' in token:
            parts = token.split(' add ')
            fixed.append(parts[0])
            fixed.append('add ' + parts[1])
        elif isinstance(token, str) and ' alter ' in token:
            parts = token.split(' alter ')
            fixed.append(parts[0])
            fixed.append('alter ' + parts[1])
        else:
            fixed.append(token)
    return fixed

# STEP 1: Load dataset
import xmlTranslator as xmlT
importlib.reload(xmlT)
from utils import get_project_root

directory = '/dataset/iRealXML'
root = get_project_root()
myPath = str(root) + str(directory)

print("\n1. Loading dataset...")
chords, durations, meta = xmlT.parse_info_from_XML(myPath)

song_id = None
for i, m in enumerate(meta):
    if m['song_name'] == 'Something':
        song_id = i
        break
print(f"   ‚úì Found 'Something' at index {song_id}")

# STEP 2: Get expected from XML
print("\n2. Reading expected chords from XML...")
xml_path = '../dataset/iRealXML/Something.xml'
tree = ET.parse(xml_path)
xml_root = tree.getroot()
divisions = int(xml_root.find('.//divisions').text)

expected_chords = []
for measure in xml_root.findall('.//measure'):
    children = list(measure)
    for i, child in enumerate(children):
        if child.tag == 'harmony':
            root_step = child.find('.//root-step')
            kind = child.find('kind')
            bass = child.find('bass/bass-step')
            
            chord = root_step.text if root_step is not None else ''
            alter = child.find('.//root-alter')
            if alter is not None and alter.text:
                chord += '#' if int(alter.text) == 1 else 'b'
            chord += kind.get('text', '') if kind is not None else ''
            if bass is not None:
                chord += '/' + bass.text
            
            dur = 0
            for j in range(i+1, len(children)):
                if children[j].tag == 'note':
                    d = children[j].find('duration')
                    if d is not None:
                        dur = int(d.text) / divisions
                    break
            expected_chords.append({'chord': chord, 'duration': dur, 'is_slash': '/' in chord})

expected_slash = sum(1 for c in expected_chords if c['is_slash'])
print(f"   ‚úì Found {len(expected_chords)} chords ({expected_slash} slash chords)")

# STEP 3: Process through pipeline
print("\n3. Processing through voicing pipeline...")
song = chords[song_id]
song = xmlT.replaceTheseChords([song], False)[0]
song = fix_compound_tokens(song)  # FIX compound tokens

# Reload voicing
for mod_name in list(sys.modules.keys()):
    if 'voicing' in mod_name.lower():
        del sys.modules[mod_name]
import voicing as voicing_module
v = voicing_module.Voicing()

# Add 'maj' tokens
processed = []
for i, token in enumerate(song):
    processed.append(token)
    if i < len(song) - 1:
        next_t = song[i + 1]
        prev_t = song[i - 1] if i > 0 else ''
        if token in v.all_notes and next_t != 'N.C.' and prev_t != '/' \
           and (next_t in v.structural_elements or str(next_t).startswith('Form_')):
            processed.append('maj')

# Convert
midi_seq, _ = v.convert_chords_to_voicing(processed)
print(f"   ‚úì Generated {len(midi_seq)} chords")

# STEP 4: Validation
print("\n4. VALIDATION:")
print("-"*70)

errors = 0

# Chord count
if abs(len(midi_seq) - len(expected_chords)) <= 5:
    print(f"   ‚úì Chord count: {len(midi_seq)} (expected {len(expected_chords)})")
else:
    print(f"   ‚ùå Chord count: {len(midi_seq)} (expected {len(expected_chords)})")
    errors += 1

# All 3+ notes
bad_chords = [i for i, (m,d,l) in enumerate(midi_seq) if len([n for n in m if n>0]) < 3]
if len(bad_chords) == 0:
    print(f"   ‚úì All chords have 3+ notes")
else:
    print(f"   ‚ùå {len(bad_chords)} chords have < 3 notes: {bad_chords}")
    errors += 1

# Timing
expected_times = [sum(c['duration'] for c in expected_chords[:i]) for i in range(15)]
actual_times = [sum(midi_seq[j][1] for j in range(i)) for i in range(min(15, len(midi_seq)))]
timing_errors = sum(1 for i in range(min(len(expected_times), len(actual_times))) 
                   if abs(expected_times[i] - actual_times[i]) > 0.5)
if timing_errors == 0:
    print(f"   ‚úì Timing: All correct")
else:
    print(f"   ‚ùå Timing: {timing_errors} errors")
    errors += 1

# STEP 5: Export
print("\n5. Exporting MIDI...")
output_file = "TESTING_Something"
output_path = f"../dataset/midi_files/mpe/{output_file}.mid"
v.export_to_midi(midi_seq, output_file, "../dataset/midi_files/mpe/")

midi = pretty_midi.PrettyMIDI(output_path)
notes = []
for inst in midi.instruments:
    notes.extend(inst.notes)
notes = sorted(notes, key=lambda n: n.start)
midi_chord_count = len(list(groupby(notes, key=lambda n: round(n.start, 2))))
print(f"   ‚úì MIDI: {len(notes)} notes, {midi_chord_count} chords, {midi.get_end_time():.1f}s")

# STEP 6: Show chords
print("\n6. FIRST 10 CHORDS:")
print("-"*70)
print(f"{'#':<4} {'Time':<8} {'Expected':<15} {'Notes':<35}")
print("-"*70)
for i in range(min(10, len(midi_seq))):
    midi_notes, dur, label = midi_seq[i]
    notes_names = [pretty_midi.note_number_to_name(n) for n in midi_notes if n > 0]
    exp = expected_chords[i]['chord'] if i < len(expected_chords) else '?'
    cum = sum(midi_seq[j][1] for j in range(i))
    print(f"{i+1:<4} {cum:<8.1f} {exp:<15} {str(notes_names):<35}")

# VERDICT
print("\n" + "="*70)
if errors == 0:
    print("üéâ ALL TESTS PASSED! STUPIDITY SCORE: 0/100")
    print(f"üéµ Listen to: {output_path}")
else:
    print(f"‚ö†Ô∏è  {errors} ERRORS - STUPIDITY SCORE: {errors * 33}/100")
print("="*70)

TESTING 'SOMETHING' - COMPLETE VALIDATION

1. Loading dataset...


  0%|          | 0/4005 [00:00<?, ?it/s]

(4005,) (4005,) (4005,)
   ‚úì Found 'Something' at index 3986

2. Reading expected chords from XML...
   ‚úì Found 55 chords (18 slash chords)

3. Processing through voicing pipeline...


  0%|          | 0/1 [00:00<?, ?it/s]

   ‚úì Generated 55 chords

4. VALIDATION:
----------------------------------------------------------------------
   ‚úì Chord count: 55 (expected 55)
   ‚úì All chords have 3+ notes
   ‚úì Timing: All correct

5. Exporting MIDI...
‚úì MIDI file created: TESTING_Something.mid
   ‚úì MIDI: 207 notes, 55 chords, 124.0s

6. FIRST 10 CHORDS:
----------------------------------------------------------------------
#    Time     Expected        Notes                              
----------------------------------------------------------------------
1    0.0      Fb              ['F3', 'A3', 'C4']                 
2    2.0      Eb              ['D#3', 'G3', 'A#3']               
3    3.0      Gb7/D           ['D3', 'G3', 'B3', 'F4']           
4    4.0      Cb              ['C3', 'G3', 'C4', 'E4']           
5    8.0      Cbmaj7          ['C3', 'G3', 'B3', 'E4']           
6    12.0     Cb7             ['C3', 'G3', 'A#3', 'E4']          
7    16.0     Fb              ['F3', 'C3', 'A3', 'F4']  

In [19]:
# ============================================================================
# SLASH CHORD DIAGNOSTIC - Check each slash chord in detail
# ============================================================================
print("="*80)
print("SLASH CHORD ANALYSIS")
print("="*80)

# Get expected slash chords from XML
expected_slash_chords = [c for c in expected_chords if c['is_slash']]
print(f"\nExpected {len(expected_slash_chords)} slash chords from XML:")
for i, c in enumerate(expected_slash_chords):
    print(f"  {i+1}. {c['chord']}")

# Find slash chords in our processed sequence
print("\n" + "-"*80)
print("SLASH CHORDS IN TOKEN SEQUENCE:")
print("-"*80)

# Look for '/' in the processed tokens
slash_positions = []
for i, token in enumerate(processed):
    if token == '/':
        # Get context around slash
        start = max(0, i-5)
        end = min(len(processed), i+3)
        context = processed[start:end]
        slash_positions.append((i, context))
        print(f"  Position {i}: {context}")

print(f"\nFound {len(slash_positions)} slash markers in tokens")

# Now check what we generated
print("\n" + "-"*80)
print("GENERATED SLASH CHORDS:")
print("-"*80)

# Find chords that should be slash chords (based on position in sequence)
# Each DOT starts a chord, so let's check which DOTs are followed by slash
dot_positions = [i for i, elem in enumerate(processed) if elem == '.']

slash_chord_results = []
for dot_idx, dot_pos in enumerate(dot_positions):
    # Find end of this chord
    end_pos = dot_positions[dot_idx + 1] if dot_idx + 1 < len(dot_positions) else len(processed)
    chord_tokens = processed[dot_pos:end_pos]
    
    # Check if this chord has a slash
    if '/' in chord_tokens:
        slash_idx = chord_tokens.index('/')
        bass_note = chord_tokens[slash_idx + 1] if slash_idx + 1 < len(chord_tokens) else '?'
        root_note = None
        nature = None
        
        # Find root and nature
        for t in chord_tokens:
            if t in v.all_notes and root_note is None:
                root_note = t
            if t in v.natures:
                nature = t
        
        # Get the generated MIDI for this chord
        if dot_idx < len(midi_seq):
            midi_notes, dur, label = midi_seq[dot_idx]
            notes = [n for n in midi_notes if n > 0]
            note_names = [pretty_midi.note_number_to_name(n) for n in notes]
            
            slash_chord_results.append({
                'index': dot_idx + 1,
                'root': root_note,
                'nature': nature,
                'bass': bass_note,
                'expected': f"{root_note}{nature or 'maj'}/{bass_note}",
                'midi': notes,
                'names': note_names
            })

print(f"\nGenerated {len(slash_chord_results)} slash chords:\n")
print(f"{'#':<4} {'Expected':<15} {'Bass':<6} {'Root':<6} {'Generated Notes':<40}")
print("-"*80)

for sc in slash_chord_results:
    # Check if bass is correct (should be lowest note)
    bass_midi = v.all_notes.get(sc['bass'], 0)
    actual_bass = sc['midi'][0] if sc['midi'] else 0
    
    # Check if original root is present (should be moved up)
    root_midi = v.all_notes.get(sc['root'], 0)
    root_in_chord = any(n % 12 == root_midi % 12 for n in sc['midi'][1:]) if len(sc['midi']) > 1 else False
    
    # Status
    bass_ok = "‚úì" if actual_bass % 12 == bass_midi % 12 else "‚ùå"
    root_ok = "‚úì" if root_in_chord else "‚ùå"
    
    print(f"{sc['index']:<4} {sc['expected']:<15} {bass_ok}{sc['bass']:<5} {root_ok}{sc['root']:<5} {sc['names']}")

# Summary
print("\n" + "="*80)
print("SUMMARY:")
bass_correct = sum(1 for sc in slash_chord_results 
                   if sc['midi'] and sc['midi'][0] % 12 == v.all_notes.get(sc['bass'], 0) % 12)
root_preserved = sum(1 for sc in slash_chord_results 
                     if len(sc['midi']) > 1 and any(n % 12 == v.all_notes.get(sc['root'], 0) % 12 for n in sc['midi'][1:]))

print(f"  Bass note correct: {bass_correct}/{len(slash_chord_results)}")
print(f"  Root preserved:    {root_preserved}/{len(slash_chord_results)}")
print("="*80)

SLASH CHORD ANALYSIS

Expected 18 slash chords from XML:
  1. Gb7/D
  2. Fb/E
  3. Gb7/A
  4. Gb7/B
  5. Abm7/G
  6. Gb7/D
  7. Ab/G
  8. Ab/F
  9. Ab/E
  10. Ab/G
  11. Ab/F
  12. Ab/E
  13. Fb/E
  14. Gb7/A
  15. Gb7/B
  16. Abm7/G
  17. Gb7/D
  18. Gb7/D

--------------------------------------------------------------------------------
SLASH CHORDS IN TOKEN SEQUENCE:
--------------------------------------------------------------------------------
  Position 16: ['maj', '.', '1.0', 'G', 'dom7', '/', 'D', 'Form_Segno']
  Position 43: ['maj', '.', '1.0', 'F', 'maj', '/', 'E', '|']
  Position 59: ['dom7', '.', '1.0', 'G', 'dom7', '/', 'A', '.']
  Position 65: ['A', '.', '2.0', 'G', 'dom7', '/', 'B', 'Form_B']
  Position 83: ['|', '.', '2.0', 'A', 'm7', '/', 'G', '.']
  Position 104: ['maj', '.', '1.0', 'G', 'dom7', '/', 'D', ':|']
  Position 122: ['maj', '.', '2.0', 'A', 'maj', '/', 'G#', '|']
  Position 129: ['|', '.', '2.0', 'A', 'maj', '/', 'F#', '.']
  Position 135: ['F#', '.', '2.0'