In [20]:
import os
import json
import numpy as np
from pathlib import Path
from mido import MidiFile, MidiTrack, Message, MetaMessage
from tqdm import tqdm
import random
#from utils import get_project_root

## 1. Load 53-TET Reference Data

The `53_reference_notes.json` contains pre-calculated mappings between 53-TET notes and:
- MIDI note numbers (12-TET)
- Pitch bend values (in semitones, normalized 0-1 range)
- Frequencies
- Note names

In [22]:
# Load 53-TET reference notes
reference_path = '../53_reference_notes.json'

with open(reference_path, 'r') as f:
    ref_data = json.load(f)

reference_notes = ref_data['notes']
print(f"Loaded {len(reference_notes)} 53-TET reference notes")

# Display sample entries
print("\nSample entries (around middle C):")
for note in reference_notes[115:125]:
    print(f"  ref={note['reference']:3d} | MIDI={note['MIDI']:3d} | bend={note['bend']:.4f} | {note['noteName']}")

Loaded 423 53-TET reference notes

Sample entries (around middle C):
  ref= 62 | MIDI= 47 | bend=0.0943 | B2
  ref= 63 | MIDI= 47 | bend=0.3208 | ^B2
  ref= 64 | MIDI= 47 | bend=0.5472 | ^^B2
  ref= 65 | MIDI= 47 | bend=0.7736 | vC2
  ref= 66 | MIDI= 48 | bend=0.0000 | C2
  ref= 67 | MIDI= 48 | bend=0.2264 | ^C3
  ref= 68 | MIDI= 48 | bend=0.4528 | ^^C3
  ref= 69 | MIDI= 48 | bend=0.6792 | vvC#3
  ref= 70 | MIDI= 48 | bend=0.9057 | vC#3
  ref= 71 | MIDI= 49 | bend=0.1321 | C#3


## 2. Define 12-TET to 53-TET Mapping

In 53-TET, each 12-TET semitone spans approximately 53/12 ≈ 4.42 steps.
We map common intervals to their "just intonation approximation" equivalents:

| 12-TET | Semitones | 53-TET Steps | Interval Name |
|--------|-----------|--------------|---------------|
| Unison | 0 | 0 | Perfect Unison |
| m2 | 1 | 5 | Minor Second |
| M2 | 2 | 9 | Major Second |
| m3 | 3 | 14 | Minor Third |
| M3 | 4 | 17 | Major Third (5-limit) |
| P4 | 5 | 22 | Perfect Fourth |
| tritone | 6 | 26-27 | Tritone |
| P5 | 7 | 31 | Perfect Fifth |
| m6 | 8 | 36 | Minor Sixth |
| M6 | 9 | 39 | Major Sixth |
| m7 | 10 | 44 | Minor Seventh |
| M7 | 11 | 48 | Major Seventh |
| Octave | 12 | 53 | Perfect Octave |

In [23]:
# 12-TET semitones to 53-TET steps mapping
# Using "just" intonation approximations in 53-TET
SEMITONE_TO_53TET = {
    0: 0,    # Unison
    1: 5,    # Minor second
    2: 9,    # Major second
    3: 14,   # Minor third (pure minor third = 13-14 steps)
    4: 17,   # Major third (5-limit = 17 steps, closer to 5/4)
    5: 22,   # Perfect fourth
    6: 27,   # Tritone (augmented fourth)
    7: 31,   # Perfect fifth
    8: 36,   # Minor sixth
    9: 39,   # Major sixth
    10: 44,  # Minor seventh (7-limit = 43, 5-limit = 44)
    11: 48,  # Major seventh
}

# Extended for multiple octaves
def semitones_to_53tet(semitones):
    """
    Convert 12-TET semitones to 53-TET steps.
    Handles any number of octaves.
    """
    octaves = semitones // 12
    remainder = semitones % 12
    return octaves * 53 + SEMITONE_TO_53TET.get(remainder, round(remainder * 53 / 12))

# Test the conversion
print("12-TET to 53-TET conversion test:")
for st in range(13):
    print(f"  {st:2d} semitones -> {semitones_to_53tet(st):2d} 53-TET steps")

12-TET to 53-TET conversion test:
   0 semitones ->  0 53-TET steps
   1 semitones ->  5 53-TET steps
   2 semitones ->  9 53-TET steps
   3 semitones -> 14 53-TET steps
   4 semitones -> 17 53-TET steps
   5 semitones -> 22 53-TET steps
   6 semitones -> 27 53-TET steps
   7 semitones -> 31 53-TET steps
   8 semitones -> 36 53-TET steps
   9 semitones -> 39 53-TET steps
  10 semitones -> 44 53-TET steps
  11 semitones -> 48 53-TET steps
  12 semitones -> 53 53-TET steps


## 3. Build Note Lookup Table

Create a fast lookup from 53-TET reference index to (MIDI_note, pitch_bend) pairs.

In [24]:
# Build lookup table: 53-TET reference -> (MIDI, bend)
# The reference values go from -53 to ~370 (covering the full MIDI range)

ref_to_midi_bend = {}
for note in reference_notes:
    ref_idx = note['reference']
    midi_note = note['MIDI']
    bend = note['bend']  # This is in semitones (0.0 to ~0.99)
    ref_to_midi_bend[ref_idx] = (midi_note, bend)

print(f"Built lookup table with {len(ref_to_midi_bend)} entries")
print(f"Reference range: {min(ref_to_midi_bend.keys())} to {max(ref_to_midi_bend.keys())}")

# Find the reference index for MIDI note 60 (middle C)
c4_refs = [ref for ref, (midi, bend) in ref_to_midi_bend.items() if midi == 60 and bend < 0.1]
print(f"\nMiddle C (MIDI 60) reference indices with low bend: {c4_refs}")

Built lookup table with 423 entries
Reference range: -53 to 369

Middle C (MIDI 60) reference indices with low bend: [119]


In [25]:
def midi_to_53tet_reference(midi_note):
    """
    Convert a 12-TET MIDI note to its 53-TET reference index.
    
    In the reference file:
    - MIDI 60 (C4) maps to reference 119 (which has noteName 'C3' in their numbering)
    - Reference 0 corresponds to A1 (MIDI 33)
    
    Formula: ref_index = (midi_note - 33) * 53 / 12 + small_offset
    But simpler: find the closest reference with matching MIDI and minimal bend
    """
    # Find references that map to this MIDI note
    matching_refs = [(ref, bend) for ref, (midi, bend) in ref_to_midi_bend.items() 
                     if midi == midi_note]
    
    if not matching_refs:
        # Fallback: estimate based on A440 = MIDI 69 = reference ~159
        return round((midi_note - 69) * 53/12) + 159
    
    # Return the one with smallest bend (closest to the 12-TET note)
    best_ref = min(matching_refs, key=lambda x: abs(x[1]))[0]
    return best_ref

# Test: find 53-TET reference for common MIDI notes
print("MIDI to 53-TET reference mapping:")
for midi in [48, 52, 55, 60, 64, 67, 69, 72]:
    ref = midi_to_53tet_reference(midi)
    midi_back, bend = ref_to_midi_bend.get(ref, (None, None))
    print(f"  MIDI {midi} -> ref {ref} -> MIDI {midi_back}, bend {bend:.4f}")

MIDI to 53-TET reference mapping:
  MIDI 48 -> ref 66 -> MIDI 48, bend 0.0000
  MIDI 52 -> ref 84 -> MIDI 52, bend 0.0755
  MIDI 55 -> ref 97 -> MIDI 55, bend 0.0189
  MIDI 60 -> ref 119 -> MIDI 60, bend 0.0000
  MIDI 64 -> ref 137 -> MIDI 64, bend 0.0755
  MIDI 67 -> ref 150 -> MIDI 67, bend 0.0189
  MIDI 69 -> ref 159 -> MIDI 69, bend 0.0566
  MIDI 72 -> ref 172 -> MIDI 72, bend 0.0000


## 4. Pitch Bend Conversion for MPE

MIDI pitch bend:
- Range: -8192 to +8191 (14-bit)
- Default sensitivity: ±2 semitones
- For MPE microtonal: We'll use ±48 semitones range

The bend values in the reference file are in semitones (0.0 to ~1.0).
We need to convert this to MIDI pitch bend values.

In [26]:
def cents_to_pitch_bend(cents, pitch_bend_range=2):
    """
    Convert cents deviation to MIDI pitch bend value.
    
    Args:
        cents: Deviation in cents (100 cents = 1 semitone)
        pitch_bend_range: Pitch bend sensitivity in semitones (default ±2)
    
    Returns:
        MIDI pitch bend value (-8192 to +8191)
    """
    # Convert cents to semitones
    semitones = cents / 100.0
    # Calculate bend value
    # Full range (8192) = pitch_bend_range semitones
    bend = int((semitones / pitch_bend_range) * 8192)
    return max(-8192, min(8191, bend))

def semitones_to_pitch_bend(semitones, pitch_bend_range=2):
    """
    Convert semitones deviation to MIDI pitch bend value.
    
    Args:
        semitones: Deviation in semitones
        pitch_bend_range: Pitch bend sensitivity in semitones (default ±2)
    
    Returns:
        MIDI pitch bend value (-8192 to +8191)
    """
    bend = int((semitones / pitch_bend_range) * 8192)
    return max(-8192, min(8191, bend))

# Test
print("Pitch bend conversion test (default ±2 semitones range):")
for semitones in [0, 0.25, 0.5, 0.75, 1.0, 1.5, 2.0]:
    bend = semitones_to_pitch_bend(semitones)
    print(f"  {semitones:.2f} semitones -> bend value {bend:5d}")

Pitch bend conversion test (default ±2 semitones range):
  0.00 semitones -> bend value     0
  0.25 semitones -> bend value  1024
  0.50 semitones -> bend value  2048
  0.75 semitones -> bend value  3072
  1.00 semitones -> bend value  4096
  1.50 semitones -> bend value  6144
  2.00 semitones -> bend value  8191


## 5. MIDI File Processing Functions

In [27]:
def convert_note_to_53tet(midi_note, conversion_mode='just'):
    """
    Convert a 12-TET MIDI note to 53-TET (MIDI note + pitch bend).
    
    Args:
        midi_note: Original 12-TET MIDI note number (0-127)
        conversion_mode: 'just' for just intonation approximation,
                        'equal' for equal-tempered 53-TET spacing
    
    Returns:
        tuple: (new_midi_note, pitch_bend_value, bend_semitones)
    """
    # Get the 53-TET reference for this note
    ref_idx = midi_to_53tet_reference(midi_note)
    
    if ref_idx in ref_to_midi_bend:
        new_midi, bend_semitones = ref_to_midi_bend[ref_idx]
        pitch_bend = semitones_to_pitch_bend(bend_semitones, pitch_bend_range=2)
        return (new_midi, pitch_bend, bend_semitones)
    else:
        # Fallback: no pitch bend
        return (midi_note, 0, 0.0)

def apply_microtonal_offset(midi_note, offset_53tet):
    """
    Apply a microtonal offset in 53-TET steps to a MIDI note.
    
    Args:
        midi_note: Original MIDI note
        offset_53tet: Offset in 53-TET steps (can be positive or negative)
    
    Returns:
        tuple: (new_midi_note, pitch_bend_value, bend_semitones)
    """
    # Get base 53-TET reference
    base_ref = midi_to_53tet_reference(midi_note)
    # Apply offset
    new_ref = base_ref + offset_53tet
    
    if new_ref in ref_to_midi_bend:
        new_midi, bend_semitones = ref_to_midi_bend[new_ref]
        pitch_bend = semitones_to_pitch_bend(bend_semitones, pitch_bend_range=2)
        return (new_midi, pitch_bend, bend_semitones)
    else:
        # Fallback: calculate approximate values
        step_in_semitones = 12.0 / 53.0
        offset_semitones = offset_53tet * step_in_semitones
        new_midi = midi_note + int(round(offset_semitones))
        bend_semitones = offset_semitones - round(offset_semitones)
        pitch_bend = semitones_to_pitch_bend(bend_semitones, pitch_bend_range=2)
        return (max(0, min(127, new_midi)), pitch_bend, bend_semitones)

# Test the conversion
print("Note conversion test:")
for note in [60, 64, 67, 72]:
    result = convert_note_to_53tet(note)
    print(f"  MIDI {note} -> MIDI {result[0]}, bend {result[1]:5d} ({result[2]:.3f} semitones)")

Note conversion test:
  MIDI 60 -> MIDI 60, bend     0 (0.000 semitones)
  MIDI 64 -> MIDI 64, bend   309 (0.075 semitones)
  MIDI 67 -> MIDI 67, bend    77 (0.019 semitones)
  MIDI 72 -> MIDI 72, bend     0 (0.000 semitones)


In [28]:
def get_chord_interval_offsets(chord_notes):
    """
    Calculate 53-TET offsets for notes in a chord based on their intervals from the bass.
    
    This applies just intonation-inspired adjustments:
    - Major 3rd: -1 step (14 cents flatter for pure 5/4)
    - Minor 7th: -1 step (for septimal 7th)
    - Other intervals: keep standard 53-TET mapping
    
    Args:
        chord_notes: List of MIDI note numbers in the chord
    
    Returns:
        List of 53-TET step offsets for each note
    """
    if len(chord_notes) == 0:
        return []
    
    bass = min(chord_notes)
    offsets = []
    
    # Just intonation inspired offsets in 53-TET steps
    # These are deviations from equal temperament
    INTERVAL_ADJUSTMENTS = {
        4: -1,   # Major 3rd: 1 step flat for pure 5/4 ratio
        10: -1,  # Minor 7th: 1 step flat for septimal 7th (7/4)
        # 3: 0,   # Minor 3rd: keep as is (already close to 6/5)
        # 7: 0,   # Perfect 5th: keep as is (already close to 3/2)
    }
    
    for note in chord_notes:
        interval = (note - bass) % 12
        adjustment = INTERVAL_ADJUSTMENTS.get(interval, 0)
        offsets.append(adjustment)
    
    return offsets

# Test
test_chord = [48, 52, 55, 58]  # Cmaj7
offsets = get_chord_interval_offsets(test_chord)
print(f"Chord {test_chord}")
print(f"Intervals from bass: {[(n - min(test_chord)) % 12 for n in test_chord]}")
print(f"53-TET adjustments: {offsets}")

Chord [48, 52, 55, 58]
Intervals from bass: [0, 4, 7, 10]
53-TET adjustments: [0, -1, 0, -1]


## 6. MIDI File Converter

In [29]:
def convert_midi_to_53tet(input_path, output_path, conversion_level='full'):
    """
    Convert a 12-TET MIDI file to 53-TET using MPE pitch bend.
    
    Args:
        input_path: Path to input 12-TET MIDI file
        output_path: Path for output 53-TET MIDI file
        conversion_level: 'full' (100% microtonal), 'hybrid' (50%), 'sparse' (10%)
    
    Returns:
        bool: Success status
    """
    try:
        # Read input MIDI
        mid = MidiFile(input_path)
        
        # Create output MIDI with same settings
        out_mid = MidiFile(type=mid.type, ticks_per_beat=mid.ticks_per_beat)
        
        # Track notes currently on (for MPE channel management)
        active_notes = {}  # note -> channel
        next_channel = 1  # Channels 1-15 for notes, 0 for global
        
        for track_idx, track in enumerate(mid.tracks):
            out_track = MidiTrack()
            out_mid.tracks.append(out_track)
            
            # Collect notes in current chord (for interval analysis)
            chord_notes = []
            chord_time = 0
            
            for msg in track:
                if msg.type == 'note_on' and msg.velocity > 0:
                    # Allocate MPE channel for this note
                    channel = next_channel
                    next_channel = (next_channel % 15) + 1
                    active_notes[msg.note] = channel
                    
                    # Get 53-TET conversion
                    new_midi, pitch_bend, bend_semi = convert_note_to_53tet(msg.note)
                    
                    # Decide whether to apply microtonal adjustment
                    apply_microtonal = True
                    if conversion_level == 'hybrid':
                        apply_microtonal = random.random() < 0.5
                    elif conversion_level == 'sparse':
                        apply_microtonal = random.random() < 0.1
                    
                    if apply_microtonal and pitch_bend != 0:
                        # Add pitch bend before note on
                        out_track.append(Message('pitchwheel', 
                                                 channel=channel, 
                                                 pitch=pitch_bend, 
                                                 time=msg.time))
                        # Note on with time=0 (immediately after bend)
                        out_track.append(Message('note_on', 
                                                 note=new_midi,
                                                 velocity=msg.velocity, 
                                                 channel=channel, 
                                                 time=0))
                    else:
                        out_track.append(Message('note_on', 
                                                 note=new_midi,
                                                 velocity=msg.velocity, 
                                                 channel=channel, 
                                                 time=msg.time))
                
                elif msg.type == 'note_off' or (msg.type == 'note_on' and msg.velocity == 0):
                    # Find the channel this note was on
                    channel = active_notes.pop(msg.note, msg.channel)
                    new_midi, _, _ = convert_note_to_53tet(msg.note)
                    
                    out_track.append(Message('note_off', 
                                             note=new_midi,
                                             velocity=msg.velocity, 
                                             channel=channel, 
                                             time=msg.time))
                    
                    # Reset pitch bend on this channel
                    out_track.append(Message('pitchwheel', 
                                             channel=channel, 
                                             pitch=0, 
                                             time=0))
                
                elif msg.type in ['program_change', 'control_change']:
                    # Replicate control messages to all active channels
                    out_track.append(msg.copy())
                
                else:
                    # Copy other messages as-is (tempo, time signature, etc.)
                    out_track.append(msg.copy())
        
        # Save output
        out_mid.save(output_path)
        return True
        
    except Exception as e:
        print(f"Error converting {input_path}: {e}")
        return False

## 7. Batch Processing Setup

In [30]:
# Define paths
input_dir = Path('../dataset/midi_files/mpe')
output_dir = Path('../dataset/midi_files/mpe53')

# Create output directory
output_dir.mkdir(parents=True, exist_ok=True)

print(f"Input directory: {input_dir}")
print(f"Output directory: {output_dir}")

# List input files
input_files = list(input_dir.glob('*.mid'))
print(f"\nFound {len(input_files)} MIDI files to convert")

# Show sample
print("\nSample files:")
for f in input_files[:5]:
    print(f"  {f.name}")

Input directory: ../dataset/midi_files/mpe
Output directory: ../dataset/midi_files/mpe53

Found 48062 MIDI files to convert

Sample files:
  44372_Limbo_Ab_major.mid
  29335_La Vem Você - AB AC AB A_G_major.mid
  37263_Speevy_Eb_major.mid
  03671_Boca Sem Dente  - AAB AB_B_major.mid
  32439_Vai Saudade 1_Eb_major.mid


## 8. Test Single File Conversion

In [31]:
# Test conversion on a single file
if len(input_files) > 0:
    test_file = input_files[0]
    test_output = output_dir / f"53TET_{test_file.name}"
    
    print(f"Testing conversion:")
    print(f"  Input: {test_file.name}")
    print(f"  Output: {test_output.name}")
    
    success = convert_midi_to_53tet(str(test_file), str(test_output), 'full')
    
    if success:
        print("  ✅ Conversion successful!")
        
        # Compare file sizes
        orig_size = test_file.stat().st_size
        new_size = test_output.stat().st_size
        print(f"  Original size: {orig_size} bytes")
        print(f"  New size: {new_size} bytes")
    else:
        print("  ❌ Conversion failed")

Testing conversion:
  Input: 44372_Limbo_Ab_major.mid
  Output: 53TET_44372_Limbo_Ab_major.mid
  ✅ Conversion successful!
  Original size: 2086 bytes
  New size: 3694 bytes


## 9. Batch Convert All Files

In [33]:
def batch_convert_to_53tet(input_dir, output_dir, conversion_level='full', limit=None):
    """
    Convert all MIDI files in a directory to 53-TET.
    
    Args:
        input_dir: Path to input directory
        output_dir: Path to output directory
        conversion_level: 'full', 'hybrid', or 'sparse'
        limit: Maximum number of files to process (None for all)
    
    Returns:
        tuple: (success_count, error_count, errors_list)
    """
    input_path = Path(input_dir)
    output_path = Path(output_dir)
    output_path.mkdir(parents=True, exist_ok=True)
    
    input_files = list(input_path.glob('*.mid'))
    if limit:
        input_files = input_files[:limit]
    
    success = 0
    errors = []
    
    for f in tqdm(input_files, desc="Converting to 53-TET"):
        output_file = output_path / f"53TET_{f.name}"
        
        if convert_midi_to_53tet(str(f), str(output_file), conversion_level):
            success += 1
        else:
            errors.append(f.name)
    
    return success, len(errors), errors

# Show what we're about to process
print(f"Ready to convert {len(input_files)} files")
print(f"Conversion mode: full (100% microtonal)")

Ready to convert 48062 files
Conversion mode: full (100% microtonal)


In [34]:
# Run the batch conversion
# Set limit=None to process all files, or a number for testing

success, error_count, errors = batch_convert_to_53tet(
    input_dir, 
    output_dir, 
    conversion_level='full',
    limit=None  # Process all files
)

print(f"\n{'='*50}")
print(f"Conversion Complete!")
print(f"{'='*50}")
print(f"✅ Successfully converted: {success} files")
print(f"❌ Errors: {error_count} files")

if errors:
    print(f"\nFailed files:")
    for e in errors[:10]:
        print(f"  - {e}")
    if len(errors) > 10:
        print(f"  ... and {len(errors) - 10} more")

Converting to 53-TET: 100%|██████████| 48062/48062 [35:20<00:00, 22.67it/s] 


Conversion Complete!
✅ Successfully converted: 48062 files
❌ Errors: 0 files





## 10. Verification and Summary

In [35]:
# Verify output
output_files = list(output_dir.glob('*.mid'))
print(f"Output directory contains {len(output_files)} files")

# Calculate total sizes
input_size = sum(f.stat().st_size for f in input_files) / (1024 * 1024)
output_size = sum(f.stat().st_size for f in output_files) / (1024 * 1024)

print(f"\nDataset sizes:")
print(f"  Input (12-TET): {input_size:.2f} MB")
print(f"  Output (53-TET): {output_size:.2f} MB")
print(f"  Size increase: {((output_size/input_size) - 1) * 100:.1f}%")

Output directory contains 48062 files

Dataset sizes:
  Input (12-TET): 188.76 MB
  Output (53-TET): 348.65 MB
  Size increase: 84.7%


In [36]:
# Analyze a converted file
if len(output_files) > 0:
    sample_file = output_files[0]
    mid = MidiFile(str(sample_file))
    
    print(f"Analysis of: {sample_file.name}")
    print(f"  Tracks: {len(mid.tracks)}")
    print(f"  Ticks per beat: {mid.ticks_per_beat}")
    
    # Count pitch bend messages
    pitch_bends = 0
    notes = 0
    for track in mid.tracks:
        for msg in track:
            if msg.type == 'pitchwheel':
                pitch_bends += 1
            elif msg.type == 'note_on' and msg.velocity > 0:
                notes += 1
    
    print(f"  Note events: {notes}")
    print(f"  Pitch bend events: {pitch_bends}")
    print(f"  Ratio (bends/notes): {pitch_bends/notes:.2f}")

Analysis of: 53TET_37931_Fato Consumado 1_B_major.mid
  Tracks: 2
  Ticks per beat: 960
  Note events: 276
  Pitch bend events: 534
  Ratio (bends/notes): 1.93


## Next Steps

1. **Listen to converted files** - Use a synthesizer that supports MPE and ±2 semitone pitch bend range
2. **Validate tuning** - Check that intervals sound more "pure" than 12-TET
3. **Generate hybrid datasets** - Run with `conversion_level='hybrid'` for 50% microtonal
4. **Create sparse dataset** - Run with `conversion_level='sparse'` for 10% microtonal
5. **Tokenize for training** - Process these files with the tokenizer for GPT-2 training