In [1]:
import sys
import pandas as pd
from pathlib import Path
from IPython.display import display

# Add project root to Python path
PROJECT_ROOT = Path.cwd().resolve().parent.parent
BACKEND_ROOT = PROJECT_ROOT / "backend"
sys.path.append(str(BACKEND_ROOT))

import ml.preprocessing.loader as loader
import ml.preprocessing.cleaner as cleaner
import ml.preprocessing.feature_extractor as feature_extractor
from ml.preprocessing.tokenizer import Tokenizer

datasets = loader.load()
datasets = cleaner.clean(datasets)
features = feature_extractor.extract_features(datasets)
print(datasets)
tokenizer = Tokenizer()

{'melody':         melid       onset  pitch  duration  bar
0           1    4.000000   65.0  0.500000    1
1           1    5.041667   63.0  0.625000    1
2           1    5.750000   58.0  0.291667    1
3           1    6.083333   61.0  0.875000    1
4           1    7.041667   63.0  0.458333    1
...       ...         ...    ...       ...  ...
180564    456  213.375000   57.0  0.625000   53
180565    456  214.000000   55.0  0.333333   54
180566    456  214.291667   57.0  0.708333   54
180567    456  215.208333   59.0  1.458333   54
180568    456  216.708333   52.0  5.208333   54

[180569 rows x 5 columns], 'beats':         melid  onset  bar  beat  \
0           1    0.0    0     1   
1           1    1.0    0     2   
2           1    2.0    0     3   
3           1    3.0    0     4   
4           1    4.0    1     1   
...       ...    ...  ...   ...   
118075    456  219.0   55     2   
118076    456  220.0   55     3   
118077    456  221.0   55     4   
118078    456  222.0   56 

In [2]:
import json
from collections import Counter

# ------ Analyze current datasets -------

beats = datasets["beats"]
melody = datasets["melody"]
sections = datasets["sections"]
solo_info = datasets["solo_info"]

# ---------------------------------------

all_sequences = []
for melid in features['melody']['melid'].unique():
    melid_melody = features['melody'][features['melody']['melid'] == melid]
    melid_beats = features['beats'][features['beats']['melid'] == melid]   
    melid_solo_info = features['solo_info'][features['solo_info']['melid'] == melid]
    
    tokens = tokenizer.encode_solo(melid_melody, melid_beats, melid_solo_info)
    if tokens is not None:
        all_sequences.append({
            'melid': melid,
            'encoder': tokens['encoder_input'],
            'decoder': tokens['decoder_target'],
            'solo_info': melid_solo_info
        })

print(f"Total sequences: {len(all_sequences)}")

# ============================================
# 1. Find all KEY and STYLE tokens in training
# ============================================
all_keys = set()
all_styles = set()
all_chords = set()

for seq in all_sequences:
    for tok in seq['encoder']:
        if tok.startswith('KEY_'):
            all_keys.add(tok)
        elif tok.startswith('STYLE_'):
            all_styles.add(tok)
        elif tok.startswith('CHORD_'):
            all_chords.add(tok)

print("\n=== KEYS in training ===")
print(sorted(all_keys))

print("\n=== STYLES in training ===")
print(sorted(all_styles))

print("\n=== CHORDS in training ===")
print(sorted(all_chords))

# ============================================
# 2. Find Minor Swing-like sequences (A min, D min, E dom)
# ============================================
print("\n=== Sequences with A_MIN + D_MIN + E_DOM ===")
minor_swing_like = []
for seq in all_sequences:
    enc = seq['encoder']
    if 'CHORD_A_MIN' in enc and 'CHORD_D_MIN' in enc and 'CHORD_E_DOM' in enc:
        minor_swing_like.append(seq)

print(f"Found {len(minor_swing_like)} similar sequences")

if minor_swing_like:
    seq = minor_swing_like[0]
    print(f"\nExample - melid {seq['melid']}:")
    print(f"KEY: {seq['solo_info']['key'].values[0]}")
    print(f"STYLE: {seq['solo_info']['style'].values[0]}")
    print(f"TEMPO: {seq['solo_info']['avgtempo'].values[0]}")
    print(f"\nFULL ENCODER TOKENS:")
    print(seq['encoder'])
    print(f"\nDECODER TOKENS (first 100):")
    print(seq['decoder'][:100])

# ============================================
# 3. Compare to YOUR generation input
# ============================================
print("\n" + "="*50)
print("COMPARISON: Your input vs Training data")
print("="*50)

# Your generation input
your_solo_info = {
    'key': 'A_MINOR',
    'style': 'POSTBOP',
    'avgtempo': 290
}

your_beats = [
    {'bar': 0, 'beat': 1, 'root': 'A', 'quality_class': 'MIN'},
    {'bar': 1, 'beat': 1, 'root': 'A', 'quality_class': 'MIN'},
    {'bar': 2, 'beat': 1, 'root': 'D', 'quality_class': 'MIN'},
    {'bar': 3, 'beat': 1, 'root': 'D', 'quality_class': 'MIN'},
    {'bar': 4, 'beat': 1, 'root': 'E', 'quality_class': 'DOM'},
]

your_encoder = tokenizer.encode_chord_timeline(your_beats, your_solo_info)
print(f"\nYOUR ENCODER TOKENS:")
print(your_encoder)

# Check for unknown tokens
print(f"\n=== TOKEN VALIDITY CHECK ===")
for tok in your_encoder:
    if tok.startswith('KEY_') and tok not in all_keys:
        print(f"❌ KEY NOT IN TRAINING: {tok}")
        print(f"   Available keys: {all_keys}")
    elif tok.startswith('STYLE_') and tok not in all_styles:
        print(f"❌ STYLE NOT IN TRAINING: {tok}")
        print(f"   Available styles: {all_styles}")
    elif tok.startswith('CHORD_') and tok not in all_chords:
        print(f"❌ CHORD NOT IN TRAINING: {tok}")

# ============================================
# 4. Check note distribution in training for A minor context
# ============================================
if minor_swing_like:
    print("\n=== NOTE DISTRIBUTION in A minor sequences ===")
    all_notes = []
    for seq in minor_swing_like:
        for tok in seq['decoder']:
            if tok.startswith('NOTE_'):
                pitch = int(tok.split('_')[1])
                all_notes.append(pitch)
    
    note_counts = Counter(all_notes)
    print("Most common pitches:")
    for pitch, count in note_counts.most_common(15):
        note_name = ['C','C#','D','D#','E','F','F#','G','G#','A','A#','B'][pitch % 12]
        print(f"  {pitch} ({note_name}): {count}")

Skipping melid 34 due to length: 119 3135
Skipping melid 42 due to length: 613 3766
Skipping melid 43 due to length: 610 3778
Skipping melid 79 due to length: 127 4212
Skipping melid 81 due to length: 836 5519
Skipping melid 82 due to length: 204 4357
Skipping melid 84 due to length: 131 3076
Skipping melid 106 due to length: 515 3141
Skipping melid 113 due to length: 310 3824
Skipping melid 116 due to length: 243 4079
Skipping melid 124 due to length: 298 3359
Skipping melid 176 due to length: 541 3288
Skipping melid 198 due to length: 476 3841
Skipping melid 206 due to length: 350 3074
Skipping melid 210 due to length: 473 3035
Skipping melid 211 due to length: 1008 4864
Skipping melid 215 due to length: 610 2299
Skipping melid 218 due to length: 233 3173
Skipping melid 222 due to length: 760 3749
Skipping melid 224 due to length: 1132 17165
Skipping melid 225 due to length: 495 6861
Skipping melid 226 due to length: 361 3927
Skipping melid 229 due to length: 802 6095
Skipping melid 