In [5]:
len(file_paths)

1094620

In [None]:
import os
import numpy as np
import pandas as pd
import ast
from symusic import Score
from loops_nomml.note_set import compute_note_sets
import loops_nomml.corr_mat as corr
from loops_nomml.corr_mat import get_valid_loops
from joblib import Parallel, delayed
from tqdm.notebook import tqdm

# ─── Patch safe get_duration_beats ────────────────────────────────────────
def safe_get_duration_beats(start: int, end: int, ticks_beats: list[int]) -> float:
    i0 = max((i for i, t in enumerate(ticks_beats) if t <= start), default=0)
    i1 = max((i for i, t in enumerate(ticks_beats) if t <= end), default=i0)
    return float(i1 - i0)
corr.get_duration_beats = safe_get_duration_beats

# ─── Constants ─────────────────────────────────────────────────────────────
GM_GROUPS = [
    'Piano','Chromatic Percussion','Organ','Guitar',
    'Bass','Strings','Ensemble','Brass',
    'Reed','Pipe','Synth Lead','Synth Pad',
    'Synth Effects','Ethnic','Percussive','Sound Effects'
]
DRUM_GROUP = 'Drums'

# ─── similarity + soft-count ──────────────────────────────────────────────
def note_similarity(a, b, v_a, v_b, w_p=0.5, w_v=0.3, w_t=0.2, max_time_diff=0.05):
    p = len(a.pitches & b.pitches) / max(1, len(a.pitches | b.pitches))
    v = 1 - abs(v_a - v_b) / 127
    t = np.exp(-abs(a.start - b.start) / max_time_diff)
    return w_p*p + w_v*v + w_t*t

def calc_correlation_soft_count(ns, vel_means, tau):
    N = len(ns)
    C = np.zeros((N, N), dtype=int)
    for j in range(1, N):
        if note_similarity(ns[0], ns[j], vel_means[0], vel_means[j]) >= tau and ns[0].is_barline():
            C[0, j] = 1
    for i in range(1, N-1):
        for j in range(i+1, N):
            sim = note_similarity(ns[i], ns[j], vel_means[i], vel_means[j])
            if sim >= tau and (C[i-1, j-1] > 0 or ns[i].is_barline()):
                C[i, j] = C[i-1, j-1] + 1
    return C

# ─── loopability score ───────────────────────────────────────────────────
def score_loopability(ns, vel_means, tau, alpha=0.7, beta=0.3):
    C = calc_correlation_soft_count(ns, vel_means, tau)
    N = len(ns)
    if N < 2:
        return 0.0
    S_max = C.max() / N
    S_den = C.sum() / (N*(N-1)/2)
    return alpha * S_max + beta * S_den

# ─── Process one file ─────────────────────────────────────────────────────
def process_file(path, melodic_tau=0.3, drum_tau=0.1):
    loops = []
    try:
        score = Score(path, ttype='tick')
        try:
            beat_ticks = score.beat_ticks()
        except:
            ppq = getattr(score, 'ticks_per_quarter', getattr(score, 'ppq', 480))
            beat_ticks = list(range(0, score.end()+1, ppq))
        bars = [beat_ticks[i] for i in range(0, len(beat_ticks), 4)]

        for ti, track in enumerate(score.tracks):
            is_drum = getattr(track, 'channel', None) == 9
            tau = drum_tau if is_drum else melodic_tau

            prog = getattr(track, 'program', None)
            if "drums-only" in path:
                group = DRUM_GROUP
            else:
                group = DRUM_GROUP if is_drum else (GM_GROUPS[prog // 8] if prog is not None else 'Unknown')

            ns = compute_note_sets(track.notes, bars)
            if len(ns) < 2:
                continue
            vel_means = [
                float(np.mean([n.velocity for n in track.notes
                               if n.start == nset.start and n.end == nset.end]))
                if any(n.start == nset.start and n.end == nset.end for n in track.notes)
                else 0.0
                for nset in ns
            ]

            loopability = score_loopability(ns, vel_means, tau)
            C = calc_correlation_soft_count(ns, vel_means, tau)
            try:
                _, endpoints = get_valid_loops(
                    ns, C, beat_ticks,
                    min_rep_notes=0,
                    min_rep_beats=1.0 if not is_drum else 0.5,
                    min_beats=1.0    if not is_drum else 0.5,
                    max_beats=32.0,
                    min_loop_note_density=0.0
                )
            except IndexError:
                continue

            for start, end, dur, dens in endpoints:
                loops.append({
                    'track_idx': ti,
                    'MIDI program number': prog,
                    'instrument_group': group,
                    'loopability': loopability,
                    'start_tick': start,
                    'end_tick': end,
                    'duration_beats': dur,
                    'note_density': dens
                })
    except Exception as e:
        print(f"[Error] {os.path.basename(path)}: {e}")
    return loops

# ─── 1) Load CSV & select only rows whose NOMML list contains a 12 ──────────
df_input = pd.read_csv(
    "Final_GigaMIDI_Loop_V2_path-instrument-NOMML-type.csv",
    converters={'NOMML': ast.literal_eval}
)

# keep rows where the NOMML list has at least one 12
df_input = df_input[df_input['NOMML'].apply(lambda lst: isinstance(lst, (list,tuple)) and 12 in lst)]

file_paths = df_input['file_path'].tolist()

# ─── 2) Chunk size 100,000 for checkpoint ───────────────────────────────────
chunk_size = 100000

# ─── 3) Process in chunks, checkpoint each chunk ───────────────────────────
all_rows = []
for idx in range(0, len(file_paths), chunk_size):
    chunk = file_paths[idx: idx + chunk_size]
    results = Parallel(n_jobs=-1, backend='loky')(
        delayed(process_file)(p) for p in tqdm(chunk, desc=f"Files {idx+1}-{idx+len(chunk)}")
    )

    # organize one row per file, unpacking loops into parallel arrays
    rows = []
    for path, loops in zip(chunk, results):
        rows.append({
            'file_path': path,
            'track_idx': [d['track_idx'] for d in loops],
            'MIDI program number': [d['MIDI program number'] for d in loops],
            'instrument_group': [d['instrument_group'] for d in loops],
            'loopability': [d['loopability'] for d in loops],
            'start_tick': [d['start_tick'] for d in loops],
            'end_tick': [d['end_tick'] for d in loops],
            'duration_beats': [d['duration_beats'] for d in loops],
            'note_density': [d['note_density'] for d in loops]
        })
    df_chunk = pd.DataFrame(rows)

    # save checkpoint
    checkpoint = f"loops_checkpoint_{idx//chunk_size + 1}.csv"
    df_chunk.to_csv(checkpoint, index=False)
    print(f"Saved checkpoint: {checkpoint}")

    all_rows.extend(rows)

# ─── 4) Final combined DataFrame ─────────────────────────────────────────────
df_all = pd.DataFrame(all_rows)

# ─── 5) Save the full output to CSV ─────────────────────────────────────────
df_all.to_csv("loops_full_output.csv", index=False)
print("Saved full output: loops_full_output.csv")
