# Loading Data to VM and setting up the notebook

In [2]:
from pathlib import Path
import pandas as pd, numpy as np, matplotlib.pyplot as plt, tqdm, json, os, math

ROOT = Path("data/lmd_matched").expanduser()
midi_files = sorted(ROOT.rglob("*.mid")) + sorted(ROOT.rglob("*.midi"))
print("Number of MIDIs:", len(midi_files))
print("Here are three examples:", midi_files[:3])

Number of MIDIs: 116189
Here are three examples: [PosixPath('data/lmd_matched/A/A/A/TRAAAGR128F425B14B/1d9d16a9da90c090809c153754823c2b.mid'), PosixPath('data/lmd_matched/A/A/A/TRAAAGR128F425B14B/5dd29e99ed7bd3cc0c5177a6e9de22ea.mid'), PosixPath('data/lmd_matched/A/A/A/TRAAAGR128F425B14B/b97c529ab9ef783a849b896816001748.mid')]


# Basic integrity

## A glimpse into the type of music

the following graph tells us how the models are going to behave according to the given data

In [3]:
terms_distro = pd.read_csv("data/term_distro.csv", delimiter=",")
mbtags_distro = pd.read_csv("data/mbtag_distro.csv", delimiter=",")

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 7), tight_layout=True)

ax1.bar(terms_distro["genre_term"].loc[:20], terms_distro["n_tracks"].loc[:20])
ax1.set_title("Distribution of genre terms", fontweight="bold")
ax1.set_ylabel("Number of tracks")
ax1.set_xlabel("Genre term")
ax1.tick_params(axis="x", labelrotation=45)
for t in ax1.get_xticklabels():
    t.set_ha("right")

ax2.bar(mbtags_distro["genre_tag"].loc[:20], mbtags_distro["n_tracks"].loc[:20])
ax2.set_title("Distribution of musicbrainz tags", fontweight="bold")
ax2.set_ylabel("Number of tracks")
ax2.set_xlabel("Genre tag")
ax2.tick_params(axis="x", labelrotation=45)
for t in ax2.get_xticklabels():
    t.set_ha("right")
plt.savefig("data_reports/genre_distribution.png", dpi=300)
plt.show()

FileNotFoundError: [Errno 2] No such file or directory: 'data/term_distro.csv'

## Some statistics on a sample from the dataset

In [None]:
import pretty_midi, random, statistics
from tqdm import tqdm

rng = random.Random()
sample = rng.sample(midi_files, k=5000)


def safe_read(p):
    try:
        pm = pretty_midi.PrettyMIDI(str(p))
        return pm
    except Exception as e:
        return None


rows = []
for p in tqdm(sample):
    pm = safe_read(p)
    ok = pm is not None
    dur = pm.get_end_time() if ok else np.nan
    tempos = [] if not ok else [t for t in pm.get_tempo_changes()[1]]
    time_sigs = (
        []
        if not ok
        else [(ts.numerator, ts.denominator) for ts in pm.time_signature_changes]
    )
    n_notes = 0 if not ok else sum(len(i.notes) for i in pm.instruments)
    n_instr = 0 if not ok else len(set([i.program for i in pm.instruments]))
    rows.append(
        dict(
            path=str(p),
            ok=ok,
            duration=dur,
            notes=n_notes,
            instruments=n_instr,
            tempi=len(tempos),
            time_sigs=len(time_sigs),
        )
    )
df_basic = pd.DataFrame(rows)

In [None]:
from ydata_profiling import ProfileReport

profile = ProfileReport(df_basic, title="Data Report", explorative=True)
profile.to_notebook_iframe() # open in your browser
profile.to_file("data_reports/basic_info.html")


In [None]:
from collections import Counter

prog_ctr, drum_ctr = Counter(), 0
#poly_vals, dens_vals, pitch_min, pitch_max = [], [], [], []
for p in tqdm(sample):
    pm = safe_read(p)
    if not pm: continue
    active_times = []
    for inst in pm.instruments:
        if inst.is_drum:
            drum_ctr += 1
        else:
            prog_ctr[inst.program] += 1
        for n in inst.notes:
            active_times.append((n.start, +1))
            active_times.append((n.end, -1))
            pitch_min.append(n.pitch); pitch_max.append(n.pitch)
    # crude polyphony estimate
    active_times.sort()
    cur, max_poly = 0, 0
    for t, d in active_times:
        cur += d
        max_poly = max(max_poly, cur)
    poly_vals.append(max_poly)
    dur = pm.get_end_time() or 1e-6
    total_notes = sum(len(i.notes) for i in pm.instruments)
    dens_vals.append(total_notes / max(dur, 1e-6))

pd.Series(poly_vals).hist(bins=50); plt.title("Max polyphony (sample)"); plt.show()
pd.Series(dens_vals).hist(bins=50); plt.title("Note density (notes/sec)"); plt.show()
pd.Series(pitch_min+pitch_max).describe(), prog_ctr.most_common(10), drum_ctr


In [None]:
from collections import Counter
import muspy

prog_ctr, drum_ctr = Counter(), 0
pitch_min, pitch_max = [], []
rows = []
for p in tqdm(sample):
    pm = safe_read(p)
    if not pm:
        continue

    # --- per-instrument bookkeeping ---
    for inst in pm.instruments:
        if inst.is_drum:
            drum_ctr += 1
        else:
            prog_ctr[inst.program] += 1
        for n in inst.notes:
            pitch_min.append(n.pitch)
            pitch_max.append(n.pitch)

    m = muspy.from_pretty_midi(pm)
    ts = m.time_signatures[0] if m.time_signatures else muspy.TimeSignature(4, 4, 0)
    def safe_measure_resolution(m):
        # pick the first time signature with a valid, nonzero denominator
        ts = next((ts for ts in m.time_signatures if getattr(ts, "denominator", 0) > 0), None)
        if ts is None:
            num, den = 4, 4          # fallback if none/invalid
        else:
            num, den = ts.numerator, ts.denominator
            # MIDI TS denominators should be powers of two; sanitize if needed
            if den <= 0:
                den = 4
        # use float division then round to keep intent even if den not dividing evenly
        return int(round(m.resolution * num * 4 / den))

    measure_resolution = safe_measure_resolution(m)

    # --- note-density calculation ---
    dur = pm.get_end_time() or 1e-6
    total_notes = sum(len(i.notes) for i in pm.instruments)

    beat = m.resolution
    grid16 = max(beat // 4, 1)

    total_notes = sum(len(t.notes) for t in m.tracks)
    non_drum_notes = [n for t in m.tracks if not t.is_drum for n in t.notes]
    drum_notes = [n for t in m.tracks if t.is_drum for n in t.notes]

    # A) validity & hygiene
    def overlaps_same_pitch(track):
        by_pitch = {}
        bad = 0
        notes = sorted(track.notes, key=lambda x: (x.pitch, x.time, x.end))
        for n in notes:
            by_pitch.setdefault(n.pitch, []).append((n.time, n.end))
        for spans in by_pitch.values():
            last_end = -1
            for t0, t1 in spans:
                if t0 < last_end:
                    bad += 1
                last_end = max(last_end, t1)
        return bad

    same_pitch_overlap_count = sum(
        overlaps_same_pitch(t) for t in m.tracks if not t.is_drum
    )
    ts_changes = max(len(m.time_signatures) - 1, 0)
    parsable_nonempty = int(total_notes > 0 and dur > 0)

    # B) rhythm & timing
    bpms = [tm.qpm for tm in m.tempos]
    tempo_mean_bpm = (sum(bpms) / len(bpms)) if bpms else float("nan")
    tempo_std_bpm = (
        ((sum((x - tempo_mean_bpm) ** 2 for x in bpms) / len(bpms)) ** 0.5)
        if len(bpms) > 1
        else 0.0
    )
    tempo_changes = max(len(bpms) - 1, 0)

    off = sum((n.time % grid16) != 0 for t in m.tracks for n in t.notes)
    offgrid_rate_16 = off / max(total_notes, 1)

    # measure counts for density variance and length
    length = max((t.get_end_time() for t in m.tracks), default=0)
    n_measures = int(length // measure_resolution) + 1 if length > 0 else 0
    counts_per_measure = [
        sum(1 for t in m.tracks for n in t.notes if (n.time // measure_resolution) == k)
        for k in range(n_measures)
    ]
    if counts_per_measure:
        mean_c = sum(counts_per_measure) / len(counts_per_measure)
        onset_density_var_meas = sum(
            (c - mean_c) ** 2 for c in counts_per_measure
        ) / len(counts_per_measure)
    else:
        onset_density_var_meas = float("nan")

    # C) dynamics & articulation
    vels = [n.velocity for n in non_drum_notes]
    if vels:
        v_mean = sum(vels) / len(vels)
        velocity_mean = v_mean
        velocity_std = (sum((v - v_mean) ** 2 for v in vels) / len(vels)) ** 0.5
        sv = sorted(vels)
        p5 = sv[int(0.05 * len(sv))]
        p95 = sv[max(int(0.95 * len(sv)) - 1, 0)]
        velocity_p95_minus_p5 = p95 - p5
    else:
        velocity_mean = velocity_std = velocity_p95_minus_p5 = float("nan")

    articulation_ratio = (
        (
            sum(n.duration for t in m.tracks for n in t.notes)
            / max(total_notes * beat, 1)
        )
        if total_notes
        else float("nan")
    )

    # D) instrumentation & meta
    track_count = len(m.tracks)
    drum_note_ratio = len(drum_notes) / max(total_notes, 1)
    pitch_register_mean = (
        (sum(n.pitch for n in non_drum_notes) / len(non_drum_notes))
        if non_drum_notes
        else float("nan")
    )
    length_measures = n_measures

    rows.append(
        dict(
            path=str(p),
            polyphony=muspy.metrics.polyphony(m),
            polyphony_rate=muspy.metrics.polyphony_rate(m),
            notes_density=total_notes / max(dur, 1e-6),
            empty_beat_rate=muspy.metrics.empty_beat_rate(m),
            empty_measure_rate=muspy.metrics.empty_measure_rate(m, measure_resolution),
            groove_consistency=muspy.metrics.groove_consistency(m, measure_resolution),
            n_pitch_classes_used=muspy.metrics.n_pitch_classes_used(m),
            n_pitches_used=muspy.metrics.n_pitches_used(m),
            pitch_class_entropy=muspy.metrics.pitch_class_entropy(m),
            pitch_entropy=muspy.metrics.pitch_entropy(m),
            pitch_range=muspy.metrics.pitch_range(m),
            scale_consistency=muspy.metrics.scale_consistency(m),
            same_pitch_overlap_count=same_pitch_overlap_count,
            time_signature_changes=ts_changes,
            parsable_nonempty=parsable_nonempty,
            tempo_mean_bpm=tempo_mean_bpm,
            tempo_std_bpm=tempo_std_bpm,
            tempo_changes=tempo_changes,
            offgrid_rate_16=offgrid_rate_16,
            onset_density_var_meas=onset_density_var_meas,
            velocity_mean=velocity_mean,
            velocity_std=velocity_std,
            velocity_p95_minus_p5=velocity_p95_minus_p5,
            articulation_ratio=articulation_ratio,
            drum_note_ratio=drum_note_ratio,
            pitch_register_mean=pitch_register_mean,
            length_measures=length_measures,
        )
    )
df_metrics = pd.DataFrame(rows)

pd.Series(pitch_min + pitch_max).describe(), prog_ctr.most_common(10), drum_ctr

In [None]:
profile = ProfileReport(df_metrics, title="Metrics Report", explorative=True)
profile.to_notebook_iframe() # open in your browser
profile.to_file("data_reports/metrics.html")

## Correlation matrix

### 🔥 Strongest Correlations

#### Near-duplicates
- **Tempo-related**
  - `tempo_changes ↔ tempo_std_bpm` **0.954**  
    More tempo changes → higher tempo variability.
- **Dynamics-related**
  - `velocity_p95_minus_p5 ↔ velocity_std` **0.951**  
    Both measure spread of dynamics.
- **Sparsity-related**
  - `empty_beat_rate ↔ empty_measure_rate` **0.863**  
    Sparse beats go with sparse measures.
- **Pitch/tonal clarity**
  - `pitch_class_entropy ↔ scale_consistency` **−0.848**  
    More tonal ambiguity → less scale consistency.
- **Pitch diversity**
  - `n_pitches_used ↔ pitch_entropy` **0.798**  
    More unique pitches → higher pitch entropy.
- **Texture**
  - `polyphony ↔ polyphony_rate` **0.790**  
    Essentially the same concept.

---

### 🎵 Pitch & Harmony Cluster
- `n_pitch_classes_used ↔ n_pitches_used` **0.764**
- `pitch_class_entropy ↔ pitch_entropy` **0.759**
- `n_pitch_classes_used ↔ pitch_class_entropy` **0.693**
- `n_pitches_used ↔ pitch_class_entropy` **0.692**
- `n_pitches_used ↔ scale_consistency` **−0.650**
- `n_pitches_used ↔ pitch_range` **0.638**
- `pitch_entropy ↔ scale_consistency` **−0.635**

**Interpretation:**  
Greater pitch variety leads to higher entropy, wider pitch range, and weaker adherence to a single scale.

---

### 🥁 Rhythm & Tightness
- `groove_consistency ↔ offgrid_rate_16` **−0.721**  
  More off-grid 16ths → lower groove consistency.
- `notes_density ↔ onset_density_var_meas` **0.622**  
  Denser textures → greater onset variance.
- `articulation_ratio ↔ notes_density` **−0.527**  
  Dense notes → shorter articulations.

---

### ⏱️ Tempo & Length
- `length_measures ↔ tempo_mean_bpm` **0.536**  
  Longer pieces → higher mean BPM (dataset-specific effect).
- `notes_density ↔ tempo_mean_bpm` **0.442**  
  Faster tempos → denser textures.

---

### 🎼 Polyphony & Articulation
- `articulation_ratio ↔ polyphony_rate` **0.506**
- `articulation_ratio ↔ polyphony` **0.425**  

**Interpretation:**  
Higher note overlap (polyphony) is associated with longer articulations.

---

### 📌 Practical Takeaways

#### Feature Selection
To reduce **multicollinearity**, drop one from each of these pairs:
- `tempo_changes` vs. `tempo_std_bpm`
- `velocity_p95_minus_p5` vs. `velocity_std`
- `empty_beat_rate` vs. `empty_measure_rate`
- `polyphony` vs. `polyphony_rate`
- Among pitch/tonal measures, keep a compact subset (e.g., `n_pitch_classes_used`, `scale_consistency`, `pitch_entropy`).

#### Latent Factors (useful for PCA or clustering)
- **Tonal variety / ambiguity:** (`pitch_entropy`, `pitch_class_entropy`, `n_pitches_used`, inverse of `scale_consistency`)
- **Rhythmic tightness vs looseness:** (`groove_consistency` vs. `offgrid_rate_16`)
- **Density / energy:** (`notes_density`, `onset_density_var_meas`, `tempo_mean_bpm`)
- **Dynamics spread:** (`velocity_std` or `velocity_p95_minus_p5`)
- **Texture:** (`polyphony`, `polyphony_rate`, linked to `articulation_ratio`)

#### Sanity Check
- Off-grid timing decreases groove consistency (expected).
- More pitch classes reduce tonal stability (expected).
- Denser passages encourage staccato articulation (musically intuitive).

---
