In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [61]:
df = pd.read_csv('chordonomicon_v2.csv',low_memory=False) 

# Drop columns: keep only chords and id
df=df[['id','chords']]
# peak
df.head()

Unnamed: 0,id,chords
0,1,<intro_1> C <verse_1> F C E7 Amin C F C G7 C F...
1,2,<intro_1> E D A/Cs E D A/Cs <verse_1> E D A/Cs...
2,3,<intro_1> Csmin <verse_1> A Csmin A Csmin A Cs...
3,4,<intro_1> D Dmaj7 D Dmaj7 <verse_1> Emin A D G...
4,5,<intro_1> C <verse_1> G C G C <chorus_1> F Dmi...


# Chord Tokens
We grab each chord and place them in a list. We can either disregard the section labeling or separate by section. We have both below. 

Note: technically if you have a separated list, then you can just unseparate by concatenation. 

In [62]:
# Chord tokens without sections

import re, numpy as np
from typing import List

SECTION_TAG   = re.compile(r"<[^>]+>")
CHORD_PATTERN = re.compile(r'^([A-G](?:[#b]|s)?)([^/\s]*)(?:/([A-G](?:[#b]|s)?))?$', re.I)

def extract_chord_tokens(text: str) -> List[str]:
    if not isinstance(text, str): return []
    s = SECTION_TAG.sub(" ", text)
    toks = re.split(r"[,\s]+", s.strip())
    out = []
    for t in toks:
        if not t: continue
        head = t.split("/")[0]
        if CHORD_PATTERN.fullmatch(t) or CHORD_PATTERN.fullmatch(head):
            out.append(t)
    return out

#---------------------------------
# Chord tokens separated by section
import re
from typing import List, Tuple

SECTION_TAG   = re.compile(r"<[^>]+>")
CHORD_PATTERN = re.compile(r'^([A-G](?:[#b]|s)?)([^/\s]*)(?:/([A-G](?:[#b]|s)?))?$', re.I)

def _is_chord(tok: str) -> bool:
    if not tok:
        return False
    head = tok.split("/", 1)[0]
    return bool(CHORD_PATTERN.fullmatch(tok) or CHORD_PATTERN.fullmatch(head))

def extract_sectioned_tokens(text: str) -> List[List[str]]:
    """
    Parse a chord string with <section> tags into a list of lists of chord tokens.
    Example:
      '<verse_1> E D A/Cs <chorus> C G Am F <end> D D/C# Bm'
      -> [['E','D','A/Cs'], ['C','G','Am','F'], ['D','D/C#','Bm']]
    """
    if not isinstance(text, str) or not text.strip():
        return []

    chunks = re.split(r'(<[^>]+>)', text)  # keep section tags
    sections: List[List[str]] = []
    cur: List[str] = []

    for chunk in chunks:
        if not chunk:
            continue
        if SECTION_TAG.fullmatch(chunk):
            # starting a new section: commit the previous one if it has content
            if cur:
                sections.append(cur)
                cur = []
            # just a marker; name is ignored to keep [[...],[...]] shape
            continue

        # tokenize this text chunk (commas or whitespace)
        for tok in re.split(r"[,\s]+", chunk.strip()):
            if tok and _is_chord(tok):
                cur.append(tok)

    if cur:
        sections.append(cur)

    return sections

In [63]:
# Apply to df
df['chord_tokens_sep'] = df['chords'].apply(extract_sectioned_tokens) 

# Feature Collection

Let $\mathcal{C}=\{\text{All possible chords}\}$, which is somewhat ill-defined. We define the **song space** as $$\mathcal{S}=\coprod_{l=1}^\infty \mathcal{C}^l.$$
We define the **harmonic space** as $\mathcal{H}=\mathcal{S}/(\mathbb{Z}/12)$, where $\mathbb{Z}/12$ acts on $\mathcal{S}$ via modulation.

Given a sample $\Sigma \subset \mathcal{S}$, we want to define a function $f:\Omega\rightarrow \mathcal{F}$ that somehow captures the 'key harmonic features' of a song $s\in \Omega$. These features should be invariant under modulation, $f:\Omega/(\mathbb{Z}/12)\rightarrow \mathcal{F}$.

An idea is to have $\mathcal{F}=\mathcal{H}$, where $f:\Omega\rightarrow \mathcal{H}$ captures a string of 'important chords'. For example, $$f([\text{C, Amin, F, G, C, Amin, F, G, F , G}])= [\text{C, Amin, F, G}]\text{ mod } \Z/12.$$

An example with a different codomain is $rank:\Omega\rightarrow \N$ where $rank(s)$ is the rank of the song matrix.

Here's a funny example; $f$ being the inclusion function $\iota:\Sigma \subset \mathcal{S}$ followed by the modulation projection $\mathcal{S}\rightarrow \mathcal{H}$. Below we give a frequency $f$.



### most_common_progression(tokens, min_len=3)
Finds the most frequent chord progression (n-gram) in a list of chord tokens.  
Returns the most common progression (as a tuple) and how many times it appears.  
Example: `['Dmin', 'G', 'C','Dmin', 'G', 'C', 'Bdim', 'C', 'G, 'Bdim'] → (('Dmin', 'G', 'C'), 3)`

Mathematically, this is computing $f(\omega)=\argmax_{\omega}\text{count}(\omega)$, where $\omega=(c_i,c_{i+1},\cdots,c_{i+k})$ for $k\geq \text{min\_len}$.


In [64]:
from collections import Counter
from typing import List, Tuple, Optional

def most_common_progression(tokens: List[str], min_len: int = 3, max_len: Optional[int] = None
) -> Tuple[Optional[Tuple[str, ...]], int]:
    """
    Finds the most frequent contiguous chord progression (n-gram) of length ≥ min_len.
    Returns (progression, count).
    Example:
        `['Dmin', 'G', 'C','Dmin', 'G', 'C', 'Bdim', 'C', 'G, 'Bdim'] → (('Dmin', 'G', 'C'), 3)`
    """
    n = len(tokens)
    if n < min_len:
        return None, 0

    if max_len is None or max_len > n:
        max_len = n

    counts = Counter()
    # collect all subsequences (n-grams) of length >= min_len
    for L in range(min_len, max_len + 1):
        for i in range(n - L + 1):
            counts[tuple(tokens[i:i+L])] += 1

    if not counts:
        return None, 0

    # choose the most frequent, breaking ties by longer length. 
    best, freq = max(counts.items(), key=lambda kv: (kv[1], len(kv[0])))
    return best, freq

# Apply most_common_progression to each section
def most_common_progression_by_section(
    sections: List[List[str]],
    min_len: int = 3,
    max_len: Optional[int] = None,
) -> List[Tuple[Optional[Tuple[str, ...]], int]]: 
    out = []
    for sec in sections or []:
        prog, cnt = most_common_progression(sec, min_len=min_len, max_len=max_len)
        out.append((prog, cnt))
    return out
 

In [65]:
Prog = Tuple[str, ...]
Res = Tuple[Optional[Prog], int]

def pick_global_progression(
    results: List[Res],
    tie_break: str = "longest",  # "longest" or "shortest"
) -> Res:
    """
    Combine equal progressions by summing counts, then pick the max.
    Ranking: higher total count; then length per tie_break; then earliest occurrence.
    """
    if not results:
        return (None, 0)

    # clean + remember first occurrence index for stable tie-breaking
    first_idx: Dict[Prog, int] = {}
    tally: Counter[Prog] = Counter()

    for idx, (p, c) in enumerate(results):
        if not p or c <= 0:
            continue
        prog = tuple(p)
        tally[prog] += int(c)
        if prog not in first_idx:
            first_idx[prog] = idx

    if not tally:
        return (None, 0)

    if tie_break not in {"shortest", "longest"}:
        raise ValueError("tie_break must be 'shortest' or 'longest'")

    # choose winner
    if tie_break == "longest":
        key_fn = lambda prog: (tally[prog], len(prog), -first_idx[prog])
    else:  # "shortest"
        key_fn = lambda prog: (tally[prog], -len(prog), -first_idx[prog])

    best = max(tally.keys(), key=key_fn)
    return best, tally[best]

In [66]:
# Apply to get list of (progression, count) per section
df['most_common_progressions'] = df['chord_tokens_sep'].apply(
    lambda sections: most_common_progression_by_section(sections, min_len=3, max_len=None)
)


In [67]:
# 1. Pick the single best (progression, count) overall, preferring longest progressions on ties. If ties persist, choose earliest instance
df['most_common_progression'] = df['most_common_progressions'].apply(
    lambda res: pick_global_progression(res, tie_break="longest")
)

# 2. Add a column with just the progression tuple (drop count)
df['most_common_progression_only'] = df['most_common_progression'].apply(lambda x: x[0])

In [68]:
df.head()

Unnamed: 0,id,chords,chord_tokens_sep,most_common_progressions,most_common_progression,most_common_progression_only
0,1,<intro_1> C <verse_1> F C E7 Amin C F C G7 C F...,"[[C], [F, C, E7, Amin, C, F, C, G7, C, F, C, E...","[(None, 0), ((F, C, E7, Amin, C, F), 2), ((F, ...","((F, C, E7, Amin, C, F), 4)","(F, C, E7, Amin, C, F)"
1,2,<intro_1> E D A/Cs E D A/Cs <verse_1> E D A/Cs...,"[[E, D, A/Cs, E, D, A/Cs], [E, D, A/Cs, E, D, ...","[((E, D, A/Cs), 2), ((E, D, A/Cs, E, D), 3), (...","((E, G, D, A), 9)","(E, G, D, A)"
2,3,<intro_1> Csmin <verse_1> A Csmin A Csmin A Cs...,"[[Csmin], [A, Csmin, A, Csmin, A, Csmin, A, B]...","[(None, 0), ((A, Csmin, A), 3), ((Csmin, A, Fs...","((Csmin, A, Csmin, A), 3)","(Csmin, A, Csmin, A)"
3,4,<intro_1> D Dmaj7 D Dmaj7 <verse_1> Emin A D G...,"[[D, Dmaj7, D, Dmaj7], [Emin, A, D, G, Emin, A...","[((D, Dmaj7, D, Dmaj7), 1), ((Emin, A, D), 4),...","((Emin, A, D, G), 16)","(Emin, A, D, G)"
4,5,<intro_1> C <verse_1> G C G C <chorus_1> F Dmi...,"[[C], [G, C, G, C], [F, Dmin, G, Dmin, G, C], ...","[(None, 0), ((G, C, G, C), 1), ((F, Dmin, G, D...","((F, Dmin, G, Dmin, G, C), 3)","(F, Dmin, G, Dmin, G, C)"


In [69]:
df.to_csv("common_chord_progression.csv", index=False)