In [1]:
import pandas as pd
import re
import numpy as np

# =========================
# 1️⃣ 读取数据
# =========================
df = pd.read_csv('corrected_transposed_JUNE24.csv')

# =========================
# 2️⃣ Section 列
# =========================
section_cols = {
    "Intro": "intro_chordsTransposed",
    "Verse": "verse_chordsTransposed",
    "Pre-Chorus": "pre_chorus_chordsTransposed",
    "Chorus": "chorus_chordsTransposed",
    "Post-Chorus": "post_chorus_chordsTransposed",
    "Bridge": "bridge_chordsTransposed",
    "Outro": "outro_chordsTransposed",
}

# =========================
# 3️⃣ 音高映射
# =========================
PC = {
    "C":0,"C#":1,"Db":1,"D":2,"D#":3,"Eb":3,"E":4,
    "F":5,"F#":6,"Gb":6,"G":7,"G#":8,"Ab":8,
    "A":9,"A#":10,"Bb":10,"B":11
}

ROMANS = ["I","II","III","IV","V","VI","VII"]
MAJOR_SCALE = [0,2,4,5,7,9,11]
MINOR_SCALE = [0,2,3,5,7,8,10]

# =========================
# 4️⃣ 工具函数
# =========================
def split_chords(chord_string):
    if pd.isna(chord_string):
        return []
    s = re.sub(r"[,\|;]", " ", str(chord_string))
    s = s.replace("-", " ")
    s = re.sub(r"\s+", " ", s).strip()
    return s.split()

def parse_chord(sym):
    m = re.match(r"^([A-G])([#b]?)(.*)$", sym)
    if not m:
        return None, "maj"
    root = m.group(1) + (m.group(2) or "")
    rest = m.group(3).lower()
    quality = "min" if rest.startswith("m") and not rest.startswith("maj") else "maj"
    return root, quality

def infer_key(chords):
    if not chords:
        return None, None
    root, quality = parse_chord(chords[0])
    mode = "minor" if quality == "min" else "major"
    return root, mode

def interval_to_roman(interval, mode, quality):
    scale = MAJOR_SCALE if mode == "major" else MINOR_SCALE
    diffs = [(abs(interval - deg), i) for i, deg in enumerate(scale)]
    _, idx = min(diffs)
    numeral = ROMANS[idx]
    if quality == "min":
        numeral = numeral.lower()
    return numeral

def convert_to_roman(chords, key_root, key_mode):
    if key_root not in PC:
        return []
    key_pc = PC[key_root]
    romans = []
    for sym in chords:
        root, quality = parse_chord(sym)
        if root not in PC:
            continue
        interval = (PC[root] - key_pc) % 12
        romans.append(interval_to_roman(interval, key_mode, quality))
    return romans

# =========================
# 5️⃣ 构建 long format
# =========================
rows = []

for _, r in df.iterrows():
    for section_name, col in section_cols.items():
        if col not in df.columns:
            continue

        chords = split_chords(r[col])
        if not chords:
            continue

        key_root, key_mode = infer_key(chords)
        roman_prog = convert_to_roman(chords, key_root, key_mode)

        if not roman_prog:
            continue

        rows.append({
            "song_title": r["song_title"],
            "section": section_name,
            "artist": "Taylor Swift",
            "key": key_root,
            "key_mode": key_mode,
            "roman_progression": roman_prog,
            "chord_progression_raw": chords
        })

long_df = pd.DataFrame(rows)

# =========================
# 6️⃣ 取 toy size = 50
# =========================
toy_df = long_df.head(50).copy()
toy_df["snippet_id"] = ["TS_{:03d}".format(i+1) for i in range(len(toy_df))]

# list 转字符串
toy_df["roman_progression"] = toy_df["roman_progression"].apply(lambda x: str(x))
toy_df["chord_progression_raw"] = toy_df["chord_progression_raw"].apply(lambda x: str(x))

# =========================
# 7️⃣ 输出 toy CSV
# =========================
toy_df = toy_df[[
    "snippet_id",
    "song_title",
    "section",
    "artist",
    "key",
    "key_mode",
    "roman_progression",
    "chord_progression_raw"
]]

toy_df.to_csv("toy_smashchords_tswift_50.csv", index=False)

print("Toy dataset created successfully!")


Toy dataset created successfully!


In [3]:
import ast, math, random
from collections import defaultdict
import pandas as pd

# ======================
# 0) CONFIG — only change these
# ======================
CSV_PATH = "toy_smashchords_tswift_50.csv"   # your local path
ALPHA = 1.0                                  # add-alpha smoothing
EXCLUDE_SAME_SONG = True                     # mashup: don't recommend same song
MID_LO, MID_HI = 0.2, 0.8                    # search cut/start inside "middle"
N_STEPS = 3                                  # how many chords of B to score after the bridge
TOP_K = 5                                    # how many recommendations to show

RANDOM_SEED = 42
random.seed(RANDOM_SEED)

# ======================
# 1) LOAD DATA
# ======================
df = pd.read_csv(CSV_PATH)
df["chords"] = df["chord_progression_raw"].apply(ast.literal_eval)
seqs = df["chords"].tolist()

# ======================
# 2) TRAIN 2nd-ORDER MARKOV
# ======================
counts = defaultdict(lambda: defaultdict(int))  # (prev2, prev1) -> next -> count
totals = defaultdict(int)                       # (prev2, prev1) -> total
vocab = set()

for s in seqs:
    vocab.update(s)
    for t in range(2, len(s)):
        ctx = (s[t-2], s[t-1])
        nxt = s[t]
        counts[ctx][nxt] += 1
        totals[ctx] += 1

vocab = sorted(vocab)
V = len(vocab)

def prob_next(prev2, prev1, nxt, alpha=ALPHA):
    """P(nxt | prev2, prev1) with add-alpha smoothing"""
    ctx = (prev2, prev1)
    num = counts[ctx].get(nxt, 0) + alpha
    den = totals.get(ctx, 0) + alpha * V
    if den == 0:
        return 1.0 / max(1, V)
    return num / den

def logp(prev2, prev1, nxt):
    return math.log(max(prob_next(prev2, prev1, nxt), 1e-12))

# ======================
# 3) HELPERS: MID-RANGE INDICES
# ======================
def mid_i_candidates(L, lo=MID_LO, hi=MID_HI):
    # i is cut index in A, context=(A[i-1],A[i]), so i>=1
    a = max(1, int(lo * L))
    b = min(L - 2, int(hi * L))
    if b < a:
        a, b = 1, max(1, L - 2)
    return range(a, b + 1)

def mid_j_candidates(L, lo=MID_LO, hi=MID_HI):
    a = int(lo * L)
    b = max(a, int(hi * L))
    b = min(b, L - 1)
    return range(a, b + 1)

# ======================
# 4) SCORE: context + n-step continuation into B
# ======================
def bridge_sequence_score(context, B, start_j, n_steps=N_STEPS):
    """
    Score = sum_{k=0..n-1} log P(B[j+k] | prev2, prev1), rolling forward.
    """
    prev2, prev1 = context
    total = 0.0
    for k in range(n_steps):
        idx = start_j + k
        if idx >= len(B):
            break
        nxt = B[idx]
        total += logp(prev2, prev1, nxt)
        prev2, prev1 = prev1, nxt
    return total

# ======================
# 5) RANDOMLY PICK AN A SNIPPET + SEARCH BEST B RECOMMENDATIONS
# ======================
def recommend_next_snippets(random_A=True, A_ROW=None):
    if random_A:
        # pick a snippet with >=3 chords so mid-cut exists
        candidates = [i for i,s in enumerate(seqs) if len(s) >= 3]
        A_ROW = random.choice(candidates)
    else:
        if A_ROW is None:
            raise ValueError("Provide A_ROW if random_A=False.")

    A = seqs[A_ROW]
    a_song = df.loc[A_ROW, "song_title"]

    # choose best cut i in A middle (we’ll evaluate multiple cuts)
    cand_i = list(mid_i_candidates(len(A)))

    results = []  # (score, A_ROW, i, B_ROW, j)

    for B_ROW in range(len(seqs)):
        if B_ROW == A_ROW:
            continue
        if EXCLUDE_SAME_SONG and df.loc[B_ROW, "song_title"] == a_song:
            continue
        B = seqs[B_ROW]
        if len(B) < 1:
            continue

        cand_j = list(mid_j_candidates(len(B)))

        for i in cand_i:
            context = (A[i-1], A[i])
            for j in cand_j:
                s = bridge_sequence_score(context, B, j, n_steps=N_STEPS)
                results.append((s, A_ROW, i, B_ROW, j))

    results.sort(reverse=True, key=lambda x: x[0])
    return results[:TOP_K], A_ROW

top, chosen_A_row = recommend_next_snippets(random_A=True)

# ======================
# 6) PRINT REPORT-FRIENDLY OUTPUT
# ======================
A = seqs[chosen_A_row]
print("=== Randomly chosen A snippet ===")
print(df.loc[chosen_A_row, ["snippet_id", "song_title", "section", "key", "key_mode"]])
print("A chords:", A)

print(f"\n=== Top {TOP_K} recommendations (mid→mid) ===")
for rank, (score, A_ROW, i, B_ROW, j) in enumerate(top, start=1):
    B = seqs[B_ROW]
    context = (A[i-1], A[i])
    shown_B = B[j:j+max(1, N_STEPS)]
    print(f"\n[{rank}] Score={score:.3f}")
    print(f"  Cut A at i={i} (0-based)  context={context}   A_part: {A[:i+1]} | ...")
    print(f"  B target: {df.loc[B_ROW, 'snippet_id']}  {df.loc[B_ROW, 'song_title']} [{df.loc[B_ROW, 'section']}]  key={df.loc[B_ROW,'key']} {df.loc[B_ROW,'key_mode']}")
    print(f"  Start B at j={j} (0-based)  B_start_chords={shown_B}")

=== Randomly chosen A snippet ===
snippet_id            TS_045
song_title    stay beautiful
section           Pre-Chorus
key                       A#
key_mode               major
Name: 44, dtype: object
A chords: ['A#', 'C#', 'D#']

=== Top 5 recommendations (mid→mid) ===

[1] Score=-2.565
  Cut A at i=1 (0-based)  context=('A#', 'C#')   A_part: ['A#', 'C#'] | ...
  B target: TS_031  mad woman [Intro]  key=F minor
  Start B at j=2 (0-based)  B_start_chords=['D#']

[2] Score=-2.565
  Cut A at i=1 (0-based)  context=('A#', 'C#')   A_part: ['A#', 'C#'] | ...
  B target: TS_033  mad woman [Pre-Chorus]  key=G# major
  Start B at j=4 (0-based)  B_start_chords=['D#']

[3] Score=-3.258
  Cut A at i=1 (0-based)  context=('A#', 'C#')   A_part: ['A#', 'C#'] | ...
  B target: TS_001  seven [Verse]  key=G major
  Start B at j=2 (0-based)  B_start_chords=['C']

[4] Score=-3.258
  Cut A at i=1 (0-based)  context=('A#', 'C#')   A_part: ['A#', 'C#'] | ...
  B target: TS_002  seven [Chorus]  key=D minor