In [None]:
import pandas as pd
import re
import numpy as np

# =========================
# 1) Load data
# =========================
df = pd.read_csv('raw/corrected_transposed_JUNE24.csv')

# =========================
# 2) Section columns
# =========================
section_cols = {
    "Intro": "intro_chordsTransposed",
    "Verse": "verse_chordsTransposed",
    "Pre-Chorus": "pre_chorus_chordsTransposed",
    "Chorus": "chorus_chordsTransposed",
    "Post-Chorus": "post_chorus_chordsTransposed",
    "Bridge": "bridge_chordsTransposed",
    "Outro": "outro_chordsTransposed",
}

# =========================
# 3) Pitch class mapping
# =========================
PC = {
    "C":0,"C#":1,"Db":1,"D":2,"D#":3,"Eb":3,"E":4,
    "F":5,"F#":6,"Gb":6,"G":7,"G#":8,"Ab":8,
    "A":9,"A#":10,"Bb":10,"B":11
}

ROMANS = ["I","II","III","IV","V","VI","VII"]
MAJOR_SCALE = [0,2,4,5,7,9,11]
MINOR_SCALE = [0,2,3,5,7,8,10]

# =========================
# 4) Utility functions
# =========================
def split_chords(chord_string):
    if pd.isna(chord_string) or str(chord_string).strip().upper() == "NONE":
        return []
    s = re.sub(r"[,\|;]", " ", str(chord_string))
    s = s.replace("-", " ")
    s = re.sub(r"\s+", " ", s).strip()
    return s.split()

def parse_chord(sym):
    m = re.match(r"^([A-G])([#b]?)(.*)$", sym)
    if not m:
        return None, "maj"
    root = m.group(1) + (m.group(2) or "")
    rest = m.group(3).lower()
    quality = "min" if rest.startswith("m") and not rest.startswith("maj") else "maj"
    return root, quality

def interval_to_roman(interval, mode, quality):
    scale = MAJOR_SCALE if mode == "major" else MINOR_SCALE
    diffs = [(abs(interval - deg), i) for i, deg in enumerate(scale)]
    _, idx = min(diffs)
    numeral = ROMANS[idx]
    if quality == "min":
        numeral = numeral.lower()
    return numeral

def convert_to_roman(chords, key_root, key_mode):
    if key_root not in PC:
        return []
    key_pc = PC[key_root]
    romans = []
    for sym in chords:
        root, quality = parse_chord(sym)
        if root not in PC:
            continue
        interval = (PC[root] - key_pc) % 12
        romans.append(interval_to_roman(interval, key_mode, quality))
    return romans

# =========================
# 5) Per-song key inference
#    Priority: Chorus -> Intro -> any other section
#    All sections of a song share the same key.
# =========================
KEY_PRIORITY = [
    "chorus_chordsTransposed",
    "intro_chordsTransposed",
    "verse_chordsTransposed",
    "pre_chorus_chordsTransposed",
    "post_chorus_chordsTransposed",
    "bridge_chordsTransposed",
    "outro_chordsTransposed",
]

def infer_song_key(row):
    """Return (key_root, key_mode) for a song row, checking sections by priority."""
    for col in KEY_PRIORITY:
        if col not in row.index:
            continue
        chords = split_chords(row[col])
        if not chords:
            continue
        root, quality = parse_chord(chords[0])
        if root and root in PC:
            mode = "minor" if quality == "min" else "major"
            return root, mode
    return None, None

# =========================
# 6) Build long-format dataset (all songs, all sections)
# =========================
rows = []

for _, r in df.iterrows():
    song_key_root, song_key_mode = infer_song_key(r)
    if song_key_root is None:
        continue

    for section_name, col in section_cols.items():
        if col not in df.columns:
            continue

        chords = split_chords(r[col])
        if not chords:
            continue

        roman_prog = convert_to_roman(chords, song_key_root, song_key_mode)
        if not roman_prog:
            continue

        rows.append({
            "song_title": r["song_title"],
            "section": section_name,
            "artist": "Taylor Swift",
            "key": song_key_root,
            "key_mode": song_key_mode,
            "roman_progression": roman_prog,
            "chord_progression_raw": chords
        })

long_df = pd.DataFrame(rows)

# =========================
# 7) Assign snippet IDs and export
# =========================
long_df["snippet_id"] = ["TS_{:03d}".format(i+1) for i in range(len(long_df))]

long_df["roman_progression"] = long_df["roman_progression"].apply(str)
long_df["chord_progression_raw"] = long_df["chord_progression_raw"].apply(str)

out_df = long_df[[
    "snippet_id",
    "song_title",
    "section",
    "artist",
    "key",
    "key_mode",
    "roman_progression",
    "chord_progression_raw"
]]

out_df.to_csv("processed/toy_smashchords_tswift.csv", index=False)

print(f"Dataset created: {len(out_df)} snippets from {long_df['song_title'].nunique()} songs")
print(out_df.head())


In [None]:
import pandas as pd
import ast
from collections import Counter, defaultdict

# =========================
# 1) Load dataset
# =========================
toy = pd.read_csv("processed/toy_smashchords_tswift.csv")

def parse_roman_list(s):
    if pd.isna(s):
        return []
    s = str(s).strip()
    # Handle "[I, V, vi]" (no quotes)
    if s.startswith("[") and "'" not in s:
        items = [x.strip() for x in s.strip("[]").split(",") if x.strip()]
        return items
    # Handle "['I','V','vi']"
    try:
        return ast.literal_eval(s)
    except Exception:
        return [x.strip() for x in s.strip("[]").split(",") if x.strip()]

toy["roman_list"] = toy["roman_progression"].apply(parse_roman_list)

# =========================
# 2) Count bigram transitions
# =========================
transition_counts = Counter()
state_counts = Counter()

for prog in toy["roman_list"]:
    if not prog or len(prog) < 2:
        continue
    for a, b in zip(prog[:-1], prog[1:]):
        transition_counts[(a, b)] += 1
        state_counts[a] += 1

states = sorted(set([a for a, _ in transition_counts.keys()] + [b for _, b in transition_counts.keys()]))

# =========================
# 3) Transition probability matrix (Laplace smoothing)
# =========================
alpha = 1.0
V = len(states)

prob = defaultdict(dict)
for a in states:
    denom = state_counts[a] + alpha * V
    for b in states:
        num = transition_counts.get((a, b), 0) + alpha
        prob[a][b] = num / denom if denom > 0 else 1.0 / V

# =========================
# 4) Export counts & probs as CSV matrices
# =========================
counts_mat = pd.DataFrame(0, index=states, columns=states, dtype=int)
for (a, b), c in transition_counts.items():
    counts_mat.loc[a, b] = c

probs_mat = pd.DataFrame(0.0, index=states, columns=states, dtype=float)
for a in states:
    for b in states:
        probs_mat.loc[a, b] = prob[a][b]

counts_mat.to_csv("analysis/transition_counts.csv")
probs_mat.to_csv("analysis/transition_probs.csv")

print("Saved: analysis/transition_counts.csv, analysis/transition_probs.csv")
print("Num states:", V)
print("Top transitions:", transition_counts.most_common(10))

# =========================
# 5) Markov transition score helper
# =========================
def markov_score(from_prog, to_prog):
    """P(first_of_B | last_of_A)"""
    if not from_prog or not to_prog:
        return None
    a = from_prog[-1]
    b = to_prog[0]
    if a not in prob or b not in prob[a]:
        return 1.0 / V
    return prob[a][b]

# =========================
# 6) Demo: score a pair of snippets
# =========================
A = toy.iloc[0]["roman_list"]
B = toy.iloc[1]["roman_list"]
print("Example A:", A)
print("Example B:", B)
print("Markov score P(B0|A_last) =", markov_score(A, B))


In [None]:
import pandas as pd
import ast
import math
from collections import defaultdict

# =========================
# 0) Load dataset
# =========================
toy = pd.read_csv("processed/toy_smashchords_tswift.csv")

def parse_roman_list(s):
    if pd.isna(s):
        return []
    s = str(s).strip()
    if s.startswith("[") and "'" not in s and '"' not in s:
        items = [x.strip() for x in s.strip("[]").split(",") if x.strip()]
        return items
    try:
        return ast.literal_eval(s)
    except Exception:
        return [x.strip() for x in s.strip("[]").split(",") if x.strip()]

toy["roman_list"] = toy["roman_progression"].apply(parse_roman_list)
toy = toy[toy["roman_list"].map(lambda x: isinstance(x, list) and len(x) > 0)].reset_index(drop=True)

# =========================
# 1) Normalize Roman symbols
# =========================
def norm_roman(r):
    r = str(r).strip()
    r = r.replace("Â°", "").replace("+", "")
    return r

toy["roman_list_norm"] = toy["roman_list"].apply(lambda xs: [norm_roman(x) for x in xs])

# =========================
# 2) Rotation-invariant (canonical) representation
# =========================
def all_rotations(seq):
    n = len(seq)
    return [tuple(seq[i:]+seq[:i]) for i in range(n)]

def canonical_rotation(seq):
    if not seq:
        return tuple()
    return min(all_rotations(list(seq)))

toy["canon_loop"] = toy["roman_list_norm"].apply(canonical_rotation)

def bigrams(seq):
    if len(seq) < 2:
        return set()
    return set(zip(seq[:-1], seq[1:]))

toy["bigrams"] = toy["roman_list_norm"].apply(bigrams)

# =========================
# 3) Similarity metrics
# =========================
def jaccard(a, b):
    if not a and not b:
        return 1.0
    inter = len(a & b)
    union = len(a | b)
    return inter / union if union > 0 else 0.0

def loop_match_score(canonA, canonB):
    return 1.0 if canonA == canonB and len(canonA) > 0 else 0.0

def length_compatibility(lenA, lenB):
    d = abs(lenA - lenB)
    return 1.0 / (1.0 + d)

# =========================
# 4) Key proximity (circle of fifths)
# =========================
CIRCLE = ["C","G","D","A","E","B","F#","C#","Ab","Eb","Bb","F"]
ENHARMONIC = {"Db":"C#","Gb":"F#","Cb":"B","Fb":"E","E#":"F","B#":"C","A#":"Bb","D#":"Eb","G#":"Ab"}

def norm_key(k):
    if k is None or pd.isna(k):
        return None
    k = str(k).strip()
    return ENHARMONIC.get(k, k)

def circle_dist(k1, k2):
    k1 = norm_key(k1); k2 = norm_key(k2)
    if k1 not in CIRCLE or k2 not in CIRCLE:
        return None
    i = CIRCLE.index(k1); j = CIRCLE.index(k2)
    d = abs(i - j)
    return min(d, len(CIRCLE) - d)

def key_similarity(k1, k2):
    d = circle_dist(k1, k2)
    if d is None:
        return 0.5
    return 1.0 - d / 6.0

# =========================
# 5) Compatibility score
# =========================
def compatibility_score(A, B, w_loop=0.45, w_bigram=0.45, w_len=0.10, w_key=0.00):
    s_loop = loop_match_score(A["canon_loop"], B["canon_loop"])
    s_big = jaccard(A["bigrams"], B["bigrams"])
    s_len = length_compatibility(len(A["roman_list_norm"]), len(B["roman_list_norm"]))
    s_key = key_similarity(A["key"], B["key"]) if w_key > 0 else 0.0
    return (w_loop * s_loop) + (w_bigram * s_big) + (w_len * s_len) + (w_key * s_key)

# =========================
# 6) Top similar pairs
# =========================
pairs = []
n = len(toy)
REQUIRE_DIFFERENT_SONGS = True
W_KEY = 0.05

for i in range(n):
    for j in range(n):
        if i == j:
            continue
        A = toy.iloc[i]
        B = toy.iloc[j]
        if REQUIRE_DIFFERENT_SONGS and A["song_title"] == B["song_title"]:
            continue
        score = compatibility_score(A, B, w_key=W_KEY)
        pairs.append({
            "from_snippet": A["snippet_id"],
            "from_song": A["song_title"],
            "from_section": A["section"],
            "from_key": A["key"],
            "from_prog": A["roman_progression"],
            "to_snippet": B["snippet_id"],
            "to_song": B["song_title"],
            "to_section": B["section"],
            "to_key": B["key"],
            "to_prog": B["roman_progression"],
            "score": score
        })

pairs_df = pd.DataFrame(pairs).sort_values("score", ascending=False)
pairs_df.head(50).to_csv("analysis/top_pairs_medley.csv", index=False)
print("Saved: analysis/top_pairs_medley.csv (top 50 pairs)")

# =========================
# 7) Medley chain via beam search
# =========================
def build_medley_chain(toy_df, K=5, beam_width=30, w_key=W_KEY):
    M = 40
    neighbors = defaultdict(list)
    rows = [toy_df.iloc[i] for i in range(len(toy_df))]

    for i, A in enumerate(rows):
        scored = []
        for j, B in enumerate(rows):
            if i == j:
                continue
            if REQUIRE_DIFFERENT_SONGS and A["song_title"] == B["song_title"]:
                continue
            s = compatibility_score(A, B, w_key=w_key)
            scored.append((s, j))
        scored.sort(reverse=True, key=lambda x: x[0])
        neighbors[i] = scored[:M]

    beam = [(0.0, [i], {rows[i]["song_title"]}) for i in range(len(rows))]
    beam.sort(reverse=True, key=lambda x: x[0])
    beam = beam[:beam_width]

    for step in range(1, K):
        new_beam = []
        for total, path, used_songs in beam:
            last_i = path[-1]
            for s, j in neighbors[last_i]:
                B = rows[j]
                if B["song_title"] in used_songs:
                    continue
                new_beam.append((total + s, path + [j], used_songs | {B["song_title"]}))
        if not new_beam:
            break
        new_beam.sort(reverse=True, key=lambda x: x[0])
        beam = new_beam[:beam_width]

    best_total, best_path, _ = max(beam, key=lambda x: x[0])
    chain_rows = []
    for idx in best_path:
        r = rows[idx]
        chain_rows.append({
            "snippet_id": r["snippet_id"],
            "song_title": r["song_title"],
            "section": r["section"],
            "key": r["key"],
            "roman_progression": r["roman_progression"]
        })
    return best_total, pd.DataFrame(chain_rows)

best_score, best_chain = build_medley_chain(toy, K=5, beam_width=40, w_key=W_KEY)
best_chain.to_csv("analysis/best_medley_k5.csv", index=False)

print("Saved: analysis/best_medley_k5.csv")
print("Best chain total score:", round(best_score, 4))
print(best_chain)
