# Spotify ‚Üí SoundCloud Playlist Matching Prototype

**Goal**: Match Spotify Release Radar tracks to SoundCloud using multi-factor scoring.

**Strategy**:
1. Fetch Spotify playlist tracks (title, artist, album, duration)
2. Search SoundCloud for each track
3. Score candidates using TF-IDF + duration matching
4. Generate CSV report for manual review
5. Tune weights and iterate

## 1. Setup - Initialize Providers

In [114]:
%load_ext autoreload
%autoreload 2
import sys

sys.path.insert(0, "/home/kevin/coding/music-minion-cli/src")

from music_minion.domain.library.providers import spotify, soundcloud
from music_minion.domain.library.provider import ProviderConfig
from music_minion.domain.library.deduplication import normalize_string

from datetime import datetime, timedelta
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from dataclasses import dataclass
from typing import Optional
import time
import csv
from pathlib import Path
import requests
import re


API_BASE = "https://api.spotify.com/v1"

# Initialize Spotify provider
print("Initializing Spotify provider...")
spotify_config = ProviderConfig(name="spotify", enabled=True)
spotify_state = spotify.init_provider(spotify_config)
print(f"‚úì Spotify authenticated: {spotify_state.authenticated}")

if not spotify_state.authenticated:
    raise Exception("Not authenticated with Spotify. Run in CLI: library auth spotify")

# Initialize SoundCloud provider
print("\nInitializing SoundCloud provider...")
soundcloud_config = ProviderConfig(name="soundcloud", enabled=True)
soundcloud_state = soundcloud.init_provider(soundcloud_config)
print(f"‚úì SoundCloud authenticated: {soundcloud_state.authenticated}")

if not soundcloud_state.authenticated:
    raise Exception(
        "Not authenticated with SoundCloud. Run in CLI: library auth soundcloud"
    )

[32m2025-11-22 14:45:47.143[0m | [34m[1mDEBUG   [0m | [36mmusic_minion.domain.library.providers.spotify[0m:[36minit_provider[0m:[36m47[0m - [34m[1mInitializing Spotify provider[0m
[32m2025-11-22 14:45:47.146[0m | [34m[1mDEBUG   [0m | [36mmusic_minion.domain.library.providers.spotify[0m:[36minit_provider[0m:[36m66[0m - [34m[1mFound Spotify tokens in database[0m
[32m2025-11-22 14:45:47.146[0m | [1mINFO    [0m | [36mmusic_minion.domain.library.providers.spotify[0m:[36minit_provider[0m:[36m70[0m - [1mSpotify token expired, attempting refresh[0m


[32m2025-11-22 14:45:47.148[0m | [1mINFO    [0m | [36mmusic_minion.core.config[0m:[36mget_config_path[0m:[36m224[0m - [1mUsing project config: /home/kevin/coding/music-minion-cli/config.toml[0m
[32m2025-11-22 14:45:47.269[0m | [34m[1mDEBUG   [0m | [36mmusic_minion.domain.library.providers.spotify.auth[0m:[36m_save_user_tokens[0m:[36m336[0m - [34m[1mSaved Spotify tokens to /home/kevin/.local/share/music-minion/spotify/user_tokens.json[0m
[32m2025-11-22 14:45:47.270[0m | [1mINFO    [0m | [36mmusic_minion.domain.library.providers.spotify.auth[0m:[36mrefresh_token[0m:[36m402[0m - [1mSpotify token refreshed successfully, expires: 2025-11-22 15:45:47.269496[0m
[32m2025-11-22 14:45:47.304[0m | [1mINFO    [0m | [36mmusic_minion.domain.library.providers.spotify[0m:[36minit_provider[0m:[36m78[0m - [1mSpotify token refreshed successfully[0m
[32m2025-11-22 14:45:47.306[0m | [1mINFO    [0m | [36mmusic_minion.core.config[0m:[36mget_config_path

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Initializing Spotify provider...
‚úì Spotify authenticated: True

Initializing SoundCloud provider...
‚úì SoundCloud authenticated: True


## 2. Fetch Spotify Playlist Tracks

In [115]:
def search(state, q):
    """Fetch tracks for specific playlist.

    Args:
        state: Provider state
        playlist_id: Spotify playlist ID or URN (spotify:playlists:{id})

    Returns:
        (updated_state, track_list)
    """
    state, token = spotify._ensure_valid_token(state)
    if not token:
        return state, []

    # Handle URN format: spotify:playlists:{id}

    tracks = []
    url = f"{API_BASE}/search"
    params = {"q": "playlist:release radar", "type": ["playlist"]}
    headers = {"Authorization": f"Bearer {token['access_token']}"}

    try:
        response = requests.get(url, params=params, headers=headers, timeout=30)
        response.raise_for_status()
        data = response.json()
        print(f"Playlist data: {data}")

        for item in data["tracks"]["items"]:
            if not item.get("track"):  # Skip local files and removed tracks
                continue
            if item["track"].get("is_local"):  # Skip local files
                continue

            track_id = item["track"]["id"]
            metadata = spotify._normalize_spotify_track(item["track"])
            tracks.append((track_id, metadata))

        url = data.get("next")
        params = {}

        return state, tracks

    except Exception:
        print("Error fetching playlist tracks:", sys.exc_info()[1])
        return state, []


search(spotify_state, "")


Playlist data: {'playlists': {'href': 'https://api.spotify.com/v1/search?offset=0&limit=20&query=playlist%3Arelease%20radar&type=playlist', 'limit': 20, 'next': 'https://api.spotify.com/v1/search?offset=20&limit=20&query=playlist%3Arelease%20radar&type=playlist', 'offset': 0, 'previous': None, 'total': 895, 'items': [None, {'collaborative': False, 'description': 'Easily discover the latest music from all genres of EDM. Updated weekly', 'external_urls': {'spotify': 'https://open.spotify.com/playlist/7c7tO96FdmW1jwQozBIHtZ'}, 'href': 'https://api.spotify.com/v1/playlists/7c7tO96FdmW1jwQozBIHtZ', 'id': '7c7tO96FdmW1jwQozBIHtZ', 'images': [{'height': None, 'url': 'https://image-cdn-ak.spotifycdn.com/image/ab67706c0000da84db6e613aac938696e321a696', 'width': None}], 'name': 'EDM Release Radar', 'owner': {'display_name': 'Madi Paine', 'external_urls': {'spotify': 'https://open.spotify.com/user/ljjq7mz5d148fp0akm1bxexa0'}, 'href': 'https://api.spotify.com/v1/users/ljjq7mz5d148fp0akm1bxexa0', '

(ProviderState(config=ProviderConfig(name='spotify', enabled=True, cache_duration_hours=24), authenticated=True, last_sync=None, cache={'token_data': {'access_token': 'BQAmuCp4fKFdQlBw1WJ_C2fBd3j63ASD51kyVIF-5ta36RR1yO8eM_i98Vvt1pNixCg2QgWHR6fNFtdAL3NYnGYDcNjTQTpUp9-KfISrXMnhFcEQ4jOSDXBCrs27pby-lzzSHNE9-1cSV9ztJjVGc_xAb-V-Ign3ufWoWfbRB6l9X3Tb7Ui67rsxiKqnzoxccnqAi7stYyMwV49fJDy8iUr0zTWhTzJ5icr1V8YyM4iGfF8PW2Jlw1eDD9nv6G1uqjI6ePN5aBAElqtemzxCQYSDGs0inVW8WLSSMQ1TU_JGqQgW89XIM9nHs2A8c8nAstVuyRieYmxHhuBH4ZPVqdlFBl3TkIzsoQm1o-261ESBShr5Ag', 'token_type': 'Bearer', 'expires_in': 3600, 'refresh_token': 'AQAP_Laf82pnteGUua4Te8a0TbQxcnTFD4Rh5nGP9ZBlDNkiYr-qR3bYJS0b24gpA2XGEv_U76kbBzeEqIZZXRIKkwBPCk5SjbsN8HnEKqbZl2AyIt1IOImdN6wPoSYu0c_U6A', 'scope': 'playlist-read-private playlist-read-collaborative user-follow-read playlist-modify-private user-read-email user-read-private streaming user-follow-modify user-modify-playback-state user-library-read user-library-modify playlist-modify-public user-rea

In [116]:
# Your Release Radar playlist ID - get from Spotify URL
SPOTIFY_PLAYLIST_ID = "3R77UjTxG7SjGvtjxGLGD0"  # Replace with your playlist ID
# https://open.spotify.com/playlist/3R77UjTxG7SjGvtjxGLGD0?si=04d08f6e51ca4d64
print(f"Fetching Spotify playlist: {SPOTIFY_PLAYLIST_ID}")
spotify_state, track_list = spotify.get_playlist_tracks(
    spotify_state, SPOTIFY_PLAYLIST_ID
)
print(f"‚úì Fetched {len(track_list)} tracks\n")

# Convert to structured format
spotify_tracks = []
for track_id, metadata in track_list:
    print(f"Track metadata: {metadata}")
    spotify_tracks.append(
        {
            "id": track_id,
            "title": metadata["title"],
            "artist": metadata["artist"],
            "album": metadata.get("album", ""),
            "duration_ms": int(metadata["duration"] * 1000)
            if metadata.get("duration")
            else 0,
            "year": metadata.get("year"),
            "release_date": metadata.get("release_date"),
            "top_level_artist": metadata.get("top_level_artist", ""),
        }
    )

print("First 3 tracks:")
for track in spotify_tracks[:3]:
    print(
        f"  {track['artist']} - {track['title']} ({track['duration_ms'] / 1000:.1f}s) released on {track['release_date']}"
    )

Fetching Spotify playlist: 3R77UjTxG7SjGvtjxGLGD0


[32m2025-11-22 14:45:48.740[0m | [34m[1mDEBUG   [0m | [36mmusic_minion.domain.library.providers.spotify.api[0m:[36mget_playlist_tracks[0m:[36m586[0m - [34m[1mFetched 58 tracks for playlist 3R77UjTxG7SjGvtjxGLGD0[0m


Playlist data: {'collaborative': False, 'description': '', 'external_urls': {'spotify': 'https://open.spotify.com/playlist/3R77UjTxG7SjGvtjxGLGD0'}, 'followers': {'href': None, 'total': 0}, 'href': 'https://api.spotify.com/v1/playlists/3R77UjTxG7SjGvtjxGLGD0', 'id': '3R77UjTxG7SjGvtjxGLGD0', 'images': [{'height': 640, 'url': 'https://mosaic.scdn.co/640/ab67616d00001e02477215d62384a7b666d947d5ab67616d00001e029ea1ddfa69f09b864eed7d91ab67616d00001e02f452fcb1c838e5faaa314d69ab67616d00001e02fcb5a76207196c90dc5066ea', 'width': 640}, {'height': 300, 'url': 'https://mosaic.scdn.co/300/ab67616d00001e02477215d62384a7b666d947d5ab67616d00001e029ea1ddfa69f09b864eed7d91ab67616d00001e02f452fcb1c838e5faaa314d69ab67616d00001e02fcb5a76207196c90dc5066ea', 'width': 300}, {'height': 60, 'url': 'https://mosaic.scdn.co/60/ab67616d00001e02477215d62384a7b666d947d5ab67616d00001e029ea1ddfa69f09b864eed7d91ab67616d00001e02f452fcb1c838e5faaa314d69ab67616d00001e02fcb5a76207196c90dc5066ea', 'width': 60}], 'name': 'Re

## 3. Define Scoring Algorithm

In [117]:
@dataclass
class MatchCandidate:
    soundcloud_id: str
    soundcloud_title: str
    soundcloud_artist: str
    soundcloud_duration: float
    title_similarity: float
    artist_similarity: float
    duration_match: float
    confidence_score: float


WEIGHTS = {"title": 0.40, "artist": 0.30, "duration": 0.30}


def calculate_text_similarity(text1: str, text2: str) -> float:
    norm1, norm2 = normalize_string(text1), normalize_string(text2)
    if not norm1 or not norm2:
        return 0.0
    try:
        vectorizer = TfidfVectorizer(ngram_range=(1, 2))
        tfidf_matrix = vectorizer.fit_transform([norm1, norm2])
        return float(cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0])
    except:
        return 0.0


def calculate_confidence_score(
    spotify_track: dict, soundcloud_track: dict
) -> MatchCandidate:
    title_sim = calculate_text_similarity(
        spotify_track["title"],
        f"{soundcloud_track['artist']} - {soundcloud_track['title']}",
    )
    artist_sim = calculate_text_similarity(
        spotify_track["top_level_artist"], soundcloud_track["artist"]
    )

    spotify_duration = spotify_track["duration_ms"] / 1000.0
    sc_duration = soundcloud_track["duration"]
    duration_diff = abs(spotify_duration - sc_duration)
    duration_score = (
        1.0 if duration_diff <= 3 else (0.5 if duration_diff <= 10 else 0.0)
    )

    confidence = (
        title_sim * WEIGHTS["title"]
        + artist_sim * WEIGHTS["artist"]
        + duration_score * WEIGHTS["duration"]
    )

    return MatchCandidate(
        soundcloud_id=str(soundcloud_track["id"]),
        soundcloud_title=soundcloud_track["title"],
        soundcloud_artist=soundcloud_track["artist"],
        soundcloud_duration=sc_duration,
        title_similarity=title_sim,
        artist_similarity=artist_sim,
        duration_match=duration_score,
        confidence_score=confidence,
    )


print(f"‚úì Scoring algorithm defined (weights: {WEIGHTS})")

‚úì Scoring algorithm defined (weights: {'title': 0.4, 'artist': 0.3, 'duration': 0.3})


---

## ‚ö° QUICK START - Run Cells in This Order:

To test the improved matching approach:

1. ‚úÖ **Initialize** (cells 1-2): Run once at start
2. ‚úÖ **Fetch playlist** (cell 5): Load your Spotify tracks
3. üÜï **Define improved functions** (2 cells below): Ensemble scoring + multi-query
4. üÜï **Run improved matching** (1 cell): Creates `improved_match_results`
5. üÜï **View comparison** (1 cell): Summary and analysis

**Scroll down to find the cells with these names:**
- `from rapidfuzz import fuzz, distance` ‚Üê Run this first
- `def generate_query_variants` ‚Üê Run this second
- `improved_match_results = []` ‚Üê Run this third (THE MATCHING LOOP)
- `# Get scores from improved results` ‚Üê Run this last (SUMMARY)

---

## 6. Next Steps

### To Test the Improved Approach:

1. **Run cells 1-2**: Initialize providers
2. **Run cell 5**: Fetch your Spotify playlist (58 tracks)
3. **Run the new cells** (3.1 and 4): Define improved functions and run matching
4. **Review results** in cell 5 comparison

### Key Improvements:

- **Ensemble Scoring**: Combines 4 different metrics instead of just TF-IDF
- **Substring Penalty**: Prevents false positives like "By Your Side" ‚Üí "Wake Up By Your Side"
- **Adaptive Duration**: Different tolerances for remixes (¬±20s), extended (¬±60s), originals (¬±10s)
- **Multi-Query**: Tries 3 different search formats per track
- **Better Artist Handling**: More lenient matching for collaborators like "TVBOO" vs "TVBOO, AHEE"

### Tuning Parameters:

```python
# In ensemble_text_similarity():
method="balanced"  # Options: "balanced", "strict" (fewer false positives), "lenient" (more matches)

# In IMPROVED_WEIGHTS:
"title": 0.50     # Increase if titles are reliable
"artist": 0.25    # Decrease if artist fields are noisy
"duration": 0.25  # Increase if durations are accurate

# In matching cell:
MIN_CONFIDENCE_THRESHOLD = 0.60  # Lower to catch more matches, raise to be more selective
```

### If Results Are Good:

Port these functions to `/src/music_minion/domain/library/deduplication.py` to use in production.

In [118]:
# --- 4. MULTI-QUERY STRATEGY ---


def generate_query_variants(spotify_track: dict) -> list[tuple[str, str]]:
    """
    Generate multiple search query formats to handle different SoundCloud title patterns.

    Returns:
        List of (query_string, query_description) tuples
    """
    title = spotify_track["title"]
    artist = spotify_track["top_level_artist"]

    # Clean title: remove common noise for fallback queries
    clean_title = re.sub(
        r"\s*[\(\[]?(feat\.?|ft\.?|featuring)[^\)\]]*[\)\]]?", "", title, flags=re.I
    )
    clean_title = re.sub(
        r"\s*[\(\[]?(remix|vip|extended|edit)[^\)\]]*[\)\]]?",
        "",
        clean_title,
        flags=re.I,
    )
    clean_title = clean_title.strip()

    variants = [
        # Primary query (current approach)
        (f"{title} - {artist}", "standard"),
        # Reverse order (some SoundCloud tracks have "Title Artist" format)
        (f"{title} {artist}", "no separator"),
        # Artist first (for artist-heavy tracks)
        (f"{artist} {title}", "artist first"),
        # Title only (fallback when artist field is too noisy)
        (title, "title only"),
    ]

    # Add clean title variant if different from original
    if clean_title and clean_title != title:
        variants.append((f"{clean_title} {artist}", "clean title"))

    return variants


def search_with_multi_query(
    spotify_track: dict, soundcloud_state, max_variants: int = 3, verbose: bool = False
) -> tuple[list, str]:
    """
    Try multiple query formats and return combined unique results.

    Returns:
        (combined_results, best_query_format)
    """
    variants = generate_query_variants(spotify_track)[:max_variants]

    all_results = {}  # Dict to deduplicate by soundcloud_id
    best_query = variants[0][0]  # Default

    for query, query_type in variants:
        if verbose:
            print(f"    Trying: {query} ({query_type})")

        _, sc_results = soundcloud.search(soundcloud_state, query)

        if sc_results:
            # Track which query found the most results
            if len(sc_results) > len(all_results):
                best_query = query

            # Merge results (deduplicate by ID)
            for sc_id, meta in sc_results:
                if sc_id not in all_results:
                    all_results[sc_id] = meta

        time.sleep(0.1)  # Rate limiting

    # Convert back to list format
    combined = [(sc_id, meta) for sc_id, meta in all_results.items()]

    if verbose and len(all_results) > 0:
        print(
            f"    ‚Üí Found {len(all_results)} unique candidates across {len(variants)} queries"
        )

    return combined, best_query


print(f"‚úì Multi-query strategy defined")
print(f"  Functions: generate_query_variants, search_with_multi_query")

‚úì Multi-query strategy defined
  Functions: generate_query_variants, search_with_multi_query


In [None]:
from rapidfuzz import fuzz, distance
import re

# --- 1. ENSEMBLE TEXT SIMILARITY ---


def calculate_token_subset_penalty(norm1: str, norm2: str) -> float:
    """
    Penalize when one string's tokens are a small subset of another's.

    Prevents: "with your love" matching "marmalade for your love with tommy villiers"
    The tokens overlap but it's clearly a different song.
    """
    if not norm1 or not norm2:
        return 0.0

    tokens1 = set(norm1.split())
    tokens2 = set(norm2.split())

    if not tokens1 or not tokens2:
        return 0.0

    # Check overlap ratios
    intersection = tokens1 & tokens2
    union = tokens1 | tokens2

    overlap_ratio1 = len(intersection) / len(tokens1)  # What % of query tokens found?
    overlap_ratio2 = len(intersection) / len(
        tokens2
    )  # What % of candidate tokens match?
    jaccard = len(intersection) / len(union)  # Overall overlap

    # If ALL query tokens are found but they're a small part of candidate
    # Example: "with your love" (3 tokens) vs "flourish tommy villiers marmalade for your love with tommy villiers" (10 tokens)
    # overlap_ratio1 = 1.0 (all 3 found), overlap_ratio2 = 0.3 (only 3/10 match)
    if overlap_ratio1 >= 0.9 and overlap_ratio2 < 0.5:
        # Query is fully contained but represents small part of candidate
        # This is likely a false positive unless titles are very similar
        return 0.2  # Heavy penalty

    # If only some tokens match and they're scattered
    if jaccard < 0.4:  # Less than 40% overlap
        return 0.5  # Moderate penalty

    # If most tokens match on both sides, it's probably good
    if overlap_ratio1 >= 0.8 and overlap_ratio2 >= 0.6:
        return 1.0  # No penalty

    # Gradual penalty based on jaccard similarity
    return 0.5 + (jaccard * 0.5)  # Range: 0.5 to 1.0


def ensemble_text_similarity(text1: str, text2: str, method: str = "balanced") -> float:
    """
    Combines multiple similarity metrics for robust matching.

    Args:
        text1, text2: Strings to compare
        method: "balanced" (default), "strict" (avoid false positives), "lenient" (catch more variations)

    Returns:
        Similarity score 0.0-1.0
    """
    norm1, norm2 = normalize_string(text1), normalize_string(text2)
    if not norm1 or not norm2:
        return 0.0

    # Metric 1: TF-IDF (existing approach - good for semantic similarity)
    try:
        vectorizer = TfidfVectorizer(ngram_range=(1, 2))
        tfidf_matrix = vectorizer.fit_transform([norm1, norm2])
        tfidf_score = float(
            cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
        )
    except:
        tfidf_score = 0.0

    # Metric 2: RapidFuzz token_set_ratio (handles collaborators, word order)
    # "TVBOO" vs "TVBOO, AHEE" ‚Üí high score
    fuzzy_score = fuzz.token_set_ratio(norm1, norm2) / 100.0

    # Metric 3: Jaro-Winkler (good for typos, abbreviations, prefix matching)
    jaro_score = distance.JaroWinkler.similarity(norm1, norm2)

    # Metric 4: Character substring penalty (avoid false positives)
    substring_penalty = calculate_substring_penalty(norm1, norm2)

    # Metric 5: Token subset penalty (NEW - catches "with your love" cases)
    token_penalty = calculate_token_subset_penalty(norm1, norm2)

    if method == "strict":
        # Require all metrics to agree - avoid false positives
        weights = {
            "tfidf": 0.20,
            "fuzzy": 0.15,
            "jaro": 0.15,
            "substring": 0.25,
            "token": 0.25,
        }
        score = (
            tfidf_score * weights["tfidf"]
            + fuzzy_score * weights["fuzzy"]
            + jaro_score * weights["jaro"]
            + substring_penalty * weights["substring"]
            + token_penalty * weights["token"]
        )
    elif method == "lenient":
        # Prioritize fuzzy matching - catch more variations
        weights = {
            "tfidf": 0.25,
            "fuzzy": 0.35,
            "jaro": 0.20,
            "substring": 0.10,
            "token": 0.10,
        }
        score = (
            tfidf_score * weights["tfidf"]
            + fuzzy_score * weights["fuzzy"]
            + jaro_score * weights["jaro"]
            + substring_penalty * weights["substring"]
            + token_penalty * weights["token"]
        )
    else:  # balanced (default)
        # Balanced approach with both substring and token penalties
        weights = {
            "tfidf": 0.25,
            "fuzzy": 0.20,
            "jaro": 0.15,
            "substring": 0.20,
            "token": 0.20,
        }
        score = (
            tfidf_score * weights["tfidf"]
            + fuzzy_score * weights["fuzzy"]
            + jaro_score * weights["jaro"]
            + substring_penalty * weights["substring"]
            + token_penalty * weights["token"]
        )

    return score


def calculate_substring_penalty(norm1: str, norm2: str) -> float:
    """
    STRICTER penalty for character substring matches.

    Prevents: "By Your Side" matching "Wake Up By Your Side"
    Allows: "Selector 2025 Remake" matching "Selector (2025 Remake)"
    """
    if norm1 == norm2:
        return 1.0  # Perfect match

    # Check if either is a substring
    if norm1 in norm2:
        position = norm2.index(norm1)
        length_ratio = len(norm1) / len(norm2)

        # STRICTER: Reject if query is small portion OR not at start
        if length_ratio < 0.7:  # Increased from 0.6
            return 0.0  # Changed from 0.3 - REJECT small substrings
        elif position > 0:
            return 0.0  # Changed from 0.5 - REJECT non-prefix substrings
        else:
            # At start and substantial portion
            return 0.85 * length_ratio  # Slight penalty even for prefix

    elif norm2 in norm1:
        # Symmetric check
        position = norm1.index(norm2)
        length_ratio = len(norm2) / len(norm1)

        if length_ratio < 0.7:
            return 0.0
        elif position > 0:
            return 0.0
        else:
            return 0.85 * length_ratio

    return 1.0  # Not a substring - no penalty


# --- 2. ADAPTIVE DURATION SCORING ---


def calculate_duration_score_adaptive(
    spotify_track: dict, soundcloud_track: dict
) -> float:
    """
    Adaptive duration matching with different tolerances for different track types.

    Extended mixes can be 30-60s longer
    Remixes/VIPs have 10-20s variation
    Original tracks should be within 3-10s
    """
    sp_duration = spotify_track["duration_ms"] / 1000.0
    sc_duration = soundcloud_track["duration"]
    diff = abs(sp_duration - sc_duration)

    # Check track type from both Spotify and SoundCloud titles
    combined_title = f"{spotify_track['title']} {soundcloud_track['title']}".lower()

    is_extended = "extended" in combined_title or "ext" in combined_title
    is_remix = "remix" in combined_title or "rework" in combined_title
    is_vip = "vip" in combined_title
    is_edit = "edit" in combined_title or "radio edit" in combined_title

    if is_extended:
        # Extended versions: very lenient (up to 60s difference)
        return 1.0 if diff <= 30 else (0.7 if diff <= 60 else 0.3)
    elif is_remix or is_vip:
        # Remixes/VIPs: moderate tolerance (up to 20s)
        return 1.0 if diff <= 10 else (0.6 if diff <= 20 else 0.2)
    elif is_edit:
        # Radio edits: tighter tolerance (within 15s)
        return 1.0 if diff <= 5 else (0.7 if diff <= 15 else 0.3)
    else:
        # Original tracks: strict tolerance (within 10s)
        return (
            1.0 if diff <= 1 else (0.7 if diff <= 3 else (0.4 if diff <= 10 else 0.0))
        )


# --- 3. IMPROVED SCORING WEIGHTS ---

# ADJUSTED: Reduced duration weight even more, increased title weight
IMPROVED_WEIGHTS = {
    "title": 0.65,  # Increased from 0.50 (title is most reliable)
    "artist": 0.25,  # Same (often has extra collaborators)
    "duration": 0.10,  # Decreased from 0.25 (causing too many false positives)
}


def calculate_confidence_score_improved(
    spotify_track: dict, soundcloud_track: dict, title_method: str = "balanced"
) -> MatchCandidate:
    """
    Improved confidence scoring with ensemble similarity and adaptive duration.

    Args:
        spotify_track: Spotify track metadata
        soundcloud_track: SoundCloud track metadata
        title_method: "balanced", "strict", or "lenient"
    """
    # Title similarity: Compare Spotify title with SoundCloud's full title
    # (SoundCloud often has "Artist - Title" or "Artist, Artist - Title" format)
    title_sim = ensemble_text_similarity(
        spotify_track["title"],
        f"{soundcloud_track['artist']} - {soundcloud_track['title']}",
        method=title_method,
    )

    # Artist similarity: Use ensemble matching for better collaborator handling
    artist_sim = ensemble_text_similarity(
        spotify_track["top_level_artist"],
        soundcloud_track["artist"],
        method="lenient",  # More forgiving for "TVBOO" vs "TVBOO, AHEE"
    )

    # Duration: Adaptive scoring based on track type
    duration_score = calculate_duration_score_adaptive(spotify_track, soundcloud_track)

    # Weighted confidence score
    confidence = (
        title_sim * IMPROVED_WEIGHTS["title"]
        + artist_sim * IMPROVED_WEIGHTS["artist"]
        + duration_score * IMPROVED_WEIGHTS["duration"]
    )

    # STRICTER FILTER: Require minimum title similarity
    # If title similarity is too low, reject even if artist/duration match
    if title_sim < 0.70:
        confidence *= 0.6  # Heavy penalty for low title similarity

    # ADDITIONAL FILTER: Require minimum thresholds for both title AND artist
    # If BOTH are too low, reject even if duration matches
    if title_sim < 0.65 and artist_sim < 0.65:
        confidence *= 0.5  # Penalize matches where both are weak

    return MatchCandidate(
        soundcloud_id=str(soundcloud_track["id"]),
        soundcloud_title=soundcloud_track["title"],
        soundcloud_artist=soundcloud_track["artist"],
        soundcloud_duration=soundcloud_track["duration"],
        title_similarity=title_sim,
        artist_similarity=artist_sim,
        duration_match=duration_score,
        confidence_score=confidence,
    )


print(f"‚úì Improved scoring functions defined (STRICTER VERSION)")
print(f"  Weights: {IMPROVED_WEIGHTS}")
print(f"  Changes:")
print(f"    - NEW: Token subset penalty (catches 'with your love' false positives)")
print(f"    - Substring penalty: Stricter (0.0 for most cases)")
print(f"    - Duration weight: 0.25 ‚Üí 0.10 (reduced by 60%)")
print(f"    - Title weight: 0.50 ‚Üí 0.65 (increased)")
print(f"    - Title filter: < 0.70 gets 40% penalty")
print(f"    - Dual-threshold filter: both < 0.65 gets 50% penalty")

‚úì Improved scoring functions defined (STRICTER VERSION)
  Weights: {'title': 0.65, 'artist': 0.25, 'duration': 0.1}
  Changes:
    - NEW: Token subset penalty (catches 'with your love' false positives)
    - Substring penalty: Stricter (0.0 for most cases)
    - Duration weight: 0.25 ‚Üí 0.10 (reduced by 60%)
    - Title weight: 0.50 ‚Üí 0.65 (increased)
    - Title filter: < 0.70 gets 40% penalty
    - Dual-threshold filter: both < 0.65 gets 50% penalty


In [120]:
tracks = [spotify_tracks[5], spotify_tracks[31]]

In [121]:
improved_match_results = []
MIN_CANDIDATES = 3
MIN_CONFIDENCE_THRESHOLD = 0.60  # Lowered from 0.70 to catch more matches

spotify_tracks = tracks

print(f"üöÄ Running IMPROVED matching on {len(spotify_tracks)} tracks...")
print(f"   Min confidence: {MIN_CONFIDENCE_THRESHOLD}")
print(f"   Multi-query: Enabled (3 variants per track)")
print(f"   Ensemble scoring: TF-IDF + RapidFuzz + Jaro-Winkler + Substring penalty")
print()

for i, sp_track in enumerate(spotify_tracks, 1):
    print(
        f"[{i}/{len(spotify_tracks)}] {sp_track['title']} - {sp_track['top_level_artist']}"
    )

    # Multi-query search
    sc_results, best_query = search_with_multi_query(
        sp_track, soundcloud_state, max_variants=3, verbose=False
    )

    print(f"  üîé Query: {best_query}")

    if not sc_results:
        print(f"  ‚ùå No results")
        improved_match_results.append(
            {"spotify_track": sp_track, "candidates": [], "best_query": best_query}
        )
        print()
        continue

    # Parse SoundCloud results
    sc_tracks_parsed = [
        {
            "id": sc_id,
            "title": meta["title"],
            "artist": meta["artist"],
            "duration": meta["duration"],
        }
        for sc_id, meta in sc_results
    ]

    # Score all candidates with IMPROVED scoring
    candidates = [
        calculate_confidence_score_improved(sp_track, sc_track, title_method="balanced")
        for sc_track in sc_tracks_parsed
    ]

    # Sort by confidence and filter by threshold
    candidates.sort(key=lambda x: x.confidence_score, reverse=True)
    top_candidates = [
        c
        for c in candidates[:MIN_CANDIDATES]
        if c.confidence_score >= MIN_CONFIDENCE_THRESHOLD
    ]

    if top_candidates:
        best = top_candidates[0]

        # Visual feedback based on confidence tier
        if best.confidence_score >= 0.90:
            status = "‚úì"  # Excellent
        elif best.confidence_score >= 0.75:
            status = "‚úì"  # Good
        elif best.confidence_score >= 0.60:
            status = "‚ö†"  # Fair - manual review recommended
        else:
            status = "‚ùå"  # Poor

        print(
            f"  {status} {best.soundcloud_artist} - {best.soundcloud_title} ({best.confidence_score:.3f}, ({best.duration_match:.3f})"
        )
        print(
            f"     title_sim: {best.title_similarity:.3f}, artist_sim: {best.artist_similarity:.3f}, duration_match: {best.duration_match:.3f}"
        )
    else:
        # No candidates above threshold
        if candidates:
            best = candidates[0]
            print(f"  ‚ùå No good match (best score: {best.confidence_score:.3f})")
            print(f"best match details:")
            print(f"title: {best.soundcloud_title}, artist: {best.soundcloud_artist}")
            print(
                f"     title_sim: {best.title_similarity:.3f}, artist_sim: {best.artist_similarity:.3f}, duration_match: {best.duration_match:.3f}"
            )
        else:
            print(f"  ‚ùå No results")

    improved_match_results.append(
        {
            "spotify_track": sp_track,
            "candidates": top_candidates,
            "best_query": best_query,
        }
    )

    print()
    print("--------------------------------")
    print()

# Summary
matched_count = sum(1 for r in improved_match_results if r["candidates"])
print(f"\n‚úì Complete! {matched_count}/{len(improved_match_results)} matched")

üöÄ Running IMPROVED matching on 2 tracks...
   Min confidence: 0.6
   Multi-query: Enabled (3 variants per track)
   Ensemble scoring: TF-IDF + RapidFuzz + Jaro-Winkler + Substring penalty

[1/2] By Your Side - G Jones
  üîé Query: By Your Side - G Jones
  ‚ö† G - By Your Side (0.648, (0.000)
     title_sim: 0.776, artist_sim: 0.574, duration_match: 0.000

--------------------------------

[2/2] With Your Love - ILLENIUM
  üîé Query: With Your Love - ILLENIUM
  ‚ùå No good match (best score: 0.383)
best match details:
title: ILLENIUM - In Your Arms (with X Ambassadors) [Alan Walker Remix], artist: ILLENIUM
     title_sim: 0.566, artist_sim: 1.000, duration_match: 0.200

--------------------------------


‚úì Complete! 1/2 matched


In [90]:
# Get scores from improved results
improved_scores = [
    r["candidates"][0].confidence_score
    for r in improved_match_results
    if r["candidates"]
]

improved_matched = sum(1 for r in improved_match_results if r["candidates"])

print("=" * 70)
print("IMPROVED MATCHING SUMMARY")
print("=" * 70)
print(f"Total tracks: {len(improved_match_results)}")
# print(f"Matched: {improved_matched} ({improved_matched / len(improved_match_results) * 100:.1f}%)")
print()

if improved_scores:
    print(f"Confidence Scores:")
    print(f"  Average: {sum(improved_scores) / len(improved_scores):.3f}")
    print(f"  Min: {min(improved_scores):.3f}")
    print(f"  Max: {max(improved_scores):.3f}")
    print()

    print("Confidence Tiers:")
    print(f"  Excellent (‚â•0.90): {sum(1 for s in improved_scores if s >= 0.90)} tracks")
    print(
        f"  Good (0.75-0.89): {sum(1 for s in improved_scores if 0.75 <= s < 0.90)} tracks"
    )
    print(
        f"  Fair (0.60-0.74): {sum(1 for s in improved_scores if 0.60 <= s < 0.75)} tracks"
    )
    print(f"  Poor (<0.60): {sum(1 for s in improved_scores if s < 0.60)} tracks")
    print()

    # Show problematic matches
    results_sorted = sorted(
        [(r, r["candidates"][0]) for r in improved_match_results if r["candidates"]],
        key=lambda x: x[1].confidence_score,
    )

    print("‚ö†Ô∏è  Bottom 5 matches (review recommended):")
    for r, m in results_sorted[:5]:
        sp = r["spotify_track"]
        print(f"  {m.confidence_score:.3f} | {sp['artist']} - {sp['title']}")
        print(f"         ‚Üí {m.soundcloud_artist} - {m.soundcloud_title}")
        print(
            f"         (title: {m.title_similarity:.2f}, artist: {m.artist_similarity:.2f}, duration: {m.duration_match:.2f})"
        )
        print()

    print("üèÜ Top 5 matches (highest confidence):")
    for r, m in results_sorted[-5:][::-1]:
        sp = r["spotify_track"]
        print(f"  {m.confidence_score:.3f} | {sp['artist']} - {sp['title']}")
        print(f"         ‚Üí {m.soundcloud_artist} - {m.soundcloud_title}")
        print()

print("=" * 70)

# Show failed matches
failed = [r for r in improved_match_results if not r["candidates"]]
if failed:
    print()
    print(f"‚ùå Failed to match {len(failed)} tracks:")
    for r in failed:
        sp = r["spotify_track"]
        print(f"  - {sp['artist']} - {sp['title']}")

IMPROVED MATCHING SUMMARY
Total tracks: 58

Confidence Scores:
  Average: 0.750
  Min: 0.639
  Max: 0.889

Confidence Tiers:
  Excellent (‚â•0.90): 0 tracks
  Good (0.75-0.89): 25 tracks
  Fair (0.60-0.74): 27 tracks
  Poor (<0.60): 0 tracks

‚ö†Ô∏è  Bottom 5 matches (review recommended):
  0.639 | G Jones, Eprom - By Your Side
         ‚Üí Graham Jones - Chiddy Bang- By Your Side (The Kid G Remix)
         (title: 0.60, artist: 0.76, duration: 0.60)

  0.653 | ILLENIUM, Ryan Tedder - With Your Love
         ‚Üí Flourish & Tommy Villiers - Marmalade (For Your Love) [with tommy villiers]
         (title: 0.65, artist: 0.31, duration: 1.00)

  0.666 | Excision, Sullivan King, From Ashes to New - Adrenaline
         ‚Üí Excision, Sullivan King, From Ashes To New - Excision & Sullivan King - Adrenaline ft. From Ashes To New
         (title: 0.49, artist: 0.68, duration: 1.00)

  0.666 | Skrillex, Varg2‚Ñ¢, Eurohead, LOAM, swedm¬Æ, Virtual Riot - voltage (see you again)
         ‚Üí SKYFVLL

from rapidfuzz import fuzz, distance
import re

# --- 1. ENSEMBLE TEXT SIMILARITY ---

def calculate_token_subset_penalty(norm1: str, norm2: str) -> float:
    """
    Penalize when one string's tokens are a small subset of another's.
    
    Prevents: "with your love" matching "marmalade for your love with tommy villiers"
    The tokens overlap but it's clearly a different song.
    """
    if not norm1 or not norm2:
        return 0.0
    
    tokens1 = set(norm1.split())
    tokens2 = set(norm2.split())
    
    if not tokens1 or not tokens2:
        return 0.0
    
    # Check overlap ratios
    intersection = tokens1 & tokens2
    union = tokens1 | tokens2
    
    overlap_ratio1 = len(intersection) / len(tokens1)  # What % of query tokens found?
    overlap_ratio2 = len(intersection) / len(tokens2)  # What % of candidate tokens match?
    jaccard = len(intersection) / len(union)  # Overall overlap
    
    # If ALL query tokens are found but they're a small part of candidate
    # Example: "with your love" (3 tokens) vs "flourish tommy villiers marmalade for your love with tommy villiers" (10 tokens)
    # overlap_ratio1 = 1.0 (all 3 found), overlap_ratio2 = 0.3 (only 3/10 match)
    if overlap_ratio1 >= 0.9 and overlap_ratio2 < 0.5:
        # Query is fully contained but represents small part of candidate
        # This is likely a false positive unless titles are very similar
        return 0.2  # Heavy penalty
    
    # If only some tokens match and they're scattered
    if jaccard < 0.4:  # Less than 40% overlap
        return 0.5  # Moderate penalty
    
    # If most tokens match on both sides, it's probably good
    if overlap_ratio1 >= 0.8 and overlap_ratio2 >= 0.6:
        return 1.0  # No penalty
    
    # Gradual penalty based on jaccard similarity
    return 0.5 + (jaccard * 0.5)  # Range: 0.5 to 1.0


def ensemble_text_similarity(text1: str, text2: str, method: str = "balanced") -> float:
    """
    Combines multiple similarity metrics for robust matching.
    
    Args:
        text1, text2: Strings to compare
        method: "balanced" (default), "strict" (avoid false positives), "lenient" (catch more variations)
    
    Returns:
        Similarity score 0.0-1.0
    """
    norm1, norm2 = normalize_string(text1), normalize_string(text2)
    if not norm1 or not norm2:
        return 0.0
    
    # Metric 1: TF-IDF (existing approach - good for semantic similarity)
    try:
        vectorizer = TfidfVectorizer(ngram_range=(1, 2))
        tfidf_matrix = vectorizer.fit_transform([norm1, norm2])
        tfidf_score = float(cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0])
    except:
        tfidf_score = 0.0
    
    # Metric 2: RapidFuzz token_set_ratio (handles collaborators, word order)
    # "TVBOO" vs "TVBOO, AHEE" ‚Üí high score
    fuzzy_score = fuzz.token_set_ratio(norm1, norm2) / 100.0
    
    # Metric 3: Jaro-Winkler (good for typos, abbreviations, prefix matching)
    jaro_score = distance.JaroWinkler.similarity(norm1, norm2)
    
    # Metric 4: Character substring penalty (avoid false positives)
    substring_penalty = calculate_substring_penalty(norm1, norm2)
    
    # Metric 5: Token subset penalty (NEW - catches "with your love" cases)
    token_penalty = calculate_token_subset_penalty(norm1, norm2)
    
    if method == "strict":
        # Require all metrics to agree - avoid false positives
        weights = {"tfidf": 0.20, "fuzzy": 0.15, "jaro": 0.15, "substring": 0.25, "token": 0.25}
        score = (
            tfidf_score * weights["tfidf"] +
            fuzzy_score * weights["fuzzy"] +
            jaro_score * weights["jaro"] +
            substring_penalty * weights["substring"] +
            token_penalty * weights["token"]
        )
    elif method == "lenient":
        # Prioritize fuzzy matching - catch more variations
        weights = {"tfidf": 0.25, "fuzzy": 0.35, "jaro": 0.20, "substring": 0.10, "token": 0.10}
        score = (
            tfidf_score * weights["tfidf"] +
            fuzzy_score * weights["fuzzy"] +
            jaro_score * weights["jaro"] +
            substring_penalty * weights["substring"] +
            token_penalty * weights["token"]
        )
    else:  # balanced (default)
        # Balanced approach with both substring and token penalties
        weights = {"tfidf": 0.25, "fuzzy": 0.20, "jaro": 0.15, "substring": 0.20, "token": 0.20}
        score = (
            tfidf_score * weights["tfidf"] +
            fuzzy_score * weights["fuzzy"] +
            jaro_score * weights["jaro"] +
            substring_penalty * weights["substring"] +
            token_penalty * weights["token"]
        )
    
    return score


def calculate_substring_penalty(norm1: str, norm2: str) -> float:
    """
    STRICTER penalty for character substring matches.
    
    Prevents: "By Your Side" matching "Wake Up By Your Side"
    Allows: "Selector 2025 Remake" matching "Selector (2025 Remake)"
    """
    if norm1 == norm2:
        return 1.0  # Perfect match
    
    # Check if either is a substring
    if norm1 in norm2:
        position = norm2.index(norm1)
        length_ratio = len(norm1) / len(norm2)
        
        # STRICTER: Reject if query is small portion OR not at start
        if length_ratio < 0.7:  # Increased from 0.6
            return 0.0  # Changed from 0.3 - REJECT small substrings
        elif position > 0:
            return 0.0  # Changed from 0.5 - REJECT non-prefix substrings
        else:
            # At start and substantial portion
            return 0.85 * length_ratio  # Slight penalty even for prefix
    
    elif norm2 in norm1:
        # Symmetric check
        position = norm1.index(norm2)
        length_ratio = len(norm2) / len(norm1)
        
        if length_ratio < 0.7:
            return 0.0
        elif position > 0:
            return 0.0
        else:
            return 0.85 * length_ratio
    
    return 1.0  # Not a substring - no penalty


# --- 2. ADAPTIVE DURATION SCORING ---

def calculate_duration_score_adaptive(spotify_track: dict, soundcloud_track: dict) -> float:
    """
    Adaptive duration matching with different tolerances for different track types.
    
    Extended mixes can be 30-60s longer
    Remixes/VIPs have 10-20s variation
    Original tracks should be within 3-10s
    """
    sp_duration = spotify_track["duration_ms"] / 1000.0
    sc_duration = soundcloud_track["duration"]
    diff = abs(sp_duration - sc_duration)
    
    # Check track type from both Spotify and SoundCloud titles
    combined_title = f"{spotify_track['title']} {soundcloud_track['title']}".lower()
    
    is_extended = "extended" in combined_title or "ext" in combined_title
    is_remix = "remix" in combined_title or "rework" in combined_title
    is_vip = "vip" in combined_title
    is_edit = "edit" in combined_title or "radio edit" in combined_title
    
    if is_extended:
        # Extended versions: very lenient (up to 60s difference)
        return 1.0 if diff <= 30 else (0.7 if diff <= 60 else 0.3)
    elif is_remix or is_vip:
        # Remixes/VIPs: moderate tolerance (up to 20s)
        return 1.0 if diff <= 10 else (0.6 if diff <= 20 else 0.2)
    elif is_edit:
        # Radio edits: tighter tolerance (within 15s)
        return 1.0 if diff <= 5 else (0.7 if diff <= 15 else 0.3)
    else:
        # Original tracks: strict tolerance (within 10s)
        return 1.0 if diff <= 1 else (0.7 if diff <= 3 else (0.4 if diff <= 10 else 0.0))


# --- 3. IMPROVED SCORING WEIGHTS ---

# ADJUSTED: Reduced duration weight even more, increased title weight
IMPROVED_WEIGHTS = {
    "title": 0.65,    # Increased from 0.60 (title is most reliable)
    "artist": 0.25,   # Same (often has extra collaborators)
    "duration": 0.10  # Decreased from 0.15 (causing too many false positives)
}


def calculate_confidence_score_improved(
    spotify_track: dict, 
    soundcloud_track: dict,
    title_method: str = "balanced"
) -> MatchCandidate:
    """
    Improved confidence scoring with ensemble similarity and adaptive duration.
    
    Args:
        spotify_track: Spotify track metadata
        soundcloud_track: SoundCloud track metadata
        title_method: "balanced", "strict", or "lenient"
    """
    # Title similarity: Compare Spotify title with SoundCloud's full title
    # (SoundCloud often has "Artist - Title" or "Artist, Artist - Title" format)
    title_sim = ensemble_text_similarity(
        spotify_track["title"],
        f"{soundcloud_track['artist']} - {soundcloud_track['title']}",
        method=title_method
    )
    
    # Artist similarity: Use ensemble matching for better collaborator handling
    artist_sim = ensemble_text_similarity(
        spotify_track["top_level_artist"],
        soundcloud_track["artist"],
        method="lenient"  # More forgiving for "TVBOO" vs "TVBOO, AHEE"
    )
    
    # Duration: Adaptive scoring based on track type
    duration_score = calculate_duration_score_adaptive(spotify_track, soundcloud_track)
    
    # Weighted confidence score
    confidence = (
        title_sim * IMPROVED_WEIGHTS["title"] +
        artist_sim * IMPROVED_WEIGHTS["artist"] +
        duration_score * IMPROVED_WEIGHTS["duration"]
    )
    
    # STRICTER FILTER: Require minimum title similarity
    # If title similarity is too low, reject even if artist/duration match
    if title_sim < 0.70:
        confidence *= 0.6  # Heavy penalty for low title similarity
    
    # ADDITIONAL FILTER: Require minimum thresholds for both title AND artist
    # If BOTH are too low, reject even if duration matches
    if title_sim < 0.65 and artist_sim < 0.65:
        confidence *= 0.5  # Penalize matches where both are weak
    
    return MatchCandidate(
        soundcloud_id=str(soundcloud_track["id"]),
        soundcloud_title=soundcloud_track["title"],
        soundcloud_artist=soundcloud_track["artist"],
        soundcloud_duration=soundcloud_track["duration"],
        title_similarity=title_sim,
        artist_similarity=artist_sim,
        duration_match=duration_score,
        confidence_score=confidence,
    )


print(f"‚úì Improved scoring functions defined (EVEN STRICTER VERSION)")
print(f"  Weights: {IMPROVED_WEIGHTS}")
print(f"  Changes:")
print(f"    - NEW: Token subset penalty (catches 'with your love' false positives)")
print(f"    - Substring penalty weight: 0.20 (character-level)")
print(f"    - Token penalty weight: 0.20 (token-level)")
print(f"    - Duration weight: 0.15 ‚Üí 0.10 (further reduced)")
print(f"    - Title weight: 0.60 ‚Üí 0.65 (further increased)")
print(f"    - Added title similarity filter: < 0.70 gets 40% penalty")
print(f"    - Dual-threshold filter: both title AND artist < 0.65 gets 50% penalty")

## 4. Improved Matching - Test on Full Playlist

**Strategy:**
1. Use multi-query search to get more candidates
2. Apply ensemble scoring to all candidates
3. Use adaptive confidence thresholds
4. Compare with original approach

## 3.1 Improved Scoring - Ensemble Approach

**New Features:**
1. **Ensemble scoring**: Combines TF-IDF, RapidFuzz, and Jaro-Winkler
2. **Substring position penalty**: Avoids false matches like "By Your Side" ‚Üí "Wake Up By Your Side"
3. **Adaptive duration scoring**: Different tolerances for remixes/extended versions
4. **Multi-query strategy**: Try multiple search formats to handle artist field variations

In [8]:
query = "PEPPA POT Major Lazer"
print(f"  üîé Query: {query}")
_, sc_results = soundcloud.search(soundcloud_state, query)
print(sc_results)

  üîé Query: PEPPA POT Major Lazer
[('2208665093', {'title': 'PEPPA POT', 'artist': 'Major Lazer, Diplo', 'genre': 'Ragga / Dancehall / Reggaeton', 'duration': 144.718, 'year': None, 'bpm': None}), ('2216757365', {'title': 'Major Lazer - Peppa Pot (ULTRA REMIX)', 'artist': 'EL√âOKH√ÉO', 'genre': 'REMIX', 'duration': 128.183, 'year': None, 'bpm': None}), ('2216756642', {'title': 'Major Lazer - PEPPA POT (DEAN flip)', 'artist': 'DEAN', 'genre': 'bassline', 'duration': 171.52, 'year': None, 'bpm': None})]


## 4. Search SoundCloud and Match Tracks

In [None]:
all_match_results = []
MIN_CANDIDATES = 3

tracks = spotify_tracks.copy()[5:10]

print(tracks[0])


print(f"üîç Searching SoundCloud for {len(tracks)} tracks...\n")


def remove_duplicate_words(text: str) -> str:
    return " ".join(dict.fromkeys(text.split()))


for i, sp_track in enumerate(tracks, 1):
    print(f"[{i}/{len(tracks)}] {sp_track['title']} - {sp_track['top_level_artist']}")

    query = f"{sp_track['title']} - {sp_track['top_level_artist']}"

    print(f"  üîé Query: {query}")
    _, sc_results = soundcloud.search(soundcloud_state, query)

    if not sc_results:
        print(f"  ‚ùå No results")
        all_match_results.append({"spotify_track": sp_track, "candidates": []})
        continue

    sc_tracks_parsed = [
        {
            "id": sc_id,
            "title": meta["title"],
            "artist": meta["artist"],
            "duration": meta["duration"],
        }
        for sc_id, meta in sc_results
    ]

    candidates = [
        calculate_confidence_score(sp_track, sc_track) for sc_track in sc_tracks_parsed
    ]
    candidates.sort(key=lambda x: x.confidence_score, reverse=True)
    top_candidates = candidates[:MIN_CANDIDATES]

    if top_candidates:
        best = top_candidates[0]
        print(
            f"  ‚úì {best.soundcloud_artist} - {best.soundcloud_title} ({best.confidence_score:.3f}, ({best.duration_match:.3f})"
        )
        for candidate in top_candidates:
            print(
                f"    - {candidate.soundcloud_artist} - {candidate.soundcloud_title} ({candidate.confidence_score:.3f}, duration_match: {candidate.duration_match:.3f})"
            )
        print("")
        print("--------------------------------")
        print("")
        # print(
        #     f"    (title_sim: {best.title_similarity:.3f}, artist_sim: {best.artist_similarity:.3f}, duration_match: {best.duration_match:.3f})"
        # )

    all_match_results.append({"spotify_track": sp_track, "candidates": top_candidates})
    time.sleep(0.3)

print(
    f"\n‚úì Complete! {sum(1 for r in all_match_results if r['candidates'])}/{len(all_match_results)} matched"
)

{'id': '2OE8b4XUccHrareZNJRSLc', 'title': 'By Your Side', 'artist': 'G Jones, Eprom', 'album': 'Disk Utility', 'duration_ms': 192814, 'year': 2025, 'release_date': '2025-11-21', 'top_level_artist': 'G Jones'}
üîç Searching SoundCloud for 5 tracks...

[1/5] By Your Side - G Jones
  üîé Query: By Your Side - G Jones
  ‚úì g_e_r_b - Wake Up By Your Side (0.549, (1.000)
    - g_e_r_b - Wake Up By Your Side (0.549, duration_match: 1.000)
    - C Y G N - C Y G N - By your Side (0.400, duration_match: 0.000)
    - G - By Your Side (0.400, duration_match: 0.000)

--------------------------------

[2/5] voltage (see you again) - Skrillex
  üîé Query: voltage (see you again) - Skrillex
  ‚úì Skrillex, Varg2‚Ñ¢, Eurohead, LOAM, swedm¬Æ, Virtual Riot - Skrillex, Varg2‚Ñ¢, Eurohead, LOAM, swedm¬Æ, Virtual Riot - voltage (see you again)         /released (0.470, (0.000)
    - Skrillex, Varg2‚Ñ¢, Eurohead, LOAM, swedm¬Æ, Virtual Riot - Skrillex, Varg2‚Ñ¢, Eurohead, LOAM, swedm¬Æ, Virtual Riot - vo

## 5. Generate CSV Report

In [None]:
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
report_path = Path.cwd() / f"spotify_soundcloud_matches_{timestamp}.csv"

with open(report_path, "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(
        [
            "spotify_id",
            "spotify_title",
            "spotify_artist",
            "spotify_album",
            "spotify_duration",
            "sc_match1_id",
            "sc_match1_title",
            "sc_match1_artist",
            "sc_match1_score",
            "sc_match2_id",
            "sc_match2_title",
            "sc_match2_score",
            "sc_match3_id",
            "sc_match3_title",
            "sc_match3_score",
            "correct_sc_id",
            "notes",
            "status",
        ]
    )

    for result in all_match_results:
        sp = result["spotify_track"]
        cands = result["candidates"]

        row = [
            sp["id"],
            sp["title"],
            sp["artist"],
            sp["album"],
            f"{sp['duration_ms'] / 1000:.1f}s",
        ]

        for i in range(3):
            if i < len(cands):
                m = cands[i]
                row.extend(
                    [
                        m.soundcloud_id,
                        m.soundcloud_title,
                        m.soundcloud_artist if i == 0 else "",
                        f"{m.confidence_score:.3f}",
                    ]
                )
            else:
                row.extend(["", "", "", ""] if i == 0 else ["", "", ""])

        row.extend(["", "", ""])
        writer.writerow(row)

print(f"‚úÖ Report: {report_path}\n")
print("Next: Open CSV, review matches, fill 'correct_sc_id' and 'status' columns")

‚úÖ Report: /home/kevin/coding/music-minion-cli/spotify_soundcloud_matches_20251122_105246.csv

Next: Open CSV, review matches, fill 'correct_sc_id' and 'status' columns


## 6. Results Summary

In [None]:
matched = sum(1 for r in all_match_results if r["candidates"])
scores = [
    r["candidates"][0].confidence_score for r in all_match_results if r["candidates"]
]

print("=" * 60)
print("MATCHING SUMMARY")
print("=" * 60)
print(
    f"Total: {len(all_match_results)} | Matched: {matched} ({matched / len(all_match_results) * 100:.1f}%)"
)

if scores:
    print(
        f"\nScores: avg={sum(scores) / len(scores):.3f} min={min(scores):.3f} max={max(scores):.3f}"
    )
    print(f"Excellent (‚â•0.90): {sum(1 for s in scores if s >= 0.90)}")
    print(f"Good (0.80-0.89): {sum(1 for s in scores if 0.80 <= s < 0.90)}")
    print(f"Fair (0.70-0.79): {sum(1 for s in scores if 0.70 <= s < 0.80)}")
    print(f"Poor (<0.70): {sum(1 for s in scores if s < 0.70)}")

    results_sorted = sorted(
        [(r, r["candidates"][0]) for r in all_match_results if r["candidates"]],
        key=lambda x: x[1].confidence_score,
    )
    print("\n‚ö†Ô∏è  Bottom 3 (review first):")
    for r, m in results_sorted[:3]:
        print(
            f"  {m.confidence_score:.3f} | {r['spotify_track']['artist']} - {r['spotify_track']['title']}"
        )
        print(f"         ‚Üí {m.soundcloud_artist} - {m.soundcloud_title}")

print("=" * 60)

MATCHING SUMMARY


ZeroDivisionError: division by zero