# SoundCloud Matching Accuracy Test

Test the accuracy of matching SoundCloud tracks to local tracks using metadata similarity.

**Goal**: Verify ~170/200 tracks match correctly before implementing incremental sync.

## 1. Setup & Imports

In [1]:
import sys

sys.path.insert(0, "/home/kevin/coding/music-minion-cli/src")

from music_minion.domain.library.providers import soundcloud
from music_minion.domain.library.provider import ProviderConfig
from music_minion.domain.library.deduplication import (
    normalize_string,
    find_best_matches_tfidf,
    apply_manual_corrections,
)
from music_minion.core import database

import pandas as pd
import time
from typing import List, Dict, Any, Tuple

## 2. Fetch SoundCloud Tracks (First 200)

In [2]:
# Initialize SoundCloud provider
config = ProviderConfig(name="soundcloud", enabled=True)
state = soundcloud.init_provider(config)

print(f"Authenticated: {state.authenticated}")

if not state.authenticated:
    print("\n‚ö†Ô∏è  Token expired or not authenticated")
    print("Re-authenticating...")

    # Try to get token from database and refresh if expired
    db_state = database.load_provider_state("soundcloud")

    if db_state and db_state.get("auth_data"):
        auth_data = db_state["auth_data"]

        # Try to refresh token
        new_token_data = soundcloud._refresh_token(auth_data)

        if new_token_data:
            print("‚úì Token refreshed successfully!")

            # Save to database
            config_data = db_state.get("config_data", {})
            database.save_provider_state("soundcloud", new_token_data, config_data)

            # Reinitialize with new token
            state = soundcloud.init_provider(config)
            print(f"‚úì Re-authenticated: {state.authenticated}")
        else:
            raise Exception("Token refresh failed. Run in CLI: library auth soundcloud")
    else:
        raise Exception(
            "Not authenticated with SoundCloud. Run in CLI: library auth soundcloud"
        )

[32m2025-11-21 19:42:53.614[0m | [1mINFO    [0m | [36mmusic_minion.core.config[0m:[36mget_config_path[0m:[36m224[0m - [1mUsing project config: /home/kevin/coding/music-minion-cli/config.toml[0m


Authenticated: True


In [3]:
token_data = state.cache.get("token_data")
access_token = token_data["access_token"]

# Single API request for first page
import requests

url = "https://api.soundcloud.com/me/activities"
headers = {"Authorization": f"OAuth {access_token}"}
params = {
    "limit": 1000,
}

response = requests.get(url, params, headers=headers, timeout=30)
response.raise_for_status()
data = response.json()

In [None]:
data["collection"][0]

{'type': 'track',
 'created_at': '2025/11/22 03:41:24 +0000',
 'origin': {'kind': 'track',
  'id': 2216944880,
  'urn': 'soundcloud:tracks:2216944880',
  'created_at': '2025/11/22 03:39:32 +0000',
  'duration': 121887,
  'commentable': True,
  'comment_count': 0,
  'sharing': 'public',
  'tag_list': ' ',
  'streamable': True,
  'embeddable_by': 'me',
  'purchase_url': None,
  'purchase_title': None,
  'genre': None,
  'title': 'Take me Dancing (dirty bird af) .m4a',
  'description': '',
  'label_name': '',
  'release': None,
  'key_signature': None,
  'isrc': None,
  'bpm': None,
  'release_year': None,
  'release_month': None,
  'release_day': None,
  'license': 'all-rights-reserved',
  'uri': 'https://api.soundcloud.com/tracks/soundcloud:tracks:2216944880',
  'user': {'avatar_url': 'https://i1.sndcdn.com/avatars-BxX02pw6sIYdqSlP-gSJooQ-large.jpg',
   'id': 185609189,
   'urn': 'soundcloud:users:185609189',
   'kind': 'user',
   'permalink_url': 'https://soundcloud.com/mportaudio?utm_

In [42]:
df = pd.json_normalize(data["collection"])
df.type.value_counts()

type
track:repost    906
track            71
playlist          6
Name: count, dtype: int64

In [29]:
df.head()

Unnamed: 0,type,created_at,origin.kind,origin.id,origin.urn,origin.created_at,origin.duration,origin.commentable,origin.comment_count,origin.sharing,...,origin.user_id,origin.user_urn,origin.last_modified,origin.playlist_type,origin.type,origin.likes_count,origin.repost_count,origin.tags,origin.ean,origin.tracks_uri
0,track:repost,2025/11/21 19:34:11 +0000,track,2164518843,soundcloud:tracks:2164518843,2025/09/04 09:01:35 +0000,160078,True,0.0,public,...,,,,,,,,,,
1,playlist,2025/11/21 19:32:27 +0000,playlist,2117969744,soundcloud:playlists:2117969744,2025/11/21 19:32:27 +0000,235076,,,public,...,7498196.0,soundcloud:users:7498196,2025/11/21 19:32:27 +0000,PLAYLIST,PLAYLIST,0.0,0.0,,,https://api.soundcloud.com/playlists/soundclou...
2,track:repost,2025/11/21 19:31:33 +0000,track,2216723663,soundcloud:tracks:2216723663,2025/11/21 18:35:47 +0000,306495,True,4.0,public,...,,,,,,,,,,
3,track:repost,2025/11/21 19:31:26 +0000,track,2216747603,soundcloud:tracks:2216747603,2025/11/21 19:23:04 +0000,221858,True,3.0,public,...,,,,,,,,,,
4,track:repost,2025/11/21 19:30:12 +0000,track,2166646716,soundcloud:tracks:2166646716,2025/09/08 00:50:22 +0000,150909,True,8.0,public,...,,,,,,,,,,


In [23]:
df.type.value_counts()

type
track:repost    48
playlist         1
track            1
Name: count, dtype: int64

In [10]:
df.iloc[0].to_dict()

{'duration': 3294485,
 'genre': '',
 'release_day': None,
 'permalink': 'long-tracks',
 'permalink_url': 'https://soundcloud.com/kevinbigfoot/sets/long-tracks/s-D9RU9imJ0Ir?utm_medium=api&utm_campaign=social_sharing&utm_source=id_318266',
 'release_month': None,
 'release_year': None,
 'description': None,
 'uri': 'https://api.soundcloud.com/playlists/soundcloud:playlists:2114893679?secret_token=s-D9RU9imJ0Ir',
 'label_name': None,
 'label_id': None,
 'label': None,
 'tag_list': '',
 'track_count': 1,
 'user_id': 15999805,
 'user_urn': 'soundcloud:users:15999805',
 'last_modified': '2025/11/15 21:13:44 +0000',
 'license': 'all-rights-reserved',
 'playlist_type': 'PLAYLIST',
 'type': 'PLAYLIST',
 'id': 2114893679,
 'urn': 'soundcloud:playlists:2114893679',
 'downloadable': None,
 'likes_count': 0,
 'repost_count': 0,
 'sharing': 'private',
 'created_at': '2025/11/15 21:13:44 +0000',
 'release': None,
 'tags': '',
 'kind': 'playlist',
 'title': 'Long Tracks',
 'purchase_title': None,
 'e

In [12]:
# Fetch first 200 tracks directly (one API call)
print("Fetching first 200 SoundCloud tracks...")

token_data = state.cache.get("token_data")
access_token = token_data["access_token"]

# Single API request for first page
import requests

url = "https://api.soundcloud.com/me/likes/tracks"
headers = {"Authorization": f"OAuth {access_token}"}
params = {
    "limit": 200,
    "linked_partitioning": True,
}

response = requests.get(url, params=params, headers=headers, timeout=30)
response.raise_for_status()
data = response.json()

# Parse tracks
sc_tracks = []
if "collection" in data:
    for item in data["collection"]:
        # Filter to only tracks (API may return other kinds)
        if not item or item.get("kind") != "track":
            continue

        track_id = str(item["id"])
        metadata = soundcloud._normalize_soundcloud_track(item)
        sc_tracks.append((track_id, metadata))

# Filter Out First 31
sc_tracks = sc_tracks[31:]

print(f"‚úì Fetched {len(sc_tracks)} tracks from first page")
print(f"\nFirst 3:")
for track_id, metadata in sc_tracks[:3]:
    print(f"  {metadata.get('artist')} - {metadata.get('title')} (ID: {track_id})")

Fetching first 200 SoundCloud tracks...
‚úì Fetched 169 tracks from first page

First 3:
  Wakaan - Capochino - Hypnotic (ID: 2172046062)
  Shroom - Young Miko - Wassup (Shroom x UrBoiN8 Flip) (ID: 2176339092)
  Kompany - Jackpot (Space Wizard Remix) (ID: 2192107907)


In [None]:
df = pd.DataFrame(data["collection"])

print(df.iloc[0])

kind                                                                   track
id                                                                2211704240
urn                                             soundcloud:tracks:2211704240
created_at                                         2025/11/13 07:20:43 +0000
duration                                                              167471
commentable                                                             True
comment_count                                                             14
sharing                                                               public
tag_list                                              drumstep "Drum & Bass"
streamable                                                              True
embeddable_by                                                            all
purchase_url                    https://hypeddit.com/thelivingproof/feelthis
purchase_title                                                          None

## 4. Load Local Tracks (Without SoundCloud ID)

In [13]:
# Get all tracks from database
all_tracks = database.get_all_tracks()

# Filter to only local tracks WITHOUT soundcloud_id
local_tracks = [
    t for t in all_tracks if t.get("local_path") and not t.get("soundcloud_id")
]

print(f"Total tracks in DB: {len(all_tracks)}")
print(f"Local tracks without SoundCloud ID: {len(local_tracks)}")
print(f"\nFirst 3 local tracks:")
for track in local_tracks[:3]:
    print(f"  {track.get('artist')} - {track.get('title')}")

Total tracks in DB: 5415
Local tracks without SoundCloud ID: 5415

First 3 local tracks:
  None - MergeFX Sample Sound 202
  None - MergeFX Sample Sound 203
  None - MergeFX Sample Sound 204


## 5. Run Matching Process

### 5.1 Test Case: BAWDY Track (Should Now Match!)

### 5.2 Full Matching with Scores

In [15]:
# TF-IDF batch matching - MUCH faster!
print("=" * 80)
print("TF-IDF SEARCH-BASED MATCHING")
print("=" * 80)
print(
    f"Matching {len(sc_tracks)} SC tracks against {len(local_tracks)} local tracks..."
)
print()

# Time the matching
start_time = time.time()

# Call TF-IDF matcher (batch operation)
tfidf_results = find_best_matches_tfidf(sc_tracks, local_tracks, min_score=0.70)

elapsed = time.time() - start_time

# Process results
matches_tfidf = []
no_matches_tfidf = []

for sc_id, best_match, score in tfidf_results:
    sc_metadata = next(meta for tid, meta in sc_tracks if tid == sc_id)

    if best_match:
        matches_tfidf.append(
            {
                "sc_id": sc_id,
                "sc_title": sc_metadata.get("title"),
                "sc_artist": sc_metadata.get("artist"),
                "local_id": best_match["id"],
                "local_title": best_match.get("title"),
                "local_artist": best_match.get("artist"),
                "score": score,
            }
        )
    else:
        no_matches_tfidf.append(
            {
                "sc_id": sc_id,
                "title": sc_metadata.get("title"),
                "artist": sc_metadata.get("artist"),
                "genre": sc_metadata.get("genre"),
                "best_score": score,
            }
        )

print(f"‚úì Matching complete in {elapsed:.2f} seconds!")
print()
print(f"  Matched (>= 0.70): {len(matches_tfidf)}")
print(f"  No match (< 0.70):  {len(no_matches_tfidf)}")
print()

if matches_tfidf:
    scores = [m["score"] for m in matches_tfidf]
    print(f"Score distribution:")
    print(f"  Average:  {sum(scores) / len(scores):.3f}")
    print(f"  Min:      {min(scores):.3f}")
    print(f"  Max:      {max(scores):.3f}")
    print()
    print(f"  High (0.9-1.0):      {sum(1 for s in scores if s >= 0.9)} tracks")
    print(f"  Good (0.8-0.89):     {sum(1 for s in scores if 0.8 <= s < 0.9)} tracks")
    print(f"  Moderate (0.7-0.79): {sum(1 for s in scores if 0.7 <= s < 0.8)} tracks")

TF-IDF SEARCH-BASED MATCHING
Matching 169 SC tracks against 5415 local tracks...

‚úì Matching complete in 0.31 seconds!

  Matched (>= 0.70): 149
  No match (< 0.70):  20

Score distribution:
  Average:  0.923
  Min:      0.702
  Max:      0.995

  High (0.9-1.0):      113 tracks
  Good (0.8-0.89):     27 tracks
  Moderate (0.7-0.79): 9 tracks


# Apply manual corrections if CSV has been edited
corrections_file = '/home/kevin/coding/music-minion-cli/soundcloud_matches_review.csv'

# Convert matches_tfidf list to dict format for correction function
matches_dict = matches_tfidf.copy()

# Apply corrections
corrected_matches = apply_manual_corrections(matches_dict, corrections_file)

# Count changes
corrected_count = sum(1 for m in corrected_matches if m.get('corrected', False))
removed_count = len(matches_dict) - len(corrected_matches)

if corrected_count > 0 or removed_count > 0:
    print(f"\nüìä Correction summary:")
    if corrected_count > 0:
        print(f"  ‚úì {corrected_count} matches corrected (replaced with correct track ID)")
    if removed_count > 0:
        print(f"  ‚úó {removed_count} matches removed (marked as 'None' - no valid match)")
    
    print(f"\n  Total matches: {len(matches_dict)} ‚Üí {len(corrected_matches)}")
    
    if corrected_count > 0:
        print("\nCorrected matches:")
        for m in corrected_matches:
            if m.get('corrected'):
                print(f"  {m['sc_artist']} - {m['sc_title']}")
                print(f"    ‚Üí {m['local_artist']} - {m['local_title']} (ID: {m['local_id']})")
else:
    print("\n‚ÑπÔ∏è  No corrections applied (CSV not edited or no correct_id values filled)")

In [None]:
# Apply manual corrections if CSV has been edited
corrections_file = "/home/kevin/coding/music-minion-cli/soundcloud_matches_review.csv"

# Convert matches_tfidf list to dict format for correction function
matches_dict = matches_tfidf.copy()

# Apply corrections
corrected_matches = apply_manual_corrections(matches_dict, corrections_file)

# Count how many were corrected
corrected_count = sum(1 for m in corrected_matches if m.get("corrected", False))

if corrected_count > 0:
    print(f"\n‚úì {corrected_count} matches were corrected based on CSV")
    print("\nCorrected matches:")
    for m in corrected_matches:
        if m.get("corrected"):
            print(f"  {m['sc_artist']} - {m['sc_title']}")
            print(
                f"    ‚Üí {m['local_artist']} - {m['local_title']} (ID: {m['local_id']})"
            )
else:
    print("\n‚ÑπÔ∏è  No corrections applied (CSV not edited or no correct_id values filled)")

In [None]:
# Export matches to CSV for manual review and correction
df_review = pd.DataFrame(matches_tfidf)

# Add empty column for manual corrections
df_review["correct_id"] = ""
df_review["notes"] = ""

# Sort by score (lowest first) - these need the most attention
df_review_sorted = df_review.sort_values("score")

# Save to CSV
output_file = "/home/kevin/coding/music-minion-cli/soundcloud_matches_review.csv"
df_review_sorted.to_csv(output_file, index=False)

print(f"‚úì Exported {len(df_review)} matches to:")
print(f"  {output_file}")
print()
print("Review instructions:")
print("1. Open CSV and sort by 'score' (lowest first)")
print("2. For incorrect matches, look up the correct track ID")
print("3. Fill in 'correct_id' column with the right ID")
print("4. Add notes explaining why it was wrong (optional)")
print("5. Save and re-run notebook to apply corrections")
print()
print(f"Bottom 10 matches (need review):")
print()
display(
    df_review_sorted[
        ["sc_artist", "sc_title", "local_artist", "local_title", "score", "correct_id"]
    ].head(10)
)

In [None]:
# Check if BAWDY track matched
bawdy_matches = [m for m in matches_tfidf if m["sc_artist"] == "BAWDY"]

print("=" * 80)
print("BAWDY TRACK TEST - TF-IDF Results")
print("=" * 80)
print()

if bawdy_matches:
    print(f"‚úÖ Found {len(bawdy_matches)} BAWDY match(es)!")
    print()
    for match in bawdy_matches:
        print(f"SoundCloud: {match['sc_artist']} - {match['sc_title']}")
        print(f"Local:      {match['local_artist']} - {match['local_title']}")
        print(f"Score:      {match['score']:.3f}")
        print()
else:
    print("‚ùå BAWDY track did NOT match")
    print()
    # Show what it got instead
    bawdy_in_no_matches = [nm for nm in no_matches_tfidf if nm["artist"] == "BAWDY"]
    if bawdy_in_no_matches:
        for nm in bawdy_in_no_matches:
            print(f"  {nm['artist']} - {nm['title']}")
            print(
                f"  Best score: {nm.get('best_score', 0.0):.3f} (below 0.70 threshold)"
            )

In [None]:
track

('2211704240', {'title': 'FEEL THIS', 'artist': 'The Living Proof', 'genre': 'Dubstep', 'duration': 167.471})


In [53]:
df = pd.DataFrame(local_tracks)
display(df.head(1))

display(df[df.title.str.contains("MY NECK MY BACK")])

Unnamed: 0,id,file_path,title,artist,album,genre,year,duration,key_signature,bpm,...,file_mtime,last_synced_at,remix_artist,local_path,soundcloud_id,spotify_id,youtube_id,soundcloud_synced_at,spotify_synced_at,youtube_synced_at
0,6039,/home/kevin/Music/PioneerDJ/Sampler/MERGE FX/M...,MergeFX Sample Sound 202,,,,,1.838073,,,...,1602181000.0,2025-11-19 01:22:49,,/home/kevin/Music/PioneerDJ/Sampler/MERGE FX/M...,,,,,,


Unnamed: 0,id,file_path,title,artist,album,genre,year,duration,key_signature,bpm,...,file_mtime,last_synced_at,remix_artist,local_path,soundcloud_id,spotify_id,youtube_id,soundcloud_synced_at,spotify_synced_at,youtube_synced_at
321,3457,/home/kevin/Music/EDM/2020/Aug 20/MY NECK MY B...,MY NECK MY BACK - (BAD TASTES TOO WOOK TO WALK...,BADTASTES OFFICIAL,Aug 20,Trap,2018.0,158.9792,G#m,73.0,...,1751567000.0,2025-11-19 01:22:49,,/home/kevin/Music/EDM/2020/Aug 20/MY NECK MY B...,,,,,,
333,5896,/home/kevin/Music/EDM/2025/Sept 25/MY NECK MY ...,MY NECK MY BACK (BAWDY Flip),BAWDY,Sept 25,Trap,2025.0,138.087506,Cm,100.0,...,1759638000.0,2025-11-19 01:22:49,BAWDY,/home/kevin/Music/EDM/2025/Sept 25/MY NECK MY ...,,,,,,


In [None]:
df = pd.DataFrame([t[1] for t in sc_tracks])
df.insert(0, "id", [t[0] for t in sc_tracks])
display(df[df.artist == "BAWDY"].tail(1))

Unnamed: 0,id,title,artist,genre,duration
196,2091674085,MY NECK MY BACK (BAWDY Flip) (FREE DL),BAWDY,100bpm,138.136


In [64]:
print(len(sc_tracks))

200


In [None]:
all_playlists.extend(data)
# Find target playlists
target_names = ["Nov 25", "Oct 25", "Sept 25"]
target_playlists = {}

for playlist in all_playlists:
    name = playlist.get("title", "")

    if name in target_names:
        target_playlists[name] = {
            "id": str(playlist["id"]),
            "track_count": playlist.get("track_count", 0),
            "tracks": playlist.get("tracks", []),
        }

print(f"\nFound target playlists:")
for name in target_names:
    if name in target_playlists:
        pl = target_playlists[name]
        print(f"  ‚úì {name}: {pl['track_count']} tracks (ID: {pl['id']})")
    else:
        print(f"  ‚úó {name}: Not found")


Found target playlists:
  ‚úì Nov 25: 31 tracks (ID: 2107077439)
  ‚úì Oct 25: 71 tracks (ID: 2089957919)
  ‚úì Sept 25: 154 tracks (ID: 2076808602)


### Compare: Likes NOT in Playlists

## 5. Results Analysis

### 5.1 Matched Tracks Table (Top 20)

### 5.2 Borderline Matches (0.8-0.85) - Needs Manual Verification

In [None]:
if matches:
    df_borderline = df_matches[
        (df_matches["score"] >= 0.8) & (df_matches["score"] < 0.85)
    ]

    print(f"Borderline matches (0.8-0.85): {len(df_borderline)} tracks")
    print("These should be manually verified:")
    print()

    if len(df_borderline) > 0:
        display(
            df_borderline[
                ["sc_artist", "sc_title", "local_artist", "local_title", "score"]
            ]
        )
    else:
        print("‚úì No borderline matches - all matches are high confidence!")

### 5.3 Unmatched Tracks (Genuinely New)

In [50]:
if no_matches:
    df_no_matches = pd.DataFrame(no_matches)

    print(f"Unmatched tracks: {len(df_no_matches)}")
    print("These are genuinely new tracks not in local library:")
    print()
    display(df_no_matches[["artist", "title", "genre"]].tail(30))
else:
    print("All SoundCloud tracks matched to local tracks!")

Unmatched tracks: 132
These are genuinely new tracks not in local library:



Unnamed: 0,artist,title,genre
102,slugzmusic,SLUGZ X PANTHER - SHELLSHOCK,Dubstep
103,Know Good,Paramore - Decode (Know Good Flip),
104,Seth David,Zeds Dead & Flux Pavilion - WAVES (SETH DAVID ...,Dubstep
105,DOMEOFDOOM,Jack Blom - Loud N Clear,Electronic
106,Outset,Au5 & Tasha Baxter - Snowblind (Outset Remix),Dubstep
107,$LUTCHK,$PIN 4 ME ($LUTCHK EDIT) (Spins FLO x Real 4 m...,Dubstep
108,Whethan,"Disco Lines, Tinashe - No Broke Boys (Whethan ...",Dubstep
109,Phrva,deadmau5 & Kaskade - I Remember (Phrva Flip),Dubstep
110,Jad≈´ Dala,Untitld - Fight Song (JAD≈™015),JAD≈™
111,NIGHTMODE,WINK & nikko - MERCY,Dance & EDM


### 5.4 Score Distribution Histogram

In [None]:
if matches:
    import matplotlib.pyplot as plt

    scores = [m["score"] for m in matches]

    plt.figure(figsize=(10, 6))
    plt.hist(scores, bins=20, edgecolor="black", alpha=0.7)
    plt.axvline(0.8, color="red", linestyle="--", label="Min threshold (0.8)")
    plt.xlabel("Similarity Score")
    plt.ylabel("Number of Tracks")
    plt.title("Distribution of Match Similarity Scores")
    plt.legend()
    plt.grid(alpha=0.3)
    plt.show()

In [None]:
# Check a few random matches to see the normalized strings
import random

if matches and len(matches) >= 3:
    print("Sample normalized comparisons:")
    print("=" * 80)

    samples = random.sample(matches, min(3, len(matches)))

    for i, match in enumerate(samples, 1):
        print(f"\nMatch {i} (score: {match['score']:.3f})")
        print(f"  SoundCloud: {match['sc_artist']} - {match['sc_title']}")
        print(f"  Local:      {match['local_artist']} - {match['local_title']}")
        print(f"  Normalized SC title:  '{normalize_string(match['sc_title'])}'")
        print(f"  Normalized LC title:  '{normalize_string(match['local_title'])}'")
        print(f"  Normalized SC artist: '{normalize_string(match['sc_artist'])}'")
        print(f"  Normalized LC artist: '{normalize_string(match['local_artist'])}'")

# Spotify ‚Üí SoundCloud Playlist Matching Prototype

**Goal**: Match Spotify Release Radar tracks to SoundCloud using multi-factor scoring.

**Strategy**:
1. Fetch Spotify playlist tracks (with ISRC, duration, release date)
2. Search SoundCloud for each track with filters
3. Score candidates using TF-IDF + duration + release date
4. Generate CSV report for manual review
5. Tune weights and iterate

---

## üéØ How to Use This Notebook

### First Run:
1. **Update playlist ID** in Section 2 (find it in your Spotify URL)
2. **Run all cells** from top to bottom
3. **Review the CSV** report generated in Section 5
4. **Fill in corrections**: Open the CSV, review matches, fill `correct_sc_id` and `status` columns

### Iterating to Improve:
1. **Analyze patterns** in the CSV - what's failing? (title vs artist vs duration mismatches?)
2. **Tune weights** in Section 3:
   - If titles often mismatch ‚Üí increase `'title'` weight
   - If artist names are inconsistent ‚Üí decrease `'artist'` weight  
   - If duration helps ‚Üí increase `'duration'` weight
3. **Re-run sections 4-6** to see if accuracy improves
4. **Repeat** until you're happy with the results!

### When Ready for Production:
Once you've tuned the algorithm and validated accuracy:
- Extract the scoring logic to `src/music_minion/domain/sync/spotify_soundcloud_sync.py`
- Create a CLI command: `music-minion sync-playlist spotify:<playlist_id>`
- Add caching to avoid re-searching the same tracks

---

## üí° Expected Accuracy

Based on research:
- **Good matches (‚â•0.80)**: 70-85% of tracks
- **Fair matches (0.70-0.79)**: 10-15% (manual review recommended)
- **Not found**: 5-15% (truly not on SoundCloud or very different names)

Your actual results will depend on:
- How well track names align between Spotify and SoundCloud
- Whether artists use consistent naming
- If tracks exist on SoundCloud at all

In [None]:
# Analyze matching results
matched_count = sum(1 for r in all_match_results if r['candidates'])
not_found_count = len(all_match_results) - matched_count

# Get all best match scores
best_scores = [r['candidates'][0].confidence_score for r in all_match_results if r['candidates']]

print("=" * 80)
print("MATCHING SUMMARY")
print("=" * 80)
print()
print(f"Total Spotify tracks: {len(all_match_results)}")
print(f"  ‚úì Found on SoundCloud: {matched_count} ({matched_count/len(all_match_results)*100:.1f}%)")
print(f"  ‚ùå Not found: {not_found_count} ({not_found_count/len(all_match_results)*100:.1f}%)")
print()

if best_scores:
    print("Score Distribution (best matches):")
    print(f"  Average: {sum(best_scores) / len(best_scores):.3f}")
    print(f"  Min:     {min(best_scores):.3f}")
    print(f"  Max:     {max(best_scores):.3f}")
    print()
    print(f"  Excellent (0.90-1.00): {sum(1 for s in best_scores if s >= 0.90)} tracks")
    print(f"  Good      (0.80-0.89): {sum(1 for s in best_scores if 0.80 <= s < 0.90)} tracks")
    print(f"  Fair      (0.70-0.79): {sum(1 for s in best_scores if 0.70 <= s < 0.80)} tracks")
    print(f"  Poor      (0.60-0.69): {sum(1 for s in best_scores if 0.60 <= s < 0.70)} tracks")
    print(f"  Very Poor (<0.60):     {sum(1 for s in best_scores if s < 0.60)} tracks")
    print()
    
    # Show worst matches (need manual review)
    results_with_scores = [(r, r['candidates'][0]) for r in all_match_results if r['candidates']]
    results_with_scores.sort(key=lambda x: x[1].confidence_score)
    
    print("‚ö†Ô∏è  Bottom 5 Matches (review these first):")
    print()
    for r, best_match in results_with_scores[:5]:
        sp = r['spotify_track']
        print(f"  Score: {best_match.confidence_score:.3f}")
        print(f"    Spotify:    {sp['artist']} - {sp['title']}")
        print(f"    SoundCloud: {best_match.soundcloud_artist} - {best_match.soundcloud_title}")
        print(f"    Breakdown: title={best_match.title_similarity:.2f} artist={best_match.artist_similarity:.2f} dur={best_match.duration_match:.1f}")
        print()

print("=" * 80)
print(f"‚úÖ Full report with top 3 candidates per track: {report_path}")
print("=" * 80)

## 6. Quick Results Analysis

See how well the matching performed before diving into the CSV.

In [None]:
from pathlib import Path

# Generate CSV report
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
report_path = Path.cwd() / f"spotify_soundcloud_matches_{timestamp}.csv"

with open(report_path, 'w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    
    # Header
    writer.writerow([
        # Spotify info
        'spotify_id',
        'spotify_title',
        'spotify_artist',
        'spotify_album',
        'spotify_duration',
        'spotify_release_date',
        'spotify_isrc',
        
        # Top match (Match 1)
        'sc_match1_id',
        'sc_match1_title',
        'sc_match1_artist',
        'sc_match1_duration',
        'sc_match1_score',
        'sc_match1_title_sim',
        'sc_match1_artist_sim',
        'sc_match1_duration_match',
        
        # Alternative match 2
        'sc_match2_id',
        'sc_match2_title',
        'sc_match2_artist',
        'sc_match2_score',
        
        # Alternative match 3
        'sc_match3_id',
        'sc_match3_title',
        'sc_match3_artist',
        'sc_match3_score',
        
        # Manual review columns
        'correct_sc_id',  # User fills this
        'notes',          # User notes
        'status',         # CORRECT | WRONG | NOT_FOUND
    ])
    
    # Data rows
    for result in all_match_results:
        sp_track = result['spotify_track']
        candidates = result['candidates']
        
        row = [
            # Spotify info
            sp_track['id'],
            sp_track['title'],
            sp_track['artist'],
            sp_track['album'],
            f"{sp_track['duration_ms'] / 1000:.1f}s",
            sp_track.get('release_date'),
            sp_track.get('isrc'),
        ]
        
        # Match 1 (best match)
        if len(candidates) > 0:
            m1 = candidates[0]
            row.extend([
                m1.soundcloud_id,
                m1.soundcloud_title,
                m1.soundcloud_artist,
                f"{m1.soundcloud_duration:.1f}s",
                f"{m1.confidence_score:.3f}",
                f"{m1.title_similarity:.3f}",
                f"{m1.artist_similarity:.3f}",
                f"{m1.duration_match:.1f}",
            ])
        else:
            row.extend([''] * 8)
        
        # Match 2
        if len(candidates) > 1:
            m2 = candidates[1]
            row.extend([
                m2.soundcloud_id,
                m2.soundcloud_title,
                m2.soundcloud_artist,
                f"{m2.confidence_score:.3f}",
            ])
        else:
            row.extend([''] * 4)
        
        # Match 3
        if len(candidates) > 2:
            m3 = candidates[2]
            row.extend([
                m3.soundcloud_id,
                m3.soundcloud_title,
                m3.soundcloud_artist,
                f"{m3.confidence_score:.3f}",
            ])
        else:
            row.extend([''] * 4)
        
        # Manual review columns (empty for user to fill)
        row.extend(['', '', ''])
        
        writer.writerow(row)

print(f"‚úÖ Report saved to: {report_path}")
print()
print("üìã Next steps:")
print("1. Open the CSV in your spreadsheet app (Excel, Google Sheets, etc.)")
print("2. Review the 'sc_match1_*' columns - this is the best match")
print("3. If incorrect, check match2/match3 or search SoundCloud manually")
print("4. Fill 'correct_sc_id' with the right SoundCloud ID (or leave blank if not found)")
print("5. Fill 'status' column: CORRECT | WRONG | NOT_FOUND")
print("6. Save and analyze patterns to tune the algorithm")
print()
print("üí° Tips:")
print("  - Sort by 'sc_match1_score' (low to high) to review worst matches first")
print("  - Look at score breakdowns to see what's failing (title vs artist vs duration)")
print("  - Adjust WEIGHTS in section 3 based on what you learn!")

## 5. Generate CSV Report for Manual Review

Create a spreadsheet showing all matches with score breakdowns for your review.

In [None]:
# Re-initialize SoundCloud state (ensure we have the token)
sc_token_data = state.cache.get("token_data")
sc_access_token = sc_token_data["access_token"]

# Search and match each Spotify track
all_match_results = []
MIN_CANDIDATES = 3  # Keep top 3 candidates per track

print(f"üîç Searching SoundCloud for {len(spotify_tracks)} Spotify tracks...\n")

for i, sp_track in enumerate(spotify_tracks, 1):
    print(f"[{i}/{len(spotify_tracks)}] {sp_track['artist']} - {sp_track['title']}")
    
    # Build search query
    query = f"{sp_track['artist']} {sp_track['title']}"
    
    # Search SoundCloud (using existing provider function)
    _, sc_results = soundcloud.search(state, query)
    
    if not sc_results:
        print(f"  ‚ùå No results found on SoundCloud")
        all_match_results.append({
            'spotify_track': sp_track,
            'candidates': []
        })
        continue
    
    # Parse SoundCloud results (they come as tuples)
    sc_tracks_parsed = []
    for sc_id, sc_metadata in sc_results:
        sc_tracks_parsed.append({
            'id': sc_id,
            'title': sc_metadata['title'],
            'artist': sc_metadata['artist'],
            'duration': sc_metadata['duration'],
            'created_at': None,  # Not in search results, would need track fetch
            'isrc': None,  # SoundCloud doesn't expose in search
        })
    
    # Score each candidate
    candidates = []
    for sc_track in sc_tracks_parsed:
        candidate = calculate_confidence_score(sp_track, sc_track)
        candidates.append(candidate)
    
    # Sort by score (best first)
    candidates.sort(key=lambda x: x.confidence_score, reverse=True)
    
    # Keep top 3
    top_candidates = candidates[:MIN_CANDIDATES]
    
    # Show best match
    if top_candidates:
        best = top_candidates[0]
        print(f"  ‚úì Best: {best.soundcloud_artist} - {best.soundcloud_title}")
        print(f"    Score: {best.confidence_score:.3f} (title:{best.title_similarity:.2f} artist:{best.artist_similarity:.2f} dur:{best.duration_match:.1f})")
    
    all_match_results.append({
        'spotify_track': sp_track,
        'candidates': top_candidates
    })
    
    # Small delay to avoid rate limits
    time.sleep(0.3)

print(f"\n‚úì Matching complete!")
print(f"  Total tracks processed: {len(all_match_results)}")
print(f"  Tracks with matches: {sum(1 for r in all_match_results if r['candidates'])}")

## 4. Search SoundCloud and Match Tracks

For each Spotify track, search SoundCloud and score all candidates.

In [None]:
@dataclass
class MatchCandidate:
    """SoundCloud match candidate with score breakdown."""
    soundcloud_id: str
    soundcloud_title: str
    soundcloud_artist: str
    soundcloud_duration: float
    soundcloud_created_at: str
    soundcloud_isrc: Optional[str]
    
    # Score breakdown
    title_similarity: float
    artist_similarity: float
    duration_match: float
    isrc_match: float
    release_date_proximity: float
    
    # Final weighted score
    confidence_score: float


# Configurable weights (tune these!)
WEIGHTS = {
    'title': 0.40,
    'artist': 0.25,
    'duration': 0.20,
    'isrc': 0.10,
    'date': 0.05,
}

def calculate_text_similarity(text1: str, text2: str) -> float:
    """Calculate TF-IDF cosine similarity between two text strings."""
    # Normalize using existing function
    norm1 = normalize_string(text1)
    norm2 = normalize_string(text2)
    
    if not norm1 or not norm2:
        return 0.0
    
    # TF-IDF vectorization
    vectorizer = TfidfVectorizer(ngram_range=(1, 2))
    try:
        tfidf_matrix = vectorizer.fit_transform([norm1, norm2])
        similarity = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
        return float(similarity)
    except:
        return 0.0


def calculate_confidence_score(
    spotify_track: dict,
    soundcloud_track: dict
) -> MatchCandidate:
    """Multi-factor scoring algorithm."""
    
    # 1. Title similarity (TF-IDF)
    title_sim = calculate_text_similarity(
        spotify_track['title'],
        soundcloud_track['title']
    )
    
    # 2. Artist similarity (TF-IDF)
    artist_sim = calculate_text_similarity(
        spotify_track['artist'],
        soundcloud_track['artist']
    )
    
    # 3. Duration matching (¬±3 seconds tolerance)
    spotify_duration = spotify_track['duration_ms'] / 1000.0
    sc_duration = soundcloud_track['duration']
    duration_diff = abs(spotify_duration - sc_duration)
    
    if duration_diff <= 3:
        duration_score = 1.0
    elif duration_diff <= 10:
        duration_score = 0.5
    else:
        duration_score = 0.0
    
    # 4. ISRC matching (bonus if available)
    spotify_isrc = spotify_track.get('isrc')
    sc_isrc = soundcloud_track.get('isrc')
    
    isrc_score = 1.0 if (spotify_isrc and sc_isrc and spotify_isrc == sc_isrc) else 0.0
    
    # 5. Release date proximity (favor recent uploads)
    date_score = 0.2  # Default low score
    
    if spotify_track.get('release_date') and soundcloud_track.get('created_at'):
        try:
            # Parse dates
            spotify_release = datetime.fromisoformat(spotify_track['release_date'].replace('/', '-'))
            
            # SoundCloud format: "2025/11/21 18:35:47 +0000"
            sc_created_str = soundcloud_track['created_at'].split(' +')[0]
            sc_created = datetime.strptime(sc_created_str, "%Y/%m/%d %H:%M:%S")
            
            days_diff = abs((spotify_release - sc_created).days)
            
            if days_diff <= 30:
                date_score = 1.0
            elif days_diff <= 90:
                date_score = 0.5
            else:
                date_score = 0.2
        except:
            date_score = 0.2
    
    # Weighted final score
    confidence = (
        title_sim * WEIGHTS['title'] +
        artist_sim * WEIGHTS['artist'] +
        duration_score * WEIGHTS['duration'] +
        isrc_score * WEIGHTS['isrc'] +
        date_score * WEIGHTS['date']
    )
    
    return MatchCandidate(
        soundcloud_id=str(soundcloud_track['id']),
        soundcloud_title=soundcloud_track['title'],
        soundcloud_artist=soundcloud_track['artist'],
        soundcloud_duration=sc_duration,
        soundcloud_created_at=soundcloud_track.get('created_at', ''),
        soundcloud_isrc=sc_isrc,
        title_similarity=title_sim,
        artist_similarity=artist_sim,
        duration_match=duration_score,
        isrc_match=isrc_score,
        release_date_proximity=date_score,
        confidence_score=confidence
    )

print("‚úì Scoring algorithm defined")
print(f"  Weights: {WEIGHTS}")
print(f"  Adjust weights above and re-run to tune matching!")

## 3. Define Multi-Factor Scoring Algorithm

This algorithm scores SoundCloud candidates based on:
- **Title similarity** (TF-IDF, 40% weight)
- **Artist similarity** (TF-IDF, 25% weight)  
- **Duration match** (¬±3s = 1.0, ¬±10s = 0.5, 20% weight)
- **Release date proximity** (<30 days = 1.0, 5% weight)
- **ISRC match** (exact = 1.0, 10% weight, bonus if available)

You can tune these weights after seeing initial results!

In [None]:
# Your Release Radar playlist ID - get from Spotify URL
# Example: https://open.spotify.com/playlist/37i9dQZEVXcNL7TjxoGm0I
# The ID is the last part after /playlist/
SPOTIFY_PLAYLIST_ID = "37i9dQZEVXcNL7TjxoGm0I"  # Replace with your playlist ID

# Fetch playlist tracks using Spotify provider
print(f"Fetching Spotify playlist: {SPOTIFY_PLAYLIST_ID}")

# Get access token
token_data = spotify_state.cache.get("token_data")
sp_client = token_data.get("spotipy_client") if token_data else None

if not sp_client:
    raise Exception("Spotify client not initialized")

# Fetch playlist
playlist_data = sp_client.playlist(SPOTIFY_PLAYLIST_ID)
print(f"‚úì Playlist: {playlist_data['name']}")
print(f"  Total tracks: {playlist_data['tracks']['total']}")

# Fetch all tracks with full metadata
spotify_tracks = []
results = sp_client.playlist_tracks(SPOTIFY_PLAYLIST_ID)

while results:
    for item in results['items']:
        if not item['track']:
            continue
            
        track = item['track']
        
        # Extract full metadata
        track_metadata = {
            'id': track['id'],
            'title': track['name'],
            'artist': ', '.join([artist['name'] for artist in track['artists']]),
            'album': track['album']['name'],
            'duration_ms': track['duration_ms'],
            'release_date': track['album'].get('release_date'),
            'isrc': track.get('external_ids', {}).get('isrc'),
            'popularity': track.get('popularity'),
        }
        
        spotify_tracks.append(track_metadata)
    
    # Pagination
    results = sp_client.next(results) if results['next'] else None

print(f"‚úì Fetched {len(spotify_tracks)} tracks with full metadata\n")
print("First 3 tracks:")
for track in spotify_tracks[:3]:
    print(f"  {track['artist']} - {track['title']}")
    print(f"    ISRC: {track['isrc']}, Duration: {track['duration_ms']/1000:.1f}s")

## 2. Fetch Spotify Playlist Tracks

We'll fetch Release Radar (or any Spotify playlist) with full metadata including ISRC, duration, and release dates.

In [None]:
from music_minion.domain.library.providers import spotify
from datetime import datetime, timedelta
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from dataclasses import dataclass
from typing import List, Optional
import csv

# Initialize Spotify provider
spotify_config = ProviderConfig(name="spotify", enabled=True)
spotify_state = spotify.init_provider(spotify_config)

print(f"Spotify authenticated: {spotify_state.authenticated}")

if not spotify_state.authenticated:
    raise Exception("Not authenticated with Spotify. Run in CLI: library auth spotify")

## 1. Setup - Initialize Spotify Provider