# SoundCloud Matching Accuracy Test

Test the accuracy of matching SoundCloud tracks to local tracks using metadata similarity.

**Goal**: Verify ~170/200 tracks match correctly before implementing incremental sync.

## 1. Setup & Imports

In [2]:
import sys

sys.path.insert(0, "/home/kevin/coding/music-minion-cli/src")

from music_minion.domain.library.providers import soundcloud
from music_minion.domain.library.provider import ProviderConfig
from music_minion.domain.library.deduplication import (
    normalize_string,
    find_best_matches_tfidf,
    apply_manual_corrections,
)
from music_minion.core import database

import pandas as pd
import time
from typing import List, Dict, Any, Tuple

## 2. Fetch SoundCloud Tracks (First 200)

In [16]:
# Initialize SoundCloud provider
config = ProviderConfig(name="soundcloud", enabled=True)
state = soundcloud.init_provider(config)

print(f"Authenticated: {state.authenticated}")

if not state.authenticated:
    print("\n‚ö†Ô∏è  Token expired or not authenticated")
    print("Re-authenticating...")

    # Try to get token from database and refresh if expired
    db_state = database.load_provider_state("soundcloud")

    if db_state and db_state.get("auth_data"):
        auth_data = db_state["auth_data"]

        # Try to refresh token
        new_token_data = soundcloud._refresh_token(auth_data)

        if new_token_data:
            print("‚úì Token refreshed successfully!")

            # Save to database
            config_data = db_state.get("config_data", {})
            database.save_provider_state("soundcloud", new_token_data, config_data)

            # Reinitialize with new token
            state = soundcloud.init_provider(config)
            print(f"‚úì Re-authenticated: {state.authenticated}")
        else:
            raise Exception("Token refresh failed. Run in CLI: library auth soundcloud")
    else:
        raise Exception(
            "Not authenticated with SoundCloud. Run in CLI: library auth soundcloud"
        )

Authenticated: True


In [17]:
token_data = state.cache.get("token_data")
access_token = token_data["access_token"]

# Single API request for first page
import requests

url = "https://api.soundcloud.com/me/playlists"
headers = {"Authorization": f"OAuth {access_token}"}
params = {
    "limit": 200,
    "linked_partitioning": True,
}

response = requests.get(url, params=params, headers=headers, timeout=30)
response.raise_for_status()
data = response.json()

In [18]:
df = pd.json_normalize(data["collection"])

In [19]:
df.iloc[10].to_dict()

{'duration': 7238869,
 'genre': '',
 'release_day': None,
 'permalink': 'jul-25',
 'permalink_url': 'https://soundcloud.com/kevinbigfoot/sets/jul-25?utm_medium=api&utm_campaign=social_sharing&utm_source=id_318266',
 'release_month': None,
 'release_year': None,
 'description': '',
 'uri': 'https://api.soundcloud.com/playlists/soundcloud:playlists:2047400703',
 'label_name': None,
 'label_id': None,
 'label': None,
 'tag_list': '',
 'track_count': 41,
 'user_id': 15999805,
 'user_urn': 'soundcloud:users:15999805',
 'last_modified': '2025/07/30 01:00:13 +0000',
 'license': 'all-rights-reserved',
 'playlist_type': 'PLAYLIST',
 'type': 'PLAYLIST',
 'id': 2047400703,
 'urn': 'soundcloud:playlists:2047400703',
 'downloadable': None,
 'likes_count': 0,
 'repost_count': 0,
 'sharing': 'public',
 'created_at': '2025/07/06 19:44:04 +0000',
 'release': None,
 'tags': '',
 'kind': 'playlist',
 'title': 'Jul 25',
 'purchase_title': None,
 'ean': None,
 'streamable': True,
 'embeddable_by': 'all',
 

In [10]:
df.iloc[0].to_dict()

{'duration': 3294485,
 'genre': '',
 'release_day': None,
 'permalink': 'long-tracks',
 'permalink_url': 'https://soundcloud.com/kevinbigfoot/sets/long-tracks/s-D9RU9imJ0Ir?utm_medium=api&utm_campaign=social_sharing&utm_source=id_318266',
 'release_month': None,
 'release_year': None,
 'description': None,
 'uri': 'https://api.soundcloud.com/playlists/soundcloud:playlists:2114893679?secret_token=s-D9RU9imJ0Ir',
 'label_name': None,
 'label_id': None,
 'label': None,
 'tag_list': '',
 'track_count': 1,
 'user_id': 15999805,
 'user_urn': 'soundcloud:users:15999805',
 'last_modified': '2025/11/15 21:13:44 +0000',
 'license': 'all-rights-reserved',
 'playlist_type': 'PLAYLIST',
 'type': 'PLAYLIST',
 'id': 2114893679,
 'urn': 'soundcloud:playlists:2114893679',
 'downloadable': None,
 'likes_count': 0,
 'repost_count': 0,
 'sharing': 'private',
 'created_at': '2025/11/15 21:13:44 +0000',
 'release': None,
 'tags': '',
 'kind': 'playlist',
 'title': 'Long Tracks',
 'purchase_title': None,
 'e

In [12]:
# Fetch first 200 tracks directly (one API call)
print("Fetching first 200 SoundCloud tracks...")

token_data = state.cache.get("token_data")
access_token = token_data["access_token"]

# Single API request for first page
import requests

url = "https://api.soundcloud.com/me/likes/tracks"
headers = {"Authorization": f"OAuth {access_token}"}
params = {
    "limit": 200,
    "linked_partitioning": True,
}

response = requests.get(url, params=params, headers=headers, timeout=30)
response.raise_for_status()
data = response.json()

# Parse tracks
sc_tracks = []
if "collection" in data:
    for item in data["collection"]:
        # Filter to only tracks (API may return other kinds)
        if not item or item.get("kind") != "track":
            continue

        track_id = str(item["id"])
        metadata = soundcloud._normalize_soundcloud_track(item)
        sc_tracks.append((track_id, metadata))

# Filter Out First 31
sc_tracks = sc_tracks[31:]

print(f"‚úì Fetched {len(sc_tracks)} tracks from first page")
print(f"\nFirst 3:")
for track_id, metadata in sc_tracks[:3]:
    print(f"  {metadata.get('artist')} - {metadata.get('title')} (ID: {track_id})")

Fetching first 200 SoundCloud tracks...
‚úì Fetched 169 tracks from first page

First 3:
  Wakaan - Capochino - Hypnotic (ID: 2172046062)
  Shroom - Young Miko - Wassup (Shroom x UrBoiN8 Flip) (ID: 2176339092)
  Kompany - Jackpot (Space Wizard Remix) (ID: 2192107907)


In [None]:
df = pd.DataFrame(data["collection"])

print(df.iloc[0])

kind                                                                   track
id                                                                2211704240
urn                                             soundcloud:tracks:2211704240
created_at                                         2025/11/13 07:20:43 +0000
duration                                                              167471
commentable                                                             True
comment_count                                                             14
sharing                                                               public
tag_list                                              drumstep "Drum & Bass"
streamable                                                              True
embeddable_by                                                            all
purchase_url                    https://hypeddit.com/thelivingproof/feelthis
purchase_title                                                          None

## 4. Load Local Tracks (Without SoundCloud ID)

In [13]:
# Get all tracks from database
all_tracks = database.get_all_tracks()

# Filter to only local tracks WITHOUT soundcloud_id
local_tracks = [
    t for t in all_tracks if t.get("local_path") and not t.get("soundcloud_id")
]

print(f"Total tracks in DB: {len(all_tracks)}")
print(f"Local tracks without SoundCloud ID: {len(local_tracks)}")
print(f"\nFirst 3 local tracks:")
for track in local_tracks[:3]:
    print(f"  {track.get('artist')} - {track.get('title')}")

Total tracks in DB: 5415
Local tracks without SoundCloud ID: 5415

First 3 local tracks:
  None - MergeFX Sample Sound 202
  None - MergeFX Sample Sound 203
  None - MergeFX Sample Sound 204


## 5. Run Matching Process

### 5.1 Test Case: BAWDY Track (Should Now Match!)

### 5.2 Full Matching with Scores

In [15]:
# TF-IDF batch matching - MUCH faster!
print("=" * 80)
print("TF-IDF SEARCH-BASED MATCHING")
print("=" * 80)
print(
    f"Matching {len(sc_tracks)} SC tracks against {len(local_tracks)} local tracks..."
)
print()

# Time the matching
start_time = time.time()

# Call TF-IDF matcher (batch operation)
tfidf_results = find_best_matches_tfidf(sc_tracks, local_tracks, min_score=0.70)

elapsed = time.time() - start_time

# Process results
matches_tfidf = []
no_matches_tfidf = []

for sc_id, best_match, score in tfidf_results:
    sc_metadata = next(meta for tid, meta in sc_tracks if tid == sc_id)

    if best_match:
        matches_tfidf.append(
            {
                "sc_id": sc_id,
                "sc_title": sc_metadata.get("title"),
                "sc_artist": sc_metadata.get("artist"),
                "local_id": best_match["id"],
                "local_title": best_match.get("title"),
                "local_artist": best_match.get("artist"),
                "score": score,
            }
        )
    else:
        no_matches_tfidf.append(
            {
                "sc_id": sc_id,
                "title": sc_metadata.get("title"),
                "artist": sc_metadata.get("artist"),
                "genre": sc_metadata.get("genre"),
                "best_score": score,
            }
        )

print(f"‚úì Matching complete in {elapsed:.2f} seconds!")
print()
print(f"  Matched (>= 0.70): {len(matches_tfidf)}")
print(f"  No match (< 0.70):  {len(no_matches_tfidf)}")
print()

if matches_tfidf:
    scores = [m["score"] for m in matches_tfidf]
    print(f"Score distribution:")
    print(f"  Average:  {sum(scores) / len(scores):.3f}")
    print(f"  Min:      {min(scores):.3f}")
    print(f"  Max:      {max(scores):.3f}")
    print()
    print(f"  High (0.9-1.0):      {sum(1 for s in scores if s >= 0.9)} tracks")
    print(f"  Good (0.8-0.89):     {sum(1 for s in scores if 0.8 <= s < 0.9)} tracks")
    print(f"  Moderate (0.7-0.79): {sum(1 for s in scores if 0.7 <= s < 0.8)} tracks")

TF-IDF SEARCH-BASED MATCHING
Matching 169 SC tracks against 5415 local tracks...

‚úì Matching complete in 0.31 seconds!

  Matched (>= 0.70): 149
  No match (< 0.70):  20

Score distribution:
  Average:  0.923
  Min:      0.702
  Max:      0.995

  High (0.9-1.0):      113 tracks
  Good (0.8-0.89):     27 tracks
  Moderate (0.7-0.79): 9 tracks


# Apply manual corrections if CSV has been edited
corrections_file = '/home/kevin/coding/music-minion-cli/soundcloud_matches_review.csv'

# Convert matches_tfidf list to dict format for correction function
matches_dict = matches_tfidf.copy()

# Apply corrections
corrected_matches = apply_manual_corrections(matches_dict, corrections_file)

# Count changes
corrected_count = sum(1 for m in corrected_matches if m.get('corrected', False))
removed_count = len(matches_dict) - len(corrected_matches)

if corrected_count > 0 or removed_count > 0:
    print(f"\nüìä Correction summary:")
    if corrected_count > 0:
        print(f"  ‚úì {corrected_count} matches corrected (replaced with correct track ID)")
    if removed_count > 0:
        print(f"  ‚úó {removed_count} matches removed (marked as 'None' - no valid match)")
    
    print(f"\n  Total matches: {len(matches_dict)} ‚Üí {len(corrected_matches)}")
    
    if corrected_count > 0:
        print("\nCorrected matches:")
        for m in corrected_matches:
            if m.get('corrected'):
                print(f"  {m['sc_artist']} - {m['sc_title']}")
                print(f"    ‚Üí {m['local_artist']} - {m['local_title']} (ID: {m['local_id']})")
else:
    print("\n‚ÑπÔ∏è  No corrections applied (CSV not edited or no correct_id values filled)")

In [None]:
# Apply manual corrections if CSV has been edited
corrections_file = "/home/kevin/coding/music-minion-cli/soundcloud_matches_review.csv"

# Convert matches_tfidf list to dict format for correction function
matches_dict = matches_tfidf.copy()

# Apply corrections
corrected_matches = apply_manual_corrections(matches_dict, corrections_file)

# Count how many were corrected
corrected_count = sum(1 for m in corrected_matches if m.get("corrected", False))

if corrected_count > 0:
    print(f"\n‚úì {corrected_count} matches were corrected based on CSV")
    print("\nCorrected matches:")
    for m in corrected_matches:
        if m.get("corrected"):
            print(f"  {m['sc_artist']} - {m['sc_title']}")
            print(
                f"    ‚Üí {m['local_artist']} - {m['local_title']} (ID: {m['local_id']})"
            )
else:
    print("\n‚ÑπÔ∏è  No corrections applied (CSV not edited or no correct_id values filled)")

In [None]:
# Export matches to CSV for manual review and correction
df_review = pd.DataFrame(matches_tfidf)

# Add empty column for manual corrections
df_review["correct_id"] = ""
df_review["notes"] = ""

# Sort by score (lowest first) - these need the most attention
df_review_sorted = df_review.sort_values("score")

# Save to CSV
output_file = "/home/kevin/coding/music-minion-cli/soundcloud_matches_review.csv"
df_review_sorted.to_csv(output_file, index=False)

print(f"‚úì Exported {len(df_review)} matches to:")
print(f"  {output_file}")
print()
print("Review instructions:")
print("1. Open CSV and sort by 'score' (lowest first)")
print("2. For incorrect matches, look up the correct track ID")
print("3. Fill in 'correct_id' column with the right ID")
print("4. Add notes explaining why it was wrong (optional)")
print("5. Save and re-run notebook to apply corrections")
print()
print(f"Bottom 10 matches (need review):")
print()
display(
    df_review_sorted[
        ["sc_artist", "sc_title", "local_artist", "local_title", "score", "correct_id"]
    ].head(10)
)

In [None]:
# Check if BAWDY track matched
bawdy_matches = [m for m in matches_tfidf if m["sc_artist"] == "BAWDY"]

print("=" * 80)
print("BAWDY TRACK TEST - TF-IDF Results")
print("=" * 80)
print()

if bawdy_matches:
    print(f"‚úÖ Found {len(bawdy_matches)} BAWDY match(es)!")
    print()
    for match in bawdy_matches:
        print(f"SoundCloud: {match['sc_artist']} - {match['sc_title']}")
        print(f"Local:      {match['local_artist']} - {match['local_title']}")
        print(f"Score:      {match['score']:.3f}")
        print()
else:
    print("‚ùå BAWDY track did NOT match")
    print()
    # Show what it got instead
    bawdy_in_no_matches = [nm for nm in no_matches_tfidf if nm["artist"] == "BAWDY"]
    if bawdy_in_no_matches:
        for nm in bawdy_in_no_matches:
            print(f"  {nm['artist']} - {nm['title']}")
            print(
                f"  Best score: {nm.get('best_score', 0.0):.3f} (below 0.70 threshold)"
            )

In [None]:
track

('2211704240', {'title': 'FEEL THIS', 'artist': 'The Living Proof', 'genre': 'Dubstep', 'duration': 167.471})


In [53]:
df = pd.DataFrame(local_tracks)
display(df.head(1))

display(df[df.title.str.contains("MY NECK MY BACK")])

Unnamed: 0,id,file_path,title,artist,album,genre,year,duration,key_signature,bpm,...,file_mtime,last_synced_at,remix_artist,local_path,soundcloud_id,spotify_id,youtube_id,soundcloud_synced_at,spotify_synced_at,youtube_synced_at
0,6039,/home/kevin/Music/PioneerDJ/Sampler/MERGE FX/M...,MergeFX Sample Sound 202,,,,,1.838073,,,...,1602181000.0,2025-11-19 01:22:49,,/home/kevin/Music/PioneerDJ/Sampler/MERGE FX/M...,,,,,,


Unnamed: 0,id,file_path,title,artist,album,genre,year,duration,key_signature,bpm,...,file_mtime,last_synced_at,remix_artist,local_path,soundcloud_id,spotify_id,youtube_id,soundcloud_synced_at,spotify_synced_at,youtube_synced_at
321,3457,/home/kevin/Music/EDM/2020/Aug 20/MY NECK MY B...,MY NECK MY BACK - (BAD TASTES TOO WOOK TO WALK...,BADTASTES OFFICIAL,Aug 20,Trap,2018.0,158.9792,G#m,73.0,...,1751567000.0,2025-11-19 01:22:49,,/home/kevin/Music/EDM/2020/Aug 20/MY NECK MY B...,,,,,,
333,5896,/home/kevin/Music/EDM/2025/Sept 25/MY NECK MY ...,MY NECK MY BACK (BAWDY Flip),BAWDY,Sept 25,Trap,2025.0,138.087506,Cm,100.0,...,1759638000.0,2025-11-19 01:22:49,BAWDY,/home/kevin/Music/EDM/2025/Sept 25/MY NECK MY ...,,,,,,


In [None]:
df = pd.DataFrame([t[1] for t in sc_tracks])
df.insert(0, "id", [t[0] for t in sc_tracks])
display(df[df.artist == "BAWDY"].tail(1))

Unnamed: 0,id,title,artist,genre,duration
196,2091674085,MY NECK MY BACK (BAWDY Flip) (FREE DL),BAWDY,100bpm,138.136


In [64]:
print(len(sc_tracks))

200


In [None]:
all_playlists.extend(data)
# Find target playlists
target_names = ["Nov 25", "Oct 25", "Sept 25"]
target_playlists = {}

for playlist in all_playlists:
    name = playlist.get("title", "")

    if name in target_names:
        target_playlists[name] = {
            "id": str(playlist["id"]),
            "track_count": playlist.get("track_count", 0),
            "tracks": playlist.get("tracks", []),
        }

print(f"\nFound target playlists:")
for name in target_names:
    if name in target_playlists:
        pl = target_playlists[name]
        print(f"  ‚úì {name}: {pl['track_count']} tracks (ID: {pl['id']})")
    else:
        print(f"  ‚úó {name}: Not found")


Found target playlists:
  ‚úì Nov 25: 31 tracks (ID: 2107077439)
  ‚úì Oct 25: 71 tracks (ID: 2089957919)
  ‚úì Sept 25: 154 tracks (ID: 2076808602)


### Compare: Likes NOT in Playlists

## 5. Results Analysis

### 5.1 Matched Tracks Table (Top 20)

### 5.2 Borderline Matches (0.8-0.85) - Needs Manual Verification

In [None]:
if matches:
    df_borderline = df_matches[
        (df_matches["score"] >= 0.8) & (df_matches["score"] < 0.85)
    ]

    print(f"Borderline matches (0.8-0.85): {len(df_borderline)} tracks")
    print("These should be manually verified:")
    print()

    if len(df_borderline) > 0:
        display(
            df_borderline[
                ["sc_artist", "sc_title", "local_artist", "local_title", "score"]
            ]
        )
    else:
        print("‚úì No borderline matches - all matches are high confidence!")

### 5.3 Unmatched Tracks (Genuinely New)

In [50]:
if no_matches:
    df_no_matches = pd.DataFrame(no_matches)

    print(f"Unmatched tracks: {len(df_no_matches)}")
    print("These are genuinely new tracks not in local library:")
    print()
    display(df_no_matches[["artist", "title", "genre"]].tail(30))
else:
    print("All SoundCloud tracks matched to local tracks!")

Unmatched tracks: 132
These are genuinely new tracks not in local library:



Unnamed: 0,artist,title,genre
102,slugzmusic,SLUGZ X PANTHER - SHELLSHOCK,Dubstep
103,Know Good,Paramore - Decode (Know Good Flip),
104,Seth David,Zeds Dead & Flux Pavilion - WAVES (SETH DAVID ...,Dubstep
105,DOMEOFDOOM,Jack Blom - Loud N Clear,Electronic
106,Outset,Au5 & Tasha Baxter - Snowblind (Outset Remix),Dubstep
107,$LUTCHK,$PIN 4 ME ($LUTCHK EDIT) (Spins FLO x Real 4 m...,Dubstep
108,Whethan,"Disco Lines, Tinashe - No Broke Boys (Whethan ...",Dubstep
109,Phrva,deadmau5 & Kaskade - I Remember (Phrva Flip),Dubstep
110,Jad≈´ Dala,Untitld - Fight Song (JAD≈™015),JAD≈™
111,NIGHTMODE,WINK & nikko - MERCY,Dance & EDM


### 5.4 Score Distribution Histogram

In [None]:
if matches:
    import matplotlib.pyplot as plt

    scores = [m["score"] for m in matches]

    plt.figure(figsize=(10, 6))
    plt.hist(scores, bins=20, edgecolor="black", alpha=0.7)
    plt.axvline(0.8, color="red", linestyle="--", label="Min threshold (0.8)")
    plt.xlabel("Similarity Score")
    plt.ylabel("Number of Tracks")
    plt.title("Distribution of Match Similarity Scores")
    plt.legend()
    plt.grid(alpha=0.3)
    plt.show()

In [None]:
# Check a few random matches to see the normalized strings
import random

if matches and len(matches) >= 3:
    print("Sample normalized comparisons:")
    print("=" * 80)

    samples = random.sample(matches, min(3, len(matches)))

    for i, match in enumerate(samples, 1):
        print(f"\nMatch {i} (score: {match['score']:.3f})")
        print(f"  SoundCloud: {match['sc_artist']} - {match['sc_title']}")
        print(f"  Local:      {match['local_artist']} - {match['local_title']}")
        print(f"  Normalized SC title:  '{normalize_string(match['sc_title'])}'")
        print(f"  Normalized LC title:  '{normalize_string(match['local_title'])}'")
        print(f"  Normalized SC artist: '{normalize_string(match['sc_artist'])}'")
        print(f"  Normalized LC artist: '{normalize_string(match['local_artist'])}'")