In [21]:
from sklearn.neighbors import NearestNeighbors
import pandas as pd

# 1. Read datasets
spotify_df = pd.read_csv("data/spotify_df_cleaned.csv")
recommendationInfo_df = pd.read_csv("data/recommendation_info.csv")

# 2. Inspect the data in DataFrames
print("Number of rows and columns:", spotify_df.shape)
print("Number of rows and columns:", recommendationInfo_df.shape)

print("Column names:", spotify_df.columns.tolist())
print("Column names:", recommendationInfo_df.columns.tolist())

Number of rows and columns: (32828, 21)
Number of rows and columns: (32828, 3)
Column names: ['track_popularity', 'playlist_name', 'playlist_subgenre', 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms', 'playlist_genre_edm', 'playlist_genre_latin', 'playlist_genre_pop', 'playlist_genre_r&b', 'playlist_genre_rap', 'playlist_genre_rock']
Column names: ['track_artist', 'track_name', 'track_album_name']


In [22]:
# 3. Fit KNN model
target_column = [
    'track_popularity',
    'key',
    'mode',
    'speechiness',
    'instrumentalness',
    'liveness',
    'tempo',
    'duration_ms',
                 ]

sorted_spotify_df = spotify_df.drop(columns=target_column)

X = spotify_df.values   
knn_model = NearestNeighbors(
    metric='cosine',  
    algorithm='brute', 
    n_neighbors=11 
)

knn_model.fit(X)
print("KNN model fitted successfully.")

KNN model fitted successfully.


In [23]:
def recommend_song(song_list, n_recs=10):
    seed_indices = []

    # Find indices of input songs
    for track_name, artist_name in song_list:
        t = track_name.lower()
        a = artist_name.lower()

        mask = (
            recommendationInfo_df["track_name"].str.lower() == t
        ) & (
            recommendationInfo_df["track_artist"].str.lower() == a
        )

        if mask.sum() > 0:
            idx = recommendationInfo_df[mask].index[0]
            seed_indices.append(idx)
        else:
            print(f"Song not found: {track_name} — {artist_name}")

    if len(seed_indices) == 0:
        return "No valid input songs found in dataset."

    # Create query vector by averaging seed song vectors
    query_vector = X[seed_indices].mean(axis=0).reshape(1, -1)

    # Get nearest neighbors
    distances, neighbors = knn_model.kneighbors(
        query_vector,
        n_neighbors=n_recs + len(seed_indices) + 20  
    )

    neighbors = neighbors[0]

    # Remove seed songs from recommendations
    filtered = [idx for idx in neighbors if idx not in seed_indices]

    # Remove duplicates while preserving order
    unique = []
    seen = set()
    for idx in filtered:
        song_id = (
            recommendationInfo_df.loc[idx, "track_artist"],
            recommendationInfo_df.loc[idx, "track_name"]
        )
        if song_id not in seen:
            seen.add(song_id)
            unique.append(idx)
        if len(unique) == n_recs:
            break

    return recommendationInfo_df.iloc[unique].reset_index(drop=True)


In [24]:
songs = [
    ("Don't You (Forget About Me) - Remastered", "Simple Minds"),
    ("Don't You Want Me", "The Human League"),
    ("The Promise", "When in Rome"),
    ("Take On Me", "a-ha"),
    ("Sweet Dreams (Are Made of This)", "Eurythmics"),
    ("Everybody Wants to Rule the World", "Tears for Fears"),
    ("Hungry Like the Wolf", "Duran Duran")
]

# Print input songs
print("Input Songs:")
for title, artist in songs:
    print(f"- {title} — {artist}")

print("\nRecommended Songs:")
recommended_song_list = recommend_song(songs, 10)
print(recommended_song_list)


Input Songs:
- Don't You (Forget About Me) - Remastered — Simple Minds
- Don't You Want Me — The Human League
- The Promise — When in Rome
- Take On Me — a-ha
- Sweet Dreams (Are Made of This) — Eurythmics
- Everybody Wants to Rule the World — Tears for Fears
- Hungry Like the Wolf — Duran Duran

Recommended Songs:
Song not found: Sweet Dreams (Are Made of This) — Eurythmics
Song not found: Sweet Dreams (Are Made of This) — Eurythmics
Song not found: Hungry Like the Wolf — Duran Duran
Song not found: Hungry Like the Wolf — Duran Duran
  track_artist                                         track_name  \
0    Daft Punk  Get Lucky (feat. Pharrell Williams & Nile Rodg...   
1    Paradisio                              Bailando - Video Edit   
2        Kabah                            La Calle De Las Sirenas   
3     Haddaway                                       What Is Love   
4         TOTO                                      Hold the Line   
5  Rick Astley                            Nev

In [25]:
#Cosine similarity score of the songs vector and recommended_song_list vector
from sklearn.metrics.pairwise import cosine_similarity
def cosine_similarity_score(song_list, recommended_song_list):
    seed_indices = []

    for track_name, artist_name in song_list:
        t = track_name.lower()
        a = artist_name.lower()

        mask = (
            recommendationInfo_df["track_name"].str.lower() == t
        ) & (
            recommendationInfo_df["track_artist"].str.lower() == a
        )

        if mask.sum() > 0:
            idx = recommendationInfo_df[mask].index[0]
            seed_indices.append(idx)

    query_vector = X[seed_indices].mean(axis=0).reshape(1, -1)
    rec_vectors = X[recommended_song_list.index]

    scores = cosine_similarity(query_vector, rec_vectors)

    return scores.flatten()

similarity_scores = cosine_similarity_score(songs, recommended_song_list)
print("\nCosine Similarity Scores:")
print(similarity_scores)


Cosine Similarity Scores:
[0.56595466 0.55715469 0.59460244 0.40029019 0.64759857 0.61708071
 0.45251308 0.54543894 0.30804676 0.66350995]


In [26]:
# Load spotify playlists data
import random
# Handle malformed CSV with robust error handling and quoting
playlists_df = pd.read_csv(
    "data/spotify_playlists.csv", 
    on_bad_lines='skip',
    quoting=1,  # QUOTE_ALL
    encoding='utf-8',
    encoding_errors='ignore',
    nrows=50000
)
print("Playlists data loaded:")
print(f"Total rows: {len(playlists_df)}")
print(f"Column names: {playlists_df.columns.tolist()}")
print(f"\nFirst few rows:\n{playlists_df.head()}")

# Strip quotes from column names if present
playlists_df.columns = playlists_df.columns.str.strip().str.replace('"', '')
print(f"\nCleaned column names: {playlists_df.columns.tolist()}")

if 'playlistname' in playlists_df.columns:
    print(f"\nUnique playlists: {playlists_df['playlistname'].nunique()}")
    print(f"\nSample playlists:\n{playlists_df['playlistname'].value_counts().head()}")
else:
    print("\nNote: 'playlistname' column not found. Check column names above.")

Playlists data loaded:
Total rows: 50000
Column names: ['user_id', ' "artistname"', ' "trackname"', ' "playlistname"']

First few rows:
                            user_id                      "artistname"  \
0  9cc0cfd4d7d7885102480dd99e7a90d6                    Elvis Costello   
1  9cc0cfd4d7d7885102480dd99e7a90d6  Elvis Costello & The Attractions   
2  9cc0cfd4d7d7885102480dd99e7a90d6                      Tiffany Page   
3  9cc0cfd4d7d7885102480dd99e7a90d6  Elvis Costello & The Attractions   
4  9cc0cfd4d7d7885102480dd99e7a90d6                    Elvis Costello   

                                         "trackname"  "playlistname"  
0               (The Angels Wanna Wear My) Red Shoes  HARD ROCK 2010  
1  (What's So Funny 'Bout) Peace, Love And Unders...  HARD ROCK 2010  
2                                   7 Years Too Late  HARD ROCK 2010  
3                              Accidents Will Happen  HARD ROCK 2010  
4                                             Alison  HARD ROCK 2010  

In [27]:
def evaluate_playlist(playlist_name, n_seed=5, n_recs=10):
    """
    Takes a playlist name, randomly selects n_seed songs from it that exist in our dataset,
    gets recommendations, and checks if any recommendations are from the original playlist.
    
    Parameters:
    - playlist_name: name of playlist in spotify_playlists.csv
    - n_seed: number of songs to use as input (default 5)
    - n_recs: number of recommendations to generate (default 10)
    
    Returns:
    - Dictionary with evaluation results
    """
    # Get all songs from the specified playlist
    playlist_songs = playlists_df[playlists_df['playlistname'] == playlist_name].copy()
    
    if len(playlist_songs) == 0:
        return {"error": f"Playlist '{playlist_name}' not found"}
    
    print(f"\n{'='*60}")
    print(f"Evaluating Playlist: {playlist_name}")
    print(f"{'='*60}")
    print(f"Total songs in playlist: {len(playlist_songs)}")
    
    # Find which playlist songs exist in our recommendation dataset
    matched_songs = []
    for idx, row in playlist_songs.iterrows():
        track = str(row['trackname']).lower().strip()
        artist = str(row['artistname']).lower().strip()
        
        mask = (
            recommendationInfo_df["track_name"].str.lower().str.strip() == track
        ) & (
            recommendationInfo_df["track_artist"].str.lower().str.strip() == artist
        )
        
        if mask.sum() > 0:
            dataset_idx = recommendationInfo_df[mask].index[0]
            matched_songs.append({
                'track': row['trackname'],
                'artist': row['artistname'],
                'dataset_idx': dataset_idx
            })
    
    print(f"Songs found in our dataset: {len(matched_songs)}/{len(playlist_songs)}")
    
    if len(matched_songs) < n_seed:
        return {
            "error": f"Not enough songs in dataset. Found {len(matched_songs)}, need at least {n_seed}",
            "matched_count": len(matched_songs),
            "total_count": len(playlist_songs)
        }
    
    # Randomly select n_seed songs as input
    random.seed(42)  # For reproducibility
    seed_songs = random.sample(matched_songs, n_seed)
    remaining_songs = [s for s in matched_songs if s not in seed_songs]
    
    print(f"\n--- Input Songs (randomly selected {n_seed}) ---")
    for i, song in enumerate(seed_songs, 1):
        print(f"{i}. {song['track']} — {song['artist']}")
    
    # Get recommendations using existing recommend_song() function
    seed_list = [(song['track'], song['artist']) for song in seed_songs]
    recs_df = recommend_song(seed_list, n_recs=n_recs)
    
    # Check if recommend_song returned an error
    if isinstance(recs_df, str):
        return {"error": recs_df}
    
    print(f"\n--- Recommendations ({len(recs_df)} songs) ---")
    remaining_indices = {s['dataset_idx'] for s in remaining_songs}
    hits = []
    
    for i, (idx, row) in enumerate(recs_df.iterrows(), 1):
        track = row['track_name']
        artist = row['track_artist']
        in_playlist = "✓ IN PLAYLIST" if idx in remaining_indices else ""
        print(f"{i}. {track} — {artist} {in_playlist}")
        
        if idx in remaining_indices:
            hits.append({'track': track, 'artist': artist, 'rank': i})
    
    # Calculate metrics
    hit_rate = 1 if len(hits) > 0 else 0
    precision = len(hits) / n_recs if n_recs > 0 else 0
    recall = len(hits) / len(remaining_songs) if len(remaining_songs) > 0 else 0
    
    print(f"\n--- Evaluation Results ---")
    print(f"Songs in playlist (in dataset): {len(matched_songs)}")
    print(f"Used as input: {n_seed}")
    print(f"Held-out (ground truth): {len(remaining_songs)}")
    print(f"Recommendations generated: {len(recs_df)}")
    print(f"Hits (songs from playlist recommended): {len(hits)}")
    print(f"Hit Rate (at least 1 hit): {hit_rate}")
    print(f"Precision@{n_recs}: {precision:.4f}")
    print(f"Recall@{n_recs}: {recall:.4f}")
    
    if hits:
        print(f"\nHit songs (from original playlist):")
        for hit in hits:
            print(f"  - Rank {hit['rank']}: {hit['track']} — {hit['artist']}")
    
    return {
        'playlist_name': playlist_name,
        'total_in_playlist': len(playlist_songs),
        'matched_in_dataset': len(matched_songs),
        'seed_count': n_seed,
        'held_out_count': len(remaining_songs),
        'n_recs': n_recs,
        'hits': len(hits),
        'hit_rate': hit_rate,
        'precision': precision,
        'recall': recall,
        'hit_songs': hits
    }

In [40]:
print(evaluate_playlist("Hit's 2014", n_seed=10, n_recs=10))


Evaluating Playlist: Hit's 2014
Total songs in playlist: 54
Songs found in our dataset: 20/54

--- Input Songs (randomly selected 10) ---
1. Hall of Fame — The Script
2. Blurred Lines — Robin Thicke
3. Let Her Go — Passenger
4. I Love It (feat. Charli XCX) - Original Version — Icona Pop
5. Wake Me Up — Avicii
6. Good Feeling — Flo Rida
7. S&M — Rihanna
8. Burn — Ellie Goulding
9. Pompeii — Bastille
10. We Found Love — Rihanna
Songs found in our dataset: 20/54

--- Input Songs (randomly selected 10) ---
1. Hall of Fame — The Script
2. Blurred Lines — Robin Thicke
3. Let Her Go — Passenger
4. I Love It (feat. Charli XCX) - Original Version — Icona Pop
5. Wake Me Up — Avicii
6. Good Feeling — Flo Rida
7. S&M — Rihanna
8. Burn — Ellie Goulding
9. Pompeii — Bastille
10. We Found Love — Rihanna

--- Recommendations (10 songs) ---
1. Found Me (feat. Maria Hazell) — The Him 
2. Who I Am — Nick Jonas & The Administration 
3. If I Can't Have You - Gryffin Remix — Shawn Mendes 
4. Blame It On Wa

In [46]:
print(evaluate_playlist('Liked from Radio', n_seed=65, n_recs=105))


Evaluating Playlist: Liked from Radio
Total songs in playlist: 778
Songs found in our dataset: 133/778

--- Input Songs (randomly selected 65) ---
1. Bitch, Don’t Kill My Vibe — Kendrick Lamar
2. Holocene — Bon Iver
3. Hell Yeah — Rev Theory
4. Dark Horse — Katy Perry
5. Battle Born — Five Finger Death Punch
6. Island In The Sun — Weezer
7. Rather Be (feat. Jess Glynne) — Clean Bandit
8. Stone Rollin' — Raphael Saadiq
9. Max Don´t Have Sex With Your Ex — E-Rotic
10. Rattle - Original Mix — Bingo Players
11. White Room — Cream
12. Heart Of Gold — Neil Young
13. Reptilia — The Strokes
14. Icky Thump — The White Stripes
15. Bad Religion — Godsmack
16. Can I Kick It? — A Tribe Called Quest
17. Cameras — Matt and Kim
18. Brown Eyed Girl — Van Morrison
19. All of Me — John Legend
20. It's You - Ron Basejam Remix — The White Lamp
21. Don't Bring Me Down — Electric Light Orchestra
22. Indestructible — Disturbed
23. Your Love — The Outfield
24. Hip To Be Square — Huey Lewis & The News
25. Monk

In [45]:
print(evaluate_playlist('Not Selena Gomez', n_seed=10, n_recs=100))


Evaluating Playlist: Not Selena Gomez
Total songs in playlist: 135
Songs found in our dataset: 14/135

--- Input Songs (randomly selected 10) ---
1. Sprawl II (Mountains Beyond Mountains) — Arcade Fire
2. Bloodbuzz Ohio — The National
3. Airplanes — Local Natives
4. I'm On Fire — Bruce Springsteen
5. GfC — Albert Hammond, Jr.
6. Should Have Known Better — Sufjan Stevens
7. Clocks — Coldplay
8. Just What I Needed — The Cars
9. Stuck on the puzzle — Alex Turner
10. Yellow — Coldplay

--- Recommendations (100 songs) ---
1. Holocene — Bon Iver 
2. Riding For The Feeling — Bill Callahan 
3. Drunk Drivers/Killer Whales — Car Seat Headrest 
4. Every Time the Sun Comes Up — Sharon Van Etten 
5. Downtown — Majical Cloudz 
6. 22 (OVER S∞∞N) — Bon Iver 
7. You Look Like Rain — Morphine 
8. Motion Sickness — Phoebe Bridgers 
9. Jubilee Street — Nick Cave & The Bad Seeds 
10. I Want to Know What Love Is - 1999 Remaster — Foreigner 
11. Amanda — Boston 
12. Pretty Pimpin — Kurt Vile 
13. Don't Wann

In [None]:
print(evaluate_playlist('I want kisses', n_seed=4, n_recs=100))


Evaluating Playlist: I want kisses
Total songs in playlist: 43
Songs found in our dataset: 6/43

--- Input Songs (randomly selected 4) ---
1. What Is Love — Haddaway
2. Hot N Cold — Katy Perry
3. Madness — Muse
4. Like A Prayer — Madonna

--- Recommendations (100 songs) ---
1. Heaven — Avicii 
2. Boulevard of Broken Dreams — Green Day 
3. Rio - 2009 Remaster — Duran Duran 
4. Red Lights — Tiësto 
5. Elastic Heart — Sia 
6. Total Eclipse Of The Heart (Glee Cast Version) (feat. Jonathan Groff) — Glee Cast 
7. Adventure of a Lifetime — Coldplay 
8. Believe — Cher 
9. All I Ever Need — Austin Mahone 
10. Heaven (featuring Do) — DJ Sammy 
11. Up — Olly Murs 
12. Malibu — Miley Cyrus 
13. Firework — Katy Perry 
14. Paradise — Coldplay 
15. Cool Kids — Echosmith 
16. Juke Box Hero — Foreigner 
17. Thunderstruck — AC/DC 
18. + — Aitana 
19. Big Girls Don't Cry (Personal) — Fergie 
20. Scream & Shout — will.i.am 
21. Pompeii — Bastille 
22. PICK ME — PRODUCE 48 
23. You Make Me — Avicii 
24. C