In [122]:
from sklearn.neighbors import NearestNeighbors
import pandas as pd

# 1. Read datasets
spotify_df = pd.read_csv("data/spotify_df_cleaned.csv")
recommendationInfo_df = pd.read_csv("data/recommendation_info.csv")

# 2. Inspect the data in DataFrames
print("Number of rows and columns:", spotify_df.shape)
print("Number of rows and columns:", recommendationInfo_df.shape)

print("Column names:", spotify_df.columns.tolist())
print("Column names:", recommendationInfo_df.columns.tolist())

Number of rows and columns: (32828, 21)
Number of rows and columns: (32828, 3)
Column names: ['track_popularity', 'playlist_name', 'playlist_subgenre', 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms', 'playlist_genre_edm', 'playlist_genre_latin', 'playlist_genre_pop', 'playlist_genre_r&b', 'playlist_genre_rap', 'playlist_genre_rock']
Column names: ['track_artist', 'track_name', 'track_album_name']


In [123]:
# 3. Fit KNN model
target_column = [
    'track_popularity',
    'key',
    'mode',
    'speechiness',
    'instrumentalness',
    'liveness',
    'tempo',
    'duration_ms',
                 ]

sorted_spotify_df = spotify_df.drop(columns=target_column)

X = spotify_df.values   
knn_model = NearestNeighbors(
    metric='cosine',  
    algorithm='brute', 
    n_neighbors=11 
)

knn_model.fit(X)
print("KNN model fitted successfully.")

KNN model fitted successfully.


In [124]:
def recommend_song(song_list, n_recs=10):
    seed_indices = []

    # Find indices of input songs
    for track_name, artist_name in song_list:
        t = track_name.lower()
        a = artist_name.lower()

        mask = (
            recommendationInfo_df["track_name"].str.lower() == t
        ) & (
            recommendationInfo_df["track_artist"].str.lower() == a
        )

        if mask.sum() > 0:
            idx = recommendationInfo_df[mask].index[0]
            seed_indices.append(idx)
        else:
            print(f"Song not found: {track_name} — {artist_name}")

    if len(seed_indices) == 0:
        return "No valid input songs found in dataset."

    # Create query vector by averaging seed song vectors
    query_vector = X[seed_indices].mean(axis=0).reshape(1, -1)

    # Get nearest neighbors
    distances, neighbors = knn_model.kneighbors(
        query_vector,
        n_neighbors=n_recs + len(seed_indices) + 20  
    )

    neighbors = neighbors[0]

    # Remove seed songs from recommendations
    filtered = [idx for idx in neighbors if idx not in seed_indices]

    # Remove duplicates while preserving order
    unique = []
    seen = set()
    for idx in filtered:
        song_id = (
            recommendationInfo_df.loc[idx, "track_artist"],
            recommendationInfo_df.loc[idx, "track_name"]
        )
        if song_id not in seen:
            seen.add(song_id)
            unique.append(idx)
        if len(unique) == n_recs:
            break

    return recommendationInfo_df.iloc[unique].reset_index(drop=True)


In [125]:
songs = [
    ("Don't You (Forget About Me) - Remastered", "Simple Minds"),
    ("Don't You Want Me", "The Human League"),
    ("The Promise", "When in Rome"),
    ("Take On Me", "a-ha"),
    ("Sweet Dreams (Are Made of This)", "Eurythmics"),
    ("Everybody Wants to Rule the World", "Tears for Fears"),
    ("Hungry Like the Wolf", "Duran Duran")
]

# Print input songs
print("Input Songs:")
for title, artist in songs:
    print(f"- {title} — {artist}")

print("\nRecommended Songs:")
recommended_song_list = recommend_song(songs, 10)
print(recommended_song_list)


Input Songs:
- Don't You (Forget About Me) - Remastered — Simple Minds
- Don't You Want Me — The Human League
- The Promise — When in Rome
- Take On Me — a-ha
- Sweet Dreams (Are Made of This) — Eurythmics
- Everybody Wants to Rule the World — Tears for Fears
- Hungry Like the Wolf — Duran Duran

Recommended Songs:
Song not found: Sweet Dreams (Are Made of This) — Eurythmics
Song not found: Hungry Like the Wolf — Duran Duran
  track_artist                                         track_name  \
0    Daft Punk  Get Lucky (feat. Pharrell Williams & Nile Rodg...   
1    Paradisio                              Bailando - Video Edit   
2        Kabah                            La Calle De Las Sirenas   
3     Haddaway                                       What Is Love   
4         TOTO                                      Hold the Line   
5  Rick Astley                            Never Gonna Give You Up   
6    Whigfield                          Think of You - Radio Edit   
7         a-ha     

In [126]:
#Cosine similarity score of the songs vector and recommended_song_list vector
from sklearn.metrics.pairwise import cosine_similarity
def cosine_similarity_score(song_list, recommended_song_list):
    seed_indices = []

    for track_name, artist_name in song_list:
        t = track_name.lower()
        a = artist_name.lower()

        mask = (
            recommendationInfo_df["track_name"].str.lower() == t
        ) & (
            recommendationInfo_df["track_artist"].str.lower() == a
        )

        if mask.sum() > 0:
            idx = recommendationInfo_df[mask].index[0]
            seed_indices.append(idx)

    query_vector = X[seed_indices].mean(axis=0).reshape(1, -1)
    rec_vectors = X[recommended_song_list.index]

    scores = cosine_similarity(query_vector, rec_vectors)

    return scores.flatten()

similarity_scores = cosine_similarity_score(songs, recommended_song_list)
print("\nCosine Similarity Scores:")
print(similarity_scores)


Cosine Similarity Scores:
[0.56595466 0.55715469 0.59460244 0.40029019 0.64759857 0.61708071
 0.45251308 0.54543894 0.30804676 0.66350995]


In [127]:
# Load spotify playlists data
import random
# Handle malformed CSV with robust error handling and quoting
playlists_df = pd.read_csv(
    "data/spotify_playlists.csv", 
    on_bad_lines='skip',
    quoting=1,  # QUOTE_ALL
    encoding='utf-8',
    encoding_errors='ignore',
    nrows=120000
)
print("Playlists data loaded:")
print(f"Total rows: {len(playlists_df)}")
print(f"Column names: {playlists_df.columns.tolist()}")
print(f"\nFirst few rows:\n{playlists_df.head()}")

# Strip quotes from column names if present
playlists_df.columns = playlists_df.columns.str.strip().str.replace('"', '')
print(f"\nCleaned column names: {playlists_df.columns.tolist()}")

if 'playlistname' in playlists_df.columns:
    print(f"\nUnique playlists: {playlists_df['playlistname'].nunique()}")
    print(f"\nSample playlists:\n{playlists_df['playlistname'].value_counts().head()}")
else:
    print("\nNote: 'playlistname' column not found. Check column names above.")

Playlists data loaded:
Total rows: 120000
Column names: ['user_id', ' "artistname"', ' "trackname"', ' "playlistname"']

First few rows:
                            user_id                      "artistname"  \
0  9cc0cfd4d7d7885102480dd99e7a90d6                    Elvis Costello   
1  9cc0cfd4d7d7885102480dd99e7a90d6  Elvis Costello & The Attractions   
2  9cc0cfd4d7d7885102480dd99e7a90d6                      Tiffany Page   
3  9cc0cfd4d7d7885102480dd99e7a90d6  Elvis Costello & The Attractions   
4  9cc0cfd4d7d7885102480dd99e7a90d6                    Elvis Costello   

                                         "trackname"  "playlistname"  
0               (The Angels Wanna Wear My) Red Shoes  HARD ROCK 2010  
1  (What's So Funny 'Bout) Peace, Love And Unders...  HARD ROCK 2010  
2                                   7 Years Too Late  HARD ROCK 2010  
3                              Accidents Will Happen  HARD ROCK 2010  
4                                             Alison  HARD ROCK 2010 

In [128]:
def evaluate_playlist(playlist_name, n_seed=5, n_recs=10):
    """
    Takes a playlist name, randomly selects n_seed songs from it that exist in our dataset,
    gets recommendations, and checks if any recommendations are from the original playlist.
    
    Parameters:
    - playlist_name: name of playlist in spotify_playlists.csv
    - n_seed: number of songs to use as input (default 5)
    - n_recs: number of recommendations to generate (default 10)
    
    Returns:
    - Dictionary with evaluation results
    """
    # Get all songs from the specified playlist
    playlist_songs = playlists_df[playlists_df['playlistname'] == playlist_name].copy()
    
    if len(playlist_songs) == 0:
        return {"error": f"Playlist '{playlist_name}' not found"}
    
    print(f"\n{'='*60}")
    print(f"Evaluating Playlist: {playlist_name}")
    print(f"{'='*60}")
    print(f"Total songs in playlist: {len(playlist_songs)}")
    
    # Find which playlist songs exist in our recommendation dataset
    matched_songs = []
    for idx, row in playlist_songs.iterrows():
        track = str(row['trackname']).lower().strip()
        artist = str(row['artistname']).lower().strip()
        
        mask = (
            recommendationInfo_df["track_name"].str.lower().str.strip() == track
        ) & (
            recommendationInfo_df["track_artist"].str.lower().str.strip() == artist
        )
        
        if mask.sum() > 0:
            dataset_idx = recommendationInfo_df[mask].index[0]
            matched_songs.append({
                'track': row['trackname'],
                'artist': row['artistname'],
                'dataset_idx': dataset_idx
            })
    
    print(f"Songs found in our dataset: {len(matched_songs)}/{len(playlist_songs)}")
    
    if len(matched_songs) < n_seed:
        return {
            "error": f"Not enough songs in dataset. Found {len(matched_songs)}, need at least {n_seed}",
            "matched_count": len(matched_songs),
            "total_count": len(playlist_songs)
        }
    
    # Randomly select n_seed songs as input
    random.seed(42)  # For reproducibility
    seed_songs = random.sample(matched_songs, n_seed)
    remaining_songs = [s for s in matched_songs if s not in seed_songs]
    
    print(f"\n--- Input Songs (randomly selected {n_seed}) ---")
    for i, song in enumerate(seed_songs, 1):
        print(f"{i}. {song['track']} — {song['artist']}")
    
    # Get recommendations using existing recommend_song() function
    seed_list = [(song['track'], song['artist']) for song in seed_songs]
    recs_df = recommend_song(seed_list, n_recs=n_recs)
    
    # Check if recommend_song returned an error
    if isinstance(recs_df, str):
        return {"error": recs_df}
    
    print(f"\n--- Recommendations ({len(recs_df)} songs) ---")
    remaining_indices = {s['dataset_idx'] for s in remaining_songs}
    hits = []
    
    # Get the original indices from recommendationInfo_df
    for i in range(len(recs_df)):
        row = recs_df.iloc[i]
        track = row['track_name']
        artist = row['track_artist']
        
        # Find the original index in recommendationInfo_df
        mask = (
            recommendationInfo_df["track_name"].str.lower().str.strip() == track.lower().strip()
        ) & (
            recommendationInfo_df["track_artist"].str.lower().str.strip() == artist.lower().strip()
        )
        
        if mask.sum() > 0:
            original_idx = recommendationInfo_df[mask].index[0]
            in_playlist = "✓ IN PLAYLIST" if original_idx in remaining_indices else ""
            print(f"{i+1}. {track} — {artist} {in_playlist}")
            
            if original_idx in remaining_indices:
                hits.append({'track': track, 'artist': artist, 'rank': i+1})
        else:
            print(f"{i+1}. {track} — {artist}")
    
    # Calculate metrics
    hit_rate = 1 if len(hits) > 0 else 0
    precision = len(hits) / n_recs if n_recs > 0 else 0
    recall = len(hits) / len(remaining_songs) if len(remaining_songs) > 0 else 0
    
    print(f"\n--- Evaluation Results ---")
    print(f"Songs in playlist (in dataset): {len(matched_songs)}")
    print(f"Used as input: {n_seed}")
    print(f"Held-out (ground truth): {len(remaining_songs)}")
    print(f"Recommendations generated: {len(recs_df)}")
    print(f"Hits (songs from playlist recommended): {len(hits)}")
    print(f"Hit Rate (at least 1 hit): {hit_rate}")
    print(f"Precision@{n_recs}: {precision:.4f}")
    print(f"Recall@{n_recs}: {recall:.4f}")
    
    if hits:
        print(f"\nHit songs (from original playlist):")
        for hit in hits:
            print(f"  - Rank {hit['rank']}: {hit['track']} — {hit['artist']}")
    
    return {
        'playlist_name': playlist_name,
        'total_in_playlist': len(playlist_songs),
        'matched_in_dataset': len(matched_songs),
        'seed_count': n_seed,
        'held_out_count': len(remaining_songs),
        'n_recs': n_recs,
        'hits': len(hits),
        'hit_rate': hit_rate,
        'precision': precision,
        'recall': recall,
        'hit_songs': hits
    }

In [129]:
print(evaluate_playlist('Liked from Radio', n_seed=20, n_recs=100))


Evaluating Playlist: Liked from Radio
Total songs in playlist: 1999
Songs found in our dataset: 356/1999

--- Input Songs (randomly selected 20) ---
1. Coffee And TV — Blur
2. Battle Born — Five Finger Death Punch
3. Rill Rill — Sleigh Bells
4. More Than Words — Extreme
5. Manic Monday — The Bangles
6. White Room — Cream
7. Hip To Be Square — Huey Lewis & The News
8. Africa — Toto
9. Lovefool — The Cardigans
10. All Along The Watchtower — Jimi Hendrix
11. Nothing Else Matters — Metallica
12. So Good at Being in Trouble — Unknown Mortal Orchestra
13. P.D.A. (We Just Don't Care) — John Legend
14. You Don't Know How It Feels — Tom Petty
15. Ways To Go — Grouplove
16. Say It Ain't So — Weezer
17. This Is Gonna Hurt — Sixx:A.M.
18. Blue (Da Ba Dee) — Eiffel 65
19. Last Resort — Papa Roach
20. More Than a Feeling — Boston
Songs found in our dataset: 356/1999

--- Input Songs (randomly selected 20) ---
1. Coffee And TV — Blur
2. Battle Born — Five Finger Death Punch
3. Rill Rill — Sleigh Bel

In [130]:
print(evaluate_playlist('xfm top 1000', n_seed=20, n_recs=50))


Evaluating Playlist: xfm top 1000
Total songs in playlist: 869
Songs found in our dataset: 199/869

--- Input Songs (randomly selected 20) ---
1. The Pretender — Foo Fighters
2. Club Foot — Kasabian
3. All The Small Things — blink-182
4. When You Were Young — The Killers
5. I Still Haven't Found What I'm Looking For — U2
6. Heads Will Roll — Yeah Yeah Yeahs
7. God Save The Queen — Sex Pistols
8. Don't Look Back Into The Sun — The Libertines
9. When The Sun Goes Down — Arctic Monkeys
10. Clint Eastwood — Gorillaz
11. Under The Bridge — Red Hot Chili Peppers
12. Somewhere Only We Know — Keane
13. Buddy Holly — Weezer
14. Sweet Child O' Mine — Guns N' Roses
15. Munich — Editors
16. An End Has A Start — Editors
17. American Idiot — Green Day
18. Cannonball — Damien Rice
19. Girls And Boys — Blur
20. Grounds For Divorce — Elbow
Songs found in our dataset: 199/869

--- Input Songs (randomly selected 20) ---
1. The Pretender — Foo Fighters
2. Club Foot — Kasabian
3. All The Small Things — bl

In [131]:
print(evaluate_playlist('ZLX', n_seed=20, n_recs=50))


Evaluating Playlist: ZLX
Total songs in playlist: 239
Songs found in our dataset: 120/239

--- Input Songs (randomly selected 20) ---
1. Rocky Mountain Way — Joe Walsh
2. Break On Through [To The Other Side] - New Stereo Mix Advanced Resolution — The Doors
3. American Girl — Tom Petty And The Heartbreakers
4. Sunshine Of Your Love — Cream
5. Fool In The Rain — Led Zeppelin
6. Eminence Front — The Who
7. Dream On — Aerosmith
8. Can't You See — The Marshall Tucker Band
9. Born to Run — Bruce Springsteen
10. Show Me The Way - Live — Peter Frampton
11. Won't Get Fooled Again - Original Album Version — The Who
12. Peace of Mind — Boston
13. Black Magic Woman — Santana
14. Radar Love — Golden Earring
15. Listen To The Music — The Doobie Brothers
16. Another Brick In The Wall, Pt. 2 - 2011 Remastered Version — Pink Floyd
17. Don't Bring Me Down — Electric Light Orchestra
18. Dude (Looks Like A Lady) — Aerosmith
19. More Than a Feeling — Boston
20. Rebel Yell - 1999 - Remaster — Billy Idol

-

In [132]:
print(evaluate_playlist('Smiths inspired', 2, 50))


Evaluating Playlist: Smiths inspired
Total songs in playlist: 30
Songs found in our dataset: 13/30

--- Input Songs (randomly selected 2) ---
1. Town Called Malice — The Jam
2. Evil — Interpol

--- Recommendations (50 songs) ---
1. Happy When It Rains — The Jesus and Mary Chain 
2. April Skies — The Jesus and Mary Chain 
3. Sick As Our Secrets — Makes My Blood Dance 
4. Absolutely (Story of a Girl) - Radio Mix — Nine Days 
5. Television — The Academic 
6. Where Are You Now — Tiny Fighter 
7. Obstacle 1 — Interpol 
8. Transmission - 2010 Remaster — Joy Division 
9. The Last of the Famous International Playboys - 2010 Remaster — Morrissey 
Songs found in our dataset: 13/30

--- Input Songs (randomly selected 2) ---
1. Town Called Malice — The Jam
2. Evil — Interpol

--- Recommendations (50 songs) ---
1. Happy When It Rains — The Jesus and Mary Chain 
2. April Skies — The Jesus and Mary Chain 
3. Sick As Our Secrets — Makes My Blood Dance 
4. Absolutely (Story of a Girl) - Radio Mix — Ni