In [30]:
from sklearn.neighbors import NearestNeighbors
import pandas as pd

# 1. Read datasets
spotify_df = pd.read_csv("data/spotify_df_cleaned.csv")
recommendationInfo_df = pd.read_csv("data/recommendation_info.csv")

# 2. Inspect the data in DataFrames
print("Number of rows and columns:", spotify_df.shape)
print("Number of rows and columns:", recommendationInfo_df.shape)

print("Column names:", spotify_df.columns.tolist())
print("Column names:", recommendationInfo_df.columns.tolist())

Number of rows and columns: (32828, 21)
Number of rows and columns: (32828, 3)
Column names: ['track_popularity', 'playlist_name', 'playlist_subgenre', 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms', 'playlist_genre_edm', 'playlist_genre_latin', 'playlist_genre_pop', 'playlist_genre_r&b', 'playlist_genre_rap', 'playlist_genre_rock']
Column names: ['track_artist', 'track_name', 'track_album_name']


In [None]:
# 3. Fit KNN model
X = spotify_df.values   
knn_model = NearestNeighbors(
    metric='cosine',  
    algorithm='brute', 
    n_neighbors=11 
)

knn_model.fit(X)
print("KNN model fitted successfully.")

KNN model fitted successfully.


In [32]:
def recommend_song(song_list, n_recs=10):
    seed_indices = []

    # Find indices of input songs
    for track_name, artist_name in song_list:
        t = track_name.lower()
        a = artist_name.lower()

        mask = (
            recommendationInfo_df["track_name"].str.lower() == t
        ) & (
            recommendationInfo_df["track_artist"].str.lower() == a
        )

        if mask.sum() > 0:
            idx = recommendationInfo_df[mask].index[0]
            seed_indices.append(idx)
        else:
            print(f"Song not found: {track_name} — {artist_name}")

    if len(seed_indices) == 0:
        return "No valid input songs found in dataset."

    # Create query vector by averaging seed song vectors
    query_vector = X[seed_indices].mean(axis=0).reshape(1, -1)

    # Get nearest neighbors
    distances, neighbors = knn_model.kneighbors(
        query_vector,
        n_neighbors=n_recs + len(seed_indices) + 20  
    )

    neighbors = neighbors[0]

    # Remove seed songs from recommendations
    filtered = [idx for idx in neighbors if idx not in seed_indices]

    # Remove duplicates while preserving order
    unique = []
    seen = set()
    for idx in filtered:
        song_id = (
            recommendationInfo_df.loc[idx, "track_artist"],
            recommendationInfo_df.loc[idx, "track_name"]
        )
        if song_id not in seen:
            seen.add(song_id)
            unique.append(idx)
        if len(unique) == n_recs:
            break

    return recommendationInfo_df.iloc[unique].reset_index(drop=True)


In [37]:
songs = [
    ("Blinding Lights", "The Weeknd"),
    ("Watermelon Sugar", "Harry Styles"),
    ("New Rules", "Dua Lipa"),
]

# Print input songs
print("Input Songs:")
for title, artist in songs:
    print(f"- {title} — {artist}")

print("\nRecommended Songs:")
recommended_song_list = recommend_song(songs, 10)
print(recommended_song_list)


Input Songs:
- Blinding Lights — The Weeknd
- Watermelon Sugar — Harry Styles
- New Rules — Dua Lipa

Recommended Songs:
    track_artist                       track_name  \
0  WALK THE MOON                Shut Up and Dance   
1        Ava Max                 Sweet but Psycho   
2         Avicii                       Wake Me Up   
3         Avicii                       The Nights   
4      Hot Shade                             Dive   
5       Galantis        Bones (feat. OneRepublic)   
6  The Offspring         You're Gonna Go Far, Kid   
7  Martin Garrix  These Are The Times (feat. JRM)   
8  One Direction                   Best Song Ever   
9       Vigiland                 Be Your Friend‬‬   

                     track_album_name  
0  TALKING IS HARD (Expanded Edition)  
1                    Sweet but Psycho  
2                                True  
3                   The Days / Nights  
4                                Dive  
5           Bones (feat. OneRepublic)  
6       Rise An

In [34]:
#Cosine similarity score of the songs vector and recommended_song_list vector
from sklearn.metrics.pairwise import cosine_similarity
def cosine_similarity_score(song_list, recommended_song_list):
    seed_indices = []

    for track_name, artist_name in song_list:
        t = track_name.lower()
        a = artist_name.lower()

        mask = (
            recommendationInfo_df["track_name"].str.lower() == t
        ) & (
            recommendationInfo_df["track_artist"].str.lower() == a
        )

        if mask.sum() > 0:
            idx = recommendationInfo_df[mask].index[0]
            seed_indices.append(idx)

    query_vector = X[seed_indices].mean(axis=0).reshape(1, -1)
    rec_vectors = X[recommended_song_list.index]

    scores = cosine_similarity(query_vector, rec_vectors)

    return scores.flatten()

similarity_scores = cosine_similarity_score(songs, recommended_song_list)
print("\nCosine Similarity Scores:")
print(similarity_scores)


Cosine Similarity Scores:
[0.7997063  0.65267513 0.82155908 0.76777669 0.86230338 0.78601038
 0.70696606 0.82691801 0.67162161 0.85704303]
