In [12]:
from sklearn.neighbors import NearestNeighbors
import pandas as pd

# 1. Read datasets
spotify_df = pd.read_csv("data/spotify_df_cleaned.csv")
recommendationInfo_df = pd.read_csv("data/recommendation_info.csv")

# 2. Inspect the data in DataFrames
print("Number of rows and columns:", spotify_df.shape)
print("Number of rows and columns:", recommendationInfo_df.shape)

print("Column names:", spotify_df.columns.tolist())
print("Column names:", recommendationInfo_df.columns.tolist())

Number of rows and columns: (32828, 21)
Number of rows and columns: (32828, 3)
Column names: ['track_popularity', 'playlist_name', 'playlist_subgenre', 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms', 'playlist_genre_edm', 'playlist_genre_latin', 'playlist_genre_pop', 'playlist_genre_r&b', 'playlist_genre_rap', 'playlist_genre_rock']
Column names: ['track_artist', 'track_name', 'track_album_name']


In [13]:
# 3. Fit KNN model
target_column = [
    'track_popularity',
    'key',
    'mode',
    'speechiness',
    'instrumentalness',
    'liveness',
    'tempo',
    'duration_ms',
                 ]

sorted_spotify_df = spotify_df.drop(columns=target_column)

X = spotify_df.values   
knn_model = NearestNeighbors(
    metric='cosine',  
    algorithm='brute', 
    n_neighbors=11 
)

knn_model.fit(X)
print("KNN model fitted successfully.")

KNN model fitted successfully.


In [14]:
def recommend_song(song_list, n_recs=10):
    seed_indices = []

    # Find indices of input songs
    for track_name, artist_name in song_list:
        t = track_name.lower()
        a = artist_name.lower()

        mask = (
            recommendationInfo_df["track_name"].str.lower() == t
        ) & (
            recommendationInfo_df["track_artist"].str.lower() == a
        )

        if mask.sum() > 0:
            idx = recommendationInfo_df[mask].index[0]
            seed_indices.append(idx)
        else:
            print(f"Song not found: {track_name} — {artist_name}")

    if len(seed_indices) == 0:
        return "No valid input songs found in dataset."

    # Create query vector by averaging seed song vectors
    query_vector = X[seed_indices].mean(axis=0).reshape(1, -1)

    # Get nearest neighbors
    distances, neighbors = knn_model.kneighbors(
        query_vector,
        n_neighbors=n_recs + len(seed_indices) + 20  
    )

    neighbors = neighbors[0]

    # Remove seed songs from recommendations
    filtered = [idx for idx in neighbors if idx not in seed_indices]

    # Remove duplicates while preserving order
    unique = []
    seen = set()
    for idx in filtered:
        song_id = (
            recommendationInfo_df.loc[idx, "track_artist"],
            recommendationInfo_df.loc[idx, "track_name"]
        )
        if song_id not in seen:
            seen.add(song_id)
            unique.append(idx)
        if len(unique) == n_recs:
            break

    return recommendationInfo_df.iloc[unique].reset_index(drop=True)


In [15]:
songs = [
    ("Don't You (Forget About Me) - Remastered", "Simple Minds"),
    ("Don't You Want Me", "The Human League"),
    ("The Promise", "When in Rome"),
    ("Take On Me", "a-ha"),
    ("Sweet Dreams (Are Made of This)", "Eurythmics"),
    ("Everybody Wants to Rule the World", "Tears for Fears"),
    ("Hungry Like the Wolf", "Duran Duran")
]

# Print input songs
print("Input Songs:")
for title, artist in songs:
    print(f"- {title} — {artist}")

print("\nRecommended Songs:")
recommended_song_list = recommend_song(songs, 10)
print(recommended_song_list)


Input Songs:
- Don't You (Forget About Me) - Remastered — Simple Minds
- Don't You Want Me — The Human League
- The Promise — When in Rome
- Take On Me — a-ha
- Sweet Dreams (Are Made of This) — Eurythmics
- Everybody Wants to Rule the World — Tears for Fears
- Hungry Like the Wolf — Duran Duran

Recommended Songs:
Song not found: Sweet Dreams (Are Made of This) — Eurythmics
Song not found: Hungry Like the Wolf — Duran Duran
  track_artist                                         track_name  \
0    Daft Punk  Get Lucky (feat. Pharrell Williams & Nile Rodg...   
1    Paradisio                              Bailando - Video Edit   
2        Kabah                            La Calle De Las Sirenas   
3     Haddaway                                       What Is Love   
4         TOTO                                      Hold the Line   
5  Rick Astley                            Never Gonna Give You Up   
6    Whigfield                          Think of You - Radio Edit   
7         a-ha     

In [16]:
#Cosine similarity score of the songs vector and recommended_song_list vector
from sklearn.metrics.pairwise import cosine_similarity
def cosine_similarity_score(song_list, recommended_song_list):
    seed_indices = []

    for track_name, artist_name in song_list:
        t = track_name.lower()
        a = artist_name.lower()

        mask = (
            recommendationInfo_df["track_name"].str.lower() == t
        ) & (
            recommendationInfo_df["track_artist"].str.lower() == a
        )

        if mask.sum() > 0:
            idx = recommendationInfo_df[mask].index[0]
            seed_indices.append(idx)

    query_vector = X[seed_indices].mean(axis=0).reshape(1, -1)
    rec_vectors = X[recommended_song_list.index]

    scores = cosine_similarity(query_vector, rec_vectors)

    return scores.flatten()

similarity_scores = cosine_similarity_score(songs, recommended_song_list)
print("\nCosine Similarity Scores:")
print(similarity_scores)


Cosine Similarity Scores:
[0.56595466 0.55715469 0.59460244 0.40029019 0.64759857 0.61708071
 0.45251308 0.54543894 0.30804676 0.66350995]
