In [None]:
import pandas as pd
import kagglehub
from kagglehub import KaggleDatasetAdapter

# Load Spotify Songs dataset
dataset = kagglehub.dataset_load(
    KaggleDatasetAdapter.PANDAS,
    "joebeachcapital/30000-spotify-songs",
    "spotify_songs.csv"
    )

Downloading from https://www.kaggle.com/api/v1/datasets/download/joebeachcapital/30000-spotify-songs?dataset_version_number=2&file_name=spotify_songs.csv...


100%|██████████| 3.01M/3.01M [00:00<00:00, 47.0MB/s]

Extracting zip of spotify_songs.csv...





In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors

# We randomly take 10.000 songs from the original dataset
df_rec = dataset.sample(10000, random_state=42).reset_index(drop=True)

# We define the audio feature (only numeric)
feature_cols = ['danceability', 'energy', 'loudness', 'speechiness', 'acousticness',
                'instrumentalness', 'liveness', 'valence', 'tempo']

X_rec = df_rec[feature_cols]

# SCALING (mandatory for KNN)
scaler = StandardScaler()
X_rec_scaled = scaler.fit_transform(X_rec)

# Training the model (NearestNeighbors)
# metric='cosine': it compute the smilarity based on the angle (good for text/audio)
nn_model = NearestNeighbors(n_neighbors=6, algorithm='brute', metric='cosine')
nn_model.fit(X_rec_scaled)

In [None]:
def recommend_song(song_title, data, model, feature_matrix):
    """
    song_title: String, the song name
    data: the original DataFrame.
    model: the trained model.
    feature_matrix: the scaled matrix on which the model have learnead
    """

    # Search the song index in the dataset
    try:
        idx = data[data['track_name'].str.lower() == song_title.lower()].index[0]
    except IndexError:
        print(f"Error: Song: '{song_title}' not found!")
        return

    # Retrieve the feature vector of that song
    song_vector = feature_matrix[idx].reshape(1, -1)

    # Use the model to find the nearest song
    # distances: how much are they similiar (0 = identic)
    # indices: indices of the found songs
    distances, indices = model.kneighbors(song_vector)

    # Visualize the results
    original_song = data.iloc[idx]
    print(f"--- Since you like: '{original_song['track_name']}' ({original_song['track_artist']}) ---")
    print(f"Genre: {original_song['playlist_genre']} | BPM: {original_song['tempo']:.0f}\n")
    print("You should listen:")
    print("-" * 50)


    for i in range(1, len(indices[0])):
        idx_rec = indices[0][i]
        dist = distances[0][i]

        rec_song = data.iloc[idx_rec]

        print(f"{i}. {rec_song['track_name']} - {rec_song['track_artist']}")
        print(f"   (Genre: {rec_song['playlist_genre']} | Distance: {dist:.4f})")
    print("-" * 50)


In [None]:
# --- TEST THE SYSTEM ---

# Print the first rows of the dataset
print("Available songs example:", df_rec['track_name'].head(50).values)

# Choose a song:
song_to_test = input("Choose a song:")
recommend_song(song_to_test, df_rec, nn_model, X_rec_scaled)

Available songs example: ['I Miss You' 'Who Are You' 'Happy' 'ONE' 'Palace/Curse' 'Hände hoch'
 "I'll Be Around - Remastered Version" 'Música' 'Trippie Redd'
 'Close Enough to Hurt' 'Aczino: Bzrp Freestyle Sessions, Vol. 8'
 'You Better Know' 'Tropical Forest' 'Party Rock Anthem'
 "I'm Blue - Club Mix" 'Video' 'Birds'
 'House Of Cards (feat. Sidnie Tipton)' 'mEnorme' "Buggin' - Edit"
 'Wherewithal' 'No Cap' 'We Can (feat. Tory Lanez)' 'Si Supieras'
 'Platinum Plus' 'Thick And Thin' 'Can We Talk' 'Saturday Night'
 'Stranger' 'Mr. Writer' 'Lost' 'With You' 'Asli Hip Hop'
 'Been Around the World (feat. The Notorious B.I.G. & Mase)' 'Trouble'
 'La Clase' "Brandy (You're a Fine Girl)" 'Soul Fifty'
 'Drifting - Tim Schaufert Remix' 'Closer (feat. Halsey)' 'Disciples'
 'Airplanes' 'Diabla - Official Remix' 'Attention' 'Lalu'
 'Doch in der Nacht' 'Visit By Cap Kendricks' 'Starships'
 'Boredom (feat. Rex Orange County & Anna of the North)'
 'Tutto apposto (feat. Capo Plaza)']
Choose a song:Part