# Spotify Recommendation

Before running the following code, ensure you have read the "Usage" section of the README.md file explaining how to use this notebook.

In [37]:
# Import necessary packages
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler

In [38]:
# Returns a cleaned version of the given DataFrame (leaving the original unchanged)
def clean_data(df):
    
    # Create a deep copy of the DataFrame to ensure the original is unedited
    clean_df = df.copy()

    # Find duplicate tracks
    print('Finding duplicates...')
    dupe_mask = clean_df['track_id'].duplicated(keep=False)
    dupes = clean_df[dupe_mask].sort_values('track_id')
    print(f' - Total duplicated rows: {len(dupes)}')
    num_duplicated_songs = clean_df['track_id'].duplicated().sum()
    print(f' - Number of songs with duplicates: {num_duplicated_songs}')
    print('Done')

    # Remove duplicate tracks
    num_rows_before = len(clean_df)
    print(f'\nCurrent number of rows in DataFrame: {num_rows_before}')
    print('\nRemoving duplicates...')
    clean_df = clean_df.drop_duplicates(subset='track_id', keep='first')
    print('Done')
    num_rows_after = len(clean_df)
    print(f'\nNew number of rows in DataFrame: {num_rows_after}')
    print(f'Removed {num_rows_before - num_rows_after} duplicates')
    print(f'Number of unique track_ids: {clean_df['track_id'].nunique()}')

    # Drop unnecessary column
    print('\nDropping the unnecessary column "Unnamed: 0"...')
    clean_df = clean_df.drop('Unnamed: 0', axis=1)
    print('Done')

    return clean_df

In [39]:
# Create a DataFrame object from the dataset (in CSV format)
unclean_df = pd.read_csv('dataset.csv')

# Clean the data to remove duplicates and unnecessary column
clean_df = clean_data(unclean_df)
clean_df = clean_df.reset_index(drop=True) # Resets indices to account for dropped rows

# Separate numeric columns
numeric_cols = [
    'popularity', 'duration_ms', 'danceability', 
    'energy', 'loudness', 'speechiness',
    'acousticness', 'instrumentalness',
    'liveness', 'valence', 'tempo'
]

# Scale the numeric data to give each feature equal importance
scaler = StandardScaler()
scaled_cols = scaler.fit_transform(clean_df[numeric_cols])
scaled_df = pd.DataFrame(scaled_cols, columns=numeric_cols)

# Separate categorical columns
categorical_cols = [ 'key', 'time_signature' ]
# Note: 'artists', 'album_name', 'track_name', and 'track_genre' were
#       excluded to avoid an unnecessarily high dimension DataFrame.

# Apply One-Hot Encoding to categorical data to include them as numeric features
encoded_df = pd.get_dummies(
    clean_df[categorical_cols], columns=categorical_cols, drop_first=True, dtype=int
)

# Separate binary columns
binary_cols = [ 'explicit', 'mode' ]
binary_df = clean_df[binary_cols].astype(int) # Casts bool values to 0 or 1

# Combine into one DataFrame
df = pd.concat([scaled_df, encoded_df, binary_df], axis=1)

# Show a random sample of 10 tracks to view the structure of the DataFrame
df.sample(10)

Finding duplicates...
 - Total duplicated rows: 40900
 - Number of songs with duplicates: 24259
Done

Current number of rows in DataFrame: 114000

Removing duplicates...
Done

New number of rows in DataFrame: 89741
Removed 24259 duplicates
Number of unique track_ids: 89741

Dropping the unnecessary column "Unnamed: 0"...
Done


Unnamed: 0,popularity,duration_ms,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,...,key_8,key_9,key_10,key_11,time_signature_1,time_signature_3,time_signature_4,time_signature_5,explicit,mode
19154,-1.613085,-0.66098,1.804483,-0.165461,0.689655,-0.485909,0.859281,-0.53544,-0.651003,0.949253,...,0,0,0,0,0,0,1,0,0,1
48187,-1.224372,-0.511937,0.134893,0.321672,-1.047982,-0.447066,-0.668863,2.141716,-0.54889,-1.639177,...,0,0,0,0,0,0,1,0,0,0
6455,-0.009642,0.06296,0.163191,-0.71105,0.162982,-0.41352,-0.006766,-0.535481,-0.533496,0.698171,...,0,1,0,0,0,0,1,0,0,1
71983,1.010731,-0.232638,1.068731,0.832187,0.993784,-0.191938,-0.867492,-0.535481,-0.793139,1.831845,...,0,0,0,0,0,0,1,0,0,1
85740,-0.787069,-0.534726,-0.878179,-1.007226,-0.268123,-0.480612,1.60414,-0.534431,-0.691026,0.38622,...,0,0,0,0,0,0,1,0,0,1
16804,-0.835658,0.562796,0.366938,1.101084,0.459834,-0.353489,-0.969987,1.412975,0.728798,-0.625717,...,0,0,0,0,0,0,1,0,0,1
75092,0.767785,-0.673517,-0.685752,0.473657,-0.086756,7.093778,-0.104307,-0.36503,-0.42574,-0.90343,...,0,0,0,0,0,1,0,0,1,1
47942,-1.078604,1.349111,0.780091,0.434686,-0.737149,-0.384387,0.034615,-0.509821,0.641566,0.835125,...,0,0,0,1,0,0,1,0,0,0
61468,0.47625,1.633651,0.214128,-0.863035,-0.51001,-0.447066,-0.919218,2.021288,-0.776719,-1.310487,...,0,0,0,0,0,0,1,0,0,0
19507,-1.613085,-0.397069,0.593323,0.871157,1.193345,0.045534,-0.861876,-0.535436,0.554334,0.652519,...,0,0,0,0,0,0,1,0,0,0


In [40]:
# Initialize the KNN model
knn = NearestNeighbors(n_neighbors=10, metric='euclidean')

# Fit the KNN model
knn.fit(df)

0,1,2
,n_neighbors,10
,radius,1.0
,algorithm,'auto'
,leaf_size,30
,metric,'euclidean'
,p,2
,metric_params,
,n_jobs,


In [41]:
# Read reference songs to recommend from
track_ids = []
print('Reading reference songs from "reference_songs.txt" file...')
try:
    with open('reference_songs.txt', 'r') as file:
        for line in file:
            track_ids.append(line.strip())
    print('Done')
except FileNotFoundError:
    print('Error: Could not find the "reference_songs.txt" file. Please create it and read "Usage" in the "README.md" file.')

# Use fitted KNN model to recommend similar songs
for track_id in track_ids:

    # Validate the track exists in the dataset
    if track_id not in clean_df['track_id'].values:
        print(f'Error: The track_id "{track_id}" was not found in the dataset.')
        continue

    # Find track information
    track = clean_df[clean_df['track_id'] == track_id]
    track_name = track['track_name']
    track_artists = track['artists']

    # Reshape for KNN model
    track_vector = df.iloc[track.index[0]].values.reshape(1, -1)

    # Find similar songs using the model
    distances, indices = knn.kneighbors(track_vector)
    neighbor_indices = indices[0][1:] # Skip the 0th index to prevent recommendation of the same track
    similar_tracks = clean_df.iloc[neighbor_indices]
    print(f'Here are some similar songs to "{track_name}" by {track_artists}')
    print('====================================================================================================')
    print(similar_tracks[['track_id', 'track_name', 'artists', 'album_name']])
    print('====================================================================================================')

Reading reference songs from "reference_songs.txt" file...
Done
