In [33]:
import pandas as pd
import numpy as np
#import numpy.linalg import eigh, norm
from sklearn.neighbors import NearestNeighbors #implement K-Nearest Neighbors for regression

In [34]:
df_history = pd.read_csv("spotify2023data.csv", encoding="utf-16") 
#load the first data set; we assume this represents the listening history of the user

In [35]:
df_history.isnull().sum() #check for null values
df_history.applymap(lambda x: x.lower() if isinstance(x, str) else x)

  df_history.applymap(lambda x: x.lower() if isinstance(x, str) else x)


Unnamed: 0,track_name,artist(s)_name,artist_count,released_year,released_month,released_day,in_spotify_playlists,in_spotify_charts,streams,in_apple_playlists,...,bpm,key,mode,danceability_%,valence_%,energy_%,acousticness_%,instrumentalness_%,liveness_%,speechiness_%
0,seven (feat. latto) (explicit ver.),"latto, jung kook",2,2023,7,14,553,147,141381703,43,...,125,b,major,80,89,83,31,0,8,4
1,lala,myke towers,1,2023,3,23,1474,48,133716286,48,...,92,c#,major,71,61,74,7,0,10,4
2,vampire,olivia rodrigo,1,2023,6,30,1397,113,140003974,94,...,138,f,major,51,32,53,17,0,31,6
3,cruel summer,taylor swift,1,2019,8,23,7858,100,800840817,116,...,170,a,major,55,58,72,11,0,11,15
4,where she goes,bad bunny,1,2023,5,18,3133,50,303236322,84,...,144,a,minor,65,23,80,14,63,11,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
948,my mind & me,selena gomez,1,2022,11,3,953,0,91473363,61,...,144,a,major,60,24,39,57,0,8,3
949,﻿bigger than the whole sky,taylor swift,1,2022,10,21,1180,0,121871870,4,...,166,f#,major,42,7,24,83,1,12,6
950,a veces (feat. feid),"feid, paulo londra",2,2022,11,3,573,0,73513683,2,...,92,c#,major,80,81,67,4,0,8,6
951,en la de ella,"feid, sech, jhayco",3,2022,10,20,1320,0,133895612,29,...,97,c#,major,82,67,77,8,0,12,5


In [36]:
df_history = df_history.dropna() #drop all null values in the dataframe

In [37]:
df_history.shape #get the dimensions of the dataframe
df_history = df_history.drop(['artist(s)_name', 'artist_count', 'released_year', 'released_month', 'released_day', 'in_spotify_playlists', 'in_spotify_charts', 'streams', 'in_apple_playlists', 'in_apple_charts', 'in_deezer_playlists', 'in_deezer_charts', 'in_shazam_charts', 'bpm', 'key', 'mode'], axis=1)
df_history.head(5)

Unnamed: 0,track_name,danceability_%,valence_%,energy_%,acousticness_%,instrumentalness_%,liveness_%,speechiness_%
0,Seven (feat. Latto) (Explicit Ver.),80,89,83,31,0,8,4
1,LALA,71,61,74,7,0,10,4
2,vampire,51,32,53,17,0,31,6
3,Cruel Summer,55,58,72,11,0,11,15
4,WHERE SHE GOES,65,23,80,14,63,11,6


In [38]:
#a lot of the columns have "%" at the end of them, so let's rename the columns to get rid of the "%"s
df_history.columns = ['track_name', 'danceability', 'valence', 'energy', 'acousticness', 'instrumentalness', 'liveness', 'speechiness']
print(df_history.columns)

Index(['track_name', 'danceability', 'valence', 'energy', 'acousticness',
       'instrumentalness', 'liveness', 'speechiness'],
      dtype='object')


In [39]:
#now, let's load the second dataset, the data that allows us to recommend songs to the user 
df_recommend = pd.read_csv("song-recommend.csv")

In [40]:
df_recommend.isnull().sum() #check for null values

Unnamed: 0          0
track_id            0
artists             1
album_name          1
track_name          1
popularity          0
duration_ms         0
explicit            0
danceability        0
energy              0
key                 0
loudness            0
mode                0
speechiness         0
acousticness        0
instrumentalness    0
liveness            0
valence             0
tempo               0
time_signature      0
track_genre         0
dtype: int64

In [41]:
df_recommend = df_recommend.dropna()

In [42]:
df_recommend.shape #get the dimensions of the second dataset

(113999, 21)

In [43]:
#given our history dataset and our recommender dataset, let's look at the different variables that we may consider
#when recommending songs

#the two datasets have 7 columns in common that serve as good varables (these are the features/input): 
#danceability, valence, speechiness, acousticness, liveness, instrumentalness, and energy

#track name and artist name are the labels/output

In [44]:
#we have 7 total features
#Basic Idea of our Product: An advanced song recommendation system that allows users to get song recommendations 
#based on the most important song aspect to them (the 7 features). This is an imbalanced classification model. 

#define the features of the history dataset
features_df_history = ['track_name', 'danceability', 'instrumentalness', 'valence', 'speechiness', 'acousticness', 'liveness', 'energy'] 
#define the features of the recommendation dataset
features_df_recommend = ['danceability', 'instrumentalness', 'valence', 'speechiness', 'acousticness', 'liveness', 'energy']

#the features of the history dataset are in percentages while they're given as decimals in the recommendation data.
#Here, we convert the data points of the history dataset to decimal values.
for feature in features_df_history[1:]:
    df_history[feature] /= 100

In [45]:
#this function uses the history dataset to create a KNN model using the user's inputed primary feature and weightage
def weightage_train(primary_feature, primary_weight, data, n_neighbors):
    # Exclude 'Song Name' from the list of features
    features_to_use = [feature for feature in features_df_history[1:] if feature != 'track_name']
    
    other_features = [feature for feature in features_to_use if feature != primary_feature]
    weights = [primary_weight] + [(1 - primary_weight) / len(other_features)] * len(other_features)
    weighted_features = [weight * df_history[feature] for weight, feature in zip(weights, [primary_feature] + other_features)]
    X = pd.concat(weighted_features, axis=1)
    model = NearestNeighbors(n_neighbors = n_neighbors, algorithm='auto') #come back to this
    model.fit(X)
    return model

In [46]:
def get_song_features(song_name):
    return df_history.loc[df_history['track_name'] == song_name, features_df_history[1:]].values.flatten().tolist()

In [47]:
#this function recommends songs based on the customized metrics (primary feature and weightage) that the user 
#has inputed
def weightage_recommend(song_name, primary_feature, primary_weight, rec_count, n_neighbors):
    song_features = []
    song_features = get_song_features(song_name)
    model = weightage_train(primary_feature, primary_weight, df_history, n_neighbors)
    distances, indices = model.kneighbors([song_features])
    similar_songs = df_recommend.iloc[indices[0]][:rec_count] #this is the list of similar songs given
    #the number of songs to be recommended to the user
    return similar_songs[['track_name', 'artists', 'album_name']]

In [48]:
#song_features = [0.5, 0.3, 0.8, 0.2, 0.7, 0.6, 0.06]  #features of the song you want to find similar songs for

user_song_name = "vampire"  #user's input song that will be used to find similar songs
user_primary_feature = "liveness"  #user's chosen primary feature
user_primary_weight = 0.9  #user's chosen weight for the primary feature, 
user_count = 7 #Number of recommended songs to be printed out in the display
user_neighbors = 15  #Number of neighbors to be considered when recommending songs (carrying capacity of song 
#recommendations), fixed number

recommended_songs = weightage_recommend(user_song_name.lower(), user_primary_feature, round(user_primary_weight,1), int(user_count), user_neighbors)
print(recommended_songs)

                                 track_name                  artists  \
276                              Headphones               Jon Bryant   
526                      vs.2013 ~知覚と快楽の螺旋~        Masaharu Fukuyama   
23                         93 Million Miles               Jason Mraz   
779                        King of Anything           Sara Bareilles   
141                   Make You Feel My Love                JJ Heller   
434  Have Yourself a Merry Little Christmas            Tres Hermanas   
716                           See You Again  Boyce Avenue;Bea Miller   

                                 album_name  
276                              Headphones  
526                      vs.2013 ~知覚と快楽の螺旋~  
23                            Coffee Moment  
779                      Kaleidoscope Heart  
141                    I Dream of You: CALM  
434  Have Yourself a Merry Little Christmas  
716                  Cover Sessions, Vol. 4  


