In [69]:
# Import the required modules
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np

# visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning
from sklearn.model_selection import train_test_split

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.metrics import euclidean_distances
from scipy.spatial import distance
from sklearn import preprocessing
from sklearn.neighbors import NearestNeighbors


# suppress warnings
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Read in CSV
df = pd.read_csv("Resources/spotify_tracks.csv", encoding='latin1')
df.head()

Unnamed: 0.1,Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,0,5SuOikwiRyPMVoIQDJUgSV,Gen Hoshino,Comedy,Comedy,73,230666,False,0.676,0.461,1,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87.917,4,acoustic
1,1,4qPNDBW1i3p13qLCt0Ki3A,Ben Woodward,Ghost (Acoustic),Ghost - Acoustic,55,149610,False,0.42,0.166,1,-17.235,1,0.0763,0.924,6e-06,0.101,0.267,77.489,4,acoustic
2,2,1iJBSr7s7jYXzM8EGcbK5b,Ingrid Michaelson;ZAYN,To Begin Again,To Begin Again,57,210826,False,0.438,0.359,0,-9.734,1,0.0557,0.21,0.0,0.117,0.12,76.332,4,acoustic
3,3,6lfxq3CG4xtTiEg7opyCyx,Kina Grannis,Crazy Rich Asians (Original Motion Picture Sou...,Can't Help Falling In Love,71,201933,False,0.266,0.0596,0,-18.515,1,0.0363,0.905,7.1e-05,0.132,0.143,181.74,3,acoustic
4,4,5vjLSffimiIP26QG5WcN2K,Chord Overstreet,Hold On,Hold On,82,198853,False,0.618,0.443,2,-9.681,1,0.0526,0.469,0.0,0.0829,0.167,119.949,4,acoustic


In [4]:
# Check for null values and data types
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 114000 entries, 0 to 113999
Data columns (total 21 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   Unnamed: 0        114000 non-null  int64  
 1   track_id          114000 non-null  object 
 2   artists           113999 non-null  object 
 3   album_name        113999 non-null  object 
 4   track_name        113999 non-null  object 
 5   popularity        114000 non-null  int64  
 6   duration_ms       114000 non-null  int64  
 7   explicit          114000 non-null  bool   
 8   danceability      114000 non-null  float64
 9   energy            114000 non-null  float64
 10  key               114000 non-null  int64  
 11  loudness          114000 non-null  float64
 12  mode              114000 non-null  int64  
 13  speechiness       114000 non-null  float64
 14  acousticness      114000 non-null  float64
 15  instrumentalness  114000 non-null  float64
 16  liveness          11

In [5]:
# View the metadata for each column
df.describe()

Unnamed: 0.1,Unnamed: 0,popularity,duration_ms,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
count,114000.0,114000.0,114000.0,114000.0,114000.0,114000.0,114000.0,114000.0,114000.0,114000.0,114000.0,114000.0,114000.0,114000.0,114000.0
mean,56999.5,33.238535,228029.2,0.5668,0.641383,5.30914,-8.25896,0.637553,0.084652,0.31491,0.15605,0.213553,0.474068,122.147837,3.904035
std,32909.109681,22.305078,107297.7,0.173542,0.251529,3.559987,5.029337,0.480709,0.105732,0.332523,0.309555,0.190378,0.259261,29.978197,0.432621
min,0.0,0.0,0.0,0.0,0.0,0.0,-49.531,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,28499.75,17.0,174066.0,0.456,0.472,2.0,-10.013,0.0,0.0359,0.0169,0.0,0.098,0.26,99.21875,4.0
50%,56999.5,35.0,212906.0,0.58,0.685,5.0,-7.004,1.0,0.0489,0.169,4.2e-05,0.132,0.464,122.017,4.0
75%,85499.25,50.0,261506.0,0.695,0.854,8.0,-5.003,1.0,0.0845,0.598,0.049,0.273,0.683,140.071,4.0
max,113999.0,100.0,5237295.0,0.985,1.0,11.0,4.532,1.0,0.965,0.996,1.0,1.0,0.995,243.372,5.0


In [6]:
null_rows = df.loc[df['album_name'].isnull()]
print(null_rows)

       Unnamed: 0                track_id artists album_name track_name  \
65900       65900  1kR4gIb7nGxHPI3D2ifs59     NaN        NaN        NaN   

       popularity  duration_ms  explicit  danceability  energy  key  loudness  \
65900           0            0     False         0.501   0.583    7     -9.46   

       mode  speechiness  acousticness  instrumentalness  liveness  valence  \
65900     0       0.0605          0.69           0.00396    0.0747    0.734   

         tempo  time_signature track_genre  
65900  138.391               4       k-pop  


In [7]:
df = df.dropna(subset=['album_name'])
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 113999 entries, 0 to 113999
Data columns (total 21 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   Unnamed: 0        113999 non-null  int64  
 1   track_id          113999 non-null  object 
 2   artists           113999 non-null  object 
 3   album_name        113999 non-null  object 
 4   track_name        113999 non-null  object 
 5   popularity        113999 non-null  int64  
 6   duration_ms       113999 non-null  int64  
 7   explicit          113999 non-null  bool   
 8   danceability      113999 non-null  float64
 9   energy            113999 non-null  float64
 10  key               113999 non-null  int64  
 11  loudness          113999 non-null  float64
 12  mode              113999 non-null  int64  
 13  speechiness       113999 non-null  float64
 14  acousticness      113999 non-null  float64
 15  instrumentalness  113999 non-null  float64
 16  liveness          113999 

In [8]:
df.track_id.nunique()

89740

In [9]:
df.drop_duplicates(subset=['track_id'], keep='first', inplace = True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 89740 entries, 0 to 113999
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Unnamed: 0        89740 non-null  int64  
 1   track_id          89740 non-null  object 
 2   artists           89740 non-null  object 
 3   album_name        89740 non-null  object 
 4   track_name        89740 non-null  object 
 5   popularity        89740 non-null  int64  
 6   duration_ms       89740 non-null  int64  
 7   explicit          89740 non-null  bool   
 8   danceability      89740 non-null  float64
 9   energy            89740 non-null  float64
 10  key               89740 non-null  int64  
 11  loudness          89740 non-null  float64
 12  mode              89740 non-null  int64  
 13  speechiness       89740 non-null  float64
 14  acousticness      89740 non-null  float64
 15  instrumentalness  89740 non-null  float64
 16  liveness          89740 non-null  float64
 1

In [10]:
df.to_csv('Resources/clean_spotify.csv', index=False)

In [112]:
df.track_genre.nunique()
# 

113

In [12]:
genre_df = df.groupby('track_genre')['track_id'].nunique().reset_index(name='song_count')
print(genre_df)

     track_genre  song_count
0       acoustic        1000
1       afrobeat         999
2       alt-rock         999
3    alternative         407
4        ambient         999
..           ...         ...
108       techno         416
109       trance         708
110     trip-hop         904
111      turkish         870
112  world-music         923

[113 rows x 2 columns]


In [13]:
genre_df = genre_df.sort_values(by='song_count', ascending=True)
print(genre_df)

   track_genre  song_count
89   reggaeton          74
56       indie         134
53       house         210
85        punk         226
71       metal         232
..         ...         ...
4      ambient         999
2     alt-rock         999
1     afrobeat         999
12    cantopop         999
0     acoustic        1000

[113 rows x 2 columns]


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 89740 entries, 0 to 113999
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Unnamed: 0        89740 non-null  int64  
 1   track_id          89740 non-null  object 
 2   artists           89740 non-null  object 
 3   album_name        89740 non-null  object 
 4   track_name        89740 non-null  object 
 5   popularity        89740 non-null  int64  
 6   duration_ms       89740 non-null  int64  
 7   explicit          89740 non-null  bool   
 8   danceability      89740 non-null  float64
 9   energy            89740 non-null  float64
 10  key               89740 non-null  int64  
 11  loudness          89740 non-null  float64
 12  mode              89740 non-null  int64  
 13  speechiness       89740 non-null  float64
 14  acousticness      89740 non-null  float64
 15  instrumentalness  89740 non-null  float64
 16  liveness          89740 non-null  float64
 1

In [33]:
numeric_columns = ['popularity', 'duration_ms', 'explicit', 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature']

numeric_df = df.loc[:, numeric_columns]
numeric_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 89740 entries, 0 to 113999
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   popularity        89740 non-null  int64  
 1   duration_ms       89740 non-null  int64  
 2   explicit          89740 non-null  float64
 3   danceability      89740 non-null  float64
 4   energy            89740 non-null  float64
 5   key               89740 non-null  int64  
 6   loudness          89740 non-null  float64
 7   mode              89740 non-null  int64  
 8   speechiness       89740 non-null  float64
 9   acousticness      89740 non-null  float64
 10  instrumentalness  89740 non-null  float64
 11  liveness          89740 non-null  float64
 12  valence           89740 non-null  float64
 13  tempo             89740 non-null  float64
 14  time_signature    89740 non-null  int64  
dtypes: float64(10), int64(5)
memory usage: 13.0 MB


In [34]:
data_types_counts = df.dtypes.value_counts()
print(data_types_counts)

float64    10
int64       6
object      5
Name: count, dtype: int64


In [35]:
df_model['explicit'] = df_model['explicit'].astype(float)

In [36]:
# Assuming 'df' is your DataFrame
int_columns = numeric_df.select_dtypes(include=['int64']).columns
numeric_df[int_columns] = numeric_df[int_columns].astype('float64')

# Verify the data types after conversion
print(numeric_df.dtypes)

popularity          float64
duration_ms         float64
explicit            float64
danceability        float64
energy              float64
key                 float64
loudness            float64
mode                float64
speechiness         float64
acousticness        float64
instrumentalness    float64
liveness            float64
valence             float64
tempo               float64
time_signature      float64
dtype: object


In [18]:
# initialize
scaler = StandardScaler()

# fit
scaler.fit(numeric_df)

# predict/transform
scaled_data = scaler.transform(numeric_df)
df_scaled = pd.DataFrame(scaled_data, columns=numeric_columns, index = df.track_id)

df_scaled.head()

Unnamed: 0_level_0,popularity,duration_ms,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
5SuOikwiRyPMVoIQDJUgSV,1.933925,0.013472,0.644253,-0.675975,-1.203275,0.335727,-1.324621,0.490458,-0.875166,-0.535482,0.723656,0.934047,-1.133599,0.226216
4qPNDBW1i3p13qLCt0Ki3A,1.059312,-0.704186,-0.804604,-1.825602,-1.203275,-1.673087,0.754933,-0.098364,1.76081,-0.535468,-0.595078,-0.770269,-1.479843,0.226216
1iJBSr7s7jYXzM8EGcbK5b,1.156491,-0.162188,-0.702731,-1.073473,-1.484183,-0.236524,0.754933,-0.280219,-0.349626,-0.535485,-0.512978,-1.329497,-1.518259,0.226216
6lfxq3CG4xtTiEg7opyCyx,1.836746,-0.240925,-1.676182,-2.240247,-1.484183,-1.918228,0.754933,-0.45148,1.70465,-0.535266,-0.436009,-1.241999,1.981635,-1.979174
5vjLSffimiIP26QG5WcN2K,2.371232,-0.268195,0.315996,-0.746122,-0.922368,-0.226373,0.754933,-0.307585,0.415925,-0.535485,-0.687954,-1.150696,-0.07003,0.226216


In [37]:
string_columns = ['track_id', 'track_name', 'album_name', 'artists' , 'track_genre']
string_df = df.loc[:, string_columns]
string_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 89740 entries, 0 to 113999
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   track_id     89740 non-null  object
 1   track_name   89740 non-null  object
 2   album_name   89740 non-null  object
 3   artists      89740 non-null  object
 4   track_genre  89740 non-null  object
dtypes: object(5)
memory usage: 6.1+ MB


In [21]:
track_data = df_concat.loc[df_concat['track_name'] == 'How to Save a Life']

# Print all columns for the specified track_name value
print(track_data)

                                track_name          album_name   artists  \
track_id                                                                   
5fVZC9GiM4e8vu99W0Xf6J  How to Save a Life  How To Save A Life  The Fray   

                       track_genre  popularity  duration_ms  danceability  \
track_id                                                                    
5fVZC9GiM4e8vu99W0Xf6J       piano         NaN          NaN           NaN   

                        energy  key  loudness  mode  speechiness  \
track_id                                                           
5fVZC9GiM4e8vu99W0Xf6J     NaN  NaN       NaN   NaN          NaN   

                        acousticness  instrumentalness  liveness  valence  \
track_id                                                                    
5fVZC9GiM4e8vu99W0Xf6J           NaN               NaN       NaN      NaN   

                        tempo  time_signature         track_id_column  
track_id                      

In [63]:
numerical_cols = df.select_dtypes(include=np.number).columns
data_df = pd.DataFrame(scaler.fit_transform(df[numerical_cols]), columns=numerical_cols, index=df['track_id'])

data_df.head()

Unnamed: 0_level_0,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
5SuOikwiRyPMVoIQDJUgSV,1.933925,0.013472,-0.306447,0.644253,-0.675975,-1.203275,0.335727,-1.324621,0.490458,-0.875166,-0.535482,0.723656,0.934047,-1.133599,0.226216
4qPNDBW1i3p13qLCt0Ki3A,1.059312,-0.704186,-0.306447,-0.804604,-1.825602,-1.203275,-1.673087,0.754933,-0.098364,1.76081,-0.535468,-0.595078,-0.770269,-1.479843,0.226216
1iJBSr7s7jYXzM8EGcbK5b,1.156491,-0.162188,-0.306447,-0.702731,-1.073473,-1.484183,-0.236524,0.754933,-0.280219,-0.349626,-0.535485,-0.512978,-1.329497,-1.518259,0.226216
6lfxq3CG4xtTiEg7opyCyx,1.836746,-0.240925,-0.306447,-1.676182,-2.240247,-1.484183,-1.918228,0.754933,-0.45148,1.70465,-0.535266,-0.436009,-1.241999,1.981635,-1.979174
5vjLSffimiIP26QG5WcN2K,2.371232,-0.268195,-0.306447,0.315996,-0.746122,-0.922368,-0.226373,0.754933,-0.307585,0.415925,-0.535485,-0.687954,-1.150696,-0.07003,0.226216


In [107]:
# Check to see if song is in dataset
song= df[(df['track_name'] == 'See You Again')]
song

Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
716,3XfMyT4Xf5LegDhvbrFEjp,Boyce Avenue;Bea Miller,"Cover Sessions, Vol. 4",See You Again,50,239802,0.0,0.57,0.413,9,-7.034,1,0.0345,0.535,0.0,0.076,0.278,152.032,4,acoustic
14329,7qdNE9DyIKk3invtcxdGG8,One Voice Children's Choir,Memories,See You Again,43,209881,0.0,0.64,0.406,10,-9.337,1,0.0476,0.787,0.0,0.0725,0.247,80.029,4,children
20696,62lmjlPu5Vwd3h18FMSz1G,Wiz Khalifa;Charlie Puth,Give You Love - Cozy Hits,See You Again,2,229525,0.0,0.69,0.48,10,-7.503,1,0.0816,0.369,0.0,0.0649,0.286,80.025,4,dance
20697,0FtOxBrDP67usYNbqOuy7T,Wiz Khalifa;Charlie Puth,On Chill - Rap & RnB,See You Again,0,229525,0.0,0.69,0.48,10,-7.503,1,0.0816,0.369,0.0,0.0649,0.286,80.025,4,dance
29795,4pXG8Q82L8WvypAm5Wo86y,Seven Lions;Jason Ross;Fiora,See You Again EP,See You Again,46,263200,0.0,0.248,0.588,8,-6.292,1,0.0459,0.0415,3.3e-05,0.123,0.0382,150.419,4,dubstep


In [101]:
trackNameListened = "2055"
track_id = df[(df['track_name'] == trackNameListened)][['track_id']]
track_id = track_id.values[0][0]

target_track = list(data_norm.loc[track_id])

In [102]:
data_result = pd.DataFrame()
data_result['euclidean'] = [distance.euclidean(obj, target_track) for index, obj in data_norm.iterrows()]
data_result['track_id'] = data_norm.index
data_result.head()

Unnamed: 0,euclidean,track_id
0,5.19557,5SuOikwiRyPMVoIQDJUgSV
1,6.632058,4qPNDBW1i3p13qLCt0Ki3A
2,6.354776,1iJBSr7s7jYXzM8EGcbK5b
3,8.336857,6lfxq3CG4xtTiEg7opyCyx
4,5.827444,5vjLSffimiIP26QG5WcN2K


In [103]:
data_rec = data_result.sort_values(by=['euclidean']).iloc[:6]

In [104]:
data_init = df.set_index(df.loc[:, 'track_id'])
track_list = pd.DataFrame()
for i in list(data_rec.loc[:, 'track_id']):
    if i in list(df.loc[:, 'track_id']):
        track_info = data_init.loc[[i], ['track_name', 'artists']]
        track_list = pd.concat([track_list, track_info], ignore_index=True)

In [105]:
recomended = track_list.values.tolist()
print(f"""You've just listened:  \n \t - {recomended[0][0]} - {recomended[0][1]} 
Now you might want to listen to : 
\n \t - '{recomended[1][0]} - {recomended[1][1]}'
Or maybe any of these:
\n \t - '{recomended[2][0]} - {recomended[2][1]}' 
\n \t - '{recomended[3][0]} - {recomended[3][1]}'
\n \t - '{recomended[4][0]} - {recomended[4][1]}'
\n \t - '{recomended[5][0]} - {recomended[5][1]}'  """)

You've just listened:  
 	 - 2055 - Sleepy Hallow 
Now you might want to listen to : 

 	 - 'Feel Good (feat. Lil Tjay) - Fresco Trey;Lil Tjay'
Or maybe any of these:

 	 - 'A Gangster's Wife - Ms Krazie;Chino Grande' 

 	 - 'Weight On Me - Sheff G;Sleepy Hallow'

 	 - 'Pantera - Rvfv;Duki'

 	 - 'Gemini - N i G H T S'  


In [67]:
df_model = data_df.reset_index()
df_model.head()

Unnamed: 0,track_id,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,5SuOikwiRyPMVoIQDJUgSV,1.933925,0.013472,-0.306447,0.644253,-0.675975,-1.203275,0.335727,-1.324621,0.490458,-0.875166,-0.535482,0.723656,0.934047,-1.133599,0.226216
1,4qPNDBW1i3p13qLCt0Ki3A,1.059312,-0.704186,-0.306447,-0.804604,-1.825602,-1.203275,-1.673087,0.754933,-0.098364,1.76081,-0.535468,-0.595078,-0.770269,-1.479843,0.226216
2,1iJBSr7s7jYXzM8EGcbK5b,1.156491,-0.162188,-0.306447,-0.702731,-1.073473,-1.484183,-0.236524,0.754933,-0.280219,-0.349626,-0.535485,-0.512978,-1.329497,-1.518259,0.226216
3,6lfxq3CG4xtTiEg7opyCyx,1.836746,-0.240925,-0.306447,-1.676182,-2.240247,-1.484183,-1.918228,0.754933,-0.45148,1.70465,-0.535266,-0.436009,-1.241999,1.981635,-1.979174
4,5vjLSffimiIP26QG5WcN2K,2.371232,-0.268195,-0.306447,0.315996,-0.746122,-0.922368,-0.226373,0.754933,-0.307585,0.415925,-0.535485,-0.687954,-1.150696,-0.07003,0.226216


In [89]:
df_model.drop(columns=['track_name', 'artists', 'album_name_x', 'album_name'], inplace=True)
song_df = df[['track_id', 'track_name', 'artists', 'album_name']]
df_model = pd.merge(df_model, song_df, on='track_id', how='left')
df_model.head()

Unnamed: 0,track_id,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_name,artists,album_name
0,5SuOikwiRyPMVoIQDJUgSV,1.933925,0.013472,-0.306447,0.644253,-0.675975,-1.203275,0.335727,-1.324621,0.490458,-0.875166,-0.535482,0.723656,0.934047,-1.133599,0.226216,Comedy,Gen Hoshino,Comedy
1,4qPNDBW1i3p13qLCt0Ki3A,1.059312,-0.704186,-0.306447,-0.804604,-1.825602,-1.203275,-1.673087,0.754933,-0.098364,1.76081,-0.535468,-0.595078,-0.770269,-1.479843,0.226216,Ghost - Acoustic,Ben Woodward,Ghost (Acoustic)
2,1iJBSr7s7jYXzM8EGcbK5b,1.156491,-0.162188,-0.306447,-0.702731,-1.073473,-1.484183,-0.236524,0.754933,-0.280219,-0.349626,-0.535485,-0.512978,-1.329497,-1.518259,0.226216,To Begin Again,Ingrid Michaelson;ZAYN,To Begin Again
3,6lfxq3CG4xtTiEg7opyCyx,1.836746,-0.240925,-0.306447,-1.676182,-2.240247,-1.484183,-1.918228,0.754933,-0.45148,1.70465,-0.535266,-0.436009,-1.241999,1.981635,-1.979174,Can't Help Falling In Love,Kina Grannis,Crazy Rich Asians (Original Motion Picture Sou...
4,5vjLSffimiIP26QG5WcN2K,2.371232,-0.268195,-0.306447,0.315996,-0.746122,-0.922368,-0.226373,0.754933,-0.307585,0.415925,-0.535485,-0.687954,-1.150696,-0.07003,0.226216,Hold On,Chord Overstreet,Hold On


In [123]:
# define the number of nearest neighbors to consider
k = 6

# define a function to recommend songs based on a given song name
def recommend_song(track_name, data_df, numeric_columns):
    
    # initialize the model with the number of neighbors
    model = NearestNeighbors(n_neighbors=k)

    # fit the model to the data
    model.fit(df_model[numeric_columns])
    
    # get the track_id of the given track name
    track_id = df_model[df_model['track_name'] == track_name]['track_id'].iloc[0]
    
    # get the index of the tracks in the model dataframe
    idx = df_model[df_model['track_id'] == track_id].index[0]
    
    # get the features of the tracks
    track_features = df_model.loc[idx, numeric_columns].values.reshape(1, -1)
    
    # find the k nearest neighbors
    distances, indices = model.kneighbors(track_features)
    
    # get the track names of the nearest neighbors
    tracks = df_model.iloc[indices[0]]
    tracks["distance"] = distances[0]
    
    return tracks

In [124]:
recommend_song('2055', df_model, numeric_columns)

Unnamed: 0,track_id,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_name,artists,album_name,distance
71211,4XvcHTUfIlWfyJTRG0aqlo,2.322642,-0.931057,3.263203,1.510171,-0.477226,0.201262,0.504453,-1.324621,0.878886,0.483908,-0.535485,-0.497584,0.641118,-1.379502,0.226216,2055,Sleepy Hallow,Still Sleep?,0.0
16986,0Dfctnkk9xIikDKJ06uwzo,1.496618,-0.581268,3.263203,1.504511,-0.223918,0.201262,0.161447,-1.324621,0.622877,0.800178,-0.535485,-0.695651,0.4471,-1.185993,0.226216,She's So Nice,Pink Guy,Pink Season,1.125346
29843,39LJCSHy7nB1akjmhMQ7Y7,1.982515,-0.363923,3.263203,1.283787,-0.488917,0.201262,-0.024898,-1.324621,1.673399,0.581449,-0.535461,-0.63305,0.534598,-1.562187,0.226216,Pain,Josh A,Fearless,1.213574
57102,6QNphn62YKaHG1hrnPSoHi,2.031104,0.004388,3.263203,1.470554,-0.173256,0.201262,0.094608,-1.324621,0.817091,0.297693,-0.535485,-0.589947,-0.328973,-1.064602,0.226216,A Gangster's Wife,Ms Krazie;Chino Grande,Smile Now Cry Never,1.519623
14674,6GehOJs060WzAi78QArj7p,1.205081,-0.494969,3.263203,0.82536,-0.356417,0.482169,0.935556,-1.324621,0.719984,0.241533,-0.535326,-0.717715,0.644922,-1.065233,0.226216,Broken Love,GEMINI,Going,1.555207
75083,69FiPptnZd9ovg2tEJgBkN,1.25367,-0.699475,3.263203,1.091361,-0.348623,-0.079646,0.283826,-1.324621,0.366867,-0.228438,-0.535485,-0.512978,0.1808,-1.064303,0.226216,Feel Good (feat. Lil Tjay),Fresco Trey;Lil Tjay,Feel Good (feat. Lil Tjay),1.611501


In [120]:
# define the number of nearest neighbors to consider
k = 6

# define a function to recommend songs based on a given song name
def recommend_song_artist(track_name, artist, df_model, numeric_columns):
    
    train = df_model.loc[df_model.artists == artist]
    
    k = min(len(train), 6)
    
    # initialize the model with the number of neighbors
    model = NearestNeighbors(n_neighbors=k)

    # fit the model to the data
    model.fit(train[numeric_columns])
    
    # get the track_id of the given track name
    track_id = df_model[df_model['track_name'] == track_name]['track_id'].iloc[0]
    
    # get the index of the tracks in the model dataframe
    idx = df_model[df_model['track_id'] == track_id].index[0]
    
    # get the features of the tracks
    track_features = df_model.loc[idx, numeric_columns].values.reshape(1, -1)
    
    # find the k nearest neighbors
    distances, indices = model.kneighbors(track_features)
    
    # get the track names of the nearest neighbors
    tracks = train.iloc[indices[0]]
    tracks["distance"] = distances[0]
    
    return tracks

In [122]:
recommend_song_artist('The Hills', 'The Weeknd', df_model, numeric_columns)

Unnamed: 0,track_id,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_name,artists,album_name,distance
67642,4g7118rCobKcU4XQXSFlBS,-1.467349,0.116301,3.263203,0.016037,-0.325241,-1.484183,0.285741,-1.324621,-0.165456,-0.720578,-0.535485,-0.40009,-1.310476,-0.301424,0.226216,The Hills,The Weeknd,Drippy Drippy,0.0
67657,0MnmtOOke4uQVBMAjIgwVZ,-1.613118,1.175806,-0.306447,0.502763,-0.262888,-0.079646,0.489897,-1.324621,-0.492089,-0.694271,-0.504884,-0.477059,-1.26102,-0.435134,0.226216,After Hours,The Weeknd,Halloween Party 2022,4.034348
67846,7fBv7CLKzipRk6EC6TWHOB,2.662769,0.116062,3.263203,0.129229,-0.27458,-1.484183,0.275016,-1.324621,-0.317296,-0.772009,-0.535485,-0.420615,-1.264825,-0.30066,0.226216,The Hills,The Weeknd,Beauty Behind The Madness,4.135405
67658,52KDAbgFGCXZQVlOXy2XIQ,-1.370169,-0.132404,-0.306447,0.378251,0.500932,-1.484183,0.808197,-1.324621,-0.372029,-0.249129,-0.535485,0.487618,1.291649,-0.964062,0.226216,Out of Time,The Weeknd,LUGNA HITS,4.707816
67660,2eyXrWRxc7g5yx0D4J3WTS,-1.32158,-0.132404,-0.306447,0.378251,0.500932,-1.484183,0.808197,-1.324621,-0.372029,-0.249129,-0.535485,0.487618,1.291649,-0.964062,0.226216,Out of Time,The Weeknd,HÃST,4.70907
67672,5Dt9HFzeBVmFtrk3BTiDEn,-1.467349,-0.244113,-0.306447,-0.283921,0.629534,-1.203275,0.847267,0.754933,-0.202533,-0.965997,-0.534839,-0.632023,-0.477339,1.625497,0.226216,Blinding Lights,The Weeknd,Best R&B Tunes,4.807824
