In [1]:
import sys
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KDTree

In [2]:
df_april_19 = pd.read_csv('../data/SpotifyAudioFeaturesApril2019.csv')
df_nov_18 = pd.read_csv('../data/SpotifyAudioFeaturesNov2018.csv')

In [3]:
df_april_19.shape, df_nov_18.shape

((130663, 17), (116372, 17))

In [4]:
df = pd.concat([df_april_19, df_nov_18], ignore_index=True)
print(df.shape)
assert df.shape[0] == (df_april_19.shape[0] + df_nov_18.shape[0])

(247035, 17)


In [5]:
df.head()

Unnamed: 0,artist_name,track_id,track_name,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,popularity
0,YG,2RM4jf1Xa9zPgMGRDiht8O,"Big Bank feat. 2 Chainz, Big Sean, Nicki Minaj",0.00582,0.743,238373,0.339,0.0,1,0.0812,-7.678,1,0.409,203.927,4,0.118,15
1,YG,1tHDG53xJNGsItRA3vfVgs,BAND DRUM (feat. A$AP Rocky),0.0244,0.846,214800,0.557,0.0,8,0.286,-7.259,1,0.457,159.009,4,0.371,0
2,R3HAB,6Wosx2euFPMT14UXiWudMy,Radio Silence,0.025,0.603,138913,0.723,0.0,9,0.0824,-5.89,0,0.0454,114.966,4,0.382,56
3,Chris Cooq,3J2Jpw61sO7l6Hc7qdYV91,Lactose,0.0294,0.8,125381,0.579,0.912,5,0.0994,-12.118,0,0.0701,123.003,4,0.641,0
4,Chris Cooq,2jbYvQCyPgX3CdmAzeVeuS,Same - Original mix,3.5e-05,0.783,124016,0.792,0.878,7,0.0332,-10.277,1,0.0661,120.047,4,0.928,0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 247035 entries, 0 to 247034
Data columns (total 17 columns):
artist_name         247035 non-null object
track_id            247035 non-null object
track_name          247035 non-null object
acousticness        247035 non-null float64
danceability        247035 non-null float64
duration_ms         247035 non-null int64
energy              247035 non-null float64
instrumentalness    247035 non-null float64
key                 247035 non-null int64
liveness            247035 non-null float64
loudness            247035 non-null float64
mode                247035 non-null int64
speechiness         247035 non-null float64
tempo               247035 non-null float64
time_signature      247035 non-null int64
valence             247035 non-null float64
popularity          247035 non-null int64
dtypes: float64(9), int64(5), object(3)
memory usage: 32.0+ MB


In [7]:
df[df.duplicated()]

Unnamed: 0,artist_name,track_id,track_name,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,popularity
130714,Dirty Projectors,7B1he5MaIXjreL2R4qOa6h,Blue Bird,0.520000,0.530,229333,0.521,0.028900,8,0.2090,-7.939,0,0.1520,117.259,3,0.5290,37
130862,Day6,5sUQ5UfwoQWlfVprZ8dNmv,원하니까 Still,0.076100,0.465,251854,0.725,0.000000,0,0.1410,-3.495,1,0.0380,179.872,4,0.3200,50
130874,Gloria Trevi,47Y8o48cWHjMR0YyVccX8J,Que Me Duela,0.096000,0.717,202320,0.814,0.000217,11,0.0631,-5.744,0,0.0665,93.021,4,0.5350,52
130895,Arcangel,7CvDGxMqOjJOrFtLmPA53H,En Su Boca,0.528000,0.845,189987,0.665,0.000131,11,0.1560,-4.958,1,0.2030,128.069,4,0.2510,45
130903,Poppy,1HExJgzh18hPEaZIUmboXC,Metal,0.000561,0.710,218621,0.746,0.294000,6,0.1140,-4.707,1,0.0353,125.016,4,0.8620,41
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
246868,The Revivalists,79FWUNlVtSJ67iR9MK4OBH,Next To You,0.077900,0.607,209760,0.767,0.000156,11,0.2710,-4.724,1,0.0238,94.991,4,0.8090,47
246911,Pale Waves,5StW9VA75z8AUtMnkY3ZnM,Eighteen,0.000710,0.569,180803,0.827,0.039100,7,0.1400,-4.742,1,0.0300,126.015,4,0.1480,45
246930,Alkaline,12LYkTkGduz0lXb1EJDc1T,Juggernaut,0.087000,0.774,180620,0.781,0.000000,6,0.6380,-3.239,1,0.0628,101.040,4,0.7140,54
246998,Appleby,6inzFkThegOcsjxWgn0AP8,Young Lost Love,0.945000,0.292,239658,0.321,0.389000,11,0.6910,-10.840,1,0.0394,75.959,4,0.0393,45


In [14]:
no_dup_df = df[~df.duplicated(keep='first')]
assert no_dup_df.shape[0] == 239662

In [15]:
no_dup_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 239662 entries, 0 to 247034
Data columns (total 17 columns):
artist_name         239662 non-null object
track_id            239662 non-null object
track_name          239662 non-null object
acousticness        239662 non-null float64
danceability        239662 non-null float64
duration_ms         239662 non-null int64
energy              239662 non-null float64
instrumentalness    239662 non-null float64
key                 239662 non-null int64
liveness            239662 non-null float64
loudness            239662 non-null float64
mode                239662 non-null int64
speechiness         239662 non-null float64
tempo               239662 non-null float64
time_signature      239662 non-null int64
valence             239662 non-null float64
popularity          239662 non-null int64
dtypes: float64(9), int64(5), object(3)
memory usage: 32.9+ MB


In [17]:
no_dup_df.to_csv('../data/processed_df.csv', index=False)

In [18]:
"""
Now to Make a Function to sum all this up and try different 
clustering models.
"""
def find_nearest_songs(df, number_of_songs):
    # remove categoricals
    df_numerics =  df.drop(columns=['track_id', 'track_name', 'artist_name'])
    
    # Scale Data To Cluster More Accurately, and fit clustering model
    df_scaled = StandardScaler().fit_transform(df_numerics)
    
    
    # Querying the model for the 15 Nearest Neighbors
    dist, ind = df_modeled.query(df_scaled, k=16)
    
    # Putting the Results into a Dataframe
    dist_df = pd.DataFrame(dist)
    
    # Calculating the Distances
    scores = (1 - ((dist - dist.min()) / (dist.max() - dist.min()))) * 100
    
    # Creating A New Dataframe for the Distances
    columns = ['Searched_Song', 'Nearest_Song1', 'Nearest_Song2', 'Nearest_Song3', 'Nearest_Song4',
               'Nearest_Song5', 'Nearest_Song6', 'Nearest_Song7', 'Nearest_Song8', 'Nearest_Song9',
               'Nearest_Song10', 'Nearest_Song11', 'Nearest_Song12', 'Nearest_Song13', 'Nearest_Song14',
               'Nearest_Song15']
    
    dist_score = pd.DataFrame(scores.tolist(), columns = columns)
    
    # An Array of all indices of the nearest neighbors
    ind[:16]
    
    # Making an array of the Track IDs
    song_ids = np.array(df.track_id)
    
    # A function that creates list of the each song with its nearest neighbors
    def find_similars(song_ids, ind):
        similars = []
        for row in ind:
            ids = [song_ids[i] for i in row]
            similars.append(ids)

        return similars 
    
    # using the above function
    nearest_neighbors = find_similars(song_ids, ind)
    
    # putting the results into a dataframe
    nearest_neighbors_df = pd.DataFrame(nearest_neighbors, columns=columns)
    
    return nearest_neighbors_df

In [None]:
# this takes a good two to three minutes to process
find_nearest_songs(df, 5)

In [None]:
# predict function



In [None]:
# From here, if we add both the new dataframe and the original into an SQL database, we can easily
# just run JOIN ON queries to match the song Id's with track_name, artist, and any other info we'd want to display