# Exploratory Data Analysis - Spotify Track Similarity

In [1]:
import pandas as pd
import pickle
import numpy as np

from ast import literal_eval
from sklearn.neighbors import KDTree
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import MultiLabelBinarizer

In [2]:
# Load dataset.
df = pd.read_csv('merged_with_genres.csv', converters={'genres': eval})

In [3]:
# Check dimensions.
df.shape

(130989, 19)

In [4]:
# Verify no duplicate tracks are included.
df[df.duplicated(subset='track_id', keep=False)]

Unnamed: 0,artist_name,track_id,track_name,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,popularity,artist_id,genres


In [5]:
# Examine summary statistics.
df.describe(include='all')

Unnamed: 0,artist_name,track_id,track_name,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,popularity,artist_id,genres
count,130989,130989,130989,130989.0,130989.0,130989.0,130989.0,130989.0,130989.0,130989.0,130989.0,130989.0,130989.0,130989.0,130989.0,130989.0,130989.0,130989,130989
unique,34582,130989,108697,,,,,,,,,,,,,,,34818,8784
top,Johann Sebastian Bach,20f5Dj5aFizOtfey91cuSa,Intro,,,,,,,,,,,,,,,5aIqB5nVVvmFsvSdExz408,[]
freq,3635,1,110,,,,,,,,,,,,,,,3633,37606
mean,,,,0.342467,0.581478,212676.7,0.569244,0.223863,5.232974,0.194919,-9.971764,0.607959,0.111966,119.467271,3.878891,0.439681,24.242608,,
std,,,,0.345645,0.190024,123103.6,0.260368,0.360274,3.602824,0.167789,6.545915,0.488208,0.124295,30.154507,0.514536,0.259076,19.727393,,
min,,,,0.0,0.0,3203.0,0.0,0.0,0.0,0.0,-60.0,0.0,0.0,0.0,0.0,0.0,0.0,,
25%,,,,0.0316,0.459,164000.0,0.396,0.0,2.0,0.0975,-11.894,0.0,0.0389,96.014,4.0,0.224,7.0,,
50%,,,,0.203,0.605,201931.0,0.603,0.000146,5.0,0.124,-7.973,1.0,0.0558,120.026,4.0,0.42,22.0,,
75%,,,,0.636,0.727,241054.0,0.776,0.439,8.0,0.236,-5.681,1.0,0.129,139.621,4.0,0.638,38.0,,


In [6]:
# Check for null values.
df.isna().sum()

artist_name         0
track_id            0
track_name          0
acousticness        0
danceability        0
duration_ms         0
energy              0
instrumentalness    0
key                 0
liveness            0
loudness            0
mode                0
speechiness         0
tempo               0
time_signature      0
valence             0
popularity          0
artist_id           0
genres              0
dtype: int64

In [7]:
# Display most common genre combinations.
# Note that [] is null-equivalent.
df['genres'].value_counts().head(30)

[]                                                                                                   37606
[baroque, classical, early music, german baroque]                                                     3638
[classical, classical era]                                                                            1877
[edm, progressive house, progressive trance, trance]                                                  1130
[classical, classical era, early romantic era]                                                        1071
[chillhop, lo-fi beats]                                                                                766
[lo-fi beats]                                                                                          664
[classical, early romantic era, polish classical]                                                      624
[k-pop, k-pop boy group]                                                                               615
[sleep]                              

In [8]:
df['genres_str'] = df['genres'].astype(str)

## High 'speechiness' tracks
Per the Spotify API: "Values above 0.66 describe tracks that are probably made entirely of spoken words."

In [9]:
# Explore genres for tracks with high speechiness values.
df['genres'][df['speechiness'] > 0.66].value_counts()

[]                                                                                                                                                                   319
[classical, classical era]                                                                                                                                            49
[edm, progressive house, progressive trance, trance]                                                                                                                  33
[spanish comedy]                                                                                                                                                      17
[heartland rock, mellow gold, permanent wave, rock, singer-songwriter]                                                                                                16
[comedy]                                                                                                                                                   

In [10]:
# Could consider tagging these with a 'spoken word' genre.
df[(df['speechiness'] > 0.66) & (df['genres_str'] == '[]')].sample(10)

Unnamed: 0,artist_name,track_id,track_name,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,popularity,artist_id,genres,genres_str
62371,Santo Subito,70mRaXPHfZJZrffGMF6r8g,Rhythm Royal (Interlude),0.867,0.783,31067,0.246,0.0,1,0.123,-15.373,1,0.925,93.265,4,0.426,0,6WSbT4shbUEzP6aJBzeKwY,[],[]
58566,Disvstxr,5sEeuhYWHNczuPKek1zJeG,Tea Time,0.957,0.74,106684,0.65,0.874,10,0.151,-11.708,0,0.7,80.021,4,0.624,23,6K5Qi0w4g7bhoADUV83NiR,[],[]
48834,Cashmatics,16Mdyb2btVbRH4KYlRyq9b,Break It Down,0.264,0.873,226743,0.416,0.0,1,0.111,-9.224,1,0.667,139.954,4,0.389,0,316f7p2uxXrNKSsIDJ2IfQ,[],[]
47231,Samson Oxmoor,78iiLY9PIbcx2eNw2QqfKq,The Stag in the Ox Stall,0.339,0.693,77217,0.0422,0.0,11,0.233,-39.223,0,0.747,69.296,3,0.438,2,6dInHtNpbGVRh2BouvzrGW,[],[]
43940,DopemanBeats,35g5EkYJivixw7CGF4T8yQ,Diamond Link Im Rocking,0.00808,0.798,97907,0.841,0.986,0,0.0215,-7.398,0,0.75,123.651,4,0.76,0,1awhhcLGhrYXQoHzjfOqtY,[],[]
17781,Mickey Avalon,3FR4Z6gL9bylVmP2E6ueW7,There Was a Little Man,0.39,0.767,28615,0.543,0.0,11,0.273,-9.359,0,0.936,92.623,3,0.606,0,546WiMGysEqWZTzP8hJvB2,[],[]
20831,OG,6JfOly18Ndo4EUqgbV9bx3,News From OG (Skit),0.866,0.711,32232,0.0737,0.0,3,0.198,-22.795,1,0.869,128.943,3,0.696,0,2x8QwFVWgig0KGaWHtRxGh,[],[]
38841,Domani,1eSBNC7wbMNY2brxaTfxYl,Still Got Love,0.34,0.391,241659,0.487,0.0,3,0.309,-15.801,1,0.706,78.916,1,0.255,23,0uFa64czAQ5cVJa3V0PfRq,[],[]
85746,Woonun,1Pxp2oXhY6N9IuGWpNmqFQ,Aqua Frost,0.259,0.839,147122,0.624,0.0,4,0.0763,-9.325,0,0.675,159.856,4,0.428,0,0lzkwvrFf0VW9ISMcSH4GF,[],[]
94981,Teefy Bey,3l1tOHvPMCphNBVnWzMV2c,Smacking Rappers,0.283,0.667,210287,0.589,0.0,1,0.0669,-6.532,1,0.789,117.386,5,0.514,36,7uLZg5YzE6NnKqiuTORK3T,[],[]


## Candidates for broader/combination genre categories

In [11]:
# Christian
df['genres'][df['genres_str'].str.contains('christian')].value_counts()

[christian hip hop, christian trap]                                                                                                             228
[christian relaxative]                                                                                                                           97
[anthem worship, ccm, christian alternative rock, christian music, indiecoustica, world worship, worship]                                        91
[anthem worship, ccm, christian alternative rock, christian music, world worship, worship]                                                       89
[christian hip hop, christian pop, christian trap]                                                                                               84
[christian pop]                                                                                                                                  70
[anthem worship, ccm, christian music, deep ccm, world worship, worship]                                        

In [12]:
# Classical
df['genres'][df['genres_str'].str.contains('classical')].value_counts()

[baroque, classical, early music, german baroque]                                                                                                                  3638
[classical, classical era]                                                                                                                                         1877
[classical, classical era, early romantic era]                                                                                                                     1071
[classical, early romantic era, polish classical]                                                                                                                   624
[classical, post-romantic era]                                                                                                                                      479
[baroque, classical, early music, italian baroque]                                                                                                              

In [13]:
# Country
df['genres'][df['genres_str'].str.contains('country')].value_counts()

[country pop]                                                                                                                                                                                                         221
[country rap, redneck]                                                                                                                                                                                                174
[contemporary country, country, country road, modern country rock]                                                                                                                                                    173
[contemporary country]                                                                                                                                                                                                137
[contemporary country, country pop]                                                                                             

In [14]:
# House
df['genres'][df['genres_str'].str.contains('house')].value_counts()

[edm, progressive house, progressive trance, trance]                                                                                                                                           1130
[edm, progressive house, progressive trance, trance, uplifting trance]                                                                                                                          445
[progressive house, progressive trance, trance, uplifting trance]                                                                                                                               258
[progressive house, progressive trance, progressive uplifting trance, trance, uplifting trance]                                                                                                 169
[big room, dance pop, edm, electro house, pop, tropical house]                                                                                                                                   89
[deep uplifting tran

In [15]:
# Jazz
df['genres'][df['genres_str'].str.contains('jazz')].value_counts()

[chillhop, jazz boom bap, lo-fi beats]                                                                                                                                                                240
[chillhop, jazz boom bap]                                                                                                                                                                              82
[adult standards, jazz blues, soul, swing, vocal jazz]                                                                                                                                                 45
[jazz blues, neo soul, soul, soul jazz, torch song, vocal jazz]                                                                                                                                        41
[electro swing, nu jazz]                                                                                                                                                                        

In [16]:
# Hip hop
df['genres'][df['genres_str'].str.contains('hip hop')].value_counts()

[atl hip hop, dirty south rap, gangster rap, hip hop, pop rap, rap, southern hip hop, trap music]                                      273
[christian hip hop, christian trap]                                                                                                    228
[deep underground hip hop]                                                                                                             218
[dark trap, emo rap, underground hip hop]                                                                                              135
[latin, latin hip hop, reggaeton, reggaeton flow, tropical]                                                                            124
[underground hip hop]                                                                                                                  122
[chicago drill, chicago rap, drill, hip hop, pop rap, rap, southern hip hop, trap music, underground hip hop, vapor trap]              119
[underground hip hop, vapor

In [17]:
# Pop
df['genres'][df['genres_str'].str.contains('pop')].value_counts()

[k-pop, k-pop boy group]                                                                                                            615
[regional mexican pop]                                                                                                              563
[corrido, regional mexican pop]                                                                                                     401
[atl hip hop, dirty south rap, gangster rap, hip hop, pop rap, rap, southern hip hop, trap music]                                   273
[dance pop, pop, post-teen pop]                                                                                                     238
[k-pop, k-pop girl group]                                                                                                           234
[country pop]                                                                                                                       221
[pop]                                           

In [18]:
# Metal
df['genres'][df['genres_str'].str.contains('metal')].value_counts()

[melodic metalcore, metalcore, post-screamo, screamo]                                                                                                                                                         56
[metalcore]                                                                                                                                                                                                   37
[album rock, glam metal, hard rock, nwobhm, rock]                                                                                                                                                             33
[metallic hardcore]                                                                                                                                                                                           33
[alternative metal, nu metal, post-grunge, rock]                                                                                                                    

In [19]:
# Rap
df['genres'][df['genres_str'].str.contains('rap')].value_counts()

[emo rap]                                                                                                                    352
[atl hip hop, dirty south rap, gangster rap, hip hop, pop rap, rap, southern hip hop, trap music]                            273
[cali rap, west coast trap]                                                                                                  268
[vapor trap]                                                                                                                 254
[christian hip hop, christian trap]                                                                                          228
[country rap, redneck]                                                                                                       174
[cali rap]                                                                                                                   159
[cali rap, hyphy, west coast trap]                                                               

In [20]:
# Rock
df['genres'][df['genres_str'].str.contains('rock')].value_counts()

[contemporary country, country, country road, modern country rock]                                                                                                                                                                                                                     173
[adult standards, brill building pop, folk, folk rock, mellow gold, rock, singer-songwriter, soft rock, yacht rock]                                                                                                                                                                    138
[indie garage rock]                                                                                                                                                                                                                                                                    130
[album rock, classic rock, folk rock, heartland rock, mellow gold, rock, soft rock, yacht rock]                                                        

In [21]:
# Techno
df['genres'][df['genres_str'].str.contains('techno')].value_counts()

[big room, edm, german techno, progressive house, progressive trance, trance, uplifting trance]                                                                                                39
[edm, german techno, progressive house, progressive trance, trance, uplifting trance]                                                                                                          21
[acid house, chicago house, deep house, float house, hip house, techno]                                                                                                                        18
[electronica, frankfurt electronic, german techno, microhouse, minimal techno, tech house]                                                                                                     18
[ambient techno, electronica, float house, microhouse, minimal techno, shiver pop]                                                                                                             16
[ambient techno]              

## Scaling

In [22]:
features = ['acousticness', 
            'danceability', 
            'energy', 
            'instrumentalness', 
            'key', 
            'liveness',
            'loudness', 
            'mode', 
            'speechiness', 
            'tempo', 
            'time_signature', 
            'valence',
            'popularity']

In [23]:
scaler = MinMaxScaler()
df_scaled = df.copy()
df_scaled[features] = scaler.fit_transform(df[features])

## One-hot encoding

In [24]:
# One-hot encode default genres.
mlb = MultiLabelBinarizer()
df_encoded = df_scaled.join(pd.DataFrame(mlb.fit_transform(df_scaled.pop('genres')),
                                         columns=mlb.classes_,
                                         index=df_scaled.index))

In [25]:
# Add super genres.
df_encoded['blues_super'] = df['genres_str'].str.contains('blues').astype(int)
df_encoded['classical_super'] = df['genres_str'].str.contains('classical').astype(int)
df_encoded['country_super'] = df['genres_str'].str.contains('country').astype(int)
df_encoded['folk_super'] = df['genres_str'].str.contains('americana|bluegrass|folk').astype(int)
df_encoded['house_super'] = df['genres_str'].str.contains('house').astype(int)
df_encoded['indie_super'] = df['genres_str'].str.contains('indie').astype(int)
df_encoded['jazz_super'] = df['genres_str'].str.contains('jazz').astype(int)
df_encoded['latin_super'] = df['genres_str'].str.contains('banda|columbian|cumbia|'
                                                          'dominican|espanol|grupera|latin|'
                                                          'mexican|norteno|puerto rican|'
                                                          'ranchera|reggaeton|tejano|tex-mex|uruguayan').astype(int)
df_encoded['metal_super'] = df['genres_str'].str.contains('metal|screamo').astype(int)
df_encoded['rap_super'] = df['genres_str'].str.contains('hop|rap').astype(int)
df_encoded['rock_super'] = df['genres_str'].str.contains('rock').astype(int)
df_encoded['spoken_word_super'] = (df['speechiness'] > 0.66).astype(int)
df_encoded['techno_super'] = df['genres_str'].str.contains('techno').astype(int)
df_encoded['worship_super'] = df['genres_str'].str.contains('ccm|christian|gospel|worship').astype(int)

In [26]:
df_encoded.shape

(130989, 2589)

In [27]:
df_encoded.columns

Index(['artist_name', 'track_id', 'track_name', 'acousticness', 'danceability',
       'duration_ms', 'energy', 'instrumentalness', 'key', 'liveness',
       ...
       'house_super', 'indie_super', 'jazz_super', 'latin_super',
       'metal_super', 'rap_super', 'rock_super', 'spoken_word_super',
       'techno_super', 'worship_super'],
      dtype='object', length=2589)

## Feature reduction

In [28]:
genre_features = list(df_encoded.columns[19:])
print(genre_features)

['21st century classical', '432hz', '8-bit', 'a cappella', 'aarhus indie', 'aberdeen indie', 'abstract', 'abstract beats', 'abstract hip hop', 'abstract idm', 'abstractro', 'accordion', 'acid house', 'acid jazz', 'acid techno', 'acousmatic', 'acoustic blues', 'acoustic chill', 'acoustic opm', 'acoustic pop', 'acoustic punk', 'adelaide indie', 'adoracao', 'adoracion', 'adult standards', 'adventista', 'afghan pop', 'african electronic', 'african gospel', 'african percussion', 'african rock', 'afro dancehall', 'afro house', 'afro psych', 'afro-funk', 'afrobeat', 'afropop', 'aggrotech', 'alabama indie', 'alabama metal', 'alabama rap', 'alaska indie', 'albanian hip hop', 'albanian pop', 'albany ny indie', 'alberta country', 'alberta hip hop', 'album rock', 'albuquerque indie', 'alt-idol', 'alternative americana', 'alternative ccm', 'alternative country', 'alternative dance', 'alternative emo', 'alternative hip hop', 'alternative metal', 'alternative metalcore', 'alternative pop', 'alternati

In [29]:
# Keep only genres with at least 1000 tagged tracks.
df_reduced = df_encoded.drop([col for col, val in df_encoded[genre_features].sum().iteritems() if val < 1000], axis=1)

In [30]:
print(df_reduced.columns)

Index(['artist_name', 'track_id', 'track_name', 'acousticness', 'danceability',
       'duration_ms', 'energy', 'instrumentalness', 'key', 'liveness',
       'loudness', 'mode', 'speechiness', 'tempo', 'time_signature', 'valence',
       'popularity', 'artist_id', 'genres_str', 'alternative r&b',
       'atl hip hop', 'banda', 'baroque', 'big room', 'brostep', 'cali rap',
       'ccm', 'chamber pop', 'chillhop', 'classical', 'classical era',
       'contemporary country', 'dance pop', 'early music',
       'early romantic era', 'edm', 'electro house', 'electropop', 'emo rap',
       'folk-pop', 'gangster rap', 'german baroque', 'grupera', 'hip hop',
       'indie folk', 'indie pop', 'indie poptimism', 'indie r&b', 'indie rock',
       'indie soul', 'indietronica', 'k-pop', 'latin', 'lo-fi beats',
       'mellow gold', 'melodic rap', 'modern rock', 'neo mellow', 'norteno',
       'pop', 'pop edm', 'pop rap', 'pop rock', 'post-teen pop',
       'progressive house', 'progressive trance', 

## Weight features

In [31]:
df_reduced['mode'] = df_encoded['mode']*100
df_reduced['valence'] = df_encoded['valence']*100
df_reduced['speechiness'] = df_encoded['speechiness']*100

In [32]:
super_genres = ['classical_super', 'country_super',
       'folk_super', 'house_super', 'jazz_super', 'latin_super', 'metal_super',
       'rap_super', 'rock_super', 'worship_super']
df_reduced[super_genres] = df_encoded[super_genres]*500

## Build tree

In [33]:
expanded_features = list(df_reduced.columns)

In [34]:
exclude_features = ['artist_id', 'artist_name', 'track_id', 'track_name', 'genres_str']
for feature in exclude_features:
    expanded_features.remove(feature)

In [35]:
for feature in expanded_features:
    print(feature)

acousticness
danceability
duration_ms
energy
instrumentalness
key
liveness
loudness
mode
speechiness
tempo
time_signature
valence
popularity
alternative r&b
atl hip hop
banda
baroque
big room
brostep
cali rap
ccm
chamber pop
chillhop
classical
classical era
contemporary country
dance pop
early music
early romantic era
edm
electro house
electropop
emo rap
folk-pop
gangster rap
german baroque
grupera
hip hop
indie folk
indie pop
indie poptimism
indie r&b
indie rock
indie soul
indietronica
k-pop
latin
lo-fi beats
mellow gold
melodic rap
modern rock
neo mellow
norteno
pop
pop edm
pop rap
pop rock
post-teen pop
progressive house
progressive trance
ranchera
rap
regional mexican
regional mexican pop
rock
sleep
soft rock
southern hip hop
stomp and holler
trance
trap music
tropical house
underground hip hop
uplifting trance
vapor trap
classical_super
country_super
folk_super
house_super
indie_super
jazz_super
latin_super
metal_super
rap_super
rock_super
worship_super


In [36]:
k_tree = KDTree(df_reduced[expanded_features], metric='euclidean', leaf_size=50)

### Test track 1

In [37]:
dist, ind = k_tree.query(np.array(df_reduced[expanded_features].loc[0]).reshape(1, -1), k=10)  

In [38]:
print(dist)

[[  0.         146.54020055 179.4839407  200.33967907 247.16559284
  306.00194885 320.00644406 561.73972463 566.83798089 566.83822606]]


In [39]:
print(ind)

[[     0 106304  46022 109402 113692  21871  31569  25228  30942  83448]]


In [40]:
df.loc[ind[0]][['artist_name', 'track_name', 'genres']]

Unnamed: 0,artist_name,track_name,genres
0,Frédéric Chopin,"バラード 第 1番 ト短調, 作品 23","[classical, early romantic era, polish classical]"
106304,Wolfgang Amadeus Mozart,"Don Giovanni, K. 527, Act II: Sola, sola in bu...","[classical, classical era]"
46022,Pyotr Ilyich Tchaikovsky,"18 Pieces, Op. 72: II. Berceuse","[classical, late romantic era, russian romanti..."
109402,Frédéric Chopin,"Piano Concerto No. 2 in F Minor, Op. 21: III. ...","[classical, early romantic era, polish classical]"
113692,Claude Debussy,"Fantaisie for Piano and Orchestra, L. 73: Fant...","[classical, post-romantic era]"
21871,William Walton,Walton: Viola Concerto: I. Andante comodo,"[british modern classical, classical, early mo..."
31569,Max Bruch,"Violin Concerto No. 1 in G Minor, Op. 26: I. V...","[classical, late romantic era]"
25228,Lena Raine,Golden,[video game music]
30942,Silent Knights,Open Kitchen Fire (Long With Fade),[sleep]
83448,Dan Forrest,Entreat Me Not to Leave You,[]


## Test track 2

In [41]:
dist, ind = k_tree.query(np.array(df_reduced[expanded_features].loc[2]).reshape(1, -1), k=10)  

In [42]:
df.loc[ind[0]][['artist_name', 'track_name', 'track_id', 'genres']]

Unnamed: 0,artist_name,track_name,track_id,genres
2,Neffex,Take Me Away,6HXfoTMOdKlN0IoaM9LkTa,[bass trap]
94530,Neffex,Take Me Away,2VcqW62pUjTP3f1XAkEh0h,[bass trap]
15263,OMB Peezy,Pressure (feat. TK Kravitz),4W9Vg3xIgxeORxIlKF3tKZ,"[alabama rap, baton rouge rap, cali rap, deep ..."
14321,B.o.B,Bdtdt,63GqKGTUvtkWRpes8L5wUO,"[atl hip hop, dance pop, hip hop, pop, pop rap..."
116409,Bailo,Tear It Down,5O0HssN2SXcwqFobAJRNu0,"[bass trap, electronic trap]"
3262,Lud Foe,Suffer,3OgyqiAwc4Ie5dPTFLzX2c,"[chicago rap, detroit trap, southern hip hop, ..."
124745,Gucci Mane,Kept Back (feat. Lil Pump) - Bonus Track,384Oc8MQzITdL2DGyCeivN,"[atl hip hop, dirty south rap, hip hop, pop, p..."
119897,Jaydayoungan,Ain't Want To,4rU63HINv0AYi9jm90WJaX,"[melodic rap, southern hip hop, trap music]"
126078,YoungBoy Never Broke Again,Big,4vm6LbfJ3SXVn9LmPH7khh,"[baton rouge rap, trap music]"
125276,Beau Young Prince,Let Go,5F2AuFzEiuAQD0JVNjVJQz,[dmv rap]


## Pickle tree

In [43]:
pickle.dump(k_tree, open('tree.p', 'wb'))

## Unpickle tree

In [44]:
loaded_tree = pickle.load(open('tree.p', 'rb'))

In [45]:
np.array(df_reduced[expanded_features].loc[6535])

array([9.93975904e-01, 3.16265060e-01, 8.87070000e+04, 3.24000000e-01,
       9.35000000e-01, 5.45454545e-01, 8.60860861e-02, 6.93039511e-01,
       1.00000000e+02, 5.52795031e+00, 4.32109383e-01, 8.00000000e-01,
       7.90000000e+00, 4.00000000e-02, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
      

In [46]:
np.array([9.93975904e-01, 3.16265060e-01, 8.87070000e+04, 3.24000000e-01,
       9.35000000e-01, 5.45454545e-01, 8.60860861e-02, 6.93039511e-01,
       1.00000000e+00, 5.52795031e-02, 4.32109383e-01, 8.00000000e-01,
       3.95000000e+00, 4.00000000e-02, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       2.50000000e+02, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00])

array([9.93975904e-01, 3.16265060e-01, 8.87070000e+04, 3.24000000e-01,
       9.35000000e-01, 5.45454545e-01, 8.60860861e-02, 6.93039511e-01,
       1.00000000e+00, 5.52795031e-02, 4.32109383e-01, 8.00000000e-01,
       3.95000000e+00, 4.00000000e-02, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
      

In [47]:
dist, ind = loaded_tree.query(np.array(df_reduced[expanded_features].loc[6535]).reshape(1, -1), k=15)  

In [48]:
df.loc[ind[0]][['artist_name', 'track_name', 'track_id', 'genres']]

Unnamed: 0,artist_name,track_name,track_id,genres
6535,Franz Liszt,"2 Czárdás, S. 225: No. 1, Czárdás",1IVJDJy9rWFAynjhta7l2J,"[classical, late romantic era]"
93526,Wolfgang Amadeus Mozart,"Così fan tutte, K. 588, Act I: Non siate ritro...",5Lqrnw3WM05vFVktilKSsJ,"[classical, classical era]"
112951,Johann Sebastian Bach,"Auf, auf! Die rechte Zeit ist hier, BWV 440",2PrLmcL5GW9l1mPo9zCbbG,"[baroque, classical, early music, german baroque]"
21280,Andrei Krylov,Medieval Viking Sailor Ballad,6JTqAToiIIUONY78c1g5IX,"[classical guitar, sleep]"
103970,Johann Sebastian Bach,"Goldberg Variations, BWV 988: Var. 9, Canone a...",4wdVA9OvEIEGYbaK6L4Vhb,"[baroque, classical, early music, german baroque]"
66153,Johann Sebastian Bach,"Cello Suite No. 5 in C Minor, BWV 1011 (Arr. T...",2LmApl89Lf4IyyZWINB4VE,"[baroque, classical, early music, german baroque]"
6,Johann Sebastian Bach,"Violin Partita No. 3 in E Major, BWV 1006: VI....",4KQCFic25ZiqTlqfvTmAkI,"[baroque, classical, early music, german baroque]"
122435,Wolfgang Amadeus Mozart,"Allegro in B-Flat Major, K. 3",6Pvs7yUXznpDSSW90U4r0V,"[classical, classical era]"
9034,George Frideric Handel,"Theodora, HWV 68: No. 1a, Overture. Maestoso (...",5TKFyFtGZwW3HTP9VMZ7qR,"[baroque, classical, early music, english baro..."
113055,Johann Sebastian Bach,"Gelobet seist du, Jesu Christ, BWV 722a",0t3oq8eMrZNuosY4XjrU9u,"[baroque, classical, early music, german baroque]"


In [49]:
dist, ind = loaded_tree.query(np.array(df_reduced[expanded_features].loc[54875]).reshape(1, -1), k=15)  

In [50]:
df.loc[ind[0]][['artist_name', 'track_name', 'track_id', 'genres']]

Unnamed: 0,artist_name,track_name,track_id,genres
54875,Malcolm Anthony,Poppin',3CMA0HYJ8y2eSXXTpol5sv,[deep underground hip hop]
1857,Chief Keef,I Got,6bGbRJ6hgIZ7RRXDH65S3X,"[chicago drill, chicago rap, drill, hip hop, p..."
93659,Xavier Wulf,SPIKE WULF,2wGMQxtars4PUj9Q8uVjWs,"[dark trap, emo rap, underground hip hop, vapo..."
115176,Ashanti,Let It Snow,14LtnWDIKSWJQAX6Ofcs2P,"[dance pop, hip hop, hip pop, pop, pop rap, r&..."
51171,Goonew,Slide House,39MqmmAQ4sE8s0Kd7AUzHi,"[dmv rap, vapor trap]"
86269,gizmo,Uncle Ruckus,0IMnt5um7TgcbOmYWzfXDl,"[dark trap, emo rap, scream rap, underground h..."
88230,10cellphones,Work,69NfytkocCADcyIbdF70jF,"[underground hip hop, vapor trap]"
36149,convolk,a million messages; no response,5MyDwU6PTOKmKfkO7H5q7W,"[dark trap, emo rap, melodic rap, vapor trap]"
17167,Chief Keef,TT,06qRPvIu8omxjT5wNNkc5M,"[chicago drill, chicago rap, drill, hip hop, p..."
11605,BONES,ShameOnMe МнеСтыдно,2NqT772Av8nQYlhufquaLf,"[dark trap, emo rap, underground hip hop]"


## Populate database

In [51]:
from decouple import config
from flask import Flask
from flask_sqlalchemy import SQLAlchemy
import json

In [52]:
user = config('POSTGRES_USER')
pw = config('POSTGRES_PW')
url = config('POSTGRES_URL')
db = config('POSTGRES_DB')
DB_URL = f'postgresql+psycopg2://{user}:{pw}@{url}/{db}'

In [74]:
app = Flask(__name__)
app.config['SQLALCHEMY_DATABASE_URI'] = DB_URL
DB = SQLAlchemy(app)

  'SQLALCHEMY_TRACK_MODIFICATIONS adds significant overhead and '


In [54]:
df_reduced['id'] = df_reduced.index

In [55]:
df_reduced = df_reduced[['id', 'track_id', 'track_name', 'artist_id', 'artist_name', 'genres_str',
                        'acousticness', 'danceability',
       'duration_ms', 'energy', 'instrumentalness', 'key', 'liveness',
       'loudness', 'mode', 'speechiness', 'tempo', 'time_signature', 'valence',
       'popularity', 'alternative r&b',
       'atl hip hop', 'banda', 'baroque', 'big room', 'brostep', 'cali rap',
       'ccm', 'chamber pop', 'chillhop', 'classical', 'classical era',
       'contemporary country', 'dance pop', 'early music',
       'early romantic era', 'edm', 'electro house', 'electropop', 'emo rap',
       'folk-pop', 'gangster rap', 'german baroque', 'grupera', 'hip hop',
       'indie folk', 'indie pop', 'indie poptimism', 'indie r&b', 'indie rock',
       'indie soul', 'indietronica', 'k-pop', 'latin', 'lo-fi beats',
       'mellow gold', 'melodic rap', 'modern rock', 'neo mellow', 'norteno',
       'pop', 'pop edm', 'pop rap', 'pop rock', 'post-teen pop',
       'progressive house', 'progressive trance', 'ranchera', 'rap',
       'regional mexican', 'regional mexican pop', 'rock', 'sleep',
       'soft rock', 'southern hip hop', 'stomp and holler', 'trance',
       'trap music', 'tropical house', 'underground hip hop',
       'uplifting trance', 'vapor trap', 'classical_super', 'country_super',
       'folk_super', 'house_super', 'indie_super', 'jazz_super', 'latin_super',
       'metal_super', 'rap_super', 'rock_super', 'worship_super']]

In [56]:
df_reduced.columns = df_reduced.columns.str.replace(' ', '_').str.replace('&', 'n').str.replace('-', '_')

In [57]:
df_reduced.columns

Index(['id', 'track_id', 'track_name', 'artist_id', 'artist_name',
       'genres_str', 'acousticness', 'danceability', 'duration_ms', 'energy',
       'instrumentalness', 'key', 'liveness', 'loudness', 'mode',
       'speechiness', 'tempo', 'time_signature', 'valence', 'popularity',
       'alternative_rnb', 'atl_hip_hop', 'banda', 'baroque', 'big_room',
       'brostep', 'cali_rap', 'ccm', 'chamber_pop', 'chillhop', 'classical',
       'classical_era', 'contemporary_country', 'dance_pop', 'early_music',
       'early_romantic_era', 'edm', 'electro_house', 'electropop', 'emo_rap',
       'folk_pop', 'gangster_rap', 'german_baroque', 'grupera', 'hip_hop',
       'indie_folk', 'indie_pop', 'indie_poptimism', 'indie_rnb', 'indie_rock',
       'indie_soul', 'indietronica', 'k_pop', 'latin', 'lo_fi_beats',
       'mellow_gold', 'melodic_rap', 'modern_rock', 'neo_mellow', 'norteno',
       'pop', 'pop_edm', 'pop_rap', 'pop_rock', 'post_teen_pop',
       'progressive_house', 'progressive_tra

In [58]:
def load():
    """Initialize DB."""
    DB.drop_all()
    DB.create_all()
    df_reduced.to_sql(name='track', con=DB.engine, index=False, if_exists='replace')
    DB.session.commit()
    return 'Database initialized.'

In [59]:
load()

'Database initialized.'

In [75]:
# Designate primary key in pgAdmin before running this cell.
DB.Model.metadata.reflect(DB.engine)

In [76]:
class Track(DB.Model):
    __table__ = DB.Model.metadata.tables['track']
    
    def to_array(self):
        return np.array([self.acousticness, 
                         self.danceability, 
                         self.duration_ms,
                         self.energy, 
                         self.instrumentalness, 
                         self.key, 
                         self.liveness, 
                         self.loudness, 
                         self.mode, 
                         self.speechiness, 
                         self.tempo, 
                         self.time_signature, 
                         self.valence, 
                         self.popularity, 
                         self.alternative_rnb, 
                         self.atl_hip_hop,
                         self.banda,
                         self.baroque,
                         self.big_room,
                         self.brostep,
                         self.cali_rap,
                         self.ccm,
                         self.chamber_pop,
                         self.chillhop,
                         self.classical,
                         self.classical_era,
                         self.contemporary_country,
                         self.dance_pop,
                         self.early_music,
                         self.early_romantic_era,
                         self.edm,
                         self.electro_house,
                         self.electropop,
                         self.emo_rap,
                         self.folk_pop,
                         self.gangster_rap,
                         self.german_baroque,
                         self.grupera,
                         self.hip_hop,
                         self.indie_folk,
                         self.indie_pop,
                         self.indie_poptimism,
                         self.indie_rnb,
                         self.indie_rock,
                         self.indie_soul,
                         self.indietronica,
                         self.k_pop,
                         self.latin,
                         self.lo_fi_beats,
                         self.mellow_gold,
                         self.melodic_rap,
                         self.modern_rock,
                         self.neo_mellow,
                         self.norteno,
                         self.pop,
                         self.pop_edm,
                         self.pop_rap,
                         self.pop_rock,
                         self.post_teen_pop,
                         self.progressive_house,
                         self.progressive_trance,
                         self.ranchera,
                         self.rap,
                         self.regional_mexican,
                         self.regional_mexican_pop,
                         self.rock,
                         self.sleep,
                         self.soft_rock,
                         self.southern_hip_hop,
                         self.stomp_and_holler,
                         self.trance,
                         self.trap_music,
                         self.tropical_house,
                         self.underground_hip_hop,
                         self.uplifting_trance,
                         self.vapor_trap,
                         self.classical_super,
                         self.country_super,
                         self.folk_super,
                         self.house_super,
                         self.indie_super,
                         self.jazz_super,
                         self.latin_super,
                         self.metal_super,
                         self.rap_super,
                         self.rock_super,
                         self.worship_super])

    def to_dict(self):
        return {'track_id': self.track_id,
                'track_name': self.track_name,
                'artist_name': self.artist_name,
                'acousticness': self.acousticness,
                'danceability': self.danceability,
                'duration_ms': self.duration_ms,
                'energy': self.energy,
                'instrumentalness': self.instrumentalness,
                'key': self.key,
                'liveness': self.liveness,
                'loudness': self.loudness,
                'mode': self.mode,
                'speechiness': self.speechiness,
                'tempo': self.tempo,
                'time_signature': self.time_signature,
                'valence': self.valence,
                'popularity': self.popularity}

    def __repr__(self):
        return json.dumps(self.to_dict())

In [77]:
q1 = Track.query.filter(Track.id == 1).first()

In [78]:
print(q1.to_array())

[3.00200803e-02 6.09437751e-01 1.80000000e+05 8.00000000e-01
 0.00000000e+00 6.36363636e-01 7.07707708e-01 8.46179983e-01
 1.00000000e+02 9.97929607e+00 3.03972670e-01 8.00000000e-01
 6.80000000e+01 5.00000000e-01 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.000000

In [83]:
dist, ind = loaded_tree.query(q1.to_array().reshape(1, -1), k=15) 

In [110]:
indices = [val.item() for val in ind[0]]

In [114]:
q2 = Track.query.filter(Track.id.in_(indices)).all()

In [115]:
print(q2)

[{"track_id": "6aRYzvulEbltKobXPdWdcs", "track_name": "Alive", "artist_name": "Neffex", "acousticness": 0.030020080321285142, "danceability": 0.6094377510040161, "duration_ms": 180000, "energy": 0.8, "instrumentalness": 0.0, "key": 0.6363636363636364, "liveness": 0.7077077077077076, "loudness": 0.8461799825259685, "mode": 100.0, "speechiness": 9.979296066252589, "tempo": 0.3039726701415696, "time_signature": 0.8, "valence": 68.0, "popularity": 0.5}, {"track_id": "7lFIEkgHwkrSRXtpbO0YiS", "track_name": "A Rose", "artist_name": "Ace Hood", "acousticness": 0.11947791164658636, "danceability": 0.931726907630522, "duration_ms": 180000, "energy": 0.642, "instrumentalness": 0.0, "key": 0.09090909090909091, "liveness": 0.07777777777777778, "loudness": 0.8660647833543671, "mode": 100.0, "speechiness": 4.74120082815735, "tempo": 0.4960137289335675, "time_signature": 0.8, "valence": 49.20000000000001, "popularity": 0.33}, {"track_id": "5auUx7eZGeKqDfeO1uh8wD", "track_name": "Horizons Form From Th