# Exploratory Data Analysis - Spotify Track Similarity

In [1]:
import pandas as pd
import pickle
import numpy as np

from ast import literal_eval
from sklearn.neighbors import KDTree
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import MultiLabelBinarizer

In [2]:
# Load dataset.
df = pd.read_csv('merged_with_genres.csv', converters={'genres': eval})

In [3]:
# Check dimensions.
df.shape

(130989, 19)

In [4]:
# Verify no duplicate tracks are included.
df[df.duplicated(subset='track_id', keep=False)]

Unnamed: 0,artist_name,track_id,track_name,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,popularity,artist_id,genres


In [5]:
# Examine summary statistics.
df.describe(include='all')

Unnamed: 0,artist_name,track_id,track_name,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,popularity,artist_id,genres
count,130989,130989,130989,130989.0,130989.0,130989.0,130989.0,130989.0,130989.0,130989.0,130989.0,130989.0,130989.0,130989.0,130989.0,130989.0,130989.0,130989,130989
unique,34582,130989,108697,,,,,,,,,,,,,,,34818,8784
top,Johann Sebastian Bach,20YKNAhau4msP5OyHGaKRf,Intro,,,,,,,,,,,,,,,5aIqB5nVVvmFsvSdExz408,[]
freq,3635,1,110,,,,,,,,,,,,,,,3633,37606
mean,,,,0.342467,0.581478,212676.7,0.569244,0.223863,5.232974,0.194919,-9.971764,0.607959,0.111966,119.467271,3.878891,0.439681,24.242608,,
std,,,,0.345645,0.190024,123103.6,0.260368,0.360274,3.602824,0.167789,6.545915,0.488208,0.124295,30.154507,0.514536,0.259076,19.727393,,
min,,,,0.0,0.0,3203.0,0.0,0.0,0.0,0.0,-60.0,0.0,0.0,0.0,0.0,0.0,0.0,,
25%,,,,0.0316,0.459,164000.0,0.396,0.0,2.0,0.0975,-11.894,0.0,0.0389,96.014,4.0,0.224,7.0,,
50%,,,,0.203,0.605,201931.0,0.603,0.000146,5.0,0.124,-7.973,1.0,0.0558,120.026,4.0,0.42,22.0,,
75%,,,,0.636,0.727,241054.0,0.776,0.439,8.0,0.236,-5.681,1.0,0.129,139.621,4.0,0.638,38.0,,


In [6]:
# Check for null values.
df.isna().sum()

artist_name         0
track_id            0
track_name          0
acousticness        0
danceability        0
duration_ms         0
energy              0
instrumentalness    0
key                 0
liveness            0
loudness            0
mode                0
speechiness         0
tempo               0
time_signature      0
valence             0
popularity          0
artist_id           0
genres              0
dtype: int64

In [7]:
# Display most common genre combinations.
# Note that [] is null-equivalent.
df['genres'].value_counts().head(30)

[]                                                                                                   37606
[baroque, classical, early music, german baroque]                                                     3638
[classical, classical era]                                                                            1877
[edm, progressive house, progressive trance, trance]                                                  1130
[classical, classical era, early romantic era]                                                        1071
[chillhop, lo-fi beats]                                                                                766
[lo-fi beats]                                                                                          664
[classical, early romantic era, polish classical]                                                      624
[k-pop, k-pop boy group]                                                                               615
[sleep]                              

In [8]:
df['genres_str'] = df['genres'].astype(str)

## High 'speechiness' tracks
Per the Spotify API: "Values above 0.66 describe tracks that are probably made entirely of spoken words."

In [9]:
# Explore genres for tracks with high speechiness values.
df['genres'][df['speechiness'] > 0.66].value_counts()

[]                                                                                                                                                                   319
[classical, classical era]                                                                                                                                            49
[edm, progressive house, progressive trance, trance]                                                                                                                  33
[spanish comedy]                                                                                                                                                      17
[heartland rock, mellow gold, permanent wave, rock, singer-songwriter]                                                                                                16
[comedy]                                                                                                                                                   

In [10]:
# Could consider tagging these with a 'spoken word' genre.
df[(df['speechiness'] > 0.66) & (df['genres_str'] == '[]')].sample(10)

Unnamed: 0,artist_name,track_id,track_name,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,popularity,artist_id,genres,genres_str
23110,Schranzen Danzen,0k5kUdaMt96TJMHv0y0cnG,Heavy Stone,0.502,0.609,198008,0.648,0.0,5,0.235,-12.67,1,0.902,131.415,4,0.37,0,0Ygu2DpwQuFpbCOwqDZtL4,[],[]
43177,Khan,39RP6zgUlV5qKeX4D7JG3B,Longhorn Anthem,0.246,0.713,161290,0.58,0.0,3,0.0793,-11.622,0,0.807,186.206,4,0.333,0,7IGfCYwRlHWCMqCSQyS2dD,[],[]
46670,Samson Oxmoor,5f3g8wrefnFysiTuik37kI,The Apes and the Two Travellers,0.769,0.704,71197,0.0893,0.0,10,0.168,-35.812,0,0.764,76.909,1,0.433,2,6dInHtNpbGVRh2BouvzrGW,[],[]
47191,Samson Oxmoor,42dRzi6XRFRmOL325nPi0X,The Mischievous Dog,0.151,0.704,65334,0.0397,0.0,5,0.162,-39.116,1,0.774,63.078,4,0.297,2,6dInHtNpbGVRh2BouvzrGW,[],[]
118055,Dave $tokes,1ecnKF1LDBxFlXM8hlrUv3,On It,0.288,0.414,76278,0.742,0.0,1,0.203,-11.355,1,0.674,81.193,1,0.499,48,0mP6wa81SOzUJIaAnLg7a7,[],[]
93771,99 Neighbors,6SyJPb54GQ0Vy6Yqe9DDJM,Champion,0.497,0.669,219111,0.562,0.0,0,0.252,-8.036,1,0.705,133.136,4,0.3,44,5uhqkMm8dyQvX83kl4Znq0,[],[]
74190,Swagged UP,30YG5i3bDbBnVDb2fknXea,Citrine,0.861,0.774,61308,0.427,0.0,6,0.339,-29.689,0,0.918,79.799,4,0.38,42,7Ga6pJ8JARnvFGZqBC1hIC,[],[]
76156,Bvss Jvnkie,1IsaP0m3SlZs6ZMZgNUhbv,Tears,0.961,0.972,177231,0.128,0.0226,11,0.108,-16.272,1,0.896,129.98,4,0.314,5,4xsugNlCcF32B9F57ujumi,[],[]
87643,Geoff Tate,7B6Lz9TRldY7vXIJLzwwHV,Religious Businesses,0.863,0.541,296480,0.404,0.0,1,0.81,-19.187,1,0.953,78.787,3,0.479,22,3eKlygZ2G8oK9KNjdbqbDA,[],[]
105018,Hectic,1yPyPHBoiYYjI6xiyxQY26,Foul 2da Game,0.154,0.9,184869,0.466,0.0,6,0.589,-10.349,1,0.851,100.043,4,0.353,1,6hm1KcTDSuxxjfg8JmjjaN,[],[]


## Candidates for broader/combination genre categories

In [11]:
# Christian
df['genres'][df['genres_str'].str.contains('christian')].value_counts()

[christian hip hop, christian trap]                                                                                                             228
[christian relaxative]                                                                                                                           97
[anthem worship, ccm, christian alternative rock, christian music, indiecoustica, world worship, worship]                                        91
[anthem worship, ccm, christian alternative rock, christian music, world worship, worship]                                                       89
[christian hip hop, christian pop, christian trap]                                                                                               84
[christian pop]                                                                                                                                  70
[anthem worship, ccm, christian music, deep ccm, world worship, worship]                                        

In [12]:
# Classical
df['genres'][df['genres_str'].str.contains('classical')].value_counts()

[baroque, classical, early music, german baroque]                                                                                                                  3638
[classical, classical era]                                                                                                                                         1877
[classical, classical era, early romantic era]                                                                                                                     1071
[classical, early romantic era, polish classical]                                                                                                                   624
[classical, post-romantic era]                                                                                                                                      479
[baroque, classical, early music, italian baroque]                                                                                                              

In [13]:
# Country
df['genres'][df['genres_str'].str.contains('country')].value_counts()

[country pop]                                                                                                                                                                                                         221
[country rap, redneck]                                                                                                                                                                                                174
[contemporary country, country, country road, modern country rock]                                                                                                                                                    173
[contemporary country]                                                                                                                                                                                                137
[contemporary country, country pop]                                                                                             

In [14]:
# House
df['genres'][df['genres_str'].str.contains('house')].value_counts()

[edm, progressive house, progressive trance, trance]                                                                                                                                           1130
[edm, progressive house, progressive trance, trance, uplifting trance]                                                                                                                          445
[progressive house, progressive trance, trance, uplifting trance]                                                                                                                               258
[progressive house, progressive trance, progressive uplifting trance, trance, uplifting trance]                                                                                                 169
[big room, dance pop, edm, electro house, pop, tropical house]                                                                                                                                   89
[deep uplifting tran

In [15]:
# Jazz
df['genres'][df['genres_str'].str.contains('jazz')].value_counts()

[chillhop, jazz boom bap, lo-fi beats]                                                                                                                                                                240
[chillhop, jazz boom bap]                                                                                                                                                                              82
[adult standards, jazz blues, soul, swing, vocal jazz]                                                                                                                                                 45
[jazz blues, neo soul, soul, soul jazz, torch song, vocal jazz]                                                                                                                                        41
[electro swing, nu jazz]                                                                                                                                                                        

In [16]:
# Hip hop
df['genres'][df['genres_str'].str.contains('hip hop')].value_counts()

[atl hip hop, dirty south rap, gangster rap, hip hop, pop rap, rap, southern hip hop, trap music]                                      273
[christian hip hop, christian trap]                                                                                                    228
[deep underground hip hop]                                                                                                             218
[dark trap, emo rap, underground hip hop]                                                                                              135
[latin, latin hip hop, reggaeton, reggaeton flow, tropical]                                                                            124
[underground hip hop]                                                                                                                  122
[chicago drill, chicago rap, drill, hip hop, pop rap, rap, southern hip hop, trap music, underground hip hop, vapor trap]              119
[underground hip hop, vapor

In [17]:
# Pop
df['genres'][df['genres_str'].str.contains('pop')].value_counts()

[k-pop, k-pop boy group]                                                                                                            615
[regional mexican pop]                                                                                                              563
[corrido, regional mexican pop]                                                                                                     401
[atl hip hop, dirty south rap, gangster rap, hip hop, pop rap, rap, southern hip hop, trap music]                                   273
[dance pop, pop, post-teen pop]                                                                                                     238
[k-pop, k-pop girl group]                                                                                                           234
[country pop]                                                                                                                       221
[pop]                                           

In [18]:
# Metal
df['genres'][df['genres_str'].str.contains('metal')].value_counts()

[melodic metalcore, metalcore, post-screamo, screamo]                                                                                                                                                         56
[metalcore]                                                                                                                                                                                                   37
[album rock, glam metal, hard rock, nwobhm, rock]                                                                                                                                                             33
[metallic hardcore]                                                                                                                                                                                           33
[alternative metal, nu metal, post-grunge, rock]                                                                                                                    

In [19]:
# Rap
df['genres'][df['genres_str'].str.contains('rap')].value_counts()

[emo rap]                                                                                                                    352
[atl hip hop, dirty south rap, gangster rap, hip hop, pop rap, rap, southern hip hop, trap music]                            273
[cali rap, west coast trap]                                                                                                  268
[vapor trap]                                                                                                                 254
[christian hip hop, christian trap]                                                                                          228
[country rap, redneck]                                                                                                       174
[cali rap]                                                                                                                   159
[cali rap, hyphy, west coast trap]                                                               

In [20]:
# Rock
df['genres'][df['genres_str'].str.contains('rock')].value_counts()

[contemporary country, country, country road, modern country rock]                                                                                                                                                                                                                     173
[adult standards, brill building pop, folk, folk rock, mellow gold, rock, singer-songwriter, soft rock, yacht rock]                                                                                                                                                                    138
[indie garage rock]                                                                                                                                                                                                                                                                    130
[album rock, classic rock, folk rock, heartland rock, mellow gold, rock, soft rock, yacht rock]                                                        

In [21]:
# Techno
df['genres'][df['genres_str'].str.contains('techno')].value_counts()

[big room, edm, german techno, progressive house, progressive trance, trance, uplifting trance]                                                                                                39
[edm, german techno, progressive house, progressive trance, trance, uplifting trance]                                                                                                          21
[acid house, chicago house, deep house, float house, hip house, techno]                                                                                                                        18
[electronica, frankfurt electronic, german techno, microhouse, minimal techno, tech house]                                                                                                     18
[ambient techno, electronica, float house, microhouse, minimal techno, shiver pop]                                                                                                             16
[ambient techno]              

## Scaling

In [22]:
features = ['acousticness', 
            'danceability', 
            'duration_ms',
            'energy', 
            'instrumentalness', 
            'key', 
            'liveness',
            'loudness', 
            'mode', 
            'speechiness', 
            'tempo', 
            'time_signature', 
            'valence',
            'popularity']

In [23]:
scaler = MinMaxScaler()
df_scaled = df[['track_id'] + features].copy()
df_scaled[features] = scaler.fit_transform(df[features])

In [24]:
df_scaled = pd.merge(df, df_scaled, on='track_id', suffixes=['', '_scaled'])

## One-hot encoding

In [25]:
# One-hot encode default genres.
mlb = MultiLabelBinarizer()
df_encoded = df_scaled.join(pd.DataFrame(mlb.fit_transform(df_scaled.pop('genres')),
                                         columns=mlb.classes_,
                                         index=df_scaled.index))

In [26]:
# Add super genres.
df_encoded['blues_super'] = df['genres_str'].str.contains('blues').astype(int)
df_encoded['classical_super'] = df['genres_str'].str.contains('classical').astype(int)
df_encoded['country_super'] = df['genres_str'].str.contains('country').astype(int)
df_encoded['folk_super'] = df['genres_str'].str.contains('americana|bluegrass|folk').astype(int)
df_encoded['house_super'] = df['genres_str'].str.contains('house').astype(int)
df_encoded['indian_super'] = df['genres_str'].str.contains('bangla|bollywood|ghazal|indian|'
                                                           'filmi|kannada|pakistani|tamil')
df_encoded['indie_super'] = df['genres_str'].str.contains('indie').astype(int)
df_encoded['japanese_super'] = df['genres_str'].str.contains('j-|japanese').astype(int)
df_encoded['jazz_super'] = df['genres_str'].str.contains('jazz').astype(int)
df_encoded['latin_super'] = df['genres_str'].str.contains('banda|columbian|cumbia|'
                                                          'dominican|espanol|grupera|latin|'
                                                          'mambo|mexican|norteno|puerto rican|'
                                                          'ranchera|reggaeton|salsa|tango|tejano|tex-mex|'
                                                          'uruguayan|vallenato|venezuelan').astype(int)
df_encoded['metal_super'] = df['genres_str'].str.contains('metal|screamo|thrash').astype(int)
df_encoded['rap_super'] = df['genres_str'].str.contains('crunk|drill|hop|rap').astype(int)
df_encoded['reggae_super'] = df['genres_str'].str.contains('reggae|rock steady|ska').astype(int)
df_encoded['rock_super'] = df['genres_str'].str.contains('rock').astype(int)
df_encoded['spoken_word_super'] = (df['speechiness'] > 0.66).astype(int)
df_encoded['techno_super'] = df['genres_str'].str.contains('techno').astype(int)
df_encoded['worship_super'] = df['genres_str'].str.contains('ccm|christian|gospel|'
                                                            'praise|worship').astype(int)

In [27]:
df_encoded.shape

(130989, 2606)

In [28]:
df_encoded.columns

Index(['artist_name', 'track_id', 'track_name', 'acousticness', 'danceability',
       'duration_ms', 'energy', 'instrumentalness', 'key', 'liveness',
       ...
       'japanese_super', 'jazz_super', 'latin_super', 'metal_super',
       'rap_super', 'reggae_super', 'rock_super', 'spoken_word_super',
       'techno_super', 'worship_super'],
      dtype='object', length=2606)

## Feature reduction

In [29]:
genre_features = list(df_encoded.columns[32:])
print(genre_features)

['popularity_scaled', '21st century classical', '432hz', '8-bit', 'a cappella', 'aarhus indie', 'aberdeen indie', 'abstract', 'abstract beats', 'abstract hip hop', 'abstract idm', 'abstractro', 'accordion', 'acid house', 'acid jazz', 'acid techno', 'acousmatic', 'acoustic blues', 'acoustic chill', 'acoustic opm', 'acoustic pop', 'acoustic punk', 'adelaide indie', 'adoracao', 'adoracion', 'adult standards', 'adventista', 'afghan pop', 'african electronic', 'african gospel', 'african percussion', 'african rock', 'afro dancehall', 'afro house', 'afro psych', 'afro-funk', 'afrobeat', 'afropop', 'aggrotech', 'alabama indie', 'alabama metal', 'alabama rap', 'alaska indie', 'albanian hip hop', 'albanian pop', 'albany ny indie', 'alberta country', 'alberta hip hop', 'album rock', 'albuquerque indie', 'alt-idol', 'alternative americana', 'alternative ccm', 'alternative country', 'alternative dance', 'alternative emo', 'alternative hip hop', 'alternative metal', 'alternative metalcore', 'alterna

In [30]:
# Keep only genres with at least 1000 tagged tracks.
df_reduced = df_encoded.drop([col for col, val in df_encoded[genre_features].sum().iteritems() if val < 1000], axis=1)

In [31]:
for column in df_reduced.columns:
    print(column)

artist_name
track_id
track_name
acousticness
danceability
duration_ms
energy
instrumentalness
key
liveness
loudness
mode
speechiness
tempo
time_signature
valence
popularity
artist_id
genres_str
acousticness_scaled
danceability_scaled
duration_ms_scaled
energy_scaled
instrumentalness_scaled
key_scaled
liveness_scaled
loudness_scaled
mode_scaled
speechiness_scaled
tempo_scaled
time_signature_scaled
valence_scaled
popularity_scaled
alternative r&b
atl hip hop
banda
baroque
big room
brostep
cali rap
ccm
chamber pop
chillhop
classical
classical era
contemporary country
dance pop
early music
early romantic era
edm
electro house
electropop
emo rap
folk-pop
gangster rap
german baroque
grupera
hip hop
indie folk
indie pop
indie poptimism
indie r&b
indie rock
indie soul
indietronica
k-pop
latin
lo-fi beats
mellow gold
melodic rap
modern rock
neo mellow
norteno
pop
pop edm
pop rap
pop rock
post-teen pop
progressive house
progressive trance
ranchera
rap
regional mexican
regional mexican pop
rock
s

## Weight features

In [32]:
df_reduced['mode'] = df_encoded['mode']*100
df_reduced['valence'] = df_encoded['valence']*100
df_reduced['speechiness'] = df_encoded['speechiness']*100

In [33]:
super_genres = ['classical_super', 'country_super',
       'folk_super', 'house_super', 'indian_super', 'jazz_super', 'latin_super', 'metal_super',
       'rap_super', 'reggae_super', 'rock_super', 'worship_super']
df_reduced[super_genres] = df_encoded[super_genres]*500

## Build tree

In [34]:
expanded_features = list(df_reduced.columns)

In [35]:
exclude_features = ['artist_id', 'artist_name', 'track_id', 'track_name', 'genres_str', 
                   'acousticness', 'danceability', 'duration_ms', 'energy' ,'instrumentalness',
                    'key', 'liveness', 'loudness', 'mode', 'speechiness', 'tempo', 
                    'time_signature', 'valence', 'popularity']
for feature in exclude_features:
    expanded_features.remove(feature)

In [36]:
for feature in expanded_features:
    print(feature)

acousticness_scaled
danceability_scaled
duration_ms_scaled
energy_scaled
instrumentalness_scaled
key_scaled
liveness_scaled
loudness_scaled
mode_scaled
speechiness_scaled
tempo_scaled
time_signature_scaled
valence_scaled
popularity_scaled
alternative r&b
atl hip hop
banda
baroque
big room
brostep
cali rap
ccm
chamber pop
chillhop
classical
classical era
contemporary country
dance pop
early music
early romantic era
edm
electro house
electropop
emo rap
folk-pop
gangster rap
german baroque
grupera
hip hop
indie folk
indie pop
indie poptimism
indie r&b
indie rock
indie soul
indietronica
k-pop
latin
lo-fi beats
mellow gold
melodic rap
modern rock
neo mellow
norteno
pop
pop edm
pop rap
pop rock
post-teen pop
progressive house
progressive trance
ranchera
rap
regional mexican
regional mexican pop
rock
sleep
soft rock
southern hip hop
stomp and holler
trance
trap music
tropical house
underground hip hop
uplifting trance
vapor trap
classical_super
country_super
folk_super
house_super
indian_supe

In [37]:
k_tree = KDTree(df_reduced[expanded_features], metric='euclidean', leaf_size=50)

### Test track 1

In [38]:
dist, ind = k_tree.query(np.array(df_reduced[expanded_features].loc[0]).reshape(1, -1), k=10)  

In [39]:
print(dist)

[[0.         0.07843628 0.17387433 0.17704521 0.19832775 0.21146971
  0.21578746 0.21815295 0.22023184 0.24011737]]


In [40]:
print(ind)

[[     0   2251 104812 102932 110362 109327   2256  44064  81899  48254]]


In [41]:
df.loc[ind[0]][['artist_name', 'track_name', 'genres']]

Unnamed: 0,artist_name,track_name,genres
0,Frédéric Chopin,"バラード 第 1番 ト短調, 作品 23","[classical, early romantic era, polish classical]"
2251,Frédéric Chopin,バラード 第1番 ト短調 作品23,"[classical, early romantic era, polish classical]"
104812,Frédéric Chopin,"Polonaise-Fantaisie in A-Flat Major, Op. 61","[classical, early romantic era, polish classical]"
102932,Frédéric Chopin,"バラード 第 1番 ト短調, 作品 23","[classical, early romantic era, polish classical]"
110362,Frédéric Chopin,"Ballade No. 4 In F Minor, Op. 52","[classical, early romantic era, polish classical]"
109327,Frédéric Chopin,"12 Études, Op. 10: Étude No. 6 in E-Flat Minor","[classical, early romantic era, polish classical]"
2256,Frédéric Chopin,バラード 第4番 ヘ短調 作品52,"[classical, early romantic era, polish classical]"
44064,Frédéric Chopin,"Nocturne No. 2 in F-Sharp Minor, Op. 48","[classical, early romantic era, polish classical]"
81899,Robert Schumann,"Piano Concerto in A Minor, Op.54: 1. Allegro a...","[classical, early romantic era]"
48254,Frédéric Chopin,"Mazurka No. 4 in B-Flat Minor, Op. 24","[classical, early romantic era, polish classical]"


## Test track 2

In [42]:
dist, ind = k_tree.query(np.array(df_reduced[expanded_features].loc[2]).reshape(1, -1), k=10)  

In [43]:
df.loc[ind[0]][['artist_name', 'track_name', 'track_id', 'genres']]

Unnamed: 0,artist_name,track_name,track_id,genres
2,Neffex,Take Me Away,6HXfoTMOdKlN0IoaM9LkTa,[bass trap]
94530,Neffex,Take Me Away,2VcqW62pUjTP3f1XAkEh0h,[bass trap]
128811,ManMan Savage,She A Freak (feat. Ohgeesy & 03 Greedo),5gWmZ7eZULHN15xgiwHdsW,[atl trap]
13158,Dame Dot,Courtesy of the Mafia,20uYVhUCWdml0B8SuY5ILk,[detroit trap]
69456,Peled,Ma Ani Nira Lecha,2KWzg1vpya70bKCKewYJpn,"[israeli hip hop, israeli trap]"
88680,Bizzey,Rock Ya Body,0khETOCOY7Yvvyq97jnSUG,"[dutch hip hop, dutch pop]"
25922,Shahmen,Poison,57zLdUBNAJIVdwfV6sPIcs,[bass trap]
61324,Nafe Smallz,Gucci,1g1FuSRt9nPxwj0ojs2QTa,"[uk drill, uk hip hop]"
113751,HoodCelebrityy,Walking Trophy - Toddla T Remix,2k4JprD2WzbThquUdLPzCT,"[dancehall, trap queen]"
121070,SAMAHTA,graffiti,4Ngk1LdCZwntseAb8gMfMx,"[bass trap, traprun]"


## Pickle tree

In [44]:
pickle.dump(k_tree, open('tree.p', 'wb'))

## Unpickle tree

In [45]:
loaded_tree = pickle.load(open('tree.p', 'rb'))

In [46]:
np.array(df_reduced[expanded_features].loc[6535])

array([9.93975904e-01, 3.16265060e-01, 1.52500073e-02, 3.24000000e-01,
       9.35000000e-01, 5.45454545e-01, 8.60860861e-02, 6.93039511e-01,
       1.00000000e+00, 5.52795031e-02, 4.32109383e-01, 8.00000000e-01,
       7.90000000e-02, 4.00000000e-02, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
      

In [47]:
np.array([9.93975904e-01, 3.16265060e-01, 8.87070000e+04, 3.24000000e-01,
       9.35000000e-01, 5.45454545e-01, 8.60860861e-02, 6.93039511e-01,
       1.00000000e+00, 5.52795031e-02, 4.32109383e-01, 8.00000000e-01,
       3.95000000e+00, 4.00000000e-02, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       2.50000000e+02, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00])

array([9.93975904e-01, 3.16265060e-01, 8.87070000e+04, 3.24000000e-01,
       9.35000000e-01, 5.45454545e-01, 8.60860861e-02, 6.93039511e-01,
       1.00000000e+00, 5.52795031e-02, 4.32109383e-01, 8.00000000e-01,
       3.95000000e+00, 4.00000000e-02, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
      

In [48]:
dist, ind = loaded_tree.query(np.array(df_reduced[expanded_features].loc[6535]).reshape(1, -1), k=15)  

In [49]:
df.loc[ind[0]][['artist_name', 'track_name', 'track_id', 'genres']]

Unnamed: 0,artist_name,track_name,track_id,genres
6535,Franz Liszt,"2 Czárdás, S. 225: No. 1, Czárdás",1IVJDJy9rWFAynjhta7l2J,"[classical, late romantic era]"
16181,Richard Wagner,"Der fliegende Holländer, WWV 63, Act II: Inter...",0KGpVEIclbYMUWkbaCj9b8,"[classical, german opera, late romantic era]"
113693,Claude Debussy,"Nocturnes, L. 91: No. 2. Fêtes (Arr. V. Leyetc...",4NPyeVK0Y8QBqDUw9QTham,"[classical, post-romantic era]"
45272,Philip Glass,Études: No. 3,3cFPZ9zhASW6kAkjig7AX4,"[american contemporary classical, classical, c..."
15966,Igor Stravinsky,Piano Sonata in F-Sharp Minor: II. Vivo,07u33ZyksMeJ9fkTxxewsl,"[classical, early modern classical, russian mo..."
30728,Dmitri Shostakovich,"Symphony No. 9 in E-Flat Major, Op. 70: III. P...",7kzsPLJJrCNbKIXuol1aGz,"[classical, compositional ambient, russian mod..."
85158,Pyotr Ilyich Tchaikovsky,"Symphony No. 2 in C Minor, Op. 17 ""Little Russ...",1BgGolRkWjpYfuwwV1ji00,"[classical, late romantic era, russian romanti..."
94246,Claude Debussy,12 のエチュード オクターヴのための,04KX9Ojf57yo6uZz3D8MB5,"[classical, post-romantic era]"
15963,Igor Stravinsky,"3 Movements from Petrushka: No. 2, Petrushka's...",2GVTrxpEX6FrMUoe9qrzOT,"[classical, early modern classical, russian mo..."
20046,Charles Gounod,"Quatuor en La Mineur, CG 564: I. Allegro",6MVo8ofQmqkbG5uy8FlrGI,"[classical, french opera, late romantic era]"


In [50]:
dist, ind = loaded_tree.query(np.array(df_reduced[expanded_features].loc[54875]).reshape(1, -1), k=15)  

In [51]:
df.loc[ind[0]][['artist_name', 'track_name', 'track_id', 'genres']]

Unnamed: 0,artist_name,track_name,track_id,genres
54875,Malcolm Anthony,Poppin',3CMA0HYJ8y2eSXXTpol5sv,[deep underground hip hop]
68140,Мот,Ча-Ча Ленд,2kWjzytmWLN5ng3DOu6Hq1,"[russian dance, russian dance pop, russian hip..."
69977,Bianca Bonnie,Yes I Know Ft. Chozus,7sphJvVXgfh7JU5Cq8IWk2,[trap queen]
34479,Mook,Flexin,6whIKHdT5E7SlET9Kc89a9,[chicago drill]
84138,WC no Beat,Favelado Chique,3pIu5YzfTyUkV1avxXKKK0,"[brazilian edm, brazilian hip hop, funk carioc..."
69245,Konshens,This Sex Was,7FJ7womodSrxkWUGExz7go,"[dancehall, rap kreyol]"
88303,July 7,Special,1wDzx4whIAAZzAFStOmbjM,[trap soul]
76700,Omy Alka,Bendecido Amen,2cXWWN43ABp95qSCndDcaU,[rap cristiano]
30051,Ygg Tay,No Competition,3QiR9aa6Ium08kbXj26wKp,[baltimore hip hop]
58767,Fast Cash Boyz,Whatever,3ZfZkpzvhBv8xURytcxYM1,[memphis hip hop]


## Populate database

In [52]:
from decouple import config
from flask import Flask
from flask_sqlalchemy import SQLAlchemy
import json

In [76]:
user = config('POSTGRES_USER')
pw = config('POSTGRES_PW')
url = config('POSTGRES_URL')
db = config('POSTGRES_DB')
DB_URL = f'postgresql+psycopg2://{user}:{pw}@{url}/{db}'

In [77]:
app = Flask(__name__)
app.config['SQLALCHEMY_DATABASE_URI'] = DB_URL
DB = SQLAlchemy(app)

  'SQLALCHEMY_TRACK_MODIFICATIONS adds significant overhead and '


In [55]:
df_reduced['id'] = df_reduced.index

In [56]:
for column in df_reduced.columns:
    print (column)

artist_name
track_id
track_name
acousticness
danceability
duration_ms
energy
instrumentalness
key
liveness
loudness
mode
speechiness
tempo
time_signature
valence
popularity
artist_id
genres_str
acousticness_scaled
danceability_scaled
duration_ms_scaled
energy_scaled
instrumentalness_scaled
key_scaled
liveness_scaled
loudness_scaled
mode_scaled
speechiness_scaled
tempo_scaled
time_signature_scaled
valence_scaled
popularity_scaled
alternative r&b
atl hip hop
banda
baroque
big room
brostep
cali rap
ccm
chamber pop
chillhop
classical
classical era
contemporary country
dance pop
early music
early romantic era
edm
electro house
electropop
emo rap
folk-pop
gangster rap
german baroque
grupera
hip hop
indie folk
indie pop
indie poptimism
indie r&b
indie rock
indie soul
indietronica
k-pop
latin
lo-fi beats
mellow gold
melodic rap
modern rock
neo mellow
norteno
pop
pop edm
pop rap
pop rock
post-teen pop
progressive house
progressive trance
ranchera
rap
regional mexican
regional mexican pop
rock
s

In [57]:
df_reduced.shape

(130989, 109)

In [58]:
df_reduced = df_reduced[['id', 'track_id', 'track_name', 'artist_id', 'artist_name', 'genres_str',
                        'acousticness', 'danceability',
       'duration_ms', 'energy', 'instrumentalness', 'key', 'liveness',
       'loudness', 'mode', 'speechiness', 'tempo', 'time_signature', 'valence',
       'popularity', 'acousticness_scaled', 'danceability_scaled', 'duration_ms_scaled', 'energy_scaled',
    'instrumentalness_scaled', 'key_scaled', 'liveness_scaled', 'loudness_scaled', 
                       'mode_scaled', 'speechiness_scaled', 'tempo_scaled', 'time_signature_scaled',
                       'valence_scaled', 'popularity_scaled', 'alternative r&b',
       'atl hip hop', 'banda', 'baroque', 'big room', 'brostep', 'cali rap',
       'ccm', 'chamber pop', 'chillhop', 'classical', 'classical era',
       'contemporary country', 'dance pop', 'early music',
       'early romantic era', 'edm', 'electro house', 'electropop', 'emo rap',
       'folk-pop', 'gangster rap', 'german baroque', 'grupera', 'hip hop',
       'indie folk', 'indie pop', 'indie poptimism', 'indie r&b', 'indie rock',
       'indie soul', 'indietronica', 'k-pop', 'latin', 'lo-fi beats',
       'mellow gold', 'melodic rap', 'modern rock', 'neo mellow', 'norteno',
       'pop', 'pop edm', 'pop rap', 'pop rock', 'post-teen pop',
       'progressive house', 'progressive trance', 'ranchera', 'rap',
       'regional mexican', 'regional mexican pop', 'rock', 'sleep',
       'soft rock', 'southern hip hop', 'stomp and holler', 'trance',
       'trap music', 'tropical house', 'underground hip hop',
       'uplifting trance', 'vapor trap', 'classical_super', 'country_super',
       'folk_super', 'house_super', 'indian_super', 'indie_super', 'jazz_super', 'latin_super',
       'metal_super', 'rap_super', 'reggae_super', 'rock_super', 'worship_super']]

In [59]:
df_reduced.shape

(130989, 109)

In [60]:
df_reduced.columns = df_reduced.columns.str.replace(' ', '_').str.replace('&', 'n').str.replace('-', '_')

In [61]:
df_reduced.columns

Index(['id', 'track_id', 'track_name', 'artist_id', 'artist_name',
       'genres_str', 'acousticness', 'danceability', 'duration_ms', 'energy',
       ...
       'house_super', 'indian_super', 'indie_super', 'jazz_super',
       'latin_super', 'metal_super', 'rap_super', 'reggae_super', 'rock_super',
       'worship_super'],
      dtype='object', length=109)

In [62]:
def load():
    """Initialize DB."""
    DB.drop_all()
    DB.create_all()
    df_reduced.to_sql(name='track', con=DB.engine, index=False, if_exists='replace')
    DB.session.commit()
    return 'Database initialized.'

In [63]:
load()

'Database initialized.'

In [78]:
# Designate primary key in pgAdmin before running this cell.
DB.Model.metadata.reflect(DB.engine)

In [79]:
class Track(DB.Model):
    __table__ = DB.Model.metadata.tables['track']
    
    def to_array(self):
        return np.array([self.acousticness_scaled, 
                         self.danceability_scaled, 
                         self.duration_ms_scaled,
                         self.energy_scaled, 
                         self.instrumentalness_scaled, 
                         self.key_scaled, 
                         self.liveness_scaled, 
                         self.loudness_scaled, 
                         self.mode_scaled, 
                         self.speechiness_scaled, 
                         self.tempo_scaled, 
                         self.time_signature_scaled, 
                         self.valence_scaled, 
                         self.popularity_scaled, 
                         self.alternative_rnb, 
                         self.atl_hip_hop,
                         self.banda,
                         self.baroque,
                         self.big_room,
                         self.brostep,
                         self.cali_rap,
                         self.ccm,
                         self.chamber_pop,
                         self.chillhop,
                         self.classical,
                         self.classical_era,
                         self.contemporary_country,
                         self.dance_pop,
                         self.early_music,
                         self.early_romantic_era,
                         self.edm,
                         self.electro_house,
                         self.electropop,
                         self.emo_rap,
                         self.folk_pop,
                         self.gangster_rap,
                         self.german_baroque,
                         self.grupera,
                         self.hip_hop,
                         self.indie_folk,
                         self.indie_pop,
                         self.indie_poptimism,
                         self.indie_rnb,
                         self.indie_rock,
                         self.indie_soul,
                         self.indietronica,
                         self.k_pop,
                         self.latin,
                         self.lo_fi_beats,
                         self.mellow_gold,
                         self.melodic_rap,
                         self.modern_rock,
                         self.neo_mellow,
                         self.norteno,
                         self.pop,
                         self.pop_edm,
                         self.pop_rap,
                         self.pop_rock,
                         self.post_teen_pop,
                         self.progressive_house,
                         self.progressive_trance,
                         self.ranchera,
                         self.rap,
                         self.regional_mexican,
                         self.regional_mexican_pop,
                         self.rock,
                         self.sleep,
                         self.soft_rock,
                         self.southern_hip_hop,
                         self.stomp_and_holler,
                         self.trance,
                         self.trap_music,
                         self.tropical_house,
                         self.underground_hip_hop,
                         self.uplifting_trance,
                         self.vapor_trap,
                         self.classical_super,
                         self.country_super,
                         self.folk_super,
                         self.house_super,
                         self.indian_super,
                         self.indie_super,
                         self.jazz_super,
                         self.latin_super,
                         self.metal_super,
                         self.rap_super,
                         self.reggae_super,
                         self.rock_super,
                         self.worship_super])

    def to_dict(self):
        return {'track_id': self.track_id,
                'track_name': self.track_name,
                'artist_name': self.artist_name,
                'acousticness': self.acousticness,
                'danceability': self.danceability,
                'duration_ms': self.duration_ms,
                'energy': self.energy,
                'instrumentalness': self.instrumentalness,
                'key': self.key,
                'liveness': self.liveness,
                'loudness': self.loudness,
                'mode': self.mode,
                'speechiness': self.speechiness,
                'tempo': self.tempo,
                'time_signature': self.time_signature,
                'valence': self.valence,
                'popularity': self.popularity}

    def __repr__(self):
        return json.dumps(self.to_dict())

In [80]:
q1 = Track.query.filter(Track.id == 1).first()

In [81]:
print(q1.to_array().shape)

(89,)


In [82]:
dist, ind = loaded_tree.query(q1.to_array().reshape(1, -1), k=15) 

In [83]:
indices = [val.item() for val in ind[0]]

In [84]:
q2 = Track.query.filter(Track.id.in_(indices)).all()

In [85]:
print(q2)

[{"track_id": "6aRYzvulEbltKobXPdWdcs", "track_name": "Alive", "artist_name": "Neffex", "acousticness": 0.0299, "danceability": 0.607, "duration_ms": 180000, "energy": 0.8, "instrumentalness": 0.0, "key": 7, "liveness": 0.7070000000000001, "loudness": -7.7010000000000005, "mode": 100, "speechiness": 9.64, "tempo": 75.988, "time_signature": 4, "valence": 68.0, "popularity": 50}, {"track_id": "7kH8cKU5vlnpIR9uBUpP06", "track_name": "Grapevine", "artist_name": "Junoflo", "acousticness": 0.17300000000000001, "danceability": 0.736, "duration_ms": 209579, "energy": 0.701, "instrumentalness": 0.0, "key": 7, "liveness": 0.353, "loudness": -6.331, "mode": 100, "speechiness": 17.6, "tempo": 95.012, "time_signature": 4, "valence": 71.9, "popularity": 48}, {"track_id": "5nXMMjMPYBeaYd0idselno", "track_name": "Conversation with the Streets", "artist_name": "Allstar JR", "acousticness": 0.0313, "danceability": 0.65, "duration_ms": 222694, "energy": 0.7909999999999999, "instrumentalness": 0.0, "key":