# Exploratory Data Analysis - Spotify Track Similarity

In [1]:
import pandas as pd
import numpy as np

from ast import literal_eval
from sklearn.neighbors import KDTree
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import MultiLabelBinarizer

In [2]:
# Load dataset.
df = pd.read_csv('merged_with_genres.csv', converters={'genres': eval})

In [3]:
# Check dimensions.
df.shape

(130989, 19)

In [4]:
# Verify no duplicate tracks are included.
df[df.duplicated(subset='track_id', keep=False)]

Unnamed: 0,artist_name,track_id,track_name,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,popularity,artist_id,genres


In [5]:
# Examine summary statistics.
df.describe(include='all')

Unnamed: 0,artist_name,track_id,track_name,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,popularity,artist_id,genres
count,130989,130989,130989,130989.0,130989.0,130989.0,130989.0,130989.0,130989.0,130989.0,130989.0,130989.0,130989.0,130989.0,130989.0,130989.0,130989.0,130989,130989
unique,34582,130989,108697,,,,,,,,,,,,,,,34818,8784
top,Johann Sebastian Bach,12pSrdApPoh3596vZlpMLg,Intro,,,,,,,,,,,,,,,5aIqB5nVVvmFsvSdExz408,[]
freq,3635,1,110,,,,,,,,,,,,,,,3633,37606
mean,,,,0.342467,0.581478,212676.7,0.569244,0.223863,5.232974,0.194919,-9.971764,0.607959,0.111966,119.467271,3.878891,0.439681,24.242608,,
std,,,,0.345645,0.190024,123103.6,0.260368,0.360274,3.602824,0.167789,6.545915,0.488208,0.124295,30.154507,0.514536,0.259076,19.727393,,
min,,,,0.0,0.0,3203.0,0.0,0.0,0.0,0.0,-60.0,0.0,0.0,0.0,0.0,0.0,0.0,,
25%,,,,0.0316,0.459,164000.0,0.396,0.0,2.0,0.0975,-11.894,0.0,0.0389,96.014,4.0,0.224,7.0,,
50%,,,,0.203,0.605,201931.0,0.603,0.000146,5.0,0.124,-7.973,1.0,0.0558,120.026,4.0,0.42,22.0,,
75%,,,,0.636,0.727,241054.0,0.776,0.439,8.0,0.236,-5.681,1.0,0.129,139.621,4.0,0.638,38.0,,


In [6]:
# Check for null values.
df.isna().sum()

artist_name         0
track_id            0
track_name          0
acousticness        0
danceability        0
duration_ms         0
energy              0
instrumentalness    0
key                 0
liveness            0
loudness            0
mode                0
speechiness         0
tempo               0
time_signature      0
valence             0
popularity          0
artist_id           0
genres              0
dtype: int64

In [7]:
# Display most common genre combinations.
# Note that [] is null-equivalent.
df['genres'].value_counts().head(30)

[]                                                                                                   37606
[baroque, classical, early music, german baroque]                                                     3638
[classical, classical era]                                                                            1877
[edm, progressive house, progressive trance, trance]                                                  1130
[classical, classical era, early romantic era]                                                        1071
[chillhop, lo-fi beats]                                                                                766
[lo-fi beats]                                                                                          664
[classical, early romantic era, polish classical]                                                      624
[k-pop, k-pop boy group]                                                                               615
[sleep]                              

In [8]:
df['genres_str'] = df['genres'].astype(str)

## High 'speechiness' tracks
Per the Spotify API: "Values above 0.66 describe tracks that are probably made entirely of spoken words."

In [9]:
# Explore genres for tracks with high speechiness values.
df['genres'][df['speechiness'] > 0.66].value_counts()

[]                                                                                                                                                                   319
[classical, classical era]                                                                                                                                            49
[edm, progressive house, progressive trance, trance]                                                                                                                  33
[spanish comedy]                                                                                                                                                      17
[heartland rock, mellow gold, permanent wave, rock, singer-songwriter]                                                                                                16
[comedy]                                                                                                                                                   

In [10]:
# Could consider tagging these with a 'spoken word' genre.
df[(df['speechiness'] > 0.66) & (df['genres_str'] == '[]')].sample(10)

Unnamed: 0,artist_name,track_id,track_name,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,popularity,artist_id,genres,genres_str
54731,El Fantasma,2P48smImTKmhxqUBh19zK2,The Life,0.636,0.662,152111,0.622,0.0,6,0.388,-11.175,1,0.782,173.84,4,0.851,13,3Cqv6HFqTnLcXBQVpVyXpv,[],[]
62658,Simone Felice,7J3R6BBgF8OvOFtyHxQ4Fe,They'd Hang Upon My Every Word,0.0941,0.594,228027,0.151,7e-06,7,0.212,-16.945,1,0.947,167.108,1,0.408,10,2tfqowGe1sXV4u2ZUZCW9Q,[],[]
30805,The One Shot Finalists,2yFmH0xYHALqQI5dxeT09r,E.V.I.L.,0.29,0.597,180640,0.581,0.0,3,0.108,-8.664,0,0.773,57.464,3,0.645,0,4KG18C8sBN6rNk54jdeuKA,[],[]
69678,Amazon Sounds,0XEwIj0Hq036g83CVQkpq4,Inspiring Nature Sounds,0.112,0.53,69329,0.0506,0.314,1,0.182,-36.994,1,0.852,95.872,3,0.619,8,61yxe8PrqKQm4pbzFI0sPx,[],[]
107503,Trisepta,4KLfQLtQKJ3S8WrcmoysdE,Envious,0.404,0.567,111260,0.48,7.8e-05,11,0.135,-9.016,1,0.905,176.044,4,0.146,0,1kxap3iMmTjc2T7jI3ADwr,[],[]
76594,Yung Xaw,1teoEv6nQm0wtETQ2HrhhW,Because I Can,0.114,0.705,158616,0.54,2.9e-05,9,0.143,-13.556,0,0.718,140.305,4,0.364,0,6Gq0JyfVhFiE50fke2Nuhv,[],[]
93771,99 Neighbors,6SyJPb54GQ0Vy6Yqe9DDJM,Champion,0.497,0.669,219111,0.562,0.0,0,0.252,-8.036,1,0.705,133.136,4,0.3,44,5uhqkMm8dyQvX83kl4Znq0,[],[]
68301,Amazon Sounds,6SQ7xWEHMWpCuYb2K5HavQ,Jungle Recordings,0.00691,0.501,62798,0.116,0.519,1,0.106,-38.288,1,0.776,87.611,5,0.647,8,61yxe8PrqKQm4pbzFI0sPx,[],[]
74271,Amazon Sounds,3t0E9VlYDmEeTqTk8oiYJE,Sounds of the Rainforest,0.0691,0.444,68362,0.0502,0.841,1,0.162,-37.183,1,0.907,66.62,1,0.617,7,61yxe8PrqKQm4pbzFI0sPx,[],[]
72085,Froggy Fresh,5eNsns1UBjHFmVWyciF7c8,Nightmare on My Street Intro,0.841,0.72,49041,0.424,0.0,2,0.0791,-11.544,0,0.716,69.432,3,0.192,16,0QsjXBgmv5j7WSHYfo7L82,[],[]


## Candidates for broader/combination genre categories

In [11]:
# Christian
df['genres'][df['genres_str'].str.contains('christian')].value_counts()

[christian hip hop, christian trap]                                                                                                             228
[christian relaxative]                                                                                                                           97
[anthem worship, ccm, christian alternative rock, christian music, indiecoustica, world worship, worship]                                        91
[anthem worship, ccm, christian alternative rock, christian music, world worship, worship]                                                       89
[christian hip hop, christian pop, christian trap]                                                                                               84
[christian pop]                                                                                                                                  70
[anthem worship, ccm, christian music, deep ccm, world worship, worship]                                        

In [12]:
# Classical
df['genres'][df['genres_str'].str.contains('classical')].value_counts()

[baroque, classical, early music, german baroque]                                                                                                                  3638
[classical, classical era]                                                                                                                                         1877
[classical, classical era, early romantic era]                                                                                                                     1071
[classical, early romantic era, polish classical]                                                                                                                   624
[classical, post-romantic era]                                                                                                                                      479
[baroque, classical, early music, italian baroque]                                                                                                              

In [13]:
# Country
df['genres'][df['genres_str'].str.contains('country')].value_counts()

[country pop]                                                                                                                                                                                                         221
[country rap, redneck]                                                                                                                                                                                                174
[contemporary country, country, country road, modern country rock]                                                                                                                                                    173
[contemporary country]                                                                                                                                                                                                137
[contemporary country, country pop]                                                                                             

In [14]:
# House
df['genres'][df['genres_str'].str.contains('house')].value_counts()

[edm, progressive house, progressive trance, trance]                                                                                                                                           1130
[edm, progressive house, progressive trance, trance, uplifting trance]                                                                                                                          445
[progressive house, progressive trance, trance, uplifting trance]                                                                                                                               258
[progressive house, progressive trance, progressive uplifting trance, trance, uplifting trance]                                                                                                 169
[big room, dance pop, edm, electro house, pop, tropical house]                                                                                                                                   89
[deep uplifting tran

In [15]:
# Jazz
df['genres'][df['genres_str'].str.contains('jazz')].value_counts()

[chillhop, jazz boom bap, lo-fi beats]                                                                                                                                                                240
[chillhop, jazz boom bap]                                                                                                                                                                              82
[adult standards, jazz blues, soul, swing, vocal jazz]                                                                                                                                                 45
[jazz blues, neo soul, soul, soul jazz, torch song, vocal jazz]                                                                                                                                        41
[electro swing, nu jazz]                                                                                                                                                                        

In [16]:
# Hip hop
df['genres'][df['genres_str'].str.contains('hip hop')].value_counts()

[atl hip hop, dirty south rap, gangster rap, hip hop, pop rap, rap, southern hip hop, trap music]                                      273
[christian hip hop, christian trap]                                                                                                    228
[deep underground hip hop]                                                                                                             218
[dark trap, emo rap, underground hip hop]                                                                                              135
[latin, latin hip hop, reggaeton, reggaeton flow, tropical]                                                                            124
[underground hip hop]                                                                                                                  122
[chicago drill, chicago rap, drill, hip hop, pop rap, rap, southern hip hop, trap music, underground hip hop, vapor trap]              119
[underground hip hop, vapor

In [17]:
# Pop
df['genres'][df['genres_str'].str.contains('pop')].value_counts()

[k-pop, k-pop boy group]                                                                                                            615
[regional mexican pop]                                                                                                              563
[corrido, regional mexican pop]                                                                                                     401
[atl hip hop, dirty south rap, gangster rap, hip hop, pop rap, rap, southern hip hop, trap music]                                   273
[dance pop, pop, post-teen pop]                                                                                                     238
[k-pop, k-pop girl group]                                                                                                           234
[country pop]                                                                                                                       221
[pop]                                           

In [18]:
# Metal
df['genres'][df['genres_str'].str.contains('metal')].value_counts()

[melodic metalcore, metalcore, post-screamo, screamo]                                                                                                                                                         56
[metalcore]                                                                                                                                                                                                   37
[album rock, glam metal, hard rock, nwobhm, rock]                                                                                                                                                             33
[metallic hardcore]                                                                                                                                                                                           33
[alternative metal, nu metal, post-grunge, rock]                                                                                                                    

In [19]:
# Rap
df['genres'][df['genres_str'].str.contains('rap')].value_counts()

[emo rap]                                                                                                                    352
[atl hip hop, dirty south rap, gangster rap, hip hop, pop rap, rap, southern hip hop, trap music]                            273
[cali rap, west coast trap]                                                                                                  268
[vapor trap]                                                                                                                 254
[christian hip hop, christian trap]                                                                                          228
[country rap, redneck]                                                                                                       174
[cali rap]                                                                                                                   159
[cali rap, hyphy, west coast trap]                                                               

In [20]:
# Rock
df['genres'][df['genres_str'].str.contains('rock')].value_counts()

[contemporary country, country, country road, modern country rock]                                                                                                                                                                                                                     173
[adult standards, brill building pop, folk, folk rock, mellow gold, rock, singer-songwriter, soft rock, yacht rock]                                                                                                                                                                    138
[indie garage rock]                                                                                                                                                                                                                                                                    130
[album rock, classic rock, folk rock, heartland rock, mellow gold, rock, soft rock, yacht rock]                                                        

In [21]:
# Techno
df['genres'][df['genres_str'].str.contains('techno')].value_counts()

[big room, edm, german techno, progressive house, progressive trance, trance, uplifting trance]                                                                                                39
[edm, german techno, progressive house, progressive trance, trance, uplifting trance]                                                                                                          21
[acid house, chicago house, deep house, float house, hip house, techno]                                                                                                                        18
[electronica, frankfurt electronic, german techno, microhouse, minimal techno, tech house]                                                                                                     18
[ambient techno, electronica, float house, microhouse, minimal techno, shiver pop]                                                                                                             16
[ambient techno]              

## Scaling

In [22]:
features = ['acousticness', 
            'danceability', 
            'energy', 
            'instrumentalness', 
            'key', 
            'liveness',
            'loudness', 
            'mode', 
            'speechiness', 
            'tempo', 
            'time_signature', 
            'valence',
            'popularity']

In [23]:
scaler = MinMaxScaler()
df_scaled = df.copy()
df_scaled[features] = scaler.fit_transform(df[features])

## One-hot encoding

In [24]:
mlb = MultiLabelBinarizer()
df_encoded = df_scaled.join(pd.DataFrame(mlb.fit_transform(df_scaled.pop('genres')),
                                         columns=mlb.classes_,
                                         index=df_scaled.index))

In [25]:
df_encoded.shape

(130989, 2575)

In [26]:
df_encoded.columns

Index(['artist_name', 'track_id', 'track_name', 'acousticness', 'danceability',
       'duration_ms', 'energy', 'instrumentalness', 'key', 'liveness',
       ...
       'zapstep', 'zen', 'zim hip hop', 'zim urban groove', 'zimdancehall',
       'zolo', 'zouglou', 'zouk', 'zouk riddim', 'zydeco'],
      dtype='object', length=2575)

## Feature reduction

In [27]:
genre_features = list(df_encoded.columns[19:])
print(genre_features)

['21st century classical', '432hz', '8-bit', 'a cappella', 'aarhus indie', 'aberdeen indie', 'abstract', 'abstract beats', 'abstract hip hop', 'abstract idm', 'abstractro', 'accordion', 'acid house', 'acid jazz', 'acid techno', 'acousmatic', 'acoustic blues', 'acoustic chill', 'acoustic opm', 'acoustic pop', 'acoustic punk', 'adelaide indie', 'adoracao', 'adoracion', 'adult standards', 'adventista', 'afghan pop', 'african electronic', 'african gospel', 'african percussion', 'african rock', 'afro dancehall', 'afro house', 'afro psych', 'afro-funk', 'afrobeat', 'afropop', 'aggrotech', 'alabama indie', 'alabama metal', 'alabama rap', 'alaska indie', 'albanian hip hop', 'albanian pop', 'albany ny indie', 'alberta country', 'alberta hip hop', 'album rock', 'albuquerque indie', 'alt-idol', 'alternative americana', 'alternative ccm', 'alternative country', 'alternative dance', 'alternative emo', 'alternative hip hop', 'alternative metal', 'alternative metalcore', 'alternative pop', 'alternati

In [28]:
# Keep only genres with at least 1,000 tagged tracks.
df_reduced = df_encoded.drop([col for col, val in df_encoded[genre_features].sum().iteritems() if val < 1000], axis=1)

In [29]:
df_reduced.columns

Index(['artist_name', 'track_id', 'track_name', 'acousticness', 'danceability',
       'duration_ms', 'energy', 'instrumentalness', 'key', 'liveness',
       'loudness', 'mode', 'speechiness', 'tempo', 'time_signature', 'valence',
       'popularity', 'artist_id', 'genres_str', 'alternative r&b',
       'atl hip hop', 'banda', 'baroque', 'big room', 'brostep', 'cali rap',
       'ccm', 'chamber pop', 'chillhop', 'classical', 'classical era',
       'contemporary country', 'dance pop', 'early music',
       'early romantic era', 'edm', 'electro house', 'electropop', 'emo rap',
       'folk-pop', 'gangster rap', 'german baroque', 'grupera', 'hip hop',
       'indie folk', 'indie pop', 'indie poptimism', 'indie r&b', 'indie rock',
       'indie soul', 'indietronica', 'k-pop', 'latin', 'lo-fi beats',
       'mellow gold', 'melodic rap', 'modern rock', 'neo mellow', 'norteno',
       'pop', 'pop edm', 'pop rap', 'pop rock', 'post-teen pop',
       'progressive house', 'progressive trance', 

## Build tree

In [30]:
expanded_features = ['acousticness', 'danceability', 'duration_ms', 'energy', 
                     'instrumentalness', 'key', 'liveness', 'loudness', 'mode', 
                     'speechiness', 'tempo', 'time_signature', 'valence', 
                     'popularity', 'alternative r&b',
                     'atl hip hop', 'banda', 'baroque', 'big room', 'brostep', 
                     'cali rap', 'ccm', 'chamber pop', 'chillhop', 'classical', 
                     'classical era', 'contemporary country', 'dance pop', 
                     'early music', 'early romantic era', 'edm', 'electro house', 
                     'electropop', 'emo rap', 'folk-pop', 'gangster rap', 
                     'german baroque', 'grupera', 'hip hop', 'indie folk', 
                     'indie pop', 'indie poptimism', 'indie r&b', 'indie rock',
                     'indie soul', 'indietronica', 'k-pop', 'latin', 'lo-fi beats',
                     'mellow gold', 'melodic rap', 'modern rock', 'neo mellow', 
                     'norteno', 'pop', 'pop edm', 'pop rap', 'pop rock', 
                     'post-teen pop', 'progressive house', 'progressive trance', 
                     'ranchera', 'rap', 'regional mexican', 'regional mexican pop', 
                     'rock', 'sleep', 'soft rock', 'southern hip hop', 
                     'stomp and holler', 'trance', 'trap music', 'tropical house', 
                     'underground hip hop', 'uplifting trance', 'vapor trap']

In [31]:
k_tree = KDTree(df_reduced[expanded_features], metric='euclidean', leaf_size=50)

### Test track 1

In [32]:
dist, ind = k_tree.query(np.array(df_reduced[expanded_features].loc[0]).reshape(1, -1), k=10)  

In [33]:
print(dist)

[[  0.         107.0185329  127.02201366 149.00694502 173.00510631
  226.00453596 256.00548287 267.00483716 267.00731315 267.01672006]]


In [34]:
print(ind)

[[     0 106304  64713  46022 109402 113692  25228  83448  30942  18472]]


In [35]:
df.loc[ind[0]][['artist_name', 'track_name', 'genres']]

Unnamed: 0,artist_name,track_name,genres
0,Frédéric Chopin,"バラード 第 1番 ト短調, 作品 23","[classical, early romantic era, polish classical]"
106304,Wolfgang Amadeus Mozart,"Don Giovanni, K. 527, Act II: Sola, sola in bu...","[classical, classical era]"
64713,Primordial,Upon Our Spiritual Deathbed,"[atmospheric black metal, avant-garde metal, b..."
46022,Pyotr Ilyich Tchaikovsky,"18 Pieces, Op. 72: II. Berceuse","[classical, late romantic era, russian romanti..."
109402,Frédéric Chopin,"Piano Concerto No. 2 in F Minor, Op. 21: III. ...","[classical, early romantic era, polish classical]"
113692,Claude Debussy,"Fantaisie for Piano and Orchestra, L. 73: Fant...","[classical, post-romantic era]"
25228,Lena Raine,Golden,[video game music]
83448,Dan Forrest,Entreat Me Not to Leave You,[]
30942,Silent Knights,Open Kitchen Fire (Long With Fade),[sleep]
18472,Arkham Knights,Existence - Extended Mix,"[progressive house, progressive trance, trance..."


## Test track 2

In [36]:
dist, ind = k_tree.query(np.array(df_reduced[expanded_features].loc[2]).reshape(1, -1), k=10)  

In [37]:
df.loc[ind[0]][['artist_name', 'track_name', 'track_id', 'genres']]

Unnamed: 0,artist_name,track_name,track_id,genres
2,Neffex,Take Me Away,6HXfoTMOdKlN0IoaM9LkTa,[bass trap]
94530,Neffex,Take Me Away,2VcqW62pUjTP3f1XAkEh0h,[bass trap]
93355,Claud,Never Meant To Call,5b5wCUEao71E5iME5fPl93,[bedroom pop]
52217,Dustin Hill,Too Much,7J8TSy1Rh67tSUBr1qYR49,[]
112151,Sir Lloyd,Come Thru,7Cb5Qp7nDGAdNpOSfioBax,[]
38371,Stylo G,10 Metric Ton,7hQKuQnanVziaLj1dHLBHm,"[dancehall, uk dancehall]"
66256,ILYAA,Brighter,1n12zXz2Oi7ng2M8SJMn3M,[]
54268,Noirre,holy grail,34lAWwOxuaOPPeyja8uA3D,[]
55147,Dj Cutman,Long Journey Ahead,2mA6yDk4a31OjmWRZBi9Lh,"[otacore, scorecore]"
91683,Code Black,Worlds Collide,70nJPkGjHKIqmUTGGkAvbW,"[euphoric hardstyle, hardstyle, rawstyle]"
