In [32]:
import pandas as pd
import numpy as np

In [39]:
data = pd.read_csv('data/data_by_genres.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2973 entries, 0 to 2972
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   mode              2973 non-null   int64  
 1   genres            2973 non-null   object 
 2   acousticness      2973 non-null   float64
 3   danceability      2973 non-null   float64
 4   duration_ms       2973 non-null   float64
 5   energy            2973 non-null   float64
 6   instrumentalness  2973 non-null   float64
 7   liveness          2973 non-null   float64
 8   loudness          2973 non-null   float64
 9   speechiness       2973 non-null   float64
 10  tempo             2973 non-null   float64
 11  valence           2973 non-null   float64
 12  popularity        2973 non-null   float64
 13  key               2973 non-null   int64  
dtypes: float64(11), int64(2), object(1)
memory usage: 325.3+ KB


In [40]:
data = data.drop(columns=['mode','key'])
data = data.set_index('genres')

In [43]:
for col in data.columns:
    data[col] = pd.to_numeric(data[col], downcast='float')
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2973 entries, 21st century classical to zydeco
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   acousticness      2973 non-null   float32
 1   danceability      2973 non-null   float32
 2   duration_ms       2973 non-null   float32
 3   energy            2973 non-null   float32
 4   instrumentalness  2973 non-null   float32
 5   liveness          2973 non-null   float32
 6   loudness          2973 non-null   float32
 7   speechiness       2973 non-null   float32
 8   tempo             2973 non-null   float32
 9   valence           2973 non-null   float32
 10  popularity        2973 non-null   float32
dtypes: float32(11)
memory usage: 151.0+ KB


In [45]:
from scipy.spatial.distance import cosine

def get_genre_vector(row):
    return list(row)

def distance(vec1, vec2):
    return cosine(vec1, vec2)

def closest_genres(vec, qty = 10):
    distances = {
        genre: distance(vec, get_genre_vector(data.loc[genre]))
        for genre in data.index
    }
    return sorted(distances, key=lambda genre: distances[genre])[:qty]

def closest_genre(vec):
    return closest_genres(vec)[0]

In [47]:
funk_vec = get_genre_vector(data.loc['funk'])
funk_vec

[0.31461378931999207,
 0.6369413733482361,
 279910.8125,
 0.569004476070404,
 0.10961710661649704,
 0.1753385365009308,
 -10.269994735717773,
 0.06597717851400375,
 116.3233642578125,
 0.6515148282051086,
 38.30313491821289]

In [48]:
distance(funk_vec, get_genre_vector(data.loc['rock']))

4.82839079829489e-09

In [51]:
closest_genres(funk_vec)

['funk',
 'folclore tucumano',
 'quiet storm',
 'lancaster pa indie',
 'freestyle',
 'second line',
 'danish metal',
 'disco',
 'hong kong indie',
 'hong kong rock']

In [53]:
rock_vec = np.array(get_genre_vector(data.loc['rock']))
reggae_vec = np.array(get_genre_vector(data.loc['reggae']))
closest_genres(rock_vec+reggae_vec)

['rock',
 'reggae',
 'permanent wave',
 'charango',
 'folklore boliviano',
 'scottish rock',
 'canadian indie',
 'bristol electronic',
 'minneapolis metal',
 'french reggae']