# FMA (Small Dataset) Analysis

In [1]:
!pip install numpy pandas matplotlib seaborn scikit-learn bokeh umap-learn



In [2]:
from pathlib import Path
import pandas as pd
import numpy as np

## Fetch and Clean Data

In [3]:
metadata = Path("../data/fma/metadata")

In [4]:
tracks = pd.read_csv(metadata.joinpath('tracks.csv'), header=[0, 1, 2])
tracks = tracks.droplevel(2, axis=1)
tracks.columns = tracks.columns.set_levels(['track_id', *tracks.columns.levels[0][1:]], level=0)
tracks.columns = tracks.columns.set_levels(['', *tracks.columns.levels[1][1:]], level=1)
tracks.set_index("track_id", inplace=True)
tracks = tracks[tracks["set", "subset"] == "small"]  # Only include songs from fma_small dataset
tracks = tracks[pd.notnull(tracks["track", "genre_top"])]  # Remove tracks with unknown genre

In [5]:
features = pd.read_csv(metadata.joinpath('features.csv'), header=[0, 1, 2 ,3])
features = features.droplevel(3 , axis=1)
columns = [("track_id", "", "")] + [(c[0], c[1], str(int(c[2]) - 1)) for c in features.columns[1:]]
features.columns = pd.MultiIndex.from_tuples(columns)
features.set_index("track_id", inplace=True)

In [6]:
echonest = pd.read_csv(metadata.joinpath('echonest.csv'), header=[1, 2, 3])
echonest = echonest.droplevel(2 , axis=1)
columns = [("track_id", "")] + [(c[0], c[1]) for c in echonest.columns.values[1:]]
echonest.columns = pd.MultiIndex.from_tuples(columns)
echonest.set_index("track_id", inplace=True)

In [7]:
data = pd.DataFrame(columns=pd.MultiIndex.from_tuples([("genre", "", "", "")]))
data["genre"] = tracks["track", "genre_top"]
for c in features:
    data["features", (*c)] = features[c]
for c in echonest[["audio_features", "temporal_features"]]:
    data[(*c), "", ""] = echonest[c]
print(data.columns.levels[0])
data.head(3)

Index(['genre', 'features', 'audio_features', 'temporal_features'], dtype='object')


Unnamed: 0_level_0,genre,features,features,features,features,features,features,features,features,features,...,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features
Unnamed: 0_level_1,Unnamed: 1_level_1,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,...,214,215,216,217,218,219,220,221,222,223
Unnamed: 0_level_2,Unnamed: 1_level_2,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,...,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Unnamed: 0_level_3,Unnamed: 1_level_3,0,1,2,3,4,5,6,7,8,...,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3
track_id,Unnamed: 1_level_4,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,Unnamed: 9_level_4,Unnamed: 10_level_4,Unnamed: 11_level_4,Unnamed: 12_level_4,Unnamed: 13_level_4,Unnamed: 14_level_4,Unnamed: 15_level_4,Unnamed: 16_level_4,Unnamed: 17_level_4,Unnamed: 18_level_4,Unnamed: 19_level_4,Unnamed: 20_level_4,Unnamed: 21_level_4
2,Hip-Hop,7.180653,5.230309,0.249321,1.34762,1.482478,0.531371,1.481593,2.691455,0.866868,...,-1.992303,6.805694,0.23307,0.19288,0.027455,0.06408,3.67696,3.61288,13.31669,262.929749
5,Hip-Hop,0.527563,-0.077654,-0.27961,0.685883,1.93757,0.880839,-0.923192,-0.927232,0.666617,...,-2.288358,11.527109,0.256821,0.23782,0.060122,0.06014,5.92649,5.86635,16.013849,356.755737
10,Pop,3.702245,-0.291193,2.196742,-0.234449,1.367364,0.998411,1.770694,1.604566,0.521217,...,-3.662988,21.508228,0.283352,0.26707,0.125704,0.08082,8.41401,8.33319,21.317064,483.403809


# Run Analysis

## Set Up Plotting Tools

In [8]:
from bokeh.plotting import figure, output_file, show
from bokeh.io import output_notebook
from bokeh.palettes import brewer, d3
from bokeh.models.tools import HoverTool, WheelZoomTool, PanTool, CrosshairTool, LassoSelectTool
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
sns.set_style('darkgrid')
sns.set_palette('bright')
output_notebook()
hover = HoverTool()
wheel_zoom = WheelZoomTool()
pan = PanTool()
crosshair = CrosshairTool()
lasso = LassoSelectTool()

In [9]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

def pca_plot(data, feats, genres, std=2, title="PCA", colors=d3['Category10']):
    data = data.sort_values(by=genres)
    feats = data[feats]
    genres = data[genres]
    feats = StandardScaler().fit_transform(feats)
    pca = PCA(n_components=2)
    feats = pca.fit_transform(feats)

    plot = figure(tools=(wheel_zoom, lasso, pan), active_scroll=wheel_zoom, width=800, title=title)
    colors = colors[len(genres.unique())]
    for i, genre in enumerate(genres.unique()):
        genre_tracks = feats[genres == genre]

        plot.circle(
            genre_tracks[:, 0],
            genre_tracks[:, 1],
            fill_color=[colors[i]] * len(genre_tracks),
            fill_alpha=.1,
            line_color=None,
            size=3,
            legend_label=genre
        )
        cov = np.cov(genre_tracks[:, 0], genre_tracks[:, 1])
        lmbd, v = np.linalg.eig(cov)
        order = lmbd.argsort()[::-1]
        lmbd, v = lmbd[order], v[:,order]
        lmbd = np.sqrt(lmbd)
        plot.ellipse(
            genre_tracks[:, 0].mean(),
            genre_tracks[:, 1].mean(),
            lmbd[0] * 2 * std,
            lmbd[1] * 2 * std,
            angle=np.degrees(np.arctan2(*v[:,0][::-1])),
            color=colors[i],
            fill_alpha=0.2,
            legend_label=genre
        )

    plot.legend.click_policy="hide"
    show(plot)

In [10]:
pca_plot(data, "features", "genre", title="PCA (Low-Level Features)")
pca_plot(
    data[pd.notnull(data["audio_features", "acousticness"])],
    "audio_features",
    "genre",
    title="PCA (High-Level Features)"
)
pca_plot(
    data[pd.notnull(data["audio_features", "acousticness"])],
    "temporal_features",
    "genre",
    title="PCA (Temporal Features (?))")

In [11]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from umap import UMAP
from time import time

def tsne_plot(data, feats, genres, pca_dims=0.85, tsne_dims=2, title="t-SNE", std=2, colors=d3['Category10'], **kwargs):
    start = time()
    feats = data[feats]
    genres = data[genres]
    feats = StandardScaler().fit_transform(feats)
    if pca_dims:
        pca = PCA(n_components=pca_dims)
        feats = pca.fit_transform(feats)
        print(f"PCA: {pca.n_components_} Components")
    tsne = TSNE(n_components=tsne_dims, **kwargs)
    points = tsne.fit_transform(feats)
    print(f"TSNE took {time() - start:.2f}s")
    
    
    plot = figure(tools=(wheel_zoom, lasso, pan), active_scroll=wheel_zoom, width=800, title=title)
    colors = colors[len(genres.unique())]
    for i, genre in enumerate(genres.unique()):
        genre_tracks = points[genres == genre]
        plot.circle(
            genre_tracks[:, 0],
            genre_tracks[:, 1],
            fill_color=[colors[i]] * len(genre_tracks),
            fill_alpha=.7,
            line_color=None,
            size=3,
            legend_label=genre
        )
        cov = np.cov(genre_tracks[:, 0], genre_tracks[:, 1])
        lmbd, v = np.linalg.eig(cov)
        order = lmbd.argsort()[::-1]
        lmbd, v = lmbd[order], v[:,order]
        lmbd = np.sqrt(lmbd)
        plot.ellipse(
            genre_tracks[:, 0].mean(),
            genre_tracks[:, 1].mean(),
            lmbd[0] * 2 * std,
            lmbd[1] * 2 * std,
            angle=np.degrees(np.arctan2(*v[:,0][::-1])),
            color=colors[i],
            fill_alpha=0.2,
            legend_label=genre
        )
    plot.legend.click_policy="hide"
    show(plot)

    return points

def umap_plot(data, feats, genres, pca_dims=0., umap_dims=2, title="UMAP", std=2, colors=d3['Category10'], **kwargs):
    start = time()
    feats = data[feats]
    genres = data[genres]
    feats = StandardScaler().fit_transform(feats)
    if pca_dims:
        pca = PCA(n_components=pca_dims)
        feats = pca.fit_transform(feats)
        print(f"PCA: {pca.n_components_} Components")
    umap = UMAP(n_components=umap_dims, **kwargs)
    points = umap.fit_transform(feats)
    print(f"UMAP took {time() - start:.2f}s")

    plot = figure(tools=(wheel_zoom, lasso, pan), active_scroll=wheel_zoom, width=800, title=title)
    colors = colors[len(genres.unique())]
    for i, genre in enumerate(genres.unique()):
        genre_tracks = points[genres == genre]
        plot.circle(
            genre_tracks[:, 0],
            genre_tracks[:, 1],
            fill_color=[colors[i]] * len(genre_tracks),
            fill_alpha=.7,
            line_color=None,
            size=3,
            legend_label=genre
        )
        cov = np.cov(genre_tracks[:, 0], genre_tracks[:, 1])
        lmbd, v = np.linalg.eig(cov)
        order = lmbd.argsort()[::-1]
        lmbd, v = lmbd[order], v[:,order]
        lmbd = np.sqrt(lmbd)
        plot.ellipse(
            genre_tracks[:, 0].mean(),
            genre_tracks[:, 1].mean(),
            lmbd[0] * 2 * std,
            lmbd[1] * 2 * std,
            angle=np.degrees(np.arctan2(*v[:,0][::-1])),
            color=colors[i],
            fill_alpha=0.2,
            legend_label=genre
        )
    plot.legend.click_policy="hide"
    show(plot)

    return points

In [12]:
for n in [5, 10, 20, 50, 100, 200]:
    umap_plot(data, "features", "genre", n_neighbors=n, title=f"UMAP@{n}")
    tsne_plot(data, "features", "genre", perplexity=n, title=f"t-SNE@{n}")

The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see http://numba.pydata.org/numba-doc/latest/user/parallel.html#diagnostics for help.
[1m
File "d:\programdata\anaconda3\envs\ai\lib\site-packages\umap\rp_tree.py", line 135:[0m
[1m@numba.njit(fastmath=True, nogil=True, parallel=True)
[1mdef euclidean_random_projection_split(data, indices, rng_state):
[0m[1m^[0m[0m
[0m
  state.func_ir.loc))
The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see http://numba.pydata.org/numba-doc/latest/user/parallel.html#diagnostics for help.
[1m
File "d:\programdata\anaconda3\envs\ai\lib\site-packages\umap\rp_tree.py", line 135:[0m
[1m@numba.njit(fastmath=True, nogil=True, parallel=True)
[1mdef euclidean_random_projection_split(data, indices, rng_state):
[0m[1m^[

UMAP took 17.19s


PCA: 109 Components
TSNE took 64.57s


The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see http://numba.pydata.org/numba-doc/latest/user/parallel.html#diagnostics for help.
[1m
File "d:\programdata\anaconda3\envs\ai\lib\site-packages\umap\nndescent.py", line 47:[0m
[1m    @numba.njit(parallel=True)
[1m    def nn_descent(
[0m    [1m^[0m[0m
[0m
  state.func_ir.loc))


UMAP took 21.60s


PCA: 109 Components
TSNE took 70.82s


The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see http://numba.pydata.org/numba-doc/latest/user/parallel.html#diagnostics for help.
[1m
File "d:\programdata\anaconda3\envs\ai\lib\site-packages\umap\nndescent.py", line 47:[0m
[1m    @numba.njit(parallel=True)
[1m    def nn_descent(
[0m    [1m^[0m[0m
[0m
  state.func_ir.loc))


UMAP took 38.98s


PCA: 109 Components
TSNE took 80.60s


The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see http://numba.pydata.org/numba-doc/latest/user/parallel.html#diagnostics for help.
[1m
File "d:\programdata\anaconda3\envs\ai\lib\site-packages\umap\nndescent.py", line 47:[0m
[1m    @numba.njit(parallel=True)
[1m    def nn_descent(
[0m    [1m^[0m[0m
[0m
  state.func_ir.loc))


UMAP took 80.78s


PCA: 109 Components
TSNE took 85.38s


The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see http://numba.pydata.org/numba-doc/latest/user/parallel.html#diagnostics for help.
[1m
File "d:\programdata\anaconda3\envs\ai\lib\site-packages\umap\nndescent.py", line 47:[0m
[1m    @numba.njit(parallel=True)
[1m    def nn_descent(
[0m    [1m^[0m[0m
[0m
  state.func_ir.loc))


UMAP took 92.99s


PCA: 109 Components
TSNE took 121.30s


The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see http://numba.pydata.org/numba-doc/latest/user/parallel.html#diagnostics for help.
[1m
File "d:\programdata\anaconda3\envs\ai\lib\site-packages\umap\nndescent.py", line 47:[0m
[1m    @numba.njit(parallel=True)
[1m    def nn_descent(
[0m    [1m^[0m[0m
[0m
  state.func_ir.loc))


UMAP took 101.92s


PCA: 109 Components
TSNE took 174.69s
