In [117]:
import os
import numpy as np
import pandas as pd
import ast
from pandas.api.types import CategoricalDtype

In [118]:
def load(filepath):

    filename = os.path.basename(filepath)

    if 'features' in filename:
        return pd.read_csv(filepath, index_col=0, header=[0, 1, 2])

    if 'echonest' in filename:
        return pd.read_csv(filepath, index_col=0, header=[0, 1, 2])

    if 'genres' in filename:
        return pd.read_csv(filepath, index_col=None)

    if 'tracks' in filename:
        tracks = pd.read_csv(filepath, index_col=0, header=[0, 1])

        COLUMNS = [('track', 'tags'), ('album', 'tags'), ('artist', 'tags'),
                   ('track', 'genres'), ('track', 'genres_all')]
        for column in COLUMNS:
            tracks[column] = tracks[column].map(ast.literal_eval)

        COLUMNS = [('track', 'date_created'), ('track', 'date_recorded'),
                   ('album', 'date_created'), ('album', 'date_released'),
                   ('artist', 'date_created'), ('artist', 'active_year_begin'),
                   ('artist', 'active_year_end')]
        for column in COLUMNS:
            tracks[column] = pd.to_datetime(tracks[column])

        SUBSETS = ('small', 'medium', 'large')
        tracks['set', 'subset'] = tracks['set', 'subset'].astype(
                CategoricalDtype(categories=SUBSETS, ordered=True))

        COLUMNS = [('track', 'genre_top'), ('track', 'license'),
                   ('album', 'type'), ('album', 'information'),
                   ('artist', 'bio')]
        for column in COLUMNS:
            tracks[column] = tracks[column].astype('category')

        return tracks

In [119]:
for f in os.scandir('/Users/macbookretina/'):
    if os.path.isfile(f):
        print(f)

<DirEntry 'ngrok'>
<DirEntry '.DS_Store'>
<DirEntry 'blues.00042.wav'>
<DirEntry 'Atom.itermcolors.txt'>
<DirEntry '.CFUserTextEncoding'>
<DirEntry 'test'>
<DirEntry 'package_control'>
<DirEntry '.netrc'>
<DirEntry '.swp'>
<DirEntry '.swl'>
<DirEntry '.swk'>
<DirEntry '.psql_history'>
<DirEntry '.swj'>
<DirEntry '.swm'>
<DirEntry '.rnd'>
<DirEntry '.boto'>
<DirEntry '.npmrc'>
<DirEntry '.lesshst'>
<DirEntry 'sudoers'>
<DirEntry '.git-completion.bash'>
<DirEntry '.bash_profile.recovered'>
<DirEntry '.bash_profile.pysave'>
<DirEntry '.bash_profile.backup'>
<DirEntry '.node_repl_history'>
<DirEntry 'KiteOnboarding.py'>
<DirEntry 'pip'>
<DirEntry 'mac'>
<DirEntry '.gitignore'>
<DirEntry 'package-lock.json'>
<DirEntry 'package.json'>
<DirEntry 'logfile'>
<DirEntry '.git-prompt.sh'>
<DirEntry '.dbshell'>
<DirEntry 'test.txt'>
<DirEntry '.swo'>
<DirEntry '.swn'>
<DirEntry '.mysql_history'>
<DirEntry '.bash_profile'>
<DirEntry '.python_history'>
<DirEntry '.gitconfig'>
<DirEntry '.mongorc.js'>

In [164]:
tracks = load('/Users/macbookretina/Downloads/fma_metadata_tracks.csv')

In [156]:
genres = load('/Users/macbookretina/Downloads/fma_metadata_genres.csv')

In [None]:
# genres = genres[['genre_id', 'title']]
# genres.loc[genres['top_level'].unique()].sort_values('#tracks', ascending=False)

In [169]:
fma_full = tracks[[('set', 'subset'), ('track', 'genre_top')]]
fma_full

Unnamed: 0_level_0,set,track
Unnamed: 0_level_1,subset,genre_top
track_id,Unnamed: 1_level_2,Unnamed: 2_level_2
2,small,Hip-Hop
3,medium,Hip-Hop
5,small,Hip-Hop
10,small,Pop
20,large,
...,...,...
155316,large,Rock
155317,large,Rock
155318,large,Rock
155319,large,Rock


In [166]:
fma_full[('track', 'genre_top')].value_counts(normalize=True)

Rock                   0.285939
Experimental           0.213880
Electronic             0.188959
Hip-Hop                0.071616
Folk                   0.056514
Pop                    0.047018
Instrumental           0.041917
International          0.028005
Classical              0.024799
Jazz                   0.011513
Old-Time / Historic    0.011170
Spoken                 0.008529
Country                0.003911
Soul-RnB               0.003528
Blues                  0.002218
Easy Listening         0.000484
Name: (track, genre_top), dtype: float64

In [167]:
# collect track id, genres of tracks in the small subset.
small_subset = fma_full[('set', 'subset')] == 'small'
fma_small = fma_full[small_subset]
fma_small

Unnamed: 0_level_0,set,track
Unnamed: 0_level_1,subset,genre_top
track_id,Unnamed: 1_level_2,Unnamed: 2_level_2
2,small,Hip-Hop
5,small,Hip-Hop
10,small,Pop
140,small,Folk
141,small,Folk
...,...,...
154308,small,Hip-Hop
154309,small,Hip-Hop
154413,small,Pop
154414,small,Pop


In [168]:
fma_small[('track', 'genre_top')].value_counts(normalize=True)

Rock                   0.125
Pop                    0.125
International          0.125
Instrumental           0.125
Hip-Hop                0.125
Folk                   0.125
Experimental           0.125
Electronic             0.125
Spoken                 0.000
Soul-RnB               0.000
Old-Time / Historic    0.000
Jazz                   0.000
Easy Listening         0.000
Country                0.000
Classical              0.000
Blues                  0.000
Name: (track, genre_top), dtype: float64

In [153]:
# for index, row in fma_small.iterrows():
#     track_genre_id = row[('track', 'genres')][0]
#     if track_genre_id in genres['genre_id']:
#         fma_small.at[index, ('track', 'genres')] = genres['title'][track_genre_id - 1]

In [None]:
# fma_small[('track', 'genres')].value_counts(normalize=True)