This notebook starts with samples listed in `filenames.txt` and assumes there is an equivalently named `.json` file for every sample. It loads all the `.json` files and saves them into a big `.json` file. This is a good format for exporting, and also happens to be faster for loading than `.pkl` in this case.

In [None]:
data_root = 'data/drums'

In [None]:
%matplotlib inline
from matplotlib import pyplot as plt
from os.path import join
from utils import list_all_files
from multiprocessing import Pool
import json

In [None]:
def replace_extension(fn, new_extension):
    parts = fn.split('.')
    parts[-1] = new_extension
    return '.'.join(parts)
files = open(join(data_root, 'filenames.txt')).read().splitlines()
files = [replace_extension(fn, 'json') for fn in files]
len(files)

In [None]:
def job(fn):
    with open(fn) as f:
        return json.load(f)
p = Pool()
%time results = p.map(job, files)

In [None]:
with open(join(data_root, 'metadata.json'), 'wb') as f:
    json.dump(results, f)

In [None]:
# example of getting the most common tokens from the 'description' field
descriptions = '\n'.join([x['description'] for x in results])
from collections import Counter
counter = Counter(descriptions.lower().split())
counter.most_common()[:20]

In [None]:
# example of plotting a histogram of a numeric field like num_comments, avg_rating, bitrate, num_downloads
measure = [x['num_downloads'] for x in results]
plt.hist(measure, bins=20)
plt.yscale('log')
plt.show()