In [1]:
import codecs
import pandas as pd

In [2]:
root = '../..'
dataDir = '%s/data' % root
outDir = '%s/preprocessing/spotify2musicbrainz/out' % root

In [3]:
def log_progress(sequence, every=None, size=None, name='Items'):
    from ipywidgets import IntProgress, HTML, VBox
    from IPython.display import display

    is_iterator = False
    if size is None:
        try:
            size = len(sequence)
        except TypeError:
            is_iterator = True
    if size is not None:
        if every is None:
            if size <= 200:
                every = 1
            else:
                every = int(size / 200)     # every 0.5%
    else:
        assert every is not None, 'sequence is iterator, set every'

    if is_iterator:
        progress = IntProgress(min=0, max=1, value=1)
        progress.bar_style = 'info'
    else:
        progress = IntProgress(min=0, max=size, value=0)
    label = HTML()
    box = VBox(children=[label, progress])
    display(box)

    index = 0
    try:
        for index, record in enumerate(sequence, 1):
            if index == 1 or index % every == 0:
                if is_iterator:
                    label.value = '{name}: {index} / ?'.format(
                        name=name,
                        index=index
                    )
                else:
                    progress.value = index
                    label.value = u'{name}: {index} / {size}'.format(
                        name=name,
                        index=index,
                        size=size
                    )
            yield record
    except:
        progress.bar_style = 'danger'
        raise
    else:
        progress.bar_style = 'success'
        progress.value = index
        label.value = "{name}: {index}".format(
            name=name,
            index=str(index or '?')
        )


In [4]:
def uniqueList(what):
    csv_file = '%s/%s_uri_seq.csv' % (dataDir, what)
    lines = [line.strip() for line in log_progress(codecs.open(csv_file, 'r', 'utf-8'), every=1, name='reading file')]
    uri_list = [elem for l in lines for elem in l.split(';')]
    return list(set(uri_list))


In [5]:
def getWDdata(what):
    csv_file = '%s/%s.csv' % (outDir, what)
    return pd.read_csv(csv_file, header=0, index_col=False)

In [6]:
def statsForCategory(what):
    uniq_list = uniqueList(what)
    wd_data = getWDdata(what)
    
    total = float(len(uniq_list))
    with_mbid = 0.
    with_wdid = 0.

    for elem in log_progress(uniq_list, every=1, name='searching for links'):
        spid = elem.replace('spotify:%s:' % what, '')
        p = wd_data.loc[wd_data['spid'] == spid]
        if p.empty:
            continue          

        if isinstance(p['mbid'].values[0], str):
            with_mbid += 1
        if isinstance(p['wdid'].values[0], str):
            with_wdid += 1

    print('Total %s : %d' % (what, total))
    print('With MB id : %d (around %s)' % (with_mbid, "{0:.0f}%".format(with_mbid/total * 100)))
    print('With WD id : %d (around %s)' % (with_wdid, "{0:.0f}%".format(with_wdid/total * 100)))

In [7]:
print('Albums')
statsForCategory('album')

Albums


Total album : 735668
With MB id : 88 (around 0%)
With WD id : 135 (around 0%)


In [8]:
print('Artists')
statsForCategory('artist')

Artists


Total artist : 296851
With MB id : 3982 (around 1%)
With WD id : 4137 (around 1%)


In [9]:
print('Tracks')
statsForCategory('track')

Tracks


Total track : 2263256
With MB id : 66 (around 0%)
With WD id : 127 (around 0%)
