In [1]:
!pip install gensim emblaze



In [2]:
import numpy as np
import matplotlib.pyplot as plt
import math
from gensim.models import KeyedVectors
import pandas as pd
import emblaze
from emblaze.utils import Field, ProjectionTechnique
from collections import defaultdict, namedtuple

In [3]:
fname = '../data/pymde_3d_18n.w2v'
wv = KeyedVectors.load_word2vec_format(fname)

In [4]:
# load mapping from embedding index to score ID
mapping_fname = f'{fname}.csv'
mapping_df = pd.read_csv(mapping_fname)
score_id_by_ix = mapping_df.set_index('index')['score_id'].to_dict()

In [5]:
my_score_ids = set(pd.read_csv('../data/my_score_ids.csv')['score'].values)

In [6]:
score_metadata = pd.read_csv('../data/score_metadata.csv')

score_metadata_by_id = {}
for row in score_metadata.itertuples():
    score_metadata_by_id[row.score_id] = row

next(iter(score_metadata_by_id.values()))

Pandas(Index=0, score_id='1000052_DTHR', avg_pp=97.87299, num_users=3)

In [7]:
# load beatmap metadata
beatmaps_df = pd.read_parquet('../data/beatmaps.parquet')
# beatmaps_df.head()

beatmaps_by_id = {}
for row in beatmaps_df.itertuples():
    beatmaps_by_id[row.beatmap_id] = row

next(iter(beatmaps_by_id.values()))

Pandas(Index=0, id=1.0, beatmapset_id=40071, beatmap_id=127363, approved=1, approved_date=Timestamp('2012-01-15 21:13:09'), last_update=Timestamp('2012-01-15 21:07:48'), total_length=233, hit_length=193, version='Posthumous', artist='DJ Okawari', title='Luv Letter', creator='nold_1702', bpm=95, source='', difficultyrating=5.31721, diff_size=4, diff_overall=8, diff_approach=9, diff_drain=6, mode=0)

In [32]:
positions = wv.vectors
names = [score_id_by_ix[index] for index in range(len(wv.vectors))]
# colors = [int(score_id_by_ix[index].split('_')[0]) for index in range(len(wv.vectors))]
# score ids are in the format `{id}_{mods}`, and we color by whether the score has DT, HT, or neither
# colors = [int('DT' in score_id_by_ix[index]) + 2 * int('HT' in score_id_by_ix[index]) for index in range(len(wv.vectors))]
# colors = [1 if score_id_by_ix[index] in my_score_ids else 0 for index in range(len(wv.vectors))]
colors = [max(150,min(score_metadata_by_id[score_id_by_ix[index]].avg_pp,330)) for index in range(len(wv.vectors))]
# colors = [beatmaps_by_id[int(score_id_by_ix[index].split('_')[0])].last_update.year for index in range(len(wv.vectors))]
# colors = [min(beatmaps_by_id[int(score_id_by_ix[index].split('_')[0])].total_length, 60*5) for index in range(len(wv.vectors))]
sizes = [(np.log(1+score_metadata_by_id[score_id_by_ix[index]].num_users)**1.8)*0.05 for index in range(len(wv.vectors))]
# alphas = [1 if score_id_by_ix[index] in my_score_ids else 0.1 for index in range(len(wv.vectors))]
# alphas = [1 if 'DT' in score_id_by_ix[index] else 0 for index in range(len(wv.vectors))]
alphas = [1 * (1 if score_id_by_ix[index] in my_score_ids else 0.2) if (score_metadata_by_id[score_id_by_ix[index]].avg_pp>100) else 0 for index in range(len(wv.vectors))]

emb = emblaze.Embedding({Field.POSITION: positions, Field.NAME: names, Field.COLOR: colors, Field.RADIUS: sizes, Field.ALPHA: alphas}, n_neighbors=20,)
emb.compute_neighbors(metric='euclidean')

In [30]:
wv = KeyedVectors.load_word2vec_format(fname)
emb = emblaze.Embedding({Field.POSITION: positions, Field.NAME: names, Field.COLOR: colors, Field.RADIUS: sizes, Field.ALPHA: alphas}, n_neighbors=20,)
emb.compute_neighbors(metric='euclidean')

umap_params = {
    'densmap': False,
    'dens_lambda': 1,
    'min_dist': 0.7,
    'metric': 'euclidean',
}

variants = emblaze.EmbeddingSet([
    emb.project(method=ProjectionTechnique.UMAP, n_neighbors=50, **umap_params),
    emb.project(method=ProjectionTechnique.UMAP, n_neighbors=80, **umap_params),
    emb.project(method=ProjectionTechnique.UMAP, n_neighbors=100, **umap_params),
    emb.project(method=ProjectionTechnique.UMAP, n_neighbors=200, **umap_params),
    emb.project(method=ProjectionTechnique.UMAP, n_neighbors=400, **umap_params),
    emb.project(method=ProjectionTechnique.UMAP, n_neighbors=600, **umap_params),
])
# variants = emblaze.EmbeddingSet([
#     emb.project(method=ProjectionTechnique.TSNE, perplexity=22, learning_rate='auto', init='pca', early_exaggeration=1.3),
#     emb.project(method=ProjectionTechnique.TSNE, perplexity=25, learning_rate='auto', init='pca', early_exaggeration=1.3),
#     emb.project(method=ProjectionTechnique.TSNE, perplexity=30, learning_rate='auto', init='pca', early_exaggeration=1.3),
#     emb.project(method=ProjectionTechnique.TSNE, perplexity=35, learning_rate='auto', init='pca', early_exaggeration=1.3),
#     emb.project(method=ProjectionTechnique.TSNE, perplexity=40, learning_rate='auto', init='pca', early_exaggeration=1.3),
# ])

In [34]:
thumbnails = emblaze.TextThumbnails(names)
for emb in variants.embeddings:
    emb.set_field(Field.NAME, names)
    emb.set_field(Field.COLOR, colors)
    emb.set_field(Field.RADIUS, sizes)
    emb.set_field(Field.ALPHA, alphas)
w = emblaze.Viewer(embeddings=variants, thumbnails=thumbnails)
w.colorScheme = 'viridis'
w

Viewer(colorScheme='viridis', data={'data': [{'_format': 'compressed', '_idtype': 'u2', '_length': 26736, 'ids…