In [11]:
import pandas as pd
import altair as alt
import numpy as np
from pedroai.io import read_json
from dyna_to_pyirt import load_gold_labels
from dataset import Dataset
from openTSNE import TSNE
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [35]:
dataset_labels, item_to_dataset = load_gold_labels()
params = read_json('data/multidim-sentiment/best_parameters.json')
dataset = Dataset.from_jsonlines('data/dynaboard-sentiment-pyirt.jsonlines')
item_accuracies = dataset.get_item_accuracies()
ix_to_item_id = {int(k): v for k, v in params["item_ids"].items()}
ix_to_subject_id = {int(k): v for k, v in params["subject_ids"].items()}
n_items = len(params['diff'])
rows = []
for ix in range(n_items):
    item_id = ix_to_item_id[ix]
    diff = params["diff"][ix]
    disc = params["disc"][ix]
    if "lambdas" in params:
        lambda_ = params["lambdas"][ix]
    else:
        lambda_ = np.nan
    dataset = item_to_dataset[item_id]
    rows.append(
        {
            "item_id": item_id,
            "diff": diff,
            "disc": disc,
            "lambda": lambda_,
            "dataset": dataset,
            "accuracy": item_accuracies[item_id].accuracy,
        }
    )
df = pd.DataFrame(rows)

In [36]:
df

Unnamed: 0,item_id,diff,disc,lambda,dataset,accuracy
0,12582,"[0.8394210934638977, 0.18465390801429749, 1.24...","[-0.24157419800758362, 0.3771539032459259, 0.3...",,yelp-review-dev,1.000000
1,r1-0096404,"[0.7920363545417786, -0.07290685921907425, -0....","[4.005786895751953, 0.9029595851898193, 2.3023...",,dynasent-r1-dev,0.428571
2,r2-0018776,"[-0.2748778164386749, -0.8539808988571167, 0.7...","[-4.723825454711914, -1.1047857999801636, 6.67...",,dynasent-r2-dev,0.285714
3,7502,"[0.8478358387947083, 6.9632134437561035, -0.76...","[-0.32669663429260254, -9.164567947387695, 1.9...",,yelp-review-dev,0.714286
4,24816,"[-0.024600956588983536, -1.1159800291061401, -...","[6.698476314544678, 0.8175703883171082, 0.0901...",,yelp-review-dev,0.857143
...,...,...,...,...,...,...
24615,44589,"[-0.22778929769992828, -0.7066130638122559, 0....","[-0.12588612735271454, 0.82444167137146, -0.64...",,amazon-review-dev,0.000000
24616,17051,"[0.10348467528820038, -0.4711759388446808, 0.4...","[7.5216827392578125, -0.21907258033752441, 0.5...",,yelp-review-dev,0.857143
24617,r1-0095170,"[0.6404703855514526, 0.229706272482872, 0.4406...","[7.257632255554199, 0.11377299576997757, -0.83...",,dynasent-r1-dev,0.857143
24618,265218,"[-0.28755494952201843, -0.7860469222068787, -0...","[0.46400704979896545, -0.7773410677909851, 0.5...",,amazon-review-dev,0.000000


In [37]:
x_array = np.array(df['diff'].tolist())
dataset = df['dataset'].tolist()

In [40]:
x_array.shape

(24620, 5)

In [43]:
embeddings = TSNE(
    perplexity=200,
    initialization='pca',
    metric='cosine',
    n_jobs=7,
    random_state=42,
    verbose=True
).fit(x_array)

--------------------------------------------------------------------------------
TSNE(metric='cosine', n_jobs=7, perplexity=200, random_state=42, verbose=True)
--------------------------------------------------------------------------------
===> Finding 600 nearest neighbors using Annoy approximate search using cosine distance...


In [None]:
tsne_df = pd.DataFrame(embeddings).add_prefix('tsne_')
tsne_df['item_id'] = df['item_id']
tsne_df['dataset'] = df['dataset']
chart = alt.Chart(tsne_df).mark_point(filled=False, size=8, opacity=.5).encode(
    x=alt.X('tsne_0', title='TSNE Dimension 0'),
    y=alt.Y('tsne_1', title='TSNE Dimension 1'),
    color=alt.Color('dataset', title='Dataset')
).properties(height=1000, width=1000)
chart