In [15]:
import os
import math
from collections import namedtuple
from multiprocessing import Pool
import json

import numpy as np
import matplotlib.pyplot as plt

from catboost import CatBoostRegressor

In [16]:
DIMENSION = 10
QUERY_COUNT = 1000
ITEM_COUNT = 10 ** 6
GT_TOP_LEN = 10000 # maximal value scince hnsw doesn't give more than 1000 neighbors
RECALL_TOP_LEN = 5

MIN_EF_SEARCH = 1
MAX_EF_SEARCH = 10000
HIDDEN_DIMENSIONS = list(range(4))
RECALL_OF_INTEREST = 0.95
N_SEARCH_THREADS = 8
REGENERATE_DATA = False
DATASET = "synthetic"


In [3]:
build_graph_cmd_template = (
    "./RPG --mode base "
    "--baseSize 1000000 "
    "--trainQueries {featuresSize} "
    "--base data/synthetic/data/{features}.bin "
    "--outputGraph {graphPath} "
    "--relevanceVector {relevanceVector} "
    "--efConstruction 1000 --M 8 "
    "--metric {metric}"
)



In [4]:
search_cmd_template = (
    "./RPG --mode query --baseSize 1000000 --querySize 1000"
    " --query data/synthetic/data/model_scores/{scores} --inputGraph {inputGraph}"
    " --efSearch {efSearch} --topK {topK} --output data/synthetic/{searchResultFile}.txt" +
    " --gtQueries 1000 --gtTop {} ".format(GT_TOP_LEN) +
    "--groundtruth data/synthetic/data/model_scores/{gtFile}"
)

In [5]:
def normalize(a):
    vec_lengths = np.sqrt(np.power(a, 2).sum(axis=1, keepdims=True))
    return a / vec_lengths

def generate_or_read_data(file_name, shape):
    if not os.path.isfile(file_name):
        data = normalize(np.random.randn(*shape)).astype("float32")
        data.tofile(file_name)
    else:
        data = np.fromfile(file_name, dtype="float32").reshape(shape)
    return data

In [6]:
def sort_by(results, key):
    assert key in ["evals", "efSearch"]
    permutation = np.argsort(results[key])
    for key in results:
        vals = results[key]
        results[key] = [vals[i] for i in permutation]
    return results

assert sort_by({
    "efSearch": [2, 0, 3, 1],
    "vals": [1, 2, 3, 4]}, "efSearch") == {
    "efSearch": [0, 1, 2, 3],
    "vals": [2, 4, 1, 3]
}

In [7]:
def get_evals_for_recall(results, recall=RECALL_OF_INTEREST):
    results = sort_by(results, "evals")
    evals = results["evals"]
    recalls = results["recall"]
    ef_serch = results["efSearch"] if "efSearch" in results else evals
    assert len(evals) == len(recalls)
    
    lower_bound = 0
    lower_bound_ef = MIN_EF_SEARCH
    upper_bound = math.inf
    upper_bound_ef = MAX_EF_SEARCH
    
    if not evals:
        return lower_bound, upper_bound
    
    
    if recalls[0] <= recall:
        lower_bound = evals[0]
        lower_bound_ef = ef_serch[0]
        i = 1
        while i < len(evals) and recalls[i] <= recall:
            lower_bound = evals[i]
            lower_bound_ef = ef_serch[i]
            i += 1
    
    if recalls[-1] >= recall:
        upper_bound = evals[-1]
        upper_bound_ef = ef_serch[-1]
        i = len(evals) - 2
        while i >= 0 and recalls[i] >= recall:
            upper_bound = evals[i]
            upper_bound_ef = ef_serch[i]
            i -= 1
    
    return (lower_bound, upper_bound), (lower_bound_ef, upper_bound_ef)

assert get_evals_for_recall({
    "evals": [0, 1, 2, 3, 4],
    "recall": [0.93, 0.94, 0.95, 0.96, 0.97]
}) == ((2, 2), (2, 2))
assert get_evals_for_recall({
    "evals": [0, 1, 2, 3, 4],
    "recall": [0.93, 0.94, 0.955, 0.96, 0.97]
}) == ((1, 2), (1, 2))
assert get_evals_for_recall({
    "evals": [0, 1, 2, 3, 4],
    "recall": [0.93, 0.96, 0.94, 0.96, 0.97]
}) == ((0, 3), (0, 3))


In [8]:
def merge_results(lhs_res, rhs_res):
    assert not rhs_res or sorted(lhs_res.keys()) == sorted(rhs_res.keys())
    merged_res = {}
    for key in lhs_res:
        merged_res[key] = list(lhs_res[key]) + list(rhs_res.get(key, []))
    return merged_res


In [9]:
def logspace(start, stop, count, include_ends=True):
    cnt_ = count if include_ends else count + 2
    seq = np.unique(np.exp(
        np.linspace(np.log(start), np.log(stop), cnt_)
    ).astype("int"))
    if include_ends:
        return seq
    return seq[1:-1]

In [10]:
def bench_cmd(cmd):
    res = {}
    cmd_out = []
    with os.popen(cmd) as out:
        for line in out:
            cmd_out.append(line)
            for stat_name, prefix, suff_len in [
                ("relevance", "Average relevance: ", 1),
                ("recall", "Recall@5: ", 1),
                ("time", "Average query time: ", 3),
                ("evals", "Average number of model computations: ", 1)
            ]:
                if line.startswith(prefix):
                    res[stat_name] = float(line[len(prefix):-suff_len])
    return res, "".join(cmd_out)

def run_search(graph_path, scores_file, ef_ticks, topK=5,
               result_file=None, n_threads=8, verbose=True,
               gt_file="groundtruth_test.bin"
              ):
    if result_file is None:
        result_file = "result.txt"
    else:
        assert len(ef_ticks) == 1
    
    commands = []
    for ef in ef_ticks:
        commands.append(search_cmd_template.format(
            scores=scores_file,
            inputGraph=graph_path,
            efSearch=ef,
            topK=topK,
            searchResultFile=result_file,
            gtFile=gt_file
        ))
    pool = Pool(processes=n_threads)
    results = pool.map(bench_cmd, commands)
    output = {"relevance": [], "recall": [], "time": [], "evals": []}
    for i, (res, cmd_out) in enumerate(results):
        if all(key in res for key in output):
            for key in output:
                output[key].append(res[key])
        else:
            if verbose:
                print("missed result for {} efSearch {}.".format(graph_path, ef_ticks[i]))
                print(cmd_out)
    output["efSearch"] = list(ef_ticks)
    return output

In [11]:
def search_recall(graph_path, scores_file,
                  iterations=2, runs_per_iteration=None,
                  recall_of_interest=RECALL_OF_INTEREST, n_threads=8,
                  gt_file="groundtruth_test.bin"
                 ):
    if runs_per_iteration is None:
        runs_per_iteration = n_threads
    
    lower_ef_search_bound = MIN_EF_SEARCH
    upper_ef_search_bound = MAX_EF_SEARCH
    cur_results = {}
    for i in range(iterations):
        ef_ticks = logspace(lower_ef_search_bound, upper_ef_search_bound,
                            runs_per_iteration, include_ends=(i==0))
        result_update = run_search(graph_path, scores_file,
                                   ef_ticks=ef_ticks, n_threads=n_threads, gt_file=gt_file)
        cur_results = merge_results(result_update, cur_results)
        _, (lower_ef_search_bound, upper_ef_search_bound) = get_evals_for_recall(
            cur_results, recall=recall_of_interest)
    return sort_by(cur_results, "efSearch")

In [12]:
def read_txt(file_name, expected_shape=None):
    data = []
    with open(file_name) as fin:
        for line in fin:
            data.append([int(w) for w in line.split()])
    row_len = len(data[0])
    assert all(len(l) == row_len for l in data)
    if expected_shape is not None:
        assert expected_shape == (len(data), row_len)
    return data


def calc_eval_recall_curve(approximate_top, gt_top):
    assert gt_top.shape == (QUERY_COUNT, RECALL_TOP_LEN)
    
    gt_tops = [set(query_top) for query_top in gt_top]
    recalls = []
    found_count = 0
    top_len = len(approximate_top[0])
    for i in range(top_len):
        for query_id in range(QUERY_COUNT):
            if approximate_top[query_id][i] in gt_tops[query_id]:
                found_count += 1
        recalls.append(found_count / (QUERY_COUNT * RECALL_TOP_LEN))
    evals = list(range(1, top_len + 1))
    return {"evals": evals, "recall": recalls}
    

In [13]:
# test calc_eval_recall_curve

# full_test_top = np.vstack([
#     np.arange(100) for i in range(QUERY_COUNT)
# ])
# test_gt = full_test_top[:,:RECALL_TOP_LEN]
# approx_top = full_test_top.copy()
# for row in approx_top:
#     np.random.shuffle(row[:80])
# test_curve = calc_eval_recall_curve(approx_top, test_gt)
# plt.plot(test_curve["evals"], test_curve["recall"])


In [16]:
def sample_dataset(items, queries, samples_per_query=1000):
    assert items.shape == (ITEM_COUNT, DIMENSION)
    assert queries.shape == (QUERY_COUNT, DIMENSION)
    x_batches = []
    y_batches = []
    for i in range(samples_per_query):
        item_indexes = np.random.choice(ITEM_COUNT, QUERY_COUNT, replace=False)
        chosen_items = items[item_indexes]
        x_batches.append(np.hstack((chosen_items, queries)))
        y_batches.append(np.sum(chosen_items * queries, axis=1))
    return np.vstack(x_batches), np.hstack(y_batches)

# test sample dataset

# X, y = sample_dataset(items, train_queries, 1)

# assert X.shape == (QUERY_COUNT, 2 * DIMENSION)
# assert y.shape == (QUERY_COUNT,)

# dot_prod = 0
# for i in range(DIMENSION):
#     dot_prod += X[0][i] * X[0][DIMENSION + i]
# assert np.allclose([dot_prod], [y[0]])

In [17]:
def make_predictions(items, queries, model, out_file):
    assert queries.shape == (QUERY_COUNT, DIMENSION)
    assert items.shape == (ITEM_COUNT, DIMENSION)
    per_item_predictions = []
    for item in items:
        ifeats = np.repeat(item, QUERY_COUNT).reshape((DIMENSION, QUERY_COUNT)).T
        feats = np.hstack((ifeats, queries))
        per_item_predictions.append(model.predict(feats).astype("float32"))
    np.vstack(per_item_predictions).tofile(out_file)
    

In [18]:
if REGENERATE_DATA:
    print("Generate synthetic queries and items") 
    train_queries = normalize(np.random.randn(QUERY_COUNT, DIMENSION)).astype("float32")
    train_queries.tofile("data/synthetic/data/train_queries.bin")

    test_queries = normalize(np.random.randn(QUERY_COUNT, DIMENSION)).astype("float32")
    test_queries.tofile("data/synthetic/data/test_queries.bin")
    
    items = normalize(np.random.randn(ITEM_COUNT, DIMENSION)).astype("float32")
    items.tofile("data/synthetic/data/items.bin")

    print("compute ground truth test scores")
    gt_train_scores = items.dot(train_queries.T)
    gt_train_scores.tofile("data/synthetic/data/model_scores/gt_train_scores.bin")
    del gt_train_scores

    gt_test_scores = items.dot(test_queries.T)
    gt_test_scores.tofile("data/synthetic/data/model_scores/gt_test_scores.bin")
    del gt_test_scores
    
    print("compute test scores for models with hidden dimensions") 
    for hidden_dim_count in HIDDEN_DIMENSIONS:
        hidden_model_test_scores = items[:,:DIMENSION - hidden_dim_count].dot(
            test_queries.T[:DIMENSION - hidden_dim_count]
        )
        hidden_model_test_scores.tofile(
            "data/synthetic/data/model_scores/hidden_{}_test_scores.bin".format(
                hidden_dim_count
            )
        )
        del hidden_model_test_scores

    print("Calc ground truth nearest neighbors")
    for data_part in ["train", "test"]:
        scores_path = "data/synthetic/data/model_scores/gt_{}_scores.bin".format(data_part)
        scores = np.fromfile(scores_path, dtype="float32").reshape(
            (ITEM_COUNT, QUERY_COUNT))
        gt = (-scores).argsort(axis=0)[:GT_TOP_LEN,:].T.astype("int32")
        gt.tofile("data/synthetic/data/model_scores/groundtruth_{}.bin".format(data_part))
        del scores
        del gt

    

In [19]:
hidden_search_gt_results = {}
hidden_search_rerank_results = {}
relevance_search_gt_results = {}

hidden_search_gt_evals = []
hidden_search_rerank_evals = []

In [None]:

for hidden_dim_count in HIDDEN_DIMENSIONS:
    graph_path = "data/synthetic/hidden_{}_of_{}_graph.out".format(
        hidden_dim_count, DIMENSION
    )
    if not os.path.isfile(graph_path) or REGENERATE_DATA:
        build_cmd = build_graph_cmd_template.format(
            featuresSize=DIMENSION,
            graphPath=graph_path,
            relevanceVector=DIMENSION - hidden_dim_count,
            features="items",
            metric="dot_product" # should be equivalent to l2
        )
        print(build_cmd)
        os.system(build_cmd)
    
    label = "hidden_{}_search_gt".format(hidden_dim_count)
    hidden_search_gt_results[label] = search_recall(
        graph_path, "gt_test_scores.bin",
        iterations=3, n_threads=N_SEARCH_THREADS,
        runs_per_iteration=16
    )
    (lower_evals, upper_evals), _ = get_evals_for_recall(hidden_search_gt_results[label])
    hidden_search_gt_evals.append((lower_evals, upper_evals))
    
    label = "hidden_{}_search_rerank".format(hidden_dim_count)
    search_result_file = "search_result_hidden_{}".format(hidden_dim_count)
    run_search(
        graph_path,
        "hidden_{}_test_scores.bin".format(hidden_dim_count),
        topK=GT_TOP_LEN,
        ef_ticks=[GT_TOP_LEN],
        result_file=search_result_file,
        n_threads=1,
        verbose=False
    )
    approximate_top = read_txt(
        "data/synthetic/{}.txt".format(search_result_file),
        (QUERY_COUNT, GT_TOP_LEN)
    )
    gt = np.fromfile(
        "data/synthetic/data/model_scores/groundtruth_test.bin",
        dtype="int32"
    ).reshape((QUERY_COUNT, GT_TOP_LEN))
    gt = gt[:,:RECALL_TOP_LEN]
    result = calc_eval_recall_curve(approximate_top, gt)
    hidden_search_rerank_results[label] = result
    (lower_evals, upper_evals), _ = get_evals_for_recall(result)
    hidden_search_rerank_evals.append((lower_evals, upper_evals))
    

In [None]:
# build graph on relevance vectors
relevance_graph_path = "data/synthetic/rel_{}_graph.out".format(QUERY_COUNT)
if not os.path.isfile(relevance_graph_path) or REGENERATE_DATA:
    build_cmd = build_graph_cmd_template.format(
        featuresSize=QUERY_COUNT,
        graphPath=relevance_graph_path,
        relevanceVector=QUERY_COUNT,
        features="model_scores/gt_train_scores",
        metric="l2"
    )
    print(build_cmd)
    os.system(build_cmd)

In [None]:
label = "relevance_search_gt"
relevance_search_gt_results[label] = search_recall(
    relevance_graph_path, "gt_test_scores.bin",
    iterations=3, n_threads=N_SEARCH_THREADS,
    runs_per_iteration=16
)

In [None]:
for hidden_dim_count in HIDDEN_DIMENSIONS:
    if hidden_dim_count == 0:
        continue
    label = "relevance_search_hidden_{}".format(hidden_dim_count)
    search_result_file = "search_result_rel_hidden_{}".format(hidden_dim_count)
    run_search(
        relevance_graph_path,
        "hidden_{}_test_scores.bin".format(hidden_dim_count),
        topK=GT_TOP_LEN,
        ef_ticks=[GT_TOP_LEN],
        result_file=search_result_file,
        n_threads=1,
        verbose=False
    )
    approximate_top = read_txt(
        "data/synthetic/{}.txt".format(search_result_file),
        (QUERY_COUNT, GT_TOP_LEN)
    )
    gt = np.fromfile(
        "data/synthetic/data/model_scores/groundtruth_test.bin",
        dtype="int32"
    ).reshape((QUERY_COUNT, GT_TOP_LEN))
    gt = gt[:,:RECALL_TOP_LEN]
    result = calc_eval_recall_curve(approximate_top, gt)
    relevance_search_gt_results[label] = result


In [None]:
# build graph on relevance vectors based on catboost predictions

model_path = "data/synthetic/catboost_model.cbm"
if REGENERATE_DATA or not os.path.isfile(model_path):
    X, y = sample_dataset(items, train_queries, samples_per_query=10000)
    x_val, y_val = sample_dataset(items, train_queries, samples_per_query=1000)

    model = CatBoostRegressor(iterations=1000, learning_rate=0.1)
    model.fit(X, y, eval_set=(x_val, y_val), plot=True, verbose=False)
    model.save_model(model_path)
else:
    model = CatBoostRegressor()
    model.load_model(model_path)


In [None]:
if REGENERATE_DATA:
    make_predictions(items, train_queries, model, "data/synthetic/data/model_scores/catboost_scores_train.bin")
    make_predictions(items, test_queries, model, "data/synthetic/data/model_scores/catboost_scores_test.bin")

In [None]:
# make ground truth top by catboost scores

catboost_gt_path = "data/synthetic/data/model_scores/groundtruth_catboost.bin"
if not os.path.isfile(catboost_gt_path) or REGENERATE_DATA:
    scores_path = "data/synthetic/data/model_scores/catboost_scores_test.bin"
    scores = np.fromfile(scores_path, dtype="float32").reshape(
        (ITEM_COUNT, QUERY_COUNT))
    gt = (-scores).argsort(axis=0)[:GT_TOP_LEN,:].T.astype("int32")
    gt.tofile(catboost_gt_path)
    del gt
    del scores


In [None]:
graph_path = "data/synthetic/catboost_rel_{}_graph.out".format(QUERY_COUNT)
if not os.path.isfile(graph_path) or REGENERATE_DATA:
    build_cmd = build_graph_cmd_template.format(
        featuresSize=QUERY_COUNT,
        graphPath=graph_path,
        relevanceVector=QUERY_COUNT,
        features="model_scores/catboost_scores_train",
        metric="l2"
    )
    print(build_cmd)
    os.system(build_cmd)

label = "catboost_search_catboost"
relevance_search_gt_results[label] = search_recall(
    graph_path, "catboost_scores_test.bin",
    iterations=3, n_threads=N_SEARCH_THREADS,
    runs_per_iteration=16
)

In [None]:
label = "catboost_rank_catboost"
relevance_search_gt_results[label] = search_recall(
    graph_path, "catboost_scores_test.bin",
    iterations=3, n_threads=N_SEARCH_THREADS,
    runs_per_iteration=16, gt_file="groundtruth_catboost.bin"
)


In [None]:
def plot_results(*results, x_lim=None, y_lim=None, x_log_scale=False, keys=None):
    plt.xlabel("evals")
    plt.ylabel("recall@5")
    if x_lim is not None:
        plt.xlim(x_lim)
    if y_lim is not None:
        plt.ylim(y_lim)
    if x_log_scale:
        plt.xscale('log')
    for result in results:
        for label, stats in result.items():
            if keys is None or label in keys:
                plt.plot(stats["evals"], stats["recall"], label=label)
                scatter_size = 10 if len(stats["evals"]) < 100 else 0.1
                plt.scatter(stats["evals"], stats["recall"], s=scatter_size)
    plt.legend()
    plt.show()



In [None]:
plt.figure(figsize=(10, 10))
plot_results(
    hidden_search_rerank_results
)

In [None]:
plt.figure(figsize=(10, 10))
plot_results(
    hidden_search_gt_results
)

In [None]:
plt.figure(figsize=(10, 10))
plot_results(
    hidden_search_gt_results,
    hidden_search_rerank_results,
    relevance_search_gt_results,
    keys=[
        "relevance_search_gt",
        "hidden_1_search_rerank", "hidden_1_search_gt",
        "hidden_2_search_rerank", "hidden_2_search_gt",
#         "hidden_3_search_rerank", "hidden_3_search_gt",
#         "hidden_0_search_gt",
#         "catboost_search_catboost", "catboost_rank_catboost",
#         "relevance_search_hidden_1", "relevance_search_hidden_2"
    ],
    x_lim=(0, 10000)
)

In [None]:
plt.figure(figsize=(10, 10))
plot_results(
    hidden_search_gt_results,
    hidden_search_rerank_results,
    relevance_search_gt_results,
    keys=[
        "relevance_search_gt",
#         "hidden_0_search_gt",
        "catboost_search_catboost", "catboost_rank_catboost",
    ],
    x_lim=(0, 1000)
)

In [None]:
plt.figure(figsize=(10, 10))
plot_results(
    hidden_search_gt_results,
    hidden_search_rerank_results,
    relevance_search_gt_results,
    keys=[
        "relevance_search_gt", "hidden_0_search_gt", "hidden_0_search_rerank"
    ],
    x_lim=(0, 1000),
    y_lim=(0.8, 1.01)
)

In [8]:
import pandas as pd
from collections import OrderedDict
from time import time
from sklearn.metrics import pairwise_distances

QUANTILES = [0.9, 0.99, 0.999, 1]

def estimate_disorder(items, scores, gt_top=100, queries_to_sample=100, verbose=False):
    item_count, query_count = scores.shape
    chosen_queries = np.random.choice(query_count, queries_to_sample, replace=False)
    gt = np.argsort(-scores[:,chosen_queries], axis=0)[:gt_top].T
    
    multipliers = []
    start = time()
    for i, q in enumerate(chosen_queries):
        close_items = items[gt[i]]
        ii_dists = pairwise_distances(close_items, items)
        ii_ranks = ii_dists.argsort().argsort()
        for rank_1 in range(1, gt_top):
            for rank_2 in range(0, rank_1):
                ii_rank = ii_ranks[rank_1][gt[i, rank_2]]
                multipliers.append(ii_rank / (2 + rank_1 + rank_2))
        if verbose:
            print("{} queries processed, avg time: {}".format(
                i + 1, (time() - start) / (i + 1)
            ))
    return np.quantile(multipliers, QUANTILES)

def make_table(disoreder_stats : OrderedDict):
    columns = [str(q) for q in QUANTILES]
    labels = []
    data = []
    for key, stats in disoreder_stats.items():
        assert len(stats) == len(QUANTILES)
        data.append(stats)
        labels.append(key)
    result = pd.DataFrame(data=data, columns=columns, index=labels)
    return result.style.format("{:.1f}")


In [17]:
disorder_stats_path = "data/{}/data/disorder_statistics.json".format(DATASET)


if not os.path.isfile(disorder_stats_path) or REGENERATE_DATA:
    train_queries = np.fromfile(
        "data/{}/data/train_queries.bin".format(DATASET),
        dtype="float32"
    ).reshape((QUERY_COUNT, DIMENSION))
    test_queries = np.fromfile(
        "data/{}/data/test_queries.bin".format(DATASET),
        dtype="float32"
    ).reshape((QUERY_COUNT, DIMENSION))
    items = np.fromfile(
        "data/{}/data/items.bin".format(DATASET),
        dtype="float32"
    ).reshape((ITEM_COUNT, DIMENSION))

    disorder_stats = OrderedDict()
    item_features = np.fromfile(
        "data/{}/data/model_scores/gt_train_scores.bin".format(DATASET),
        dtype="float32"
    ).reshape(ITEM_COUNT, QUERY_COUNT)
    relevance_scores = np.fromfile(
        "data/{}/data/model_scores/gt_test_scores.bin".format(DATASET),
        dtype="float32"
    ).reshape(ITEM_COUNT, QUERY_COUNT)

    disorder_stats["relevance_proximity"] = estimate_disorder(
        item_features, relevance_scores,
        queries_to_sample=100
    )
    del item_features

    for hidden_dim_count in HIDDEN_DIMENSIONS:
        open_dims = DIMENSION - hidden_dim_count
        disorder_stats["hidden_{}".format(hidden_dim_count)] = estimate_disorder(
            items[:,:open_dims], relevance_scores
        )
    del relevance_scores

    for k in disorder_stats:
        disorder_stats[k] = [float(num) for num in disorder_stats[k]]
    with open(disorder_stats_path, "w") as fout:
        json.dump(disorder_stats, fout, indent=4)
else:
    with open(disorder_stats_path) as fin:
        disorder_stats = json.load(fin)

make_table(disorder_stats)

Unnamed: 0,0.9,0.99,0.999,1
relevance_proximity,34.9,75.3,109.7,240.2
hidden_0,33.8,71.7,104.0,297.8
hidden_1,61.7,142.4,224.2,684.8
hidden_2,118.4,305.4,548.3,2025.4
hidden_3,209.0,565.2,1074.4,3524.5


In [10]:
make_table(disorder_stats)

Unnamed: 0,0.9,0.99,0.999,1
relevance_proximity,34.9,75.3,109.7,240.2
hidden_0,33.8,71.7,104.0,297.8
hidden_1,61.7,142.4,224.2,684.8
hidden_2,118.4,305.4,548.3,2025.4
hidden_3,209.0,565.2,1074.4,3524.5


In [13]:
import json

disorder_stats_path = "data/{}/data/disorder_statistics.json".format(DATASET)
