In [8]:
import os
from collections import namedtuple

import numpy as np


In [2]:
DIMENSION = 10
QUERY_COUNT = 1000
ITEM_COUNT = 10 ** 6
gt_top=100
REGENERATE_DATA = True
HIDDEN_DIMENSIONS = list(range(8))


In [3]:
def normalize(a):
    vec_lengths = np.sqrt(np.power(a, 2).sum(axis=1, keepdims=True))
    return a / vec_lengths

def generate_or_read_data(file_name, shape):
    if not os.path.isfile(file_name):
        data = normalize(np.random.randn(*shape)).astype("float32")
        data.tofile(file_name)
    else:
        data = np.fromfile(file_name, dtype="float32").reshape(shape)
    return data

In [4]:
if REGENERATE_DATA:
    print("Generate synthetic queries and items") 
    train_queries = normalize(np.random.randn(QUERY_COUNT, DIMENSION)).astype("float32")
    train_queries.tofile("data/synthetic/data/train_queries.bin")

    test_queries = normalize(np.random.randn(QUERY_COUNT, DIMENSION)).astype("float32")
    test_queries.tofile("data/synthetic/data/test_queries.bin")
    
    items = normalize(np.random.randn(ITEM_COUNT, DIMENSION)).astype("float32")
    items.tofile("data/synthetic/data/items.bin")

    print("compute ground truth test scores")
    gt_train_scores = items.dot(train_queries.T)
    gt_train_scores.tofile("data/synthetic/data/model_scores/gt_train_scores.bin")
    del gt_train_scores

    gt_test_scores = items.dot(test_queries.T)
    gt_test_scores.tofile("data/synthetic/data/model_scores/gt_test_scores.bin")
    del gt_test_scores
    
    print("compute test scores for models with hidden dimensions") 
    for hidden_dim_count in HIDDEN_DIMENSIONS:
        hidden_model_test_scores = items[:,:-hidden_dim_count].dot(
            test_queries.T[:-hidden_dim_count]
        )
        hidden_model_test_scores.tofile(
            "data/synthetic/data/model_scores/hidden_{}_test_scores.bin".format(
                hidden_dim_count
            )
        )
        del hidden_model_test_scores

    print("Calc ground truth nearest neighbors")
    for data_part in ["train", "test"]:
        scores_path = "data/synthetic/data/model_scores/gt_{}_scores.bin".format(data_part)
        scores = np.fromfile(scores_path, dtype="float32").reshape(
            (ITEM_COUNT, QUERY_COUNT))
        gt = (-scores).argsort(axis=0)[:gt_top,:].T.astype("int32")
        gt.tofile("data/synthetic/data/model_scores/groundtruth_{}.bin".format(data_part))
        
    

Generate synthetic queries and items
compute ground truth test scores
compute test scores for models with hidden dimensions
Calc ground truth nearest neighbors


In [5]:
build_graph_cmd_template = (
    "./RPG --mode base "
    "--baseSize 1000000 " +
    "--trainQueries {} ".format(DIMENSION) +
    "--base data/synthetic/data/items.bin "
    "--outputGraph {graphPath} "
    "--relevanceVector {relevanceVector} "
    "--efConstruction 1000 --M 8 "
    "--metric dot_product"
)

Task = namedtuple("label", "graph_path", "build_cmd", "eval_cmd_te")




In [7]:
for hidden_dim_count in HIDDEN_DIMENSIONS:
    graph_path = "data/synthetic/hidden_{}_of_{}_graph.out".format(
        hidden_dim_count, DIMENSION
    )
    if not os.path.isfile(graph_path) or REGENERATE_DATA:
        build_cmd = build_graph_cmd_template.format(
            graphPath=graph_path,
            relevanceVector=DIMENSION - hidden_dim_count
        )
        print(build_cmd)
        os.system(build_cmd)

./RPG --mode base --baseSize 1000000 --trainQueries 10 --base data/synthetic/data/items.bin --outputGraph data/synthetic/hidden_0_of_10_graph.out --relevanceVector 10 --efConstruction 1000 --M 8 --metric dot_product
./RPG --mode base --baseSize 1000000 --trainQueries 10 --base data/synthetic/data/items.bin --outputGraph data/synthetic/hidden_1_of_10_graph.out --relevanceVector 9 --efConstruction 1000 --M 8 --metric dot_product
./RPG --mode base --baseSize 1000000 --trainQueries 10 --base data/synthetic/data/items.bin --outputGraph data/synthetic/hidden_2_of_10_graph.out --relevanceVector 8 --efConstruction 1000 --M 8 --metric dot_product
./RPG --mode base --baseSize 1000000 --trainQueries 10 --base data/synthetic/data/items.bin --outputGraph data/synthetic/hidden_3_of_10_graph.out --relevanceVector 7 --efConstruction 1000 --M 8 --metric dot_product
./RPG --mode base --baseSize 1000000 --trainQueries 10 --base data/synthetic/data/items.bin --outputGraph data/synthetic/hidden_4_of_10_gra