Skip to content
This repository has been archived by the owner on Apr 11, 2024. It is now read-only.

Commit

Permalink
fix the computation of the baseline for cosine top pairs
Browse files Browse the repository at this point in the history
  • Loading branch information
Cecca committed May 24, 2023
1 parent db48d6a commit 37de32f
Show file tree
Hide file tree
Showing 2 changed files with 56 additions and 52 deletions.
11 changes: 6 additions & 5 deletions join-experiments/TopPairsCosine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ int main(int argc, char** argv) {
<< " vectors from hdf5 file, of dimension "
<< dim << std::endl;

std::vector<std::vector<uint32_t>> out_res;
std::vector<std::vector<float>> out_res;

if (exact) {
std::vector<std::vector<Pair>> threads_res(omp_get_max_threads());
Expand Down Expand Up @@ -170,10 +170,11 @@ int main(int argc, char** argv) {
for (auto v : data) { index.insert(v); }
index.rebuild(false, false);
auto pairs = index.global_lsh_join(k, 0.999);
for (auto entry : pairs.best_indices()) {
std::vector<uint32_t> vpair;
vpair.push_back(entry.first);
for (auto entry : pairs.best_entries()) {
std::vector<float> vpair;
vpair.push_back(entry.second);
vpair.push_back(entry.first.first);
vpair.push_back(entry.first.second);
out_res.push_back(vpair);
}

Expand All @@ -184,7 +185,7 @@ int main(int argc, char** argv) {
/* } */
std::stringstream key;
key << "top-" << k << "-pairs";
H5Easy::dump(file, key.str(), out_res);
H5Easy::dump(file, key.str(), out_res, H5Easy::DumpMode::Overwrite);

return 0;
}
Expand Down
97 changes: 50 additions & 47 deletions join-experiments/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -258,42 +258,45 @@ def compute_recalls(db):
missing_recalls = db.execute("SELECT rowid, algorithm, params, dataset, k, output_file, hdf5_group FROM main WHERE recall IS NULL AND WORKLOAD = 'global-top-k';").fetchall()
print("There are {} missing recalls for global-top-k".format(len(missing_recalls)))
for rowid, algorithm, params, dataset, k, output_file, hdf5_group in missing_recalls:
if k > 1000:
if k > 10000:
continue

if 'sample' in dataset:
continue

# Compute the top-1000 distances for the dataset, if they are not already there
dist_key, nn_key = '/top-1000-dists', '/top-1000-neighbors'
top_pairs_key = '/top-1000-pairs'
top_pairs_key = '/top-10000-pairs'

dataset_path = DATASETS[dataset]()
with h5py.File(dataset_path, 'r+') as hfp:
if top_pairs_key not in hfp:
if dist_key not in hfp or nn_key not in hfp:
print('Computing top distances for', dataset)
distances, neighbors, avg_distance = compute_distances(1000, hfp['/train'], hfp.attrs['distance'])
hfp[dist_key] = distances
hfp[nn_key] = neighbors
hfp['/average_distance'] = avg_distance

print('Computing top 1000 pairs for', dataset)
distances = hfp[dist_key]
neighbors = hfp[nn_key]
topk = []
for i, (dists, neighs) in tqdm(enumerate(zip(distances, neighbors)), total=neighbors.shape[0]):
for d, j in zip(dists, neighs):
if i != j:
t = (d, min(i, j), max(i, j))
if len(topk) > 2000:
heapq.heappushpop(topk, t)
else:
heapq.heappush(topk, t)
topk = list(set(topk)) # remove duplicates
topk.sort(reverse=True)
topk = topk[:1000]
hfp[top_pairs_key] = topk
continue
# if dist_key not in hfp or nn_key not in hfp:
# print('Computing top distances for', dataset)
# distances, neighbors, avg_distance = compute_distances(1000, hfp['/train'], hfp.attrs['distance'])
# hfp[dist_key] = distances
# hfp[nn_key] = neighbors
# hfp['/average_distance'] = avg_distance

# print('Computing top 1000 pairs for', dataset)
# distances = hfp[dist_key]
# neighbors = hfp[nn_key]
# topk = []
# for i, (dists, neighs) in tqdm(enumerate(zip(distances, neighbors)), total=neighbors.shape[0]):
# for d, j in zip(dists, neighs):
# if i != j:
# t = (d, min(i, j), max(i, j))
# if len(topk) > 2000:
# heapq.heappushpop(topk, t)
# else:
# heapq.heappush(topk, t)
# topk = list(set(topk)) # remove duplicates
# topk.sort(reverse=True)
# topk = topk[:1000]
# hfp[top_pairs_key] = topk

assert top_pairs_key in hfp

if hfp[top_pairs_key].shape[1] == 3:
baseline_pairs = set([(min(pair[0], pair[1]), max(pair[0], pair[1]))
Expand Down Expand Up @@ -1727,7 +1730,7 @@ def insert_sizes():
# run_multiple(index_params, query_params)


for dataset in ['glove-200']: #, 'DeepImage']:#, 'DBLP', 'Orkut']:
for dataset in ['glove-200', 'DeepImage']:#, 'DBLP', 'Orkut']:
# ----------------------------------------------------------------------
# Xiao et al. global top-k
# if dataset in ['AOL', 'DBLP', "Orkut", "movielens-20M"]:
Expand Down Expand Up @@ -1769,32 +1772,32 @@ def insert_sizes():
query_params = [
{'k': k, 'recall': recall, 'method': 'LSHJoinGlobal'}
for recall in [0.8, 0.9]
for k in [1, 10, 100, 1000]
for k in [1, 10, 100, 1000, 10000]
]
run_multiple(index_params, query_params)

# ----------------------------------------------------------------------
# LSB-Tree global top-k
# for m in [8]:
# for w in [0.1]:
# index_params = {
# 'dataset': dataset,
# 'workload': 'global-top-k',
# 'algorithm': 'LSBTree',
# 'params': {
# 'm': m,
# 'w': w,
# }
# }
# join_params = [
# {
# 'k': k,
# 'min_leaves': min_leaves
# }
# for k in [100, 1000]
# for min_leaves in [0, 2, 4, 8, 16, 32]
# ]
# run_multiple(index_params, join_params)
for m in [8]:
for w in [0.1]:
index_params = {
'dataset': dataset,
'workload': 'global-top-k',
'algorithm': 'LSBTree',
'params': {
'm': m,
'w': w,
}
}
join_params = [
{
'k': k,
'min_leaves': min_leaves
}
for k in [1, 10, 100, 1000, 10000]
for min_leaves in [0, 2, 4, 8]
]
run_multiple(index_params, join_params)

for dataset in ['glove-200', 'DeepImage']:
continue
Expand Down

0 comments on commit 37de32f

Please sign in to comment.