In [1]:
from collections import OrderedDict
from copy import copy
import json
import os
from time import time
from dataclasses import dataclass

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

In [2]:
ITEM_COUNT = 10 ** 6
QUERY_COUNT = 1000
GT_TOP = 100
CHOSEN_PERCENTILES = np.array([900, 950, 980, 990, 995, 999, 1000])
COLUMNS = ["90", "95", "98", "99", "99.5", "99.9", "100"]


In [3]:
def calc_l2_distances(pt, points):
    return ((points - pt) ** 2).sum(axis=1)

def calc_l1_distances(pt, points):
    return np.abs(points - pt).sum(axis=1)

def get_sum_ord_calcer(sum_ord):
    def calc_sum_ord_distances(pt, points):
        return np.partition(points + pt, sum_ord)[:,sum_ord]
    return calc_sum_ord_distances

@dataclass
class HybridCalcer:
    D : int
    rank_to_dist : np.ndarray
        
    def __call__(self, pt, points, pt_rank, pts_ranks):
        max_ranks = (pt_rank + pts_ranks).min(axis=1).astype("int64")
        max_ranks *= self.D
        max_ranks = np.minimum(max_ranks, self.rank_to_dist.size - 1)
        max_dists = self.rank_to_dist[max_ranks]
        l2_dists = ((points - pt) ** 2).sum(axis=1)
        return np.minimum(l2_dists, max_dists)

pt = np.array([1, 2, 3])
pts = np.array([
    [4, 0, 1],
    [7, 6, 5],
    [5, 5, 0]
])
assert np.all(get_sum_ord_calcer(1)(pt, pts) == np.array([4, 8, 6]))
assert np.all(get_sum_ord_calcer(0)(pt, pts) == np.array([2, 8, 3]))



In [4]:
def estimate_disorder_curve(item_features, item_ranks, metric_calcer, close_items,
                            sample_count=100):
    assert item_features.shape == (ITEM_COUNT, QUERY_COUNT)
    assert item_ranks.shape == (ITEM_COUNT, QUERY_COUNT)
    assert close_items.shape == (QUERY_COUNT, GT_TOP)
    
    base_queries = np.random.choice(QUERY_COUNT, sample_count)
    disorder_multipliers = []
    for query in base_queries:
        first_item = np.random.choice(close_items[query])
        if isinstance(metric_calcer, HybridCalcer):
            distances = metric_calcer(
                item_features[first_item], item_features,
                item_ranks[first_item], item_ranks
            )
        else:
            distances = metric_calcer(item_features[first_item], item_features)
        item_item_ranks = np.argsort(np.argsort(distances))
        
        second_items = close_items.flatten()
        qweries = np.repeat(np.arange(QUERY_COUNT), GT_TOP)
        first_item_ranks = item_ranks[first_item, qweries]
        second_items_ranks = item_ranks[second_items, qweries]
        disorder_multipliers.append(
            item_item_ranks[second_items] / (first_item_ranks + second_items_ranks)
        )
    disorder_multipliers = np.array(disorder_multipliers).flatten()
    return np.quantile(disorder_multipliers, np.linspace(0, 1, 1001))

In [5]:
def load_data_and_estimate_disorder(metric_type, feature_type, dataset, dataset_part,
                                    sample_count=100):
    assert dataset in ["video", "collections"]
    assert dataset_part in ["train", "test"]
    assert feature_type in ["scores", "ranks", "log_ranks"]
    
    curve_name = ("_".join([metric_type, feature_type, dataset_part]) +
                  "_s{}".format(sample_count))
    result_path = "{}/data/disorder_curves/{}.bin".format(dataset, curve_name)
    if os.path.isfile(result_path):
        curve = np.fromfile(result_path, dtype="float32")
    else:
        if metric_type == "l2":
            metric_calcer = calc_l2_distances
        elif metric_type == "l1":
            metric_calcer = calc_l1_distances
        elif metric_type.startswith("ord_"):
            order = int(metric_type[len("ord_"):])
            metric_calcer = get_sum_ord_calcer(order)
        elif metric_type.startswith("hybrid_"):
            D = int(metric_type[len("hybrid_"):])
            rank_to_dist_path = "{}/data/rankToDist.bin".format(dataset)
            metric_calcer = HybridCalcer(D, np.fromfile(rank_to_dist_path, dtype="float32"))
        else:
            assert False, "wrong metric type"

        if feature_type == "scores":
            item_features_path = "{}/data/model_scores/scores_train.bin".format(dataset)
        else:
            item_features_path = "{}/data/item_train_{}.bin".format(dataset, feature_type)

        item_ranks_path = "{}/data/item_{}_ranks.bin".format(dataset, dataset_part)

        if dataset_part == "test":
            ground_truth_path = "{}/data/model_scores/groundtruth.bin".format(dataset)
        else:
            ground_truth_path = "{}/data/model_scores/groundtruth_train.bin".format(dataset)

        assert os.path.isfile(item_features_path)
        assert os.path.isfile(item_ranks_path)

    #     print("Start download data")
        load_start = time()
        item_features = np.fromfile(item_features_path, dtype="float32").reshape(ITEM_COUNT, QUERY_COUNT)
        item_ranks = np.fromfile(item_ranks_path, dtype="float32").reshape(ITEM_COUNT, QUERY_COUNT)
        close_items = np.fromfile(ground_truth_path, dtype="int32").reshape(QUERY_COUNT, GT_TOP)
    #     print("Download complete in {} seconds".format(time() - load_start))

        calc_start = time()
        curve = estimate_disorder_curve(item_features, item_ranks, metric_calcer, close_items,
                                        sample_count=sample_count)
    #     print("Calculation complete in {} seconds".format(time() - calc_start))
        curve.astype("float32").tofile(result_path)
        
    print("{:>35}: ".format(curve_name) +
          " ".join("{:>10.2f}".format(curve[p]) for p in CHOSEN_PERCENTILES))
    return (
        "_".join([metric_type, feature_type, dataset_part]),
        [curve[p] for p in CHOSEN_PERCENTILES]
    )


In [9]:
sample_count = 100
dataset = "collections"

calc_list = [
    ("l2", "log_ranks"),
    ("l2", "scores"),
    ("l2", "ranks"),
    ("ord_0", "ranks"),
    ("ord_1", "ranks"),
    ("ord_2", "ranks"),
    ("ord_5", "ranks"),
    ("hybrid_5", "scores"),
    ("hybrid_10", "scores"),
    ("hybrid_20", "scores"),
    ("hybrid_50", "scores"),
    ("hybrid_100", "scores")
]

results = []

for metric_type, feature_type in calc_list:
    for data_part in ["train", "test"]:
        results.append(load_data_and_estimate_disorder(
            metric_type,
            feature_type,
            dataset,
            data_part,
            sample_count=sample_count
        ))


            l2_log_ranks_train_s100:      45.09      95.74     245.25     463.43     825.97    2760.91  249999.75
             l2_log_ranks_test_s100:      45.57     104.97     292.05     585.73    1110.10    4310.34  249998.75
               l2_scores_train_s100:      26.99      69.64     206.99     417.89     760.54    2438.41  302314.00
                l2_scores_test_s100:      34.22      92.65     279.54     553.21     982.17    3003.89  296528.34
                l2_ranks_train_s100:      17.88      50.55     161.50     337.64     636.16    2131.60  159480.00
                 l2_ranks_test_s100:      35.31      98.77     284.67     539.96     940.27    2768.65  248004.25
             ord_0_ranks_train_s100:      10.60      20.03      32.65      40.27      45.57      72.29      81.82
              ord_0_ranks_test_s100:      16.82      39.83     111.24     227.88     442.95    1624.05  249908.50
             ord_1_ranks_train_s100:       8.74      17.36      42.99      92.62     192

In [10]:
result_data = []
labels = []
for label, stats in results:
    labels.append(label)
    result_data.append(stats)
result_table = pd.DataFrame(data=result_data, columns=COLUMNS, index=labels)
result_table.style.format("{:.1f}")

Unnamed: 0,90,95,98,99,99.5,99.9,100
l2_log_ranks_train,45.1,95.7,245.2,463.4,826.0,2760.9,249999.8
l2_log_ranks_test,45.6,105.0,292.1,585.7,1110.1,4310.3,249998.8
l2_scores_train,27.0,69.6,207.0,417.9,760.5,2438.4,302314.0
l2_scores_test,34.2,92.6,279.5,553.2,982.2,3003.9,296528.3
l2_ranks_train,17.9,50.6,161.5,337.6,636.2,2131.6,159480.0
l2_ranks_test,35.3,98.8,284.7,540.0,940.3,2768.6,248004.2
ord_0_ranks_train,10.6,20.0,32.6,40.3,45.6,72.3,81.8
ord_0_ranks_test,16.8,39.8,111.2,227.9,443.0,1624.1,249908.5
ord_1_ranks_train,8.7,17.4,43.0,92.6,192.6,855.8,76002.8
ord_1_ranks_test,10.6,27.6,88.3,197.2,400.5,1516.3,249893.2


In [None]:
raise StopIteration

## Old code

In [None]:
dataset = "collections"

ifeats = np.fromfile(
    "{}/data/model_scores/scores_train.bin".format(dataset),
    dtype="float32"
).reshape(item_count, query_count)
train_ranks = np.fromfile(
    "{}/data/item_train_ranks.bin".format(dataset),
    dtype="float32"
).reshape(item_count, query_count)
train_log_ranks = np.fromfile(
    "{}/data/item_train_log_ranks.bin".format(dataset),
    dtype="float32"
).reshape(item_count, query_count)
test_ranks = np.fromfile(
    "{}/data/item_test_ranks.bin".format(dataset),
    dtype="float32"
).reshape(item_count, query_count)



In [None]:
CALC_LIST = [
    ("l2_score", ifeats, calc_l2_distances),
    ("l2_rank", train_ranks, calc_l2_distances),
    ("l2_log_rank", train_log_ranks, calc_l2_distances),
    ("l1_score", ifeats, calc_l1_distances),
    ("l1_rank", train_ranks, calc_l1_distances),
    ("l1_log_rank", train_log_ranks, calc_l1_distances),
    ("ord_0", train_ranks, get_sum_ord_calcer(0)),
    ("ord_1", train_ranks, get_sum_ord_calcer(1)),
    ("ord_2", train_ranks, get_sum_ord_calcer(2)),
    ("ord_5", train_ranks, get_sum_ord_calcer(5)),
    ("ord_10", train_ranks, get_sum_ord_calcer(10))
]

# CALC_LIST = [
#     ("l2_score", ifeats, calc_l2_distances),
#     ("l2_log_rank", train_log_ranks, calc_l2_distances),
# ]
base_item_cout=1000

In [None]:
backup_file = "{}/data/disorder_curves_{}_samples.json".format(dataset, base_item_cout)
if not os.path.isfile(backup_file):
    results = OrderedDict()
    for label, feats, dist_calcer in CALC_LIST:
        for data_part, ranks in [("_train", train_ranks), ("_test", test_ranks)]:
            curve = estimate_disorder_curve(
                feats, ranks, dist_calcer, base_item_cout=base_item_cout)
            results[label + data_part] = list(curve)
    with open(backup_file, "w") as fout:
        json.dump(results, fout, indent=4, sort_keys=True)

In [None]:
results

In [None]:
with open(backup_file) as fin:
    results = json.load(fin)
    keys = sorted(results.keys())
    results = OrderedDict((key, np.array(results[key])) for key in keys)

In [None]:
plot_range = np.arange(900, 1001)

plt.yscale("log")
for label, curve in results.items():
    plt.plot(plot_range, curve[plot_range], label=label)
plt.legend()
plt.show()

In [None]:
percentiles = np.array([900, 950, 980, 990, 995, 999, 1000])

result_table = []
columns = ["metric"] + ["{:.3}".format(q) for q in percentiles * 0.001]
for label, curve in results.items():
    result_table.append([label] + list(curve[percentiles]))
pd.DataFrame(result_table, columns=columns)

In [None]:
raise StopIteration

In [None]:
# test data

iranks = np.array([
    [1],
    [2],
    [3]
])
ifeats = np.array([
    [1],
    [2],
    [3]
])


In [None]:
curve = estimate_disorder_curve(ifeats, iranks, calc_l2_distances, base_item_cout=1)

In [None]:
curve