In [1]:
from collections import OrderedDict
from copy import copy
import json
import os

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

In [2]:
def calc_l2_distances(pt, points):
    return ((points - pt) ** 2).sum(axis=1)

def calc_l1_distances(pt, points):
    return (points - pt).abs().sum(axis=1)

def get_sum_ord_calcer(sum_ord):
    def calc_sum_ord_distances(pt, points):
        return np.partition(points + pt, sum_ord)[:,sum_ord]
    return calc_sum_ord_distances

pt = np.array([1, 2, 3])
pts = np.array([
    [4, 0, 1],
    [7, 6, 5],
    [5, 5, 0]
])
assert np.all(get_sum_ord_calcer(1)(pt, pts) == np.array([4, 8, 6]))
assert np.all(get_sum_ord_calcer(0)(pt, pts) == np.array([2, 8, 3]))



In [3]:
def estimate_disorder_curve(item_features, item_ranks, metric_calcer,
                            item_item_rank_lim=None, base_item_cout=1000):
    item_count, query_count = item_ranks.shape
    sample_count = item_count
    if item_item_rank_lim is None:
        item_item_rank_lim = item_count - 1
    base_items = np.random.choice(item_count, base_item_cout)
    disorder_multipliers = []

    for base_item in base_items:
        distances = metric_calcer(item_features[base_item], item_features)
        item_order = np.argsort(distances)
#         assert item_order[0] == base_item # fails if duplicates, fails for sum ord distance
        item_item_ranks = np.argsort(item_order)
        close_items = item_order[1: 1 + item_item_rank_lim]
        
        second_items = np.random.choice(close_items, sample_count)
        qweries = np.random.choice(query_count, sample_count)
        base_item_ranks = item_ranks[base_item, qweries]
        second_item_ranks = item_ranks[second_items, qweries]
        disorder_multipliers.append(
            item_item_ranks[second_items] / (base_item_ranks + second_item_ranks)
        )

    disorder_multipliers = np.array(disorder_multipliers).flatten()
    return np.quantile(disorder_multipliers, np.linspace(0, 1, 1001))

In [4]:
dataset = "collections"
item_count = 10 ** 6
query_count = 1000

ifeats = np.fromfile(
    "{}/data/model_scores/scores_train.bin".format(dataset),
    dtype="float32"
).reshape(item_count, query_count)
train_ranks = np.fromfile(
    "{}/data/item_train_ranks.bin".format(dataset),
    dtype="float32"
).reshape(item_count, query_count)
train_log_ranks = np.fromfile(
    "{}/data/item_train_log_ranks.bin".format(dataset),
    dtype="float32"
).reshape(item_count, query_count)
test_ranks = np.fromfile(
    "{}/data/item_test_ranks.bin".format(dataset),
    dtype="float32"
).reshape(item_count, query_count)



In [13]:
CALC_LIST = [
    ("l2_score", ifeats, calc_l2_distances),
    ("l2_rank", train_ranks, calc_l2_distances),
    ("l2_log_rank", train_log_ranks, calc_l2_distances),
    ("l1_score", ifeats, calc_l1_distances),
    ("l1_rank", train_ranks, calc_l1_distances),
    ("l1_log_rank", train_log_ranks, calc_l1_distances),
    ("ord_0", train_ranks, get_sum_ord_calcer(0)),
    ("ord_1", train_ranks, get_sum_ord_calcer(1)),
    ("ord_2", train_ranks, get_sum_ord_calcer(2)),
    ("ord_5", train_ranks, get_sum_ord_calcer(5)),
    ("ord_10", train_ranks, get_sum_ord_calcer(10))
]

# CALC_LIST = [
#     ("l2_score", ifeats, calc_l2_distances),
#     ("l2_log_rank", train_log_ranks, calc_l2_distances),
# ]
base_item_cout=1000

In [None]:
backup_file = "{}/data/disorder_curves_{}_samples.json".format(dataset, base_item_cout)
if not os.path.isfile(backup_file):
    results = OrderedDict()
    for label, feats, dist_calcer in CALC_LIST:
        for data_part, ranks in [("_train", train_ranks), ("_test", test_ranks)]:
            curve = estimate_disorder_curve(
                feats, ranks, dist_calcer, base_item_cout=base_item_cout)
            results[label + data_part] = list(curve)
    with open(backup_file, "w") as fout:
        json.dump(results, fout, indent=4, sort_keys=True)

In [1]:
results

NameError: name 'results' is not defined

In [None]:
with open(backup_file) as fin:
    results = json.load(fin)
    keys = sorted(results.keys())
    results = OrderedDict((key, np.array(results[key])) for key in keys)

In [None]:
plot_range = np.arange(900, 1001)

plt.yscale("log")
for label, curve in results.items():
    plt.plot(plot_range, curve[plot_range], label=label)
plt.legend()
plt.show()

In [None]:
percentiles = np.array([900, 950, 980, 990, 995, 999, 1000])

result_table = []
columns = ["metric"] + ["{:.3}".format(q) for q in percentiles * 0.001]
for label, curve in results.items():
    result_table.append([label] + list(curve[percentiles]))
pd.DataFrame(result_table, columns=columns)

In [None]:
raise StopIteration

In [5]:
# test data

iranks = np.array([
    [1],
    [2],
    [3]
])
ifeats = np.array([
    [1],
    [2],
    [3]
])


In [33]:
curve = estimate_disorder_curve(ifeats, iranks, calc_l2_distances, base_item_cout=1)

In [34]:
curve

array([2.68003109e-05, 2.74422998e-02, 5.56524824e-02, ...,
       9.16645442e+00, 1.16852408e+01, 9.40127289e+01])