In [28]:
import numpy as np
import os


In [29]:
def calc_ranks_limited_memory(dataset, user_count=1000, item_count=1000000, rank_block_size=1000):
    assert item_count % rank_block_size == 0
    block_count = item_count // rank_block_size
    
    score_path = "{}/data/model_scores/scores_train.bin".format(dataset)
    ranks_path = "{}/data/item_train_ranks.bin".format(dataset)
    log_ranks_path = "{}/data/item_train_log_ranks.bin".format(dataset)
    
    with open(score_path, "rb") as score_file:
        with open(ranks_path, "wb") as ranks_file, open(log_ranks_path, "wb") as log_ranks_file:
            for i in range(block_count):
                scores = np.fromfile(score_file, dtype="float32", count=user_count * rank_block_size)
                scores = scores.reshape((rank_block_size, user_count))
                ranks = scores.argsort(axis=0).astype("float32") + 1.0
                ranks.tofile(ranks_file)
                np.log(ranks).tofile(log_ranks_file)

            
#     if os.path.isfile("tmp_ranks.bin"):
#         os.remove("tmp_ranks.bin")

#     for i in range(user_count):
#         ranks_i = (1 + np.argsort(scores[:,i])).astype("int32")
#         with open("tmp_ranks.bin", "ab") as fout:
#             ranks_i.tofile(fout)

#     del scores # free memory

#     ranks = np.fromfile("tmp_ranks.bin", dtype="int32")
#     ranks = ranks.reshape((user_count, -1)).T
#     ranks.astype("float32").tofile(ranks_path)
#     np.log(ranks).astype("float32").tofile(log_ranks_path)

#     os.remove("tmp_ranks.bin")


In [30]:
def calc_ranks(dataset, user_count=1000, item_count=1000000):
    score_path = "{}/data/model_scores/scores_train.bin".format(dataset)
    ranks_path = "{}/data/item_train_ranks.bin".format(dataset)
    log_ranks_path = "{}/data/item_train_log_ranks.bin".format(dataset)
    
    with open(score_path, "rb") as score_file:
        scores = np.fromfile(score_file, dtype="float32")
    assert scores.shape[0] == user_count * item_count
    scores = scores.reshape((item_count, user_count))
    ranks = (-scores).argsort(axis=0).argsort(axis=0)
    ranks = ranks.astype("float32") + 1.0
    ranks.tofile(ranks_path)
    np.log(ranks).tofile(log_ranks_path)


In [34]:
# test
test_scores = np.array([
    [1, 2, 6],
    [4, 3, 5],
    [4, 1, 5],
    [3, 2, 6]
]).astype("float32")
with open("test/data/model_scores/scores_train.bin", "wb") as fout:
    test_scores.tofile(fout)

expected_ranks = np.array([
    [4, 2, 1],
    [1, 1, 3],
    [2, 4, 4],
    [3, 3, 2]
]).astype("float32")

calc_ranks("test", 3, 4)

ranks = np.fromfile("test/data/item_train_ranks.bin", dtype="float32")
ranks = ranks.reshape((4, 3))
assert np.all(ranks == expected_ranks), ranks



In [35]:
calc_ranks("collections")

In [36]:
calc_ranks("video")