In [1]:
import dask_distance 
import dask.array as da
import dask.dataframe as dd
import dask_ml
import time
import numpy as np
import sparse
import pandas as pd


from dask.distributed import Client
client = Client(memory_limit='12GB')

In [2]:
tmp = np.zeros(
    (25, 16)
)
tmp[2, 2] += 10.
print("Array created")
s = sparse.COO.from_numpy(tmp)
print("Array sparsified")
tmp2 = da.from_array(s, chunks=(5000, 5000))
tmp2.compute()
print(tmp2)
print("array_daskified")

Array created
Array sparsified
dask.array<array, shape=(25, 16), dtype=float64, chunksize=(25, 16), chunktype=sparse.COO>
array_daskified


In [3]:
import sparse
x = da.random.random((25076, 160901), chunks=(5000, 5000))
x[x < 0.99983] = 0
s = x.map_blocks(sparse.COO)
s

Unnamed: 0,Array,Chunk
Shape,"(25076, 160901)","(5000, 5000)"
Count,792 Tasks,198 Chunks
Type,float64,sparse.COO
"Array Chunk Shape (25076, 160901) (5000, 5000) Count 792 Tasks 198 Chunks Type float64 sparse.COO",160901  25076,

Unnamed: 0,Array,Chunk
Shape,"(25076, 160901)","(5000, 5000)"
Count,792 Tasks,198 Chunks
Type,float64,sparse.COO



start = time.time()
n = da.matmul(s, s.T)
n.compute()
print(time.time() - start)

import numpy as np
from scipy.spatial.distance import cosine
x = np.array([
    [0, -1, 2, 0, 0],
    [-1, 0, 3, 0, 0],
    [0, 1, 0, -2, 0]
]).astype(float)

cosine(x[0], x[1])

dot_mat = x @ x.T

from math import sqrt
1 - dot_mat[0][1] / (sqrt(dot_mat[0][0]) * sqrt(dot_mat[1][1]))

In [4]:
seed = 25
ddf = dd.read_csv("data/interactions_train.csv")
train, val = dask_ml.model_selection.train_test_split(
    ddf, 
    test_size=0.1, 
    train_size=0.9,
    shuffle=True,
    random_state=seed)
print(len(train), len(val))

628841 70060


In [5]:
codes, uniques = pd.factorize(ddf.user_id.compute())
print(len(codes))
print(len(uniques))

698901
25076


In [72]:
import numpy as np

class SimilarityScorer:
    def __init__(self, interactions):
        start = time.time()
        print("Generating indices")
        user_codes, self.user_idx_to_id = pd.factorize(ddf.user_id.compute())
        recipe_codes, self.recipe_idx_to_id = pd.factorize(ddf.recipe_id.compute())
        self.user_id_to_idx = {user: idx for idx, user in enumerate(self.user_idx_to_id)}
        self.recipe_id_to_idx = {recipe: idx for idx, recipe in enumerate(self.recipe_idx_to_id)}
        
        s = sparse.COO(
            [user_codes, recipe_codes],
            ddf.rating.compute(),
            shape=(len(self.user_idx_to_id), len(self.recipe_idx_to_id)),
            fill_value=0
        )
        self.sparse_mat = da.from_array(s, chunks=(5000, 5000))
        print("Generating dot products", time.time() - start)

        self.dot_product_similarities = self.sparse_mat @ self.sparse_mat.T
        self.dot_product_similarities.compute()
        print("Done with similarities!", time.time() - start)

    def predict_topk_for_user(self, user_id, k):
        user_idx = scorer.user_id_to_idx[user_id]
        similarities = scorer.dot_product_similarities[user_idx]
        similarities_norm = similarities / similarities.sum()
        recs_raw = similarities_norm.reshape(1, -1) @ scorer.sparse_mat
        recs = recs_raw.map_blocks(lambda x: x.todense(), dtype=np.ndarray)
        rec_values = recs.topk(k).compute()
        rec_idxs = recs.argtopk(k).compute()
        recs_ids = [scorer.recipe_idx_to_id[idx] for idx in rec_idxs]
        return recs_ids, rec_values
    
    def predict_pair(self, user_id, recipe_id):
        user_idx = scorer.user_id_to_idx[user_id]
        recipe_idx = scorer.recipe_id_to_idx[recipe_id]
        similarities = scorer.dot_product_similarities[user_idx]
        similarities_norm = similarities / similarities.sum()
        predicted_score = similarities_norm @ self.sparse_mat[:, recipe_idx]
        return predicted_score.compute()

scorer = SimilarityScorer(ddf)

Generating indices
Generating dot products 1.030318021774292
Done with similarities! 4.943005800247192


In [73]:
user_id = 2002254807
scorer.predict_topk_for_user(user_id, 10)

([Int64Index([27520, 27208, 129615, 26849, 89204, 69173, 32204, 77397, 39087,
              28148],
             dtype='int64')],
 array([[3.44314559, 0.78852285, 0.76567481, 0.70828905, 0.70297556,
         0.6392136 , 0.61477152, 0.55313496, 0.54941552, 0.52656748]]))

In [74]:
user_id = 2002254807
recipe_id = 27520
scorer.predict_pair(user_id, recipe_id)

3.4431455897980854