In [4]:
import dask
import dask.array as da
import dask.dataframe as dd
import sparse
import dask_ml
import time

import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error

from dask.distributed import Client
client = Client(memory_limit='6GB')

Perhaps you already have a cluster running?
Hosting the HTTP server on port 59634 instead


In [5]:
seed = 25
train = dd.read_csv("train_baselines.csv")
val = dd.read_csv("val_baselines.csv")
print(len(train), len(val))

628841 70060


In [6]:
class SimilarityScorer:
    def __init__(self, interactions):
        start = time.time()
        print("Generating indices")
        user_codes, self.user_idx_to_id = pd.factorize(interactions.user_id.compute())
        recipe_codes, self.recipe_idx_to_id = pd.factorize(interactions.recipe_id.compute())

        self.user_id_to_idx = {user: idx for idx, user in enumerate(self.user_idx_to_id)}
        self.recipe_id_to_idx = {recipe: idx for idx, recipe in enumerate(self.recipe_idx_to_id)}
        
        print("Creating sparse matrix", time.time() - start)
        s = sparse.COO(
            [user_codes, recipe_codes],
            interactions.dual_bayesian_avg_delta.compute(),
            shape=(len(self.user_idx_to_id), len(self.recipe_idx_to_id)),
            fill_value=0
        )
        self.sparse_mat = da.from_array(s, chunks=(5000, 5000))
        print("Generating dot products", time.time() - start)

        dot_product_similarities = (self.sparse_mat @ self.sparse_mat.T).compute()
        print("Raw similarities computed", time.time() - start)
        
        dense_similarities = dot_product_similarities.todense()
        sims_summed = dense_similarities.sum(axis=1) + 1e-20
        self.similarities = dense_similarities / sims_summed.reshape(-1, 1)
        self.sparse_mat = self.sparse_mat.compute()
        print("Similarities normalized!", time.time() - start)

    def predict_topk_for_user(self, user_id, k):
        user_idx = self.user_id_to_idx[user_id]
        similarities_norm = self.similarities_normalized[user_idx]
        recs = self.similarities[user_idx] @ self.sparse_mat
        rec_values = recs.topk(k)
        rec_idxs = recs.argtopk(k)
        recs_ids = [self.recipe_idx_to_id[idx] for idx in rec_idxs]
        return recs_ids, rec_values
    
    def predict_pair(self, user_id, recipe_id):
        if recipe_id not in self.recipe_id_to_idx or user_id not in self.user_id_to_idx:
            return 0
        user_idx = self.user_id_to_idx[user_id]
        recipe_idx = self.recipe_id_to_idx[recipe_id]
        predicted_score = self.similarities[user_idx] @ self.sparse_mat[:, recipe_idx]
        return predicted_score

scorer = SimilarityScorer(train)

Generating indices
Creating sparse matrix 1.2017412185668945
Generating dot products 3.1728110313415527
Raw similarities computed 10.153905153274536
Similarities normalized! 25.031099319458008


In [13]:
val_pd = val.compute()
start = time.time()
val_pd["similarity_rating"] = val_pd.apply(
    lambda row: row.dual_bayesian_avg + scorer.predict_pair(row.user_id, row.recipe_id),
    axis=1
)
print(time.time() - start)

1048.6248400211334


NameError: name 'mean_squared_error' is not defined

In [14]:
from sklearn.metrics import mean_squared_error
val_pd["similarity_rating_clipped"] = val_pd.similarity_rating.clip(0, 5)
mean_squared_error(val_pd.rating, val_pd.similarity_rating_clipped)

0.848890510543224

In [15]:
val_pd.to_csv('val_user_similarity.csv')