In [1]:
import dask
import dask.array as da
import dask.dataframe as dd
import sparse
import dask_ml
import time

import numpy as np
import pandas as pd

In [2]:
seed = 25
ddf = dd.read_csv("data/interactions_train.csv")
train, val = dask_ml.model_selection.train_test_split(
    ddf, 
    test_size=0.1, 
    train_size=0.9,
    shuffle=True,
    random_state=seed)
print(len(train), len(val))

628841 70060


In [3]:
rating_avg = train.rating.mean().compute()
rating_std = train.rating.std().compute()
print(rating_avg, rating_std)

4.573866843923981 0.95953785596336


In [4]:
# Baseline 1: Predict global avg rating
val["prediction"] = rating_avg
dask_ml.metrics.mean_squared_error(val.rating.to_dask_array(), val.prediction.to_dask_array())

0.9108435745491105

In [5]:
# Baseline 2: Predict using avg rating of each user
user_avgs = train.groupby("user_id").rating.mean().compute()
val["prediction"] = val.user_id.apply(
    lambda x: user_avgs[x] if x in user_avgs else rating_avg, 
    meta=('user_id', 'int64')
)
print(val.head())
dask_ml.metrics.mean_squared_error(val.rating.to_dask_array(), val.prediction.to_dask_array())

        user_id  recipe_id        date  rating      u       i  prediction
403988   518302      14471  2009-01-25     4.0  15755   78580    3.750000
217140   226316      35132  2007-05-16     5.0  13499  115502    4.962963
104255   134011     135101  2005-09-18     4.0    135   85305    4.792899
189103   140008      88828  2007-01-24     4.0   1557   19016    4.166667
334179   528468      15301  2008-06-11     5.0   1564  151452    4.243590


0.8617040612395044

In [6]:
# Baseline 3: Predict using avg rating of each user, bayesian style
import pandas as pd
bayesian_df = pd.DataFrame()
user_avgs = train.groupby("user_id").rating.mean().compute()
user_counts = train.groupby("user_id").rating.count().compute()
k = 6
val["personal_rating"] = val.user_id.apply(
    lambda x: (rating_avg * k + user_avgs[x] * user_counts[x]) / (user_counts[x] + k) if x in user_avgs else rating_avg, 
    meta=('personal_rating', 'float32')
)
err = dask_ml.metrics.mean_squared_error(val.rating.to_dask_array(), val.personal_rating.to_dask_array())
bayesian_df = bayesian_df.append({"k": k, "err": err}, ignore_index=True)
print(bayesian_df)

        err    k
0  0.825969  6.0


In [7]:
train["personal_rating"] = train.user_id.apply(
    lambda x: (rating_avg * k + user_avgs[x] * user_counts[x]) / (user_counts[x] + k) if x in user_avgs else rating_avg, 
    meta=('personal_rating', 'float32')
)
train["person_normalized_rating"] = train.rating - train.personal_rating

class SimilarityScorer:
    def __init__(self, interactions):
        start = time.time()
        print("Generating indices")
        user_codes, self.user_idx_to_id = pd.factorize(interactions.user_id.compute())
        recipe_codes, self.recipe_idx_to_id = pd.factorize(interactions.recipe_id.compute())
        self.user_id_to_idx = {user: idx for idx, user in enumerate(self.user_idx_to_id)}
        self.recipe_id_to_idx = {recipe: idx for idx, recipe in enumerate(self.recipe_idx_to_id)}
        
        print("Creating sparse matrix", time.time() - start)
        s = sparse.COO(
            [user_codes, recipe_codes],
            interactions.person_normalized_rating.compute(),
            shape=(len(self.user_idx_to_id), len(self.recipe_idx_to_id)),
            fill_value=0
        )

        self.sparse_mat = da.from_array(s, chunks=(5000, 5000))
        print("Generating dot products", time.time() - start)

        self.dot_product_similarities = self.sparse_mat @ self.sparse_mat.T
        self.dot_product_similarities.compute()
        print("Raw similarities computed", time.time() - start)
        
        sims_eps = self.dot_product_similarities.sum(axis=1) + 1e-20
        self.similarities_normalized = (self.dot_product_similarities / sims_eps.reshape(-1, 1))
        self.similarities_normalized.compute()
        print("Similarities normalized!", time.time() - start)
        
        self.similarities_normalized = self.similarities_normalized.map_blocks(lambda x: x.todense(), dtype=np.ndarray).compute()
        print("Similarities finished!", time.time() - start)

    def predict_topk_for_user(self, user_id, k):
        user_idx = self.user_id_to_idx[user_id]
        similarities_norm = self.similarities_normalized[user_idx]
        recs_raw = similarities_norm.reshape(1, -1) @ self.sparse_mat
        recs = recs_raw.map_blocks(lambda x: x.todense(), dtype=np.ndarray)
        rec_values = recs.topk(k).compute()
        rec_idxs = recs.argtopk(k).compute()
        recs_ids = [self.recipe_idx_to_id[idx] for idx in rec_idxs]
        return recs_ids, rec_values
    
    def predict_pair(self, user_id, recipe_id):
        if recipe_id not in self.recipe_id_to_idx or user_id not in self.user_id_to_idx:
            return 0
        user_idx = self.user_id_to_idx[user_id]
        recipe_idx = self.recipe_id_to_idx[recipe_id]
        similarities_norm = self.similarities_normalized[user_idx]
        predicted_score = similarities_norm @ self.sparse_mat[:, recipe_idx]
        return predicted_score.compute()

scorer = SimilarityScorer(train)

Generating indices
Creating sparse matrix 10.074769973754883
Generating dot products 16.371269941329956
Raw similarities computed 19.204688787460327
Similarities normalized! 23.657227039337158
Similarities finished! 35.76487874984741


In [10]:
start = time.time()
all_predictions = scorer.similarities_normalized[1] @ scorer.sparse_mat[:, 2]
print(time.time() - start)

0.008305788040161133


In [12]:
start = time.time()
i = 0
for _, row in val.iterrows():
    i += 1
    score = scorer.predict_pair(row.user_id, row.recipe_id)
    if i % 100 == 0:
        print(i / len(val), time.time() - start, score)

KeyboardInterrupt: 

In [None]:
def get_prediction(row, scorer):
    if row.user_id not in user_avgs:
        return rating_avg
    personal_avg = (rating_avg * k + user_avgs[row.user_id] * user_counts[row.user_id]) / (user_counts[row.user_id] + k)
    similarity_delta = scorer.predict_pair(row.user_id, row.recipe_id)
    return personal_avg + similarity_delta
    
preds = val.apply(
    lambda x: get_prediction(x, scorer), 
    axis=1
)
dask_ml.metrics.mean_squared_error(val.rating.to_dask_array(), preds.to_dask_array())

In [8]:
# Baseline 4: Predict using avg rating of each recipe
recipe_avgs = train.groupby("recipe_id").rating.mean().compute()
val["prediction"] = val.recipe_id.apply(
    lambda x: recipe_avgs[x] if x in recipe_avgs else rating_avg, 
    meta=('recipe_id', 'int64')
)
print(val.head())
dask_ml.metrics.mean_squared_error(val.rating.to_dask_array(), val.prediction.to_dask_array())

        user_id  recipe_id        date  rating      u       i  prediction
403988   518302      14471  2009-01-25     4.0  15755   78580    4.600000
217140   226316      35132  2007-05-16     5.0  13499  115502    4.541667
104255   134011     135101  2005-09-18     4.0    135   85305    5.000000
189103   140008      88828  2007-01-24     4.0   1557   19016    4.573867
334179   528468      15301  2008-06-11     5.0   1564  151452    4.800000


1.0679499550909195

In [10]:
# Baseline 5: Predict using avg rating of each recipe, bayesian style
bayesian_df2 = pd.DataFrame()
recipe_avgs = train.groupby("recipe_id").rating.mean().compute()
recipe_counts = train.groupby("recipe_id").rating.count().compute()
for k in range(0, 50, 10):
    val["prediction"] = val.recipe_id.apply(
        lambda x: (rating_avg * k + recipe_avgs[x] * recipe_counts[x]) / (recipe_counts[x] + k) if x in recipe_avgs else rating_avg, 
        meta=('user_id', 'int64')
    )
    err = dask_ml.metrics.mean_squared_error(val.rating.to_dask_array(), val.prediction.to_dask_array())
    bayesian_df2 = bayesian_df2.append({"k": k, "err": err}, ignore_index=True)
print(bayesian_df2)

        err     k
0  1.067950   0.0
1  0.904832  10.0
2  0.902622  20.0
3  0.902763  30.0
4  0.903214  40.0


In [91]:
user_id_to_idx = {}
recipe_id_to_idx = {}
for interaction in train:
    user_id = interaction.user_id
    recipe_id = interaction.recipe_id
    rating_id = interaction.rating

0.9136329751702902

In [89]:
!pip install dask-distance
# https://dask-distance.readthedocs.io/en/latest/dask_distance.html





In [16]:
def avg_by_user(df):
    dfg = df.groupby("user_id")
    return dfg.rating.mean().compute(), dfg.rating.std().compute()

means, stds = avg_by_user(ddf)
print(means)
print(stds)

user_id
1533          4.747826
1535          4.476117
1634          3.875000
1676          4.583333
1773          4.500000
                ...   
2002204415    4.500000
2002214643    5.000000
2002227190    3.666667
2002254807    3.000000
2002312797    4.500000
Name: rating, Length: 25076, dtype: float64
user_id
1533          0.723581
1535          0.775672
1634          1.524621
1676          0.974308
1773          0.707107
                ...   
2002204415    0.707107
2002214643    0.000000
2002227190    0.577350
2002254807    2.738613
2002312797    0.707107
Name: rating, Length: 25076, dtype: float64
