In [None]:
import dask
import dask.array as da
import dask.dataframe as dd
import sparse
import dask_ml
import time

import numpy as np
import pandas as pd

from dask.distributed import Client
client = Client(memory_limit='6GB')

In [2]:
seed = 25
ddf = dd.read_csv("data/interactions_train.csv")
train, val = dask_ml.model_selection.train_test_split(
    ddf, 
    test_size=0.1, 
    train_size=0.9,
    shuffle=True,
    random_state=seed)
print(len(train), len(val))

628841 70060


In [3]:
rating_avg = train.rating.mean().compute()
rating_std = train.rating.std().compute()
print(rating_avg, rating_std)

4.573866843923981 0.95953785596336


In [5]:
# Baseline 1: Predict global avg rating
val["global_avg"] = rating_avg
dask_ml.metrics.mean_squared_error(val.rating.to_dask_array(), val.global_avg.to_dask_array())

0.9108435745491105

In [19]:
# Baseline 2: Predict using avg rating of each user
user_avgs = train.groupby("user_id").rating.mean().compute()
val["user_avg"] = val.user_id.apply(
    lambda x: user_avgs[x] if x in user_avgs else rating_avg, 
    meta=('user_avg', 'float32')
)
dask_ml.metrics.mean_squared_error(val.rating.to_dask_array(), val.user_avg.to_dask_array())

0.8617040612395044

In [24]:
# Baseline 3: Predict using avg rating of each user, bayesian style
import pandas as pd
bayesian_df = pd.DataFrame()
user_avgs = train.groupby("user_id").rating.mean().compute()
user_counts = train.groupby("user_id").rating.count().compute()
k = 6

train["user_bayesian_avg"] = train.user_id.apply(
    lambda x: (rating_avg * k + user_avgs[x] * user_counts[x]) / (user_counts[x] + k) if x in user_avgs else rating_avg, 
    meta=('user_bayesian_avg', 'float32')
)
val["user_bayesian_avg"] = val.user_id.apply(
    lambda x: (rating_avg * k + user_avgs[x] * user_counts[x]) / (user_counts[x] + k) if x in user_avgs else rating_avg, 
    meta=('user_bayesian_avg', 'float32')
)

train["user_bayesian_avg_delta"] = train.rating - train.user_bayesian_avg
val["user_bayesian_avg_delta"] = val.rating - val.user_bayesian_avg

dask_ml.metrics.mean_squared_error(val.rating.to_dask_array(), val.user_bayesian_avg.to_dask_array())

0.8259685786006747

In [25]:
dask_ml.metrics.mean_squared_error(val.rating.to_dask_array(), val.user_bayesian_avg.to_dask_array())

0.8259685786006747

In [8]:
# Baseline 4: Predict using avg rating of each recipe
recipe_avgs = train.groupby("recipe_id").rating.mean().compute()
val["recipe_avg"] = val.recipe_id.apply(
    lambda x: recipe_avgs[x] if x in recipe_avgs else rating_avg, 
    meta=('recipe_avg', 'float32')
)
dask_ml.metrics.mean_squared_error(val.rating.to_dask_array(), val.recipe_avg.to_dask_array())

1.0679499550909195

In [None]:
# Baseline 5: Predict using avg rating of each recipe, bayesian style
bayesian_df2 = pd.DataFrame()
recipe_avgs = train.groupby("recipe_id").rating.mean().compute()
recipe_counts = train.groupby("recipe_id").rating.count().compute()
k = 20
val["recipe_bayesian_avg"] = val.recipe_id.apply(
    lambda x: (rating_avg * k + recipe_avgs[x] * recipe_counts[x]) / (recipe_counts[x] + k) if x in recipe_avgs else rating_avg, 
    meta=('recipe_bayesian_avg', 'float32')
)
dask_ml.metrics.mean_squared_error(val.rating.to_dask_array(), val.recipe_bayesian_avg.to_dask_array())

In [26]:
# Baseline 6: Predict using both avg of each recipe and each user
recipe_delta_avgs = train.groupby("recipe_id").user_bayesian_avg_delta.mean().compute()
recipe_delta_counts = train.groupby("recipe_id").user_bayesian_avg_delta.count().compute()
k = 20

def dual_avg(row, k):
    user_bayesian_avg = row.user_bayesian_avg 
    if row.recipe_id not in recipe_delta_counts:
        return user_bayesian_avg
    else:
        return user_bayesian_avg #+ (recipe_delta_avgs[row.recipe_id] / (recipe_delta_counts[row.recipe_id] + k))
    
for k in [0, 3, 4, 5, 6, 7, 8, 10, 20, 50]:
    val["dual_bayesian_avg"] = val.apply(
        lambda row: dual_avg(row, k), 
        axis=1,
        meta=('dual_bayesian_avg', 'float32')
    )
    err = dask_ml.metrics.mean_squared_error(val.rating.to_dask_array(), val.dual_bayesian_avg.to_dask_array())
    print(k, err)

KeyboardInterrupt: 

In [7]:
train["personal_rating"] = train.user_id.apply(
    lambda x: (rating_avg * k + user_avgs[x] * user_counts[x]) / (user_counts[x] + k) if x in user_avgs else rating_avg, 
    meta=('personal_rating', 'float32')
)
train["person_normalized_rating"] = train.rating - train.personal_rating

class SimilarityScorer:
    def __init__(self, interactions):
        start = time.time()
        print("Generating indices")
        user_codes, self.user_idx_to_id = pd.factorize(interactions.user_id.compute())
        recipe_codes, self.recipe_idx_to_id = pd.factorize(interactions.recipe_id.compute())
        self.user_id_to_idx = {user: idx for idx, user in enumerate(self.user_idx_to_id)}
        self.recipe_id_to_idx = {recipe: idx for idx, recipe in enumerate(self.recipe_idx_to_id)}
        
        print("Creating sparse matrix", time.time() - start)
        s = sparse.COO(
            [user_codes, recipe_codes],
            interactions.person_normalized_rating.compute(),
            shape=(len(self.user_idx_to_id), len(self.recipe_idx_to_id)),
            fill_value=0
        )
        self.sparse_mat = da.from_array(s, chunks=(5000, 5000))
        print("Generating dot products", time.time() - start)

        dot_product_similarities = (self.sparse_mat @ self.sparse_mat.T).compute()
        print("Raw similarities computed", time.time() - start)
        
        dense_similarities = dot_product_similarities.todense()
        sims_summed = dense_similarities.sum(axis=1) + 1e-20
        self.similarities = dense_similarities / sims_summed.reshape(-1, 1)
        self.sparse_mat = self.sparse_mat.compute()
        print("Similarities normalized!", time.time() - start)

    def predict_topk_for_user(self, user_id, k):
        user_idx = self.user_id_to_idx[user_id]
        similarities_norm = self.similarities_normalized[user_idx]
        recs = self.similarities[user_idx] @ self.sparse_mat
        rec_values = recs.topk(k)
        rec_idxs = recs.argtopk(k)
        recs_ids = [self.recipe_idx_to_id[idx] for idx in rec_idxs]
        return recs_ids, rec_values
    
    def predict_pair(self, user_id, recipe_id):
        if recipe_id not in self.recipe_id_to_idx or user_id not in self.user_id_to_idx:
            return 0
        user_idx = self.user_id_to_idx[user_id]
        recipe_idx = self.recipe_id_to_idx[recipe_id]
        predicted_score = self.similarities[user_idx] @ self.sparse_mat[:, recipe_idx]
        return predicted_score

scorer = SimilarityScorer(train)

Generating indices
Creating sparse matrix 10.480547189712524
Generating dot products 16.988327980041504
Raw similarities computed 23.964248180389404
Similarities normalized! 35.80215930938721


In [19]:
preds = []
ratings = []
i = 0
start = time.time()
for _, row in val.iterrows():
    i += 1
    pred = row.personal_rating + scorer.predict_pair(row.user_id, row.recipe_id)
    preds.append(pred)
    ratings.append(row.rating)
print(time.time() - start)

802.774472951889


In [29]:
from sklearn.metrics import mean_squared_error
personal_ratings = val.personal_rating.compute()
mean_squared_error(ratings, np.array(preds).clip(1, 5)) #)

0.8259685786006747

In [16]:
val["prediction"] = dd.from_array(np.array(preds))
dask_ml.metrics.mean_squared_error(val.rating.to_dask_array(), val.prediction.to_dask_array())

ValueError: Not all divisions are known, can't align partitions. Please use `set_index` to set the index.

In [None]:
start = time.time()
all_predictions = scorer.similarities_normalized[1] @ scorer.sparse_mat[:, 2]


In [None]:
start = time.time()
i = 0
for _, row in val.iterrows():
    i += 1
    score = scorer.predict_pair(row.user_id, row.recipe_id)
    if i % 100 == 0:
        print(i / len(val), time.time() - start, score)

In [91]:
user_id_to_idx = {}
recipe_id_to_idx = {}
for interaction in train:
    user_id = interaction.user_id
    recipe_id = interaction.recipe_id
    rating_id = interaction.rating

0.9136329751702902

In [89]:
!pip install dask-distance
# https://dask-distance.readthedocs.io/en/latest/dask_distance.html





In [16]:
def avg_by_user(df):
    dfg = df.groupby("user_id")
    return dfg.rating.mean().compute(), dfg.rating.std().compute()

means, stds = avg_by_user(ddf)
print(means)
print(stds)

user_id
1533          4.747826
1535          4.476117
1634          3.875000
1676          4.583333
1773          4.500000
                ...   
2002204415    4.500000
2002214643    5.000000
2002227190    3.666667
2002254807    3.000000
2002312797    4.500000
Name: rating, Length: 25076, dtype: float64
user_id
1533          0.723581
1535          0.775672
1634          1.524621
1676          0.974308
1773          0.707107
                ...   
2002204415    0.707107
2002214643    0.000000
2002227190    0.577350
2002254807    2.738613
2002312797    0.707107
Name: rating, Length: 25076, dtype: float64
