In [3]:
import dask
import dask.array as da
import dask.dataframe as dd
import sparse
import dask_ml
import time

import numpy as np
import pandas as pd

from dask.distributed import Client
client = Client(memory_limit='6GB')

In [4]:
seed = 25
ddf = dd.read_csv("data/interactions_train.csv")
train, val = dask_ml.model_selection.train_test_split(
    ddf, 
    test_size=0.1, 
    train_size=0.9,
    shuffle=True,
    random_state=seed)
print(len(train), len(val))

628841 70060


In [5]:
rating_avg = train.rating.mean().compute()
rating_std = train.rating.std().compute()
print(rating_avg, rating_std)

4.573866843923981 0.95953785596336


In [6]:
# Baseline 1: Predict global avg rating
val["global_avg"] = rating_avg
dask_ml.metrics.mean_squared_error(val.rating.to_dask_array(), val.global_avg.to_dask_array())

0.9108435745491105

In [7]:
# Baseline 2: Predict using avg rating of each user
user_avgs = train.groupby("user_id").rating.mean().compute()
val["user_avg"] = val.user_id.apply(
    lambda x: user_avgs[x] if x in user_avgs else rating_avg, 
    meta=('user_avg', 'float32')
)
dask_ml.metrics.mean_squared_error(val.rating.to_dask_array(), val.user_avg.to_dask_array())

0.8617040612395044

In [15]:
# Baseline 3: Predict using avg rating of each user, bayesian style
import pandas as pd
bayesian_df = pd.DataFrame()
user_avgs = train.groupby("user_id").rating.mean().compute()
user_counts = train.groupby("user_id").rating.count().compute()
k = 6

train["user_bayesian_avg"] = train.user_id.apply(
    lambda x: (rating_avg * k + user_avgs[x] * user_counts[x]) / (user_counts[x] + k) if x in user_avgs else rating_avg, 
    meta=('user_bayesian_avg', 'float32')
)
val["user_bayesian_avg"] = val.user_id.apply(
    lambda x: (rating_avg * k + user_avgs[x] * user_counts[x]) / (user_counts[x] + k) if x in user_avgs else rating_avg, 
    meta=('user_bayesian_avg', 'float32')
)

train["user_bayesian_avg_delta"] = train.rating - train.user_bayesian_avg
val["user_bayesian_avg_delta"] = val.rating - val.user_bayesian_avg

dask_ml.metrics.mean_squared_error(val.rating.to_dask_array(), val.user_bayesian_avg.to_dask_array())

0.8259685786006747

In [18]:
# Doing final baselines in pandas, getting very odd inconsistency in dask, 
# running the function twice will return different results each time...

from sklearn.metrics import mean_squared_error
val_pd = val.compute()
train_pd = train.compute()
mean_squared_error(val_pd.rating, val_pd.user_bayesian_avg)

0.8259685786006747



In [28]:
# Baseline 4: Predict using avg rating of each recipe
recipe_avgs = train_pd.groupby("recipe_id").rating.mean()#.compute()
val_pd["recipe_avg"] = val_pd.recipe_id.apply(
    lambda x: recipe_avgs[x] if x in recipe_avgs else rating_avg, 
    #meta=('recipe_avg', 'float32')
)
mean_squared_error(val_pd.rating, val_pd.recipe_avg)

1.0679499550909195

In [29]:
# Baseline 5: Predict using avg rating of each recipe, bayesian style
bayesian_df2 = pd.DataFrame()
recipe_avgs = train_pd.groupby("recipe_id").rating.mean()#.compute()
recipe_counts = train_pd.groupby("recipe_id").rating.count()#.compute()
k = 20
val_pd["recipe_bayesian_avg"] = val_pd.recipe_id.apply(
    lambda x: (rating_avg * k + recipe_avgs[x] * recipe_counts[x]) / (recipe_counts[x] + k) if x in recipe_avgs else rating_avg, 
    #meta=('recipe_bayesian_avg', 'float32')
)
mean_squared_error(val_pd.rating, val_pd.recipe_bayesian_avg)

0.9026219371353884

In [30]:
# Baseline 6: Predict using both avg of each recipe and each user

recipe_delta_avgs = train_pd.groupby("recipe_id").user_bayesian_avg_delta.mean()#.compute()
recipe_delta_counts = train_pd.groupby("recipe_id").user_bayesian_avg_delta.count()#.compute()

def dual_avg(row, k):
    user_bayesian_avg = row.user_bayesian_avg 
    if row.recipe_id not in recipe_delta_counts:
        return user_bayesian_avg
    else:
        return user_bayesian_avg + (recipe_delta_avgs[row.recipe_id] / (recipe_delta_counts[row.recipe_id] + k))
    
k = 7

train_pd["dual_bayesian_avg"] = train_pd.apply(
    lambda row: dual_avg(row, k), 
    axis=1,
    #meta=('dual_bayesian_avg', 'float32')
)
val_pd["dual_bayesian_avg"] = val_pd.apply(
    lambda row: dual_avg(row, k), 
    axis=1,
    #meta=('dual_bayesian_avg', 'float32')
)

train_pd["dual_bayesian_avg_delta"] = train_pd.rating - train_pd.dual_bayesian_avg
val_pd["dual_bayesian_avg_delta"] = val_pd.rating - val_pd.dual_bayesian_avg
mean_squared_error(val_pd.rating, val_pd.dual_bayesian_avg)

0.8232466157842618

In [31]:
train_pd.to_csv('train_baselines.csv')
val_pd.to_csv('val_baselines.csv')