In [58]:
import dask
import dask.array as da
import dask.dataframe as dd
import sparse
import dask_ml

seed = 25

In [59]:
ddf = dd.read_csv("data/interactions_train.csv")
train, val = dask_ml.model_selection.train_test_split(
    ddf, 
    test_size=0.1, 
    train_size=0.9,
    shuffle=True,
    random_state=seed)
print(len(train), len(val))

628841 70060


In [60]:
rating_avg = train.rating.mean().compute()
rating_std = train.rating.std().compute()
print(rating_avg, rating_std)

4.573866843923981 0.95953785596336


In [62]:
# Baseline 1: Predict global avg rating
val["prediction"] = rating_avg
print(samples.head())
dask_ml.metrics.mean_squared_error(val.rating.to_dask_array(), val.prediction.to_dask_array())

   rating  prediction
0     4.0    4.573867
1     4.0    4.573867
2     0.0    4.573867
3     2.0    4.573867
4     3.0    4.573867


0.9108435745491105

In [80]:
# Baseline 2: Predict using avg rating of each user
user_avgs = train.groupby("user_id").rating.mean().compute()
val["prediction"] = val.user_id.apply(
    lambda x: user_avgs[x] if x in user_avgs else rating_avg, 
    meta=('user_id', 'int64')
)
print(val.head())
dask_ml.metrics.mean_squared_error(val.rating.to_dask_array(), val.prediction.to_dask_array())

        user_id  recipe_id        date  rating      u       i  prediction
403988   518302      14471  2009-01-25     4.0  15755   78580    3.750000
217140   226316      35132  2007-05-16     5.0  13499  115502    4.962963
104255   134011     135101  2005-09-18     4.0    135   85305    4.792899
189103   140008      88828  2007-01-24     4.0   1557   19016    4.166667
334179   528468      15301  2008-06-11     5.0   1564  151452    4.243590


0.8617040612395044

In [81]:
# Baseline 3: Predict using avg rating of each recipe
recipe_avgs = train.groupby("recipe_id").rating.mean().compute()
val["prediction"] = val.recipe_id.apply(
    lambda x: recipe_avgs[x] if x in recipe_avgs else rating_avg, 
    meta=('recipe_id', 'int64')
)
print(val.head())
dask_ml.metrics.mean_squared_error(val.rating.to_dask_array(), val.prediction.to_dask_array())

        user_id  recipe_id        date  rating      u       i  prediction
403988   518302      14471  2009-01-25     4.0  15755   78580    4.600000
217140   226316      35132  2007-05-16     5.0  13499  115502    4.541667
104255   134011     135101  2005-09-18     4.0    135   85305    5.000000
189103   140008      88828  2007-01-24     4.0   1557   19016    4.573867
334179   528468      15301  2008-06-11     5.0   1564  151452    4.800000


1.0679499550909195

In [87]:
# Baseline 4: Predict using avg rating of each user, bayesian style
import pandas as pd
bayesian_df = pd.DataFrame()
user_avgs = train.groupby("user_id").rating.mean().compute()
user_countss = train.groupby("user_id").rating.count().compute()
for k in range(20):
    val["prediction"] = val.user_id.apply(
        lambda x: (rating_avg * k + user_avgs[x] * user_counts[x]) / (user_counts[x] + k) if x in user_avgs else rating_avg, 
        meta=('user_id', 'int64')
    )
    err = dask_ml.metrics.mean_squared_error(val.rating.to_dask_array(), val.prediction.to_dask_array())
    bayesian_df = bayesian_df.append({"k": k, "err": err}, ignore_index=True)
print(bayesian_df)

         err     k
0   0.861704   0.0
1   0.836084   1.0
2   0.829757   2.0
3   0.827329   3.0
4   0.826321   4.0
5   0.825972   5.0
6   0.825969   6.0
7   0.826162   7.0
8   0.826473   8.0
9   0.826854   9.0
10  0.827279  10.0
11  0.827729  11.0
12  0.828194  12.0
13  0.828666  13.0
14  0.829140  14.0
15  0.829612  15.0
16  0.830081  16.0
17  0.830544  17.0
18  0.831000  18.0
19  0.831449  19.0


In [16]:
def avg_by_user(df):
    dfg = df.groupby("user_id")
    return dfg.rating.mean().compute(), dfg.rating.std().compute()

means, stds = avg_by_user(ddf)
print(means)
print(stds)

user_id
1533          4.747826
1535          4.476117
1634          3.875000
1676          4.583333
1773          4.500000
                ...   
2002204415    4.500000
2002214643    5.000000
2002227190    3.666667
2002254807    3.000000
2002312797    4.500000
Name: rating, Length: 25076, dtype: float64
user_id
1533          0.723581
1535          0.775672
1634          1.524621
1676          0.974308
1773          0.707107
                ...   
2002204415    0.707107
2002214643    0.000000
2002227190    0.577350
2002254807    2.738613
2002312797    0.707107
Name: rating, Length: 25076, dtype: float64
