In [1]:
from helpers_2 import *

In [2]:
# Load datasets
df_articles, df_clicks = load_dataset()

In [3]:
dataframe = df_clicks.merge(df_articles, left_on='click_article_id', right_on='article_id')

In [4]:
dataframe = dataframe[['user_id', 'article_id', 'category_id']]
dataframe

Unnamed: 0,user_id,article_id,category_id
0,93863,96210,209
1,294036,96210,209
2,77136,96210,209
3,28126,96210,209
4,237725,96210,209
...,...,...,...
2988176,273209,164541,288
2988177,273209,164523,288
2988178,273209,164520,288
2988179,273209,164533,288


In [5]:
series = dataframe.groupby(['user_id', 'category_id']).size()
user_rating_matrix = series.to_frame()
user_rating_matrix = user_rating_matrix.reset_index()
user_rating_matrix.rename(columns = {0:'rate'}, inplace = True)

In [6]:
user_rating_matrix["rate"].value_counts()

rate
1      1378686
2       293327
3        97043
4        43989
5        23658
        ...   
84           1
103          1
156          1
113          1
138          1
Name: count, Length: 127, dtype: int64

In [7]:
reader = Reader(rating_scale=(1,10))
_x = user_rating_matrix.loc[user_rating_matrix.rate > 1]
data = Dataset.load_from_df(_x[['user_id', 'category_id', 'rate']], reader)

print('We have selects', len(_x), 'interactions.')

We have selects 503616 interactions.


In [8]:
trainset, testset = train_test_split(data, test_size=0.25)
print('Test set lenght :', len(testset))
print('Train set lenght :', len(_x) - len(testset))

Test set lenght : 125904
Train set lenght : 377712


In [9]:
from surprise import SVD, accuracy
algo = SVD()
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x171ca0be0>

In [10]:
predictions = algo.test(testset)
print('Number of predictions in Test set :', len(predictions))

Number of predictions in Test set : 125904


[Prediction(uid=17653, iid=418, r_ui=4.0, est=10, details={'was_impossible': False}),
 Prediction(uid=160768, iid=375, r_ui=3.0, est=10, details={'was_impossible': False}),
 Prediction(uid=972, iid=331, r_ui=7.0, est=10, details={'was_impossible': False}),
 Prediction(uid=177883, iid=323, r_ui=3.0, est=10, details={'was_impossible': False}),
 Prediction(uid=62259, iid=281, r_ui=2.0, est=10, details={'was_impossible': False}),
 Prediction(uid=41305, iid=412, r_ui=2.0, est=10, details={'was_impossible': False}),
 Prediction(uid=9785, iid=339, r_ui=2.0, est=10, details={'was_impossible': False}),
 Prediction(uid=29373, iid=375, r_ui=2.0, est=10, details={'was_impossible': False}),
 Prediction(uid=85170, iid=7, r_ui=46.0, est=10, details={'was_impossible': False}),
 Prediction(uid=16396, iid=281, r_ui=6.0, est=10, details={'was_impossible': False}),
 Prediction(uid=14964, iid=281, r_ui=2.0, est=10, details={'was_impossible': False}),
 Prediction(uid=46404, iid=174, r_ui=2.0, est=10, detail

In [14]:
accuracy.rmse(predictions)

RMSE: 7.5103


7.510318662463389

In [15]:
from collections import defaultdict

def get_top_n(predictions, n=10):
    """Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    """

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

In [16]:
top_n = get_top_n(predictions, n=10)
top_n

defaultdict(list,
            {17653: [(418, 10), (209, 10)],
             160768: [(375, 10)],
             972: [(331, 10)],
             177883: [(323, 10)],
             62259: [(281, 10)],
             41305: [(412, 10)],
             9785: [(339, 10),
              (6, 10),
              (327, 10),
              (389, 10),
              (174, 10),
              (281, 10),
              (437, 10)],
             29373: [(375, 10)],
             85170: [(7, 10), (348, 10), (297, 10)],
             16396: [(281, 10), (354, 10)],
             14964: [(281, 10)],
             46404: [(174, 10), (252, 10)],
             37688: [(399, 10)],
             47389: [(375, 10), (6, 10)],
             133552: [(389, 10)],
             54744: [(331, 10),
              (250, 10),
              (353, 10),
              (348, 10),
              (136, 10),
              (354, 10),
              (48, 10),
              (375, 10),
              (118, 10)],
             34139: [(209, 10), (297, 10)],
 