In [328]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

from math import *
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import pickle

from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity

from surprise import Dataset, Reader, SVD
from surprise.model_selection import GridSearchCV, cross_validate, train_test_split

# Import des modules contenant les fonctions utilitaires
import src.helpers as helpers

In [27]:
# Réglage des graphiques

plt.style.use('seaborn-whitegrid')

plt.rc('font', size=14)
plt.rc('axes', titlesize=20)
plt.rc('axes', labelsize=18)
plt.rc('xtick', labelsize=12)
plt.rc('ytick', labelsize=12)
plt.rc('legend', fontsize=14)

dims_fig = (10,6)

In [28]:
data_path = 'data/articles/'
clicks_path = data_path + 'clicks/'

In [35]:
articles_df = pd.read_csv(data_path + 'articles_metadata.csv')
articles_df.head()

Unnamed: 0,article_id,category_id,created_at_ts,publisher_id,words_count
0,0,0,1513144419000,0,168
1,1,1,1405341936000,0,189
2,2,1,1408667706000,0,250
3,3,1,1408468313000,0,230
4,4,1,1407071171000,0,162


In [146]:
articles_pickle = pd.read_pickle(data_path + 'articles_embeddings.pickle')
articles_pickle = pd.DataFrame(articles_pickle, columns=["embedding_" + str(i) for i in range(articles_pickle.shape[1])])
articles_pickle.head()

Unnamed: 0,embedding_0,embedding_1,embedding_2,embedding_3,embedding_4,embedding_5,embedding_6,embedding_7,embedding_8,embedding_9,...,embedding_240,embedding_241,embedding_242,embedding_243,embedding_244,embedding_245,embedding_246,embedding_247,embedding_248,embedding_249
0,-0.161183,-0.957233,-0.137944,0.050855,0.830055,0.901365,-0.335148,-0.559561,-0.500603,0.165183,...,0.321248,0.313999,0.636412,0.169179,0.540524,-0.813182,0.28687,-0.231686,0.597416,0.409623
1,-0.523216,-0.974058,0.738608,0.155234,0.626294,0.485297,-0.715657,-0.897996,-0.359747,0.398246,...,-0.487843,0.823124,0.412688,-0.338654,0.320787,0.588643,-0.594137,0.182828,0.39709,-0.834364
2,-0.619619,-0.97296,-0.20736,-0.128861,0.044748,-0.387535,-0.730477,-0.066126,-0.754899,-0.242004,...,0.454756,0.473184,0.377866,-0.863887,-0.383365,0.137721,-0.810877,-0.44758,0.805932,-0.285284
3,-0.740843,-0.975749,0.391698,0.641738,-0.268645,0.191745,-0.825593,-0.710591,-0.040099,-0.110514,...,0.271535,0.03604,0.480029,-0.763173,0.022627,0.565165,-0.910286,-0.537838,0.243541,-0.885329
4,-0.279052,-0.972315,0.685374,0.113056,0.238315,0.271913,-0.568816,0.341194,-0.600554,-0.125644,...,0.238286,0.809268,0.427521,-0.615932,-0.503697,0.61445,-0.91776,-0.424061,0.185484,-0.580292


In [135]:
clicks_df = helpers.get_all_clicks_files(clicks_path)
clicks_df = clicks_df[['user_id', 'session_id', 'session_size', 'click_article_id']]
clicks_df.head()

Unnamed: 0,user_id,session_id,session_size,click_article_id
0,0,1506825423271737,2,157541
1,0,1506825423271737,2,68866
2,1,1506825426267738,2,235840
3,1,1506825426267738,2,96663
4,2,1506825435299739,2,119592


In [212]:
clicks_df[clicks_df['user_id'] == 0]

Unnamed: 0,user_id,session_id,session_size,click_article_id
0,0,1506825423271737,2,157541
1,0,1506825423271737,2,68866
7412,0,1507340000920377,2,96755
7413,0,1507340000920377,2,313996
4881,0,1507385847186982,2,160158
4882,0,1507385847186982,2,233470
1811,0,1507514030209212,2,87224
1812,0,1507514030209212,2,87205


### Content-Based Recommender model

In [206]:
def contentBasedRecommendArticle(articles, clicks, user_id, n=5):

    articles_temp = articles

    articles_read = clicks_df[clicks_df['user_id'] == user_id]['click_article_id'].tolist()

    if len(articles_read) == 0:
        return "L'utilisateur n'a lu aucun article"

    articles_read_embedding = articles_temp.loc[articles_read]
    articles_temp = articles_temp.drop(articles_read)

    matrix = cosine_similarity(articles_read_embedding, articles_temp)

    rec = []

    for i in range(n):
        coord_x = floor(np.argmax(matrix)/matrix.shape[1])
        coord_y = np.argmax(matrix)%matrix.shape[1]

        rec.append(coord_y)

        matrix[coord_x][coord_y] = 0

    return rec

In [209]:
test = contentBasedRecommendArticle(articles_pickle, clicks_df, 95)
print(test)

[284843, 96076, 118494, 96725, 96428]


### Collaborative Filtering Recommender model

In [325]:
def calculRatingByClick(clicks):

    count_user_article_size = (clicks.groupby(['user_id', "click_article_id"]).agg(user_article_size=("session_size", "sum")))
    count_user_total_size = (clicks.groupby(['user_id']).agg(user_total_size=("session_size", "sum")))

    ratings = count_user_article_size.join(count_user_total_size, on="user_id")

    ratings['rating'] = ratings['user_article_size'] / ratings['user_total_size']

    ratings = ratings.reset_index().drop(['user_article_size', 'user_total_size'], axis = 1).rename({'click_article_id': 'article_id'}, axis = 1)

    return ratings

In [327]:
ratings = calculRatingByClick(clicks_df)

ratings.head()

Unnamed: 0,user_id,article_id,rating
0,0,68866,0.125
1,0,87205,0.125
2,0,87224,0.125
3,0,96755,0.125
4,0,157541,0.125


In [374]:
reader = Reader(rating_scale=(0, 1))

data = Dataset.load_from_df(ratings, reader=reader)

param_grid = {'n_factors': [20, 50, 100], 'n_epochs': [10, 20, 50],
              'lr_all': [0.002, 0.005, 0.01], 'reg_all': [0.02, 0.04, 0.1]}

gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)

gs.fit(data)

In [352]:
# best RMSE score
print(gs.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

0.1443827065437471
{'n_epochs': 10, 'lr_all': 0.002, 'reg_all': 0.6}


In [353]:
model_SVD = gs.best_estimator['rmse']

model_SVD.fit(data.build_full_trainset())

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x2206e137640>

In [373]:
model_SVD.predict(0, 0, verbose=True)

user: 0          item: 0          r_ui = None   est = 0.10   {'was_impossible': False}


Prediction(uid=0, iid=0, r_ui=None, est=0.10348339041735524, details={'was_impossible': False})