In [98]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

from math import *
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import pickle

from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from sklearn.decomposition import PCA

from surprise import Dataset, Reader, SVD
from surprise.model_selection import GridSearchCV, cross_validate, train_test_split

# Import des modules contenant les fonctions utilitaires
import src.helpers as helpers

In [99]:
# Réglage des graphiques

plt.style.use('seaborn-whitegrid')

plt.rc('font', size=14)
plt.rc('axes', titlesize=20)
plt.rc('axes', labelsize=18)
plt.rc('xtick', labelsize=12)
plt.rc('ytick', labelsize=12)
plt.rc('legend', fontsize=14)

dims_fig = (10,6)

In [100]:
data_path = 'data/articles/'
clicks_path = data_path + 'clicks/'

In [101]:
articles_df = pd.read_csv(data_path + 'articles_metadata.csv')
articles_df.head()

Unnamed: 0,article_id,category_id,created_at_ts,publisher_id,words_count
0,0,0,1513144419000,0,168
1,1,1,1405341936000,0,189
2,2,1,1408667706000,0,250
3,3,1,1408468313000,0,230
4,4,1,1407071171000,0,162


In [102]:
articles_emb = pd.read_pickle(data_path + 'articles_embeddings.pickle')
articles_emb = pd.DataFrame(articles_emb, columns=["embedding_" + str(i) for i in range(articles_emb.shape[1])])
articles_emb.head()

Unnamed: 0,embedding_0,embedding_1,embedding_2,embedding_3,embedding_4,embedding_5,embedding_6,embedding_7,embedding_8,embedding_9,...,embedding_240,embedding_241,embedding_242,embedding_243,embedding_244,embedding_245,embedding_246,embedding_247,embedding_248,embedding_249
0,-0.161183,-0.957233,-0.137944,0.050855,0.830055,0.901365,-0.335148,-0.559561,-0.500603,0.165183,...,0.321248,0.313999,0.636412,0.169179,0.540524,-0.813182,0.28687,-0.231686,0.597416,0.409623
1,-0.523216,-0.974058,0.738608,0.155234,0.626294,0.485297,-0.715657,-0.897996,-0.359747,0.398246,...,-0.487843,0.823124,0.412688,-0.338654,0.320787,0.588643,-0.594137,0.182828,0.39709,-0.834364
2,-0.619619,-0.97296,-0.20736,-0.128861,0.044748,-0.387535,-0.730477,-0.066126,-0.754899,-0.242004,...,0.454756,0.473184,0.377866,-0.863887,-0.383365,0.137721,-0.810877,-0.44758,0.805932,-0.285284
3,-0.740843,-0.975749,0.391698,0.641738,-0.268645,0.191745,-0.825593,-0.710591,-0.040099,-0.110514,...,0.271535,0.03604,0.480029,-0.763173,0.022627,0.565165,-0.910286,-0.537838,0.243541,-0.885329
4,-0.279052,-0.972315,0.685374,0.113056,0.238315,0.271913,-0.568816,0.341194,-0.600554,-0.125644,...,0.238286,0.809268,0.427521,-0.615932,-0.503697,0.61445,-0.91776,-0.424061,0.185484,-0.580292


In [104]:
pca = PCA(n_components=70)
pca.fit(articles_emb)
articles_emb_trans_pca = pca.transform(articles_emb)

In [172]:
articles_emb_trans_pca = pd.DataFrame(articles_emb_trans_pca, columns=["embedding_" + str(i) for i in range(articles_emb_trans_pca.shape[1])])
articles_emb_trans_pca.head()

Unnamed: 0,embedding_0,embedding_1,embedding_2,embedding_3,embedding_4,embedding_5,embedding_6,embedding_7,embedding_8,embedding_9,...,embedding_60,embedding_61,embedding_62,embedding_63,embedding_64,embedding_65,embedding_66,embedding_67,embedding_68,embedding_69
0,-2.176781,-1.316921,-1.029046,0.901909,-1.809563,2.064688,1.221909,0.024435,0.927229,0.669858,...,0.230554,-0.09987,0.049278,0.057237,-0.319571,0.526854,0.112947,-0.418981,-0.00851,0.081836
1,-1.735176,0.489911,3.268563,0.087855,1.473047,0.932732,-1.841619,0.88181,-0.207199,-0.816811,...,-0.102414,0.083213,-0.034634,0.419699,0.30088,-0.111372,-0.143599,0.20389,-0.174365,0.118127
2,-0.912688,-2.089329,1.865883,-1.202522,2.530584,0.521997,-0.224347,-1.479931,-0.191858,-1.356816,...,0.104943,0.103123,0.192473,0.219311,0.141823,-0.13779,0.145357,-0.141263,0.063291,0.2863
3,1.096567,0.212979,4.183517,-0.649569,-0.130862,-1.126548,-1.063988,0.662898,0.348148,-1.463892,...,0.179844,-0.029139,-0.371382,-0.105178,0.365695,0.297339,0.351053,-0.047948,-0.054928,-0.289982
4,0.193784,-0.263938,1.896587,-1.834347,1.270349,1.723305,-0.328998,-0.283795,-0.65979,-1.223755,...,0.448706,-0.348682,0.223293,-0.081996,0.406449,-0.122632,0.380346,0.115006,0.140057,-0.126945


In [106]:
sum(pca.explained_variance_ratio_)

0.9773031020087131

In [108]:
clicks_df = helpers.get_all_clicks_files(clicks_path)
clicks_df = clicks_df[['user_id', 'session_id', 'session_size', 'click_article_id']]
clicks_df.head()

Unnamed: 0,user_id,session_id,session_size,click_article_id
0,0,1506825423271737,2,157541
1,0,1506825423271737,2,68866
2,1,1506825426267738,2,235840
3,1,1506825426267738,2,96663
4,2,1506825435299739,2,119592


In [189]:
# Sauvegarder pour le stockage Azure Blob
data_azure = 'data_azure'

list_dir = os.listdir('data/')

if data_azure not in list_dir:
    os.mkdir('data/' + data_azure)

articles_emb_trans_pca.to_pickle('data/' + data_azure + '/articles_embeddings.pickle')
clicks_df.to_pickle('data/' + data_azure + '/all_clicks.pickle')

### Content-Based Recommender model

In [148]:
def contentBasedRecommendArticle(articles, clicks, user_id, n=5):

    articles_read = clicks[clicks['user_id'] == user_id]['click_article_id'].tolist()

    if len(articles_read) == 0:
        return "L'utilisateur n'a lu aucun article"

    articles_read_embedding = articles.loc[articles_read]

    articles = articles.drop(articles_read)

    matrix = cosine_similarity(articles_read_embedding, articles)

    rec = []

    for i in range(n):
        coord_x = floor(np.argmax(matrix)/matrix.shape[1])
        coord_y = np.argmax(matrix)%matrix.shape[1]

        rec.append(int(coord_y))

        matrix[coord_x][coord_y] = 0
    
    rec.sort()

    return rec

In [170]:
a = [202204, 215599, 284792, 341081, 341683]
str(a)

'[202204, 215599, 284792, 341081, 341683]'

In [153]:
test = contentBasedRecommendArticle(articles_pickle, clicks_df, 50)
print(test)

[202204, 215599, 284792, 341081, 341683]


In [171]:
test = contentBasedRecommendArticle(trans_pca, clicks_df, 5)
print(test)

[62616, 62619, 156992, 224309, 284541]


### Collaborative Filtering Recommender model

In [114]:
def calculRatingByClick(clicks):

    count_user_article_size = (clicks.groupby(['user_id', "click_article_id"]).agg(user_article_size=("session_size", "sum")))
    count_user_total_size = (clicks.groupby(['user_id']).agg(user_total_size=("session_size", "sum")))

    ratings = count_user_article_size.join(count_user_total_size, on="user_id")

    ratings['rating'] = ratings['user_article_size'] / ratings['user_total_size']

    ratings = ratings.reset_index().drop(['user_article_size', 'user_total_size'], axis = 1).rename({'click_article_id': 'article_id'}, axis = 1)

    return ratings

In [115]:
ratings = calculRatingByClick(clicks_df)

ratings.head()

Unnamed: 0,user_id,article_id,rating
0,0,68866,0.125
1,0,87205,0.125
2,0,87224,0.125
3,0,96755,0.125
4,0,157541,0.125


In [116]:
reader = Reader(rating_scale=(0, 1))

data = Dataset.load_from_df(ratings.sample(frac=0.1, random_state=42), reader=reader)

param_grid = {'n_factors': [20, 50, 100], 'n_epochs': [10, 20, 50],
              'lr_all': [0.002, 0.005, 0.01], 'reg_all': [0.02, 0.04, 0.1]}

gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)

gs.fit(data)

In [117]:
# best RMSE score
print(gs.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

0.11414360806995917
{'n_factors': 20, 'n_epochs': 50, 'lr_all': 0.01, 'reg_all': 0.1}


In [118]:
model_SVD = gs.best_estimator['rmse']

model_SVD.fit(data.build_full_trainset())

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x2b0c8291d90>

In [119]:
model_SVD.predict(0, 0, verbose=True)

user: 0          item: 0          r_ui = None   est = 0.11   {'was_impossible': False}


Prediction(uid=0, iid=0, r_ui=None, est=0.10975047856881667, details={'was_impossible': False})