In [461]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

from math import *
from heapq import nlargest
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import pickle

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA

from surprise import Dataset, Reader, SVD, accuracy, dump
from surprise.model_selection import GridSearchCV, cross_validate, train_test_split

# Import des modules contenant les fonctions utilitaires
import src.helpers as helpers

In [99]:
# Réglage des graphiques

plt.style.use('seaborn-whitegrid')

plt.rc('font', size=14)
plt.rc('axes', titlesize=20)
plt.rc('axes', labelsize=18)
plt.rc('xtick', labelsize=12)
plt.rc('ytick', labelsize=12)
plt.rc('legend', fontsize=14)

dims_fig = (10,6)

In [100]:
data_path = 'data/articles/'
clicks_path = data_path + 'clicks/'

In [101]:
articles_df = pd.read_csv(data_path + 'articles_metadata.csv')
articles_df.head()

Unnamed: 0,article_id,category_id,created_at_ts,publisher_id,words_count
0,0,0,1513144419000,0,168
1,1,1,1405341936000,0,189
2,2,1,1408667706000,0,250
3,3,1,1408468313000,0,230
4,4,1,1407071171000,0,162


In [280]:
articles_emb = pd.read_pickle(data_path + 'articles_embeddings.pickle')
articles_emb = pd.DataFrame(articles_emb, columns=["embedding_" + str(i) for i in range(articles_emb.shape[1])])
articles_emb.head()

Unnamed: 0,embedding_0,embedding_1,embedding_2,embedding_3,embedding_4,embedding_5,embedding_6,embedding_7,embedding_8,embedding_9,...,embedding_240,embedding_241,embedding_242,embedding_243,embedding_244,embedding_245,embedding_246,embedding_247,embedding_248,embedding_249
0,-0.161183,-0.957233,-0.137944,0.050855,0.830055,0.901365,-0.335148,-0.559561,-0.500603,0.165183,...,0.321248,0.313999,0.636412,0.169179,0.540524,-0.813182,0.28687,-0.231686,0.597416,0.409623
1,-0.523216,-0.974058,0.738608,0.155234,0.626294,0.485297,-0.715657,-0.897996,-0.359747,0.398246,...,-0.487843,0.823124,0.412688,-0.338654,0.320787,0.588643,-0.594137,0.182828,0.39709,-0.834364
2,-0.619619,-0.97296,-0.20736,-0.128861,0.044748,-0.387535,-0.730477,-0.066126,-0.754899,-0.242004,...,0.454756,0.473184,0.377866,-0.863887,-0.383365,0.137721,-0.810877,-0.44758,0.805932,-0.285284
3,-0.740843,-0.975749,0.391698,0.641738,-0.268645,0.191745,-0.825593,-0.710591,-0.040099,-0.110514,...,0.271535,0.03604,0.480029,-0.763173,0.022627,0.565165,-0.910286,-0.537838,0.243541,-0.885329
4,-0.279052,-0.972315,0.685374,0.113056,0.238315,0.271913,-0.568816,0.341194,-0.600554,-0.125644,...,0.238286,0.809268,0.427521,-0.615932,-0.503697,0.61445,-0.91776,-0.424061,0.185484,-0.580292


In [192]:
pca = PCA(n_components=70)
pca.fit(articles_emb)
articles_emb_trans_pca = pca.transform(articles_emb)

In [193]:
articles_emb_trans_pca = pd.DataFrame(articles_emb_trans_pca, columns=["embedding_" + str(i) for i in range(articles_emb_trans_pca.shape[1])])
articles_emb_trans_pca.head()

Unnamed: 0,embedding_0,embedding_1,embedding_2,embedding_3,embedding_4,embedding_5,embedding_6,embedding_7,embedding_8,embedding_9,...,embedding_60,embedding_61,embedding_62,embedding_63,embedding_64,embedding_65,embedding_66,embedding_67,embedding_68,embedding_69
0,-2.176779,-1.316917,-1.029053,0.901911,-1.809555,2.064694,1.221906,0.024451,0.927234,0.669827,...,0.230588,-0.100345,0.048613,0.057928,-0.319927,0.528034,0.11511,-0.420151,0.001385,0.069089
1,-1.735179,0.489897,3.26856,0.087855,1.473051,0.932728,-1.841621,0.881801,-0.207196,-0.816811,...,-0.102564,0.084034,-0.03563,0.419089,0.300881,-0.112764,-0.145098,0.197519,-0.180062,0.071035
2,-0.91269,-2.089337,1.865876,-1.202525,2.530583,0.521989,-0.224339,-1.479935,-0.191847,-1.356812,...,0.104639,0.102848,0.192354,0.219671,0.141724,-0.136361,0.145179,-0.143148,0.065917,0.267713
3,1.096562,0.212963,4.183521,-0.649562,-0.130864,-1.12655,-1.063989,0.662894,0.348151,-1.463897,...,0.180369,-0.028704,-0.371825,-0.105326,0.366992,0.291578,0.348997,-0.040329,-0.053127,-0.297753
4,0.193782,-0.263946,1.896589,-1.83435,1.27035,1.723298,-0.328998,-0.283803,-0.659784,-1.223749,...,0.448453,-0.348766,0.222626,-0.08164,0.405957,-0.121807,0.379097,0.104796,0.152662,-0.140578


In [194]:
sum(pca.explained_variance_ratio_)

0.9773061622691812

In [108]:
clicks_df = helpers.get_all_clicks_files(clicks_path)
clicks_df = clicks_df[['user_id', 'session_id', 'session_size', 'click_article_id']]
clicks_df.head()

Unnamed: 0,user_id,session_id,session_size,click_article_id
0,0,1506825423271737,2,157541
1,0,1506825423271737,2,68866
2,1,1506825426267738,2,235840
3,1,1506825426267738,2,96663
4,2,1506825435299739,2,119592


In [195]:
# Sauvegarder pour le stockage Azure Blob
data_azure = 'data_azure'

list_dir = os.listdir('data/')

if data_azure not in list_dir:
    os.mkdir('data/' + data_azure)

articles_emb_trans_pca.to_pickle('data/' + data_azure + '/articles_embeddings.pickle')
clicks_df.to_pickle('data/' + data_azure + '/all_clicks.pickle')

### Content-Based Recommender model

In [366]:
def contentBasedRecommendArticle(articles, clicks, user_id, n=5):

    articles_read = clicks[clicks['user_id'] == user_id]['click_article_id'].tolist()

    if len(articles_read) == 0:
        return "L'utilisateur n'a lu aucun article"

    articles_read_embedding = articles.loc[articles_read]

    articles = articles.drop(articles_read)

    matrix = cosine_similarity(articles_read_embedding, articles)

    rec = []

    for i in range(n):
        coord_x = floor(np.argmax(matrix)/matrix.shape[1])
        coord_y = np.argmax(matrix)%matrix.shape[1]

        rec.append(int(articles.index[coord_y]))

        matrix[coord_x][coord_y] = 0

    return rec

In [284]:
test = contentBasedRecommendArticle(articles_emb, clicks_df, 5)
print(test)

[62627, 62630, 157015, 224354, 284603]


In [285]:
test = contentBasedRecommendArticle(articles_emb_trans_pca, clicks_df, 5)
print(test)

[62627, 62630, 157015, 224354, 284603]


### Collaborative Filtering Recommender model

In [199]:
def calculRatingByClick(clicks):

    count_user_article_size = (clicks.groupby(['user_id', "click_article_id"]).agg(user_article_size=("session_size", "sum")))
    count_user_total_size = (clicks.groupby(['user_id']).agg(user_total_size=("session_size", "sum")))

    ratings = count_user_article_size.join(count_user_total_size, on="user_id")

    ratings['rating'] = ratings['user_article_size'] / ratings['user_total_size']

    ratings = ratings.reset_index().drop(['user_article_size', 'user_total_size'], axis = 1).rename({'click_article_id': 'article_id'}, axis = 1)

    return ratings

In [200]:
ratings = calculRatingByClick(clicks_df)

ratings.head()

Unnamed: 0,user_id,article_id,rating
0,0,68866,0.125
1,0,87205,0.125
2,0,87224,0.125
3,0,96755,0.125
4,0,157541,0.125


In [116]:
reader = Reader(rating_scale=(0, 1))

data = Dataset.load_from_df(ratings.sample(frac=0.1, random_state=42), reader=reader)

param_grid = {'n_factors': [20, 50, 100], 'n_epochs': [10, 20, 50],
              'lr_all': [0.002, 0.005, 0.01], 'reg_all': [0.02, 0.04, 0.1]}

gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)

gs.fit(data)

In [117]:
# best RMSE score
print(gs.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

0.11414360806995917
{'n_factors': 20, 'n_epochs': 50, 'lr_all': 0.01, 'reg_all': 0.1}


In [459]:
data = Dataset.load_from_df(ratings, reader=reader)

trainset = data.build_full_trainset()

model_SVD = gs.best_estimator['rmse']

model_SVD.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x2b0c8291d90>

In [462]:
# Sauvegarder du modèle
data_model = 'model'
model_filename = "model_svd.pickle"

list_dir = os.listdir('data/')

if data_model not in list_dir:
    os.mkdir('data/' + data_model)

dump.dump('data/' + data_model + '/' + model_filename, algo=model_SVD)

In [463]:
model_SVD.predict(5, 158536, verbose=True)

user: 5          item: 158536     r_ui = None   est = 0.03   {'was_impossible': False}


Prediction(uid=5, iid=158536, r_ui=None, est=0.029011999356316202, details={'was_impossible': False})

In [464]:
def collaborativeFilteringRecommendArticle(articles, clicks, user_id, n=5):

    index = list(articles.index)

    articles_read = clicks[clicks['user_id'] == user_id]['click_article_id'].tolist()

    for ele in articles_read:
        if ele in index:
            index.remove(ele)

    results = dict()

    for i in index:
        pred = model_SVD.predict(user_id, i)
        results[pred.iid] = pred.est
    
    return nlargest(n, results, key = results.get)

In [465]:
results = collaborativeFilteringRecommendArticle(articles_emb, clicks_df, 5, 10)
results

[289003, 67185, 50644, 39894, 36162, 352979, 74450, 214753, 74501, 336254]

In [471]:
results = collaborativeFilteringRecommendArticle(articles_emb, clicks_df, 850, 10)
results

[74501, 283009, 277107, 50644, 352979, 39894, 36162, 214753, 336254, 186070]