In [1]:
import numpy as np
import pandas as pd
import pickle
#import pickle5 as pickle
from sklearn.metrics.pairwise import cosine_similarity
from surprise import accuracy, Dataset, SVD
import os
from surprise.model_selection import cross_validate
#print(pd.__version__)

On cherche à proposer 5 articles à un lecteur. Pour cela, on va s'intéresser à 2 approches différentes.

# Content-based filtering

La première méthode consistera, pour un article donné, à chercher les 5 articles qui lui ressemble le plus.

In [2]:
with open('articles_embeddings.pickle', 'rb') as handle:
    articles_embeddings = pickle.load(handle)

In [3]:
articles_embeddings

array([[-0.16118301, -0.95723313, -0.13794445, ..., -0.231686  ,
         0.5974159 ,  0.40962312],
       [-0.52321565, -0.974058  ,  0.73860806, ...,  0.18282819,
         0.39708954, -0.83436364],
       [-0.61961854, -0.9729604 , -0.20736018, ..., -0.44758022,
         0.8059317 , -0.28528407],
       ...,
       [-0.25139043, -0.9762427 ,  0.58609664, ..., -0.14372464,
         0.06809307, -0.7050104 ],
       [ 0.22434181, -0.92328775, -0.38174152, ...,  0.6871319 ,
        -0.5315117 ,  0.01072566],
       [-0.25713393, -0.9946313 ,  0.9837918 , ...,  0.98387307,
        -0.8381829 , -0.1792827 ]], dtype=float32)

On va chercher les articles les plus proches de l'article donné par un calcul de similarité sur notre embedding.

In [4]:
def content_based(article_id):
    similarity = cosine_similarity(articles_embeddings,articles_embeddings[article_id].reshape(1, -1))
    return (similarity.argsort(axis=0))[-6:-1]    

In [5]:
content_based(200)

array([[ 155],
       [1437],
       [1368],
       [ 158],
       [ 249]], dtype=int64)

On peut envisager d'améliorer l'algorithme en faisant en sorte de ne pas sélectionner d'article déjà lu par le lecteur.

# Collaborative filtering

La deuxième méthode consistera, pour un utilisateur donné, de selectionner des articles aimés par des utilisateurs proche de lui.

In [6]:
pd.set_option('display.max_columns', None, 'display.max_rows', 6)
df_articles = pd.read_csv('articles_metadata.csv', low_memory=False)
df_articles

Unnamed: 0,article_id,category_id,created_at_ts,publisher_id,words_count
0,0,0,1513144419000,0,168
1,1,1,1405341936000,0,189
2,2,1,1408667706000,0,250
...,...,...,...,...,...
364044,364044,460,1457974279000,0,177
364045,364045,460,1515964737000,0,126
364046,364046,460,1505811330000,0,479


In [7]:
df_click = pd.read_csv('clicks_sample.csv', low_memory=False)
df_click

Unnamed: 0,user_id,session_id,session_start,session_size,click_article_id,click_timestamp,click_environment,click_deviceGroup,click_os,click_country,click_region,click_referrer_type
0,0,1506825423271737,1506825423000,2,157541,1506826828020,4,3,20,1,20,2
1,0,1506825423271737,1506825423000,2,68866,1506826858020,4,3,20,1,20,2
2,1,1506825426267738,1506825426000,2,235840,1506827017951,4,1,17,1,16,2
...,...,...,...,...,...,...,...,...,...,...,...,...
1880,706,1506828979881443,1506828979000,3,108854,1506829027334,4,3,2,1,25,1
1881,706,1506828979881443,1506828979000,3,96663,1506829095732,4,3,2,1,25,1
1882,706,1506828979881443,1506828979000,3,68866,1506829125732,4,3,2,1,25,1


On va chercher à calculer pour chaque couple (utilisateur;article) un score d'appréciation. Ici on va juste compter le nombre de clicks effectué par l'utilisateur sur l'article.

In [8]:
def getListOfFiles(dirName):
    listOfFile = os.listdir(dirName)
    allFiles = list()
    for entry in listOfFile:
        fullPath = os.path.join(dirName, entry)
        if os.path.isdir(fullPath):
            allFiles = allFiles + getListOfFiles(fullPath)
        else:
            allFiles.append(fullPath)
                
    return allFiles

In [9]:
csv_dir = 'clicks'
csv_list = getListOfFiles(csv_dir)
csv_list.pop(0)

'clicks\\clicks_hour_000.csv'

In [10]:
taille = len(df_click)
for csv in csv_list:
    df_click_temp = pd.read_csv(csv, low_memory=False)
    taille += len(df_click_temp)

In [11]:
taille

2988181

In [12]:
for csv in csv_list:
    df_click_temp = pd.read_csv(csv, low_memory=False)
    df_click = pd.concat([df_click, df_click_temp])

In [13]:
df_click.reset_index(drop=True, inplace=True)
df_click

Unnamed: 0,user_id,session_id,session_start,session_size,click_article_id,click_timestamp,click_environment,click_deviceGroup,click_os,click_country,click_region,click_referrer_type
0,0,1506825423271737,1506825423000,2,157541,1506826828020,4,3,20,1,20,2
1,0,1506825423271737,1506825423000,2,68866,1506826858020,4,3,20,1,20,2
2,1,1506825426267738,1506825426000,2,235840,1506827017951,4,1,17,1,16,2
...,...,...,...,...,...,...,...,...,...,...,...,...
2988178,322896,1508211376302329,1508211376000,2,157507,1508211702520,4,1,17,1,25,2
2988179,123718,1508211379189330,1508211379000,2,234481,1508211513583,4,3,2,1,25,2
2988180,123718,1508211379189330,1508211379000,2,233578,1508211543583,4,3,2,1,25,2


In [14]:
df_click2 = df_click.copy(deep=True)

In [15]:
df_click2

Unnamed: 0,user_id,session_id,session_start,session_size,click_article_id,click_timestamp,click_environment,click_deviceGroup,click_os,click_country,click_region,click_referrer_type
0,0,1506825423271737,1506825423000,2,157541,1506826828020,4,3,20,1,20,2
1,0,1506825423271737,1506825423000,2,68866,1506826858020,4,3,20,1,20,2
2,1,1506825426267738,1506825426000,2,235840,1506827017951,4,1,17,1,16,2
...,...,...,...,...,...,...,...,...,...,...,...,...
2988178,322896,1508211376302329,1508211376000,2,157507,1508211702520,4,1,17,1,25,2
2988179,123718,1508211379189330,1508211379000,2,234481,1508211513583,4,3,2,1,25,2
2988180,123718,1508211379189330,1508211379000,2,233578,1508211543583,4,3,2,1,25,2


In [16]:
tmp = df_click[['user_id','click_article_id']]

In [17]:
tmp

Unnamed: 0,user_id,click_article_id
0,0,157541
1,0,68866
2,1,235840
...,...,...
2988178,322896,157507
2988179,123718,234481
2988180,123718,233578


In [18]:
tmp2 = tmp.groupby(['user_id','click_article_id']).value_counts()

In [19]:
tmp2

user_id  click_article_id
0        68866               1
         87205               1
         87224               1
                            ..
322895   289197              1
322896   30760               1
         157507              1
Length: 2950710, dtype: int64

In [20]:
tmp2.unique()

array([ 1,  2,  5,  4,  3,  6,  7,  8, 10, 13,  9, 17, 12, 30, 31, 33, 16],
      dtype=int64)

In [21]:
tmp3 = tmp2.to_frame('count')
tmp3.reset_index(inplace=True)
tmp3

Unnamed: 0,user_id,click_article_id,count
0,0,68866,1
1,0,87205,1
2,0,87224,1
...,...,...,...
2950707,322895,289197,1
2950708,322896,30760,1
2950709,322896,157507,1


In [22]:
tmp3['count'].unique()

array([ 1,  2,  5,  4,  3,  6,  7,  8, 10, 13,  9, 17, 12, 30, 31, 33, 16],
      dtype=int64)

In [23]:
tmp3.to_csv('data_user.csv') 

In [24]:
dataTotal = pd.read_csv('data_user.csv', low_memory=False)
dataTotal = dataTotal.drop(['Unnamed: 0'], axis=1)
dataTotal

Unnamed: 0,user_id,click_article_id,count
0,0,68866,1
1,0,87205,1
2,0,87224,1
...,...,...,...
2950707,322895,289197,1
2950708,322896,30760,1
2950709,322896,157507,1


In [25]:
dataTotal['count'].unique()

array([ 1,  2,  5,  4,  3,  6,  7,  8, 10, 13,  9, 17, 12, 30, 31, 33, 16],
      dtype=int64)

On va maintenant entrainer un modèle qui va nous permettre de prédire le score d'un article pour un utilisateur en regardant par rapport aux autre utilisateurs ayant lu l'article.

In [26]:
from surprise import Reader, Dataset
reader = Reader(rating_scale=(1, 35))
data = Dataset.load_from_df(dataTotal, reader)
from surprise import SVD, accuracy
algo = SVD()
trainset = data.build_full_trainset()
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=3, verbose=True)
#algo.fit(trainset)

Evaluating RMSE, MAE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.1377  0.1345  0.1310  0.1344  0.0027  
MAE (testset)     0.0287  0.0284  0.0286  0.0286  0.0001  
Fit time          31.56   32.26   31.48   31.77   0.35    
Test time         11.61   12.50   11.55   11.88   0.43    


{'test_rmse': array([0.13772107, 0.13448587, 0.13102081]),
 'test_mae': array([0.02872184, 0.02844053, 0.0286002 ]),
 'fit_time': (31.564550638198853, 32.26036882400513, 31.48144769668579),
 'test_time': (11.608172178268433, 12.497164487838745, 11.549534559249878)}

In [27]:
#testset = trainset.build_anti_testset()

In [28]:
algo.predict(0, 52, verbose=False)

Prediction(uid=0, iid=52, r_ui=None, est=1.0093733671899021, details={'was_impossible': False})

In [29]:
list_article = dataTotal['click_article_id'].unique()

In [30]:
def user_list_article(user_id):
    dataUser = dataTotal.loc[(dataTotal['user_id'] == user_id)]
    user_list = dataUser['click_article_id'].unique()
    new_list = [article for article in list_article if article not in user_list]
    return new_list

In [31]:
def pred_user(user_id):
    list_pred = []
    user_list = user_list_article(user_id)
    for article in user_list:
        pred = algo.predict(user_id, article, verbose=False)
        list_pred += [pred]
    return list_pred

In [32]:
from collections import defaultdict
def get_top_n(predictions, n=10):
    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

In [33]:
def recommandation(user_id,nb):
    tmp = get_top_n(pred_user(user_id),nb)
    reco = []
    for a,b in tmp[user_id]:
        reco += [a]
    return reco

In [34]:
recommandation(12000,5)

[68851, 237071, 74254, 105941, 96173]

In [None]:
[68851, 105941, 237071, 363925, 73431]

In [115]:
#import pickle5 as pickle
pickle.dump(algo, open('ColabFilter.sav', 'wb'))

In [72]:
req = func.HttpRequest(
            method='GET',
            body={ "name": "very good" },
            url='https://recommandation.azurewebsites.net/api/HttpTest',
            params={'name': 'very good'})

In [179]:
import requests
response = requests.post('https://recommandation.azurewebsites.net/api/HttpTest',
                         json={'name': '200'})

In [180]:
response.content

b'1437 376 1184 158 249'

In [120]:
from sklearn import manifold, decomposition
pca = decomposition.PCA(n_components=0.9)
feat_pca= pca.fit_transform(articles_embeddings)

In [121]:
feat_pca.shape

(364047, 40)

In [122]:
pickle.dump(feat_pca, open('Test.sav', 'wb'))

In [173]:
response = requests.post('http://127.0.0.1:5000/recom',
                         data='300')

In [174]:
response.text

b'1264 1631 988 1116 1529'

In [182]:
response = requests.post('https://recoapi.azurewebsites.net/recom',
                         data='200')

In [184]:
response.text

'1437 376 1184 158 249'