In [1]:
import pandas as pd
import json 
from tqdm import tqdm
import random
import numpy as np

In [2]:
import warnings
warnings.filterwarnings("ignore")

# Re-ranking recommendations

For each sub_profiles, 50 recommendations are computed. The final recommendation list contains 10 recommendations.

## Data import

In [3]:
results_cv = pd.read_csv('../../baseline/report_baseline_10k_cv_complete/CentroidVector_1/rs_rank_split0.csv')

In [4]:
results_cv

Unnamed: 0,user_id,item_id,score
0,504290,41248,0.957390
1,504290,94453,0.955447
2,504290,15928,0.909462
3,504290,47925,0.902574
4,504290,64914,0.880959
...,...,...,...
445710,375980,82798,0.121524
445711,375980,70432,0.102459
445712,375980,46801,0.078437
445713,375980,2892,0.027459


In [9]:
#Définition de la liste des ids des utilisateurs
users_list = results_cv['user_id'].unique().tolist()

In [10]:
users_list = [int(i) for i in users_list]

In [11]:
len(users_list)

10000

In [12]:
test_set = pd.read_csv('../../baseline/report_baseline_10k_cv_complete/HoldOutPartitioning_test_split0.csv')

In [13]:
test_set

Unnamed: 0,user_id,item_id,score
0,504290,32215,1.0
1,504290,74453,0.0
2,504290,58342,0.0
3,504290,127836,0.0
4,504290,110175,0.0
...,...,...,...
516778,375980,54368,0.0
516779,375980,24735,0.0
516780,375980,17565,0.0
516781,375980,988,0.0


In [14]:
news = pd.read_csv('../../thematic_clustering/lda_128_large/news_thematic_clustering_large_final.csv', index_col=0)
news['cluster_hdbscan'] = news['cluster_hdbscan']+1

In [24]:
#Get initial interactions of the train set
behaviors = pd.read_csv('../../baseline/report_baseline_10k_cv_complete/HoldOutPartitioning_train_split0.csv')

In [26]:
train = behaviors.merge(news.rename(columns={'NewsID':'item_id','cluster_hdbscan':'category'})[['item_id','category']], on='item_id')

In [27]:
train

Unnamed: 0,user_id,item_id,score,category
0,504290,106909,0.0,11
1,421234,106909,0.0,11
2,421234,106909,0.0,11
3,421234,106909,0.0,11
4,52970,106909,0.0,11
...,...,...,...,...
1535263,401192,34953,1.0,2
1535264,401192,117749,1.0,2
1535265,401192,6885,1.0,4
1535266,596721,92984,1.0,0


In [28]:
categories_distribution_subprofiles = pd.read_csv('../categories_distribution_subprofiles_10k.csv', index_col=0)

In [54]:
categories_distribution_subprofiles.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
504290,0.371146,0.0,0.0,0.02651,0.073689,0.0,0.121311,0.362576,0.0,0.018257,0.0,0.0,0.02651
219624,0.338185,0.024156,0.0,0.022688,0.048312,0.0,0.151739,0.286625,0.0,0.058824,0.048312,0.021158,0.0
10420,0.338911,0.0,0.0,0.0,0.217548,0.021182,0.037526,0.042364,0.021182,0.185595,0.081045,0.054647,0.0


In [30]:
list_categories = categories_distribution_subprofiles.columns.tolist()

In [31]:
list_categories

['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12']

# Controlled and personalized re-ranking

In [33]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

In [34]:
from scipy.stats import entropy

## Compute initial entropy

In [35]:
def normalized_entropy(distribution):
    return entropy(distribution, base=2)/np.log2(len(distribution))

In [36]:
u_values = pd.DataFrame(index=users_list)

In [37]:
for u in tqdm(users_list):
    distrib_user = categories_distribution_subprofiles.loc[u].values.tolist()
    entropy_value = normalized_entropy(distrib_user)
    u_values.loc[u,'entropy'] = entropy_value

  0%|          | 0/10000 [00:00<?, ?it/s]

100%|██████████| 10000/10000 [00:01<00:00, 9906.98it/s]


## Homogenization

In [38]:
def div(x, a, b):
    return (1-b)*(x**a)+b

In [39]:
def homogeneization(distrib, param=0.5):
    n = len(distrib)
    new_distrib = [((1-param)*p)+(param/n) for p in distrib]
    return new_distrib

In [40]:
train

Unnamed: 0,user_id,item_id,score,category
0,504290,106909,0.0,11
1,421234,106909,0.0,11
2,421234,106909,0.0,11
3,421234,106909,0.0,11
4,52970,106909,0.0,11
...,...,...,...,...
1535263,401192,34953,1.0,2
1535264,401192,117749,1.0,2
1535265,401192,6885,1.0,4
1535266,596721,92984,1.0,0


In [42]:
results_cv.head()

Unnamed: 0,user_id,item_id,score
0,504290,41248,0.95739
1,504290,94453,0.955447
2,504290,15928,0.909462
3,504290,47925,0.902574
4,504290,64914,0.880959


In [44]:
def get_results_categories(initial_results):
    results_categories = initial_results.copy()
    results_categories = results_categories.rename(columns={'item_id':'NewsID'})
    results_categories = results_categories.merge(news[['NewsID','cluster_hdbscan','proba']], on='NewsID').rename(columns={'cluster_hdbscan':'category'})
    results_categories['proba'] = results_categories['proba'].replace(0, 1)
    return results_categories

In [65]:
results_cv_categories = get_results_categories(results_cv)

In [66]:
results_cv_categories = results_cv_categories.rename(columns={'NewsID':'item_id'})
results_cv_categories = results_cv_categories[['user_id','item_id','score','category']]

In [67]:
results_cv_categories.head()

Unnamed: 0,user_id,item_id,score,category
0,504290,41248,0.95739,6
1,607886,41248,0.152114,6
2,372542,41248,0.976418,6
3,292183,41248,0.271897,6
4,158723,41248,0.879657,6


In [48]:
test_u = results_cv_categories[results_cv_categories['user_id']==504290]

In [51]:
test_u['category'].unique().tolist()

[6, 7, 0, 9, 4, 10, 11, 5]

In [190]:
def rerank_perso_lambda_base(df, entropy_df, list_users, proportions_df, a, b, news, behaviors, k=10, save = False):
    results_entropy = entropy_df.copy()
    final_results = pd.DataFrame()
    for u in tqdm(list_users):
        #interactions de l'utilisateur
        accessed_news = behaviors[behaviors['user_id']==u]['item_id'].tolist()
        #recos faites à l'utilisateur
        data = df[df['user_id']==u]
        #list des catégories recommandées à l'utilisateur
        list_recommended_categories = data['category'].unique().tolist()
        #distribution de l'intérêt de l'utilisateur
        proportions_user = proportions_df.loc[u].tolist()
        #entropie dans les consommations de l'utilisateur
        entropy_user = u_values.loc[u].values[0]
        #entropie cible pour l'utilisateur, en fonction des valeurs de a et b
        target_entropy = div(entropy_user, a, b)
        #enregistrement de cette nouvelle valeur d'entropie cible dans le df associé
        results_entropy.loc[u,'target_entropy'] = target_entropy
        #instanciation d'un dictionnaire vide pour y ajouter les valeurs de lambda propres à l'utilisateur
        dict_lambda_user = {}
        for l in np.arange(0, 1.05, 0.05):
            #calcul de la nouvelle distribution avec application d'une certaine valeur de lambda
            new_distrib = homogeneization(proportions_user, param=round(l, 2))
            #calcul de l'entropie correspondante
            new_entropy = normalized_entropy(new_distrib)
            #calcul de l'erreur entre entropie de la nouvelle distribution et entropie cible
            delta = target_entropy - new_entropy
            #enregistrement de cette nouvelle valeur dans le dictionnaire
            dict_key = {round(l,2):abs(delta)}
            dict_lambda_user.update(dict_key)
        #identification de la valeur optimale de lambda
        optimal_lambda = min(dict_lambda_user, key=dict_lambda_user.get)
        results_entropy.loc[u,'optimal_lambda'] = optimal_lambda
        #définition de la nouvelle distribution optimale, contrainte (avec valeur optimale de lambda)
        new_proportion = homogeneization(proportions_user, param = optimal_lambda)
        for c in range(len(new_proportion)):
            prop = new_proportion[c]
            nb_recos = int(round(prop*k, 0))
            # sub_profile_id = str(u)+'-'+str(c)
            if c in list_recommended_categories:
                data_sub = data[data['category']==c].sort_values(by='score', ascending=False)[:nb_recos]
            else: 
                data_sub = pd.DataFrame(columns=['user_id','item_id','score','category'])
                news_to_select = news[~news['NewsID'].isin(accessed_news)]
                cat_news = news_to_select[news_to_select['cluster_hdbscan']==c]['NewsID'].tolist()
                selected_news = random.sample(cat_news, k=nb_recos)
                for n in selected_news:
                    new_row = {'user_id':u, 'item_id':n, 'score':0, 'category':c}
                    data_sub = data_sub.append(new_row, ignore_index=True)
            data_sub['category'] = c
            final_results = pd.concat([final_results, data_sub])
            final_results = final_results.reset_index(drop=True)
    if save == True:
        path_rank = 're_ranking_results/ADF/re_ranking/div_a_'+str(a).replace('.','')+'_k'+str(k)+'.csv'
        path_entropy = 're_ranking_results/ADF/entropy/entropy_a_'+str(a).replace('.','')+'_k'+str(k)+'.csv'
        final_results.to_csv(path_rank, index=False)
        results_entropy.to_csv(path_entropy)
    return final_results, results_entropy

In [189]:
re_ranking_a_0_20_base, entropy_0_20_base = rerank_perso_lambda_base(results_cv_categories, u_values, users_list, categories_distribution_subprofiles, 0, 0, news, train, k=20, save=True)
re_ranking_a_01_20_base, entropy_01_20_base = rerank_perso_lambda_base(results_cv_categories, u_values, users_list, categories_distribution_subprofiles, 0.1, 0, news, train, k=20, save=True)
re_ranking_a_02_20_base, entropy_02_20_base = rerank_perso_lambda_base(results_cv_categories, u_values, users_list, categories_distribution_subprofiles, 0.2, 0, news, train, k=20, save=True)
re_ranking_a_03_20_base, entropy_03_20_base = rerank_perso_lambda_base(results_cv_categories, u_values, users_list, categories_distribution_subprofiles, 0.3, 0, news, train, k=20, save=True)
re_ranking_a_04_20_base, entropy_04_20_base = rerank_perso_lambda_base(results_cv_categories, u_values, users_list, categories_distribution_subprofiles, 0.4, 0, news, train, k=20, save=True)
re_ranking_a_05_20_base, entropy_05_20_base = rerank_perso_lambda_base(results_cv_categories, u_values, users_list, categories_distribution_subprofiles, 0.5, 0, news, train, k=20, save=True)
re_ranking_a_06_20_base, entropy_06_20_base = rerank_perso_lambda_base(results_cv_categories, u_values, users_list, categories_distribution_subprofiles, 0.6, 0, news, train, k=20, save=True)
re_ranking_a_07_20_base, entropy_07_20_base = rerank_perso_lambda_base(results_cv_categories, u_values, users_list, categories_distribution_subprofiles, 0.7, 0, news, train, k=20, save=True)
re_ranking_a_08_20_base, entropy_08_20_base = rerank_perso_lambda_base(results_cv_categories, u_values, users_list, categories_distribution_subprofiles, 0.8, 0, news, train, k=20, save=True)
re_ranking_a_09_20_base, entropy_09_20_base = rerank_perso_lambda_base(results_cv_categories, u_values, users_list, categories_distribution_subprofiles, 0.9, 0, news, train, k=20, save=True)
re_ranking_a_1_20_base, entropy_1_20_base = rerank_perso_lambda_base(results_cv_categories, u_values, users_list, categories_distribution_subprofiles, 1, 0, news, train, k=20, save=True)

# Not personalized diversity

In [300]:
def rerank_no_pers(df, entropy_df, list_users, proportions_df, news, behaviors, target_entropy, k=10, save = False):
    results_entropy = entropy_df.copy()
    final_results = pd.DataFrame()
    for u in tqdm(list_users):
        #interactions de l'utilisateur
        accessed_news = behaviors[behaviors['user_id']==u]['item_id'].tolist()
        #recos faites à l'utilisateur
        data = df[df['user_id']==u]
        #list des catégories recommandées à l'utilisateur
        list_recommended_categories = data['category'].unique().tolist()
        #distribution de l'intérêt de l'utilisateur
        proportions_user = proportions_df.loc[u].tolist()
        #entropie dans les consommations de l'utilisateur
        entropy_user = u_values.loc[u].values[0]
        #enregistrement de cette nouvelle valeur d'entropie cible dans le df associé
        results_entropy.loc[u,'target_entropy'] = target_entropy
        #instanciation d'un dictionnaire vide pour y ajouter les valeurs de lambda propres à l'utilisateur
        dict_lambda_user = {}
        for l in np.arange(0, 1.05, 0.05):
            #calcul de la nouvelle distribution avec application d'une certaine valeur de lambda
            new_distrib = homogeneization(proportions_user, param=round(l, 2))
            #calcul de l'entropie correspondante
            new_entropy = normalized_entropy(new_distrib)
            #calcul de l'erreur entre entropie de la nouvelle distribution et entropie cible
            delta = target_entropy - new_entropy
            #enregistrement de cette nouvelle valeur dans le dictionnaire
            dict_key = {round(l,2):abs(delta)}
            dict_lambda_user.update(dict_key)
        #identification de la valeur optimale de lambda
        optimal_lambda = min(dict_lambda_user, key=dict_lambda_user.get)
        results_entropy.loc[u,'optimal_lambda'] = optimal_lambda
        #définition de la nouvelle distribution optimale, contrainte (avec valeur optimale de lambda)
        new_proportion = homogeneization(proportions_user, param = optimal_lambda)
        for c in range(len(new_proportion)):
            prop = new_proportion[c]
            nb_recos = int(round(prop*k, 0))
            # sub_profile_id = str(u)+'-'+str(c)
            if c in list_recommended_categories:
                data_sub = data[data['category']==c].sort_values(by='score', ascending=False)[:nb_recos]
            else: 
                data_sub = pd.DataFrame(columns=['user_id','item_id','score','category'])
                news_to_select = news[~news['NewsID'].isin(accessed_news)]
                cat_news = news_to_select[news_to_select['cluster_hdbscan']==c]['NewsID'].tolist()
                selected_news = random.sample(cat_news, k=nb_recos)
                for n in selected_news:
                    new_row = {'user_id':u, 'item_id':n, 'score':0, 'category':c}
                    data_sub = data_sub.append(new_row, ignore_index=True)
            data_sub['category'] = c
            final_results = pd.concat([final_results, data_sub])
            final_results = final_results.reset_index(drop=True)
    if save == True:
        path_rank = 're_ranking_results/no_pers/re_ranking/div_no_pers_'+str(target_entropy)+'_k'+str(k)+'.csv'
        path_entropy = 're_ranking_results/no_pers/entropy/entropy_no_pers_'+str(target_entropy)+'_k'+str(k)+'.csv'
        final_results.to_csv(path_rank, index=False)
        results_entropy.to_csv(path_entropy)
    return final_results, results_entropy

In [184]:
# re_ranking_a_06_20_global, entropy_06_20_global = rerank_no_pers(results_cv_categories, u_values, users_list, categories_distribution_subprofiles,news, train, 0.6 , k=20, save=True)
# re_ranking_a_07_20_global, entropy_07_20_global = rerank_no_pers(results_cv_categories, u_values, users_list, categories_distribution_subprofiles,news, train, 0.7, k=20, save=True)
# re_ranking_a_08_20_global, entropy_08_20_global = rerank_no_pers(results_cv_categories, u_values, users_list, categories_distribution_subprofiles,news, train, 0.8, k=20, save=True)
# re_ranking_a_09_20_global, entropy_09_20_global = rerank_no_pers(results_cv_categories, u_values, users_list, categories_distribution_subprofiles,news, train, 0.9, k=20, save=True)
# re_ranking_a_1_20_global, entropy_1_20_global = rerank_no_pers(results_cv_categories, u_values, users_list, categories_distribution_subprofiles, news, train, 1, k=20, save=True)

# Greedy re-ranking

In [121]:
from sklearn.metrics.pairwise import cosine_similarity
import math
import ast
import numpy as np

In [122]:
def embeddings_to_df(e):
    e_list = e.tolist()
    df_e = pd.DataFrame(e_list)
    return df_e

In [123]:
def triangular_matrix(m):
    m_tri = m.where(np.triu(np.ones(m.shape),k=1).astype(bool))
    return m_tri

In [124]:
def ILD(m):
    m_tri = triangular_matrix(m).stack().reset_index()
    m_tri.columns = ['i','j','dissimilarity']
    ild = (m_tri['dissimilarity'].sum())/len(m_tri)
    return ild

In [127]:
embeddings_lda_128 = pd.read_json('../../baseline/news_codified_lda_128/contents.json')
embeddings_lda_128['lda_128#0'] = embeddings_lda_128['lda_128#0'].apply(ast.literal_eval)
news_embeddings_lda = embeddings_to_df(embeddings_lda_128['lda_128#0'])
news_embeddings_lda.index = embeddings_lda_128['content_id']

In [128]:
similarity_matrix = pd.DataFrame(cosine_similarity(news_embeddings_lda))
similarity_matrix.index = news_embeddings_lda.index.tolist()
similarity_matrix.columns = news_embeddings_lda.index.tolist()

In [129]:
diversity_matrix = 1 - similarity_matrix

In [399]:
def greedy_reranking(recos, users_list, diversity_matrix, alpha, k=10, save=False):
    final_results = pd.DataFrame()
    for u in tqdm(users_list):
        recos_user = recos[recos['user_id']==u]
        recos_list_user = recos_user['item_id'].tolist()
        pertinence_scores = recos_user['score'].tolist()
        diversity_matrix_user = diversity_matrix[diversity_matrix.index.isin(recos_list_user)][recos_list_user]
        
        selected_items = []
        remaining_items = recos_user['item_id'].tolist()

        while len(selected_items) < k and remaining_items:
            best_i = None
            best_score = -1
            print('\n')
            for candidate in remaining_items:
                accuracy = recos_user[recos_user['item_id']==candidate]['score'].values[0]
                temporary_list = selected_items.copy()
                temporary_list.append(candidate)
                temporary_list_sorted = temporary_list.copy()
                temporary_list_sorted.sort()
                # div_score = np.mean([diversity_matrix_user[candidate, s] for s in selected_items]) if selected_items else 0
                div_score = ILD(diversity_matrix_user[diversity_matrix_user.index.isin(temporary_list)][temporary_list_sorted]) if selected_items else 0 
                score = ((1-alpha)*accuracy)+(alpha*div_score) 
                print('taille liste:', len(selected_items), 'taille liste temp',len(temporary_list), 'candidate news', candidate, 'accuracy',accuracy, 'diversity',div_score, 'score total',score)
                if score > best_score:
                    best_score=score
                    best_item=candidate
            
            if best_score >= 0:
                selected_items.append(best_item)
                remaining_items.remove(best_item)
            else:
                break

        recos_user_final = recos_user[recos_user['item_id'].isin(selected_items)]
        recos_user_final.set_index('item_id', inplace=True)
        recos_user_final = recos_user_final.reindex(selected_items)
        recos_user_final.reset_index(inplace=True)
        recos_user_final = recos_user_final[['user_id','item_id','score']]
        final_results = pd.concat([final_results, recos_user_final])
    final_results['item_id'] = final_results['item_id'].astype(int)
    if save == True:
        path = 're_ranking_results/greedy/greedy_lambda_'+str(alpha).replace('.','')+'.csv'
        final_results.to_csv(path, index=False)
    return final_results

In [292]:
# greedy_0 = greedy_reranking(results_cv, users_list, diversity_matrix, alpha=0, k=20, save=True)
# greedy_01 = greedy_reranking(results_cv, users_list, diversity_matrix, alpha=0.1, k=20, save=True)
# greedy_02 = greedy_reranking(results_cv, users_list, diversity_matrix, alpha=0.2, k=20, save=True)
# greedy_03 = greedy_reranking(results_cv, users_list, diversity_matrix, alpha=0.3, k=20, save=True)
# greedy_04 = greedy_reranking(results_cv, users_list, diversity_matrix, alpha=0.4, k=20, save=True)
# greedy_05 = greedy_reranking(results_cv, users_list, diversity_matrix, alpha=0.5, k=20, save=True)
# greedy_06 = greedy_reranking(results_cv, users_list, diversity_matrix, alpha=0.6, k=20, save=True)
# greedy_07 = greedy_reranking(results_cv, users_list, diversity_matrix, alpha=0.7, k=20, save=True)
# greedy_08 = greedy_reranking(results_cv, users_list, diversity_matrix, alpha=0.8, k=20, save=True)
# greedy_09 = greedy_reranking(results_cv, users_list, diversity_matrix, alpha=0.9, k=20, save=True)
# greedy_1 = greedy_reranking(results_cv, users_list[:5], diversity_matrix, alpha=1, k=20, save=False)

100%|██████████| 5/5 [00:23<00:00,  4.63s/it]


# Comparison distribution greedy / smooth

In [400]:
from gensim.matutils import hellinger

In [None]:
def calibration_hellinger_target(lambda_values, list_users, users_interest_df):
    dict_ch = {}
    for u in tqdm(list_users):
        interest_user = users_interest_df.loc[u].values.tolist()
        #get optimal lambda
        optimal_lambda = lambda_values.loc[u]['optimal_lambda']
        new_proportion = homogeneization(interest_user, param = optimal_lambda)
        # recos_user = recos[recos['user_id']==u].reset_index(drop=True)
        # distrib_categories = []
        # for c in categories_list:
        #     prop_cat = len(recos_user[recos_user['category']==c])
        #     distrib_categories.append(prop_cat/len(recos_user))
        c_h_user = hellinger(interest_user, new_proportion).sum()
        dict_key = {u:c_h_user}
        dict_ch.update(dict_key)
    return dict_ch

In [None]:
        for l in np.arange(0, 1.05, 0.05):
            #calcul de la nouvelle distribution avec application d'une certaine valeur de lambda
            new_distrib = homogeneization(proportions_user, param=round(l, 2))
            #calcul de l'entropie correspondante
            new_entropy = normalized_entropy(new_distrib)
            #calcul de l'erreur entre entropie de la nouvelle distribution et entropie cible
            delta = target_entropy - new_entropy
            #enregistrement de cette nouvelle valeur dans le dictionnaire
            dict_key = {round(l,2):abs(delta)}
            dict_lambda_user.update(dict_key)

In [None]:
def compare_distrib(recos, users_list, users_interest_df):
    dict_ch_smooth = {}
    for u in users_list:
        best_ch = 10
        recos_user = recos[recos['user_id']==u].reset_index(drop=True)
        interest_user = users_interest_df.loc[u].values.tolist()
        distrib_cat_recos = []
        for c in categories_list:
            prop_cat = len(recos_user[recos_user['category']==c])
            distrib_cat_recos.append(prop_cat/len(recos_user))
        for l in np.arange(0, 1.1, 0.1):
            new_distrib = homogeneization(distrib_cat_recos, param=round(l,1))
            c_h = hellinger(distrib_cat_recos, , new_distrib)
            if c_h < best_ch:
                best_ch = c_h
                optimal_lambda = l
        smoothed_distrib_user = homogeneization(distrib_cat_recos, param=round(optimal_lambda, 1))
        c_h_final = hellinger(interest_user, smoothed_distrib_user).sum()
        dict_key = {u:c_h_final}
        dict_ch_smooth.update(dict_key)
    return dict_ch_smooth
