In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook  import tqdm, trange
from tqdm import tqdm
from scipy import stats
import matplotlib as mpl
import ast
from sklearn import cluster

# Import data

In [2]:
news = pd.read_csv('../data/news_thematic_clustering_large_final.csv', index_col=0)

In [3]:
news.head(2)

Unnamed: 0,NewsID,Category,SubCategory,Title,Abstract,URL,Title_entities,Abstract_entities,Source,Text,Embedding,Date,Category_id,cluster_hdbscan,proba,tokens
0,93187,news,newsworld,The Cost of Trump's Aid Freeze in the Trenches...,Lt. Ivan Molchanets peeked over a parapet of s...,https://assets.msn.com/labs/mind/AAJgNsz.html,[],"[{""Label"": ""Ukraine"", ""Type"": ""G"", ""WikidataId...",nytimes.com,zolote ukraine lt ivan molchanets peek parapet...,"[5.403303475759458e-06, 0.23899737000465393, 5...",2019-10-25,0,3,1.0,"['zolote', 'ukraine', 'lt', 'ivan', 'molchanet..."
1,40259,news,newsworld,Chile: Three die in supermarket fire amid prot...,Three people have died in a supermarket fire a...,https://assets.msn.com/labs/mind/AAJ43pw.html,"[{""Label"": ""Chile"", ""Type"": ""G"", ""WikidataId"":...","[{""Label"": ""Santiago"", ""Type"": ""G"", ""WikidataI...",cnn.com,people die supermarket fire angry protest chil...,"[3.7435991544043645e-05, 3.7435991544043645e-0...",2019-10-20,0,6,0.954876,"['people', 'die', 'supermarket', 'fire', 'angr..."


In [5]:
behaviors =  pd.read_csv('../data/ratings_10k.csv')

In [6]:
behaviors.head()

Unnamed: 0,UserID,NewsID,Score,Time
0,504290,106909,0,2019-11-09 00:00:44
1,504290,101469,0,2019-11-09 00:00:44
2,504290,95605,0,2019-11-09 00:00:44
3,504290,96061,0,2019-11-09 00:00:44
4,504290,130031,0,2019-11-09 00:00:44


In [7]:
behaviors = behaviors.merge(news[['NewsID','SubCategory','Title','cluster_hdbscan','proba']], on='NewsID')

In [8]:
behaviors.head()

Unnamed: 0,UserID,NewsID,Score,Time,SubCategory,Title,cluster_hdbscan,proba
0,504290,106909,0,2019-11-09 00:00:44,elections-2020-us,"As Bloomberg's New York Prospered, Inequality ...",10,0.875347
1,219624,106909,0,2019-11-09 03:42:46,elections-2020-us,"As Bloomberg's New York Prospered, Inequality ...",10,0.875347
2,10420,106909,0,2019-11-09 04:38:36,elections-2020-us,"As Bloomberg's New York Prospered, Inequality ...",10,0.875347
3,511303,106909,0,2019-11-09 05:32:00,elections-2020-us,"As Bloomberg's New York Prospered, Inequality ...",10,0.875347
4,184564,106909,0,2019-11-09 05:54:33,elections-2020-us,"As Bloomberg's New York Prospered, Inequality ...",10,0.875347


# User profiling
Compute likelihood of the interest of user u in each category

In [11]:
#Add +1 on cluster_hdbscan to not have -1 for outliers
behaviors['cluster_hdbscan'] = behaviors['cluster_hdbscan']+1

In [12]:
#News that are identified as outliers are not set apart, but as a category as a whole (general category)
behaviors['proba'] = behaviors['proba'].replace(0, 1)

In [13]:
categories = behaviors['cluster_hdbscan'].unique().tolist()
categories.sort()

In [14]:
list_users = behaviors['UserID'].unique().tolist()

In [15]:
categories_distribution = pd.DataFrame(columns=categories, index=list_users)

In [16]:
def likelihood_categories(behaviors_df, list_categories, list_users):
    categories_distribution = pd.DataFrame(columns=list_categories, index=list_users)
    for u in tqdm(list_users):
        user_df = behaviors_df[(behaviors_df['UserID']==u) & (behaviors_df['Score']==1)]
        denominator = sum(user_df['Score']*user_df['proba'])
        for c in list_categories:
            user_df_cat = user_df[user_df['cluster_hdbscan']==c]
            numerator = sum(user_df_cat['Score']*user_df_cat['proba'])
            categories_distribution.loc[u,c] = numerator/denominator
    return categories_distribution

In [17]:
categories_distribution = likelihood_categories(behaviors, categories, list_users)

100%|██████████| 10000/10000 [04:13<00:00, 39.51it/s]


In [18]:
categories_distribution.index = list_users

In [19]:
categories_distribution

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
504290,0.371146,0.0,0.0,0.02651,0.073689,0.0,0.121311,0.362576,0.0,0.018257,0.0,0.0,0.02651
219624,0.338185,0.024156,0.0,0.022688,0.048312,0.0,0.151739,0.286625,0.0,0.058824,0.048312,0.021158,0.0
10420,0.338911,0.0,0.0,0.0,0.217548,0.021182,0.037526,0.042364,0.021182,0.185595,0.081045,0.054647,0.0
511303,0.152604,0.0,0.0,0.021801,0.078364,0.037012,0.057583,0.238153,0.0,0.354836,0.020975,0.038672,0.0
184564,0.276437,0.0,0.0,0.024599,0.07994,0.0,0.089455,0.15472,0.027644,0.273652,0.027644,0.045911,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
292183,0.282792,0.0,0.0,0.0,0.0,0.0,0.054277,0.087007,0.0,0.39324,0.0,0.182684,0.0
646273,0.476536,0.0,0.0,0.0,0.029783,0.0,0.11069,0.174696,0.029783,0.128191,0.0,0.050321,0.0
324828,0.17491,0.0,0.0,0.0,0.260469,0.047388,0.012758,0.048996,0.0,0.34971,0.0,0.105769,0.0
108536,0.378734,0.0,0.022237,0.0,0.313166,0.0,0.075075,0.094684,0.0,0.055484,0.030366,0.030254,0.0


In [20]:
categories_distribution.to_csv('categories_distribution_subprofiles_10k.csv')