# Exploratory Data Analysis and Data Preparation

## Import librairies

In [None]:
import os
import glob
import pickle

import numpy as np
import pandas as pd

import plotly as plt
import plotly.express as px

# void

## Load the data  
Load in 'clicks' the interactions file clicks_agg.csv, which is an aggregation of the hourly aggregation files.  
Load in 'metadata' the articles metadata file.  
Load in 'embs' the articles embedding.

In [None]:
data_path = './data'
clicks_dir = os.path.join(data_path, 'clicks')

In [None]:
metadata_file = os.path.join(data_path, 'articles_metadata.csv')
clicks_file = os.path.join(data_path, 'clicks_agg.csv')
articles_file = os.path.join(data_path, 'articles_embeddings.pickle')

In [None]:
metadata = pd.read_csv(metadata_file)
clicks = pd.read_csv(clicks_file)
with open(articles_file, mode='rb') as f:
    embs = pickle.load(f)

## Look at the data

In [None]:
metadata.columns

In [None]:
metadata.head()

In [None]:
metadata.describe()

In [None]:
clicks.columns

In [None]:
clicks.head()

In [None]:
clicks.describe()

In [None]:
# Number of unique user_id, session_id, click_article_id, and article_id
print(f"Number of unique user_id: {clicks['user_id'].nunique()}")
print(f"Number of unique session_id: {clicks['session_id'].nunique()}")
print(f"Number of unique click_article_id: {clicks['click_article_id'].nunique()}")
print(f"Number of unique article_id: {len(embs)}")

In [None]:
# Number of sessions per user
sessions_per_user = clicks.groupby(by='user_id')['session_id'].nunique()
# px.histogram(sessions_per_user, range_x=[0,50], title='Number of sessions per user').show()

# # Number of article per session
articles_per_session = clicks.groupby(by='session_id')['click_article_id'].nunique()
# px.histogram(articles_per_session, range_x=[0,15], title='Number of articles per session').show()

# # Number of user per article
users_per_article = clicks.groupby(by='click_article_id')['user_id'].nunique()
# px.histogram(users_per_article, range_x=[0,50], title='Number of users per article', nbins=50000).show()

# # Number of articles per user
articles_per_user = clicks.groupby(by='user_id')['click_article_id'].nunique()
# px.histogram(articles_per_user, range_x=[0,100], title='Number of articles per user', nbins=50000).show()

print(f"Sessions per user - max:{sessions_per_user.max()} - min:{sessions_per_user.min()} - mean:{sessions_per_user.mean()}")
print(f"Articles per session - max:{articles_per_session.max()} - min:{articles_per_session.min()} - mean:{articles_per_session.mean()}")
print(f"User per article - max:{users_per_article.max()} - min:{users_per_article.min()} - mean:{users_per_article.mean()}")
print(f"Article per user - max:{articles_per_user.max()} - min:{articles_per_user.min()} - mean:{articles_per_user.mean()}")

## Add some helpers data  
In the interactions dataframe named 'clicks':
- create columns to identify the last click of a given session  
- join the column words_count of 'metadata' to 'click'

In [None]:
# Create user_session_id and its changes
clicks = clicks.sort_values(['user_id', 'session_id', 'click_timestamp'])
clicks = clicks.reset_index(drop=True)
clicks['user_session_id'] = clicks['user_id'].astype(str) + '_' + clicks['session_id'].astype(str)
clicks['user_id_change'] = clicks['user_id'].diff() != 0
clicks['session_id_change'] = clicks['session_id'].diff() != 0
clicks['user_session_id_change'] = clicks['user_id_change'] & clicks['session_id_change']

clicks  = clicks.join(metadata[['article_id', 'words_count']],on='click_article_id', how='left')

## Clean the data  
Some articles have 0 word. We remove the sessions containing such articles from the data. We check that the impact of that operation on the amount of data available is limited.

In [None]:
# Clean the data
# Drop sessions with articles that have words_count = 0
user_id_before = clicks['user_id'].nunique()
session_id_before = clicks['session_id'].nunique()
click_articles_id_before = clicks['click_article_id'].nunique()

min_words_count_per_session = clicks.groupby(by='session_id')['words_count'].min()
sessions_to_drop = min_words_count_per_session[min_words_count_per_session == 0].index
indexes_to_drop = clicks[clicks['session_id'].isin(sessions_to_drop)].index
clicks = clicks.drop(index=indexes_to_drop)

print(f"Number of unique user_id: before {user_id_before} => after {clicks['user_id'].nunique()}")
print(f"Number of unique session_id: before {session_id_before} => after {clicks['session_id'].nunique()}")
print(f"Number of unique click_article_id: before {click_articles_id_before} => after {clicks['click_article_id'].nunique()}")

## Compute an implicit rating of each article seen by a user (each interaction)

The proxy we use for the rating is the time spent on each article normalized by the number of words in the article  

1) It is computed as:  
> time spent on the current article = timestamp of the click on the next article - timestamp of the click on the current article  
> normalized time spent on the current article =  time spent on the current article / number of words of the article
  
2) This computation can be done for all articles within a session but the last one because it has no next article.  
In that case the value is set to np.nan.  

3) Due to the very long tail of the resulting distribution, we take its log. The log distribution still have a narrow pick and we further adjust the rating by compressing both sides of the peak around it.

4) Finally the ratings are normalized on a scale of 0 to 5.

In [None]:
# 1) Compute a proxy for the ratings

starts = clicks['click_timestamp'].values
starts = np.append(starts, starts[-1])
stops = starts[1:]
starts = starts[0:-1]
clicks['article_deltatime'] = (stops-starts)/1000.

last_session_click = clicks['session_id_change']
last_session_click = last_session_click[1:].append(pd.Series([True]))
clicks['last_session_click'] = last_session_click.to_numpy()

In [None]:
# 2) Set clicks where this computation is not possible to np.nan
idx = clicks['last_session_click'] == True
clicks.loc[idx, 'article_deltatime'] = np.nan

clicks['score'] = clicks['article_deltatime'] / clicks['words_count']

In [None]:
# 3) Take the log
clicks['rating'] = np.log10(clicks['score'] + 1)

In [None]:
# 4) Compress the distribution around its peak
th = 0.05
ratings_above_th = clicks.loc[clicks['rating']>th,'rating']
clicks.loc[clicks['rating']>th,'rating'] = ratings_above_th / (ratings_above_th/th)**(0.8)

ratings_below_th = clicks.loc[clicks['rating']<=th,'rating']
clicks.loc[clicks['rating']<=th,'rating'] = ratings_below_th * (ratings_below_th/th)**(0.8)

clicks['rating'] = clicks['rating']/clicks['rating'].max()*5

px.histogram(clicks['rating'])

## Save the data

In [None]:
clicks.to_csv('./data/clicks_enhanced.csv', index=False)

In [None]:
clicks = clicks[['user_id', 'click_article_id', 'rating']]
clicks = clicks.rename(columns={'user_id':'userID', 'click_article_id': 'itemID'})
clicks.to_csv('./data/clicks_light.csv', index=False)

# Back up

In [None]:
# Time spent on an article
# px.histogram(clicks['article_timespan'], title = 'Time spent on an article')

In [None]:
# file_paths = glob.glob(os.path.join(clicks_dir,'*'))
# clicks_agg_file = os.path.join(data_path, 'clicks_agg.csv')

# with open(clicks_agg_file, mode='w') as agg_f:
#     with open(file_paths[0]) as f:
#         agg_f.write(f.read())
#     for file_path in file_paths[1:]:
#         with open(file_path) as f:
#             agg_f.writelines(f.readlines()[1:])