In [None]:
import glob
import pandas as pd
from google.colab import files
from google.colab import drive
import numpy as np
from tqdm import tqdm
import re
import random
import math
import warnings
warnings.filterwarnings('ignore')

#Load in data

In [None]:
drive.mount('/content/drive')

In [None]:
df_clean_page_data = pd.read_csv("/content/drive/My Drive/Knab/Data/CleanData/time_delta.csv")
df_clean_event_data = pd.read_csv("/content/drive/My Drive/Knab/Data/CleanData/clean_event_data.csv")

In [None]:
df_clean_article_data = pd.read_csv("/content/drive/My Drive/Knab/Data/CleanData/clean_article_data.csv")
df_article_time = df_clean_article_data[['URL', 'READING_TIME']]

In [None]:
#Find which instances contain reading condition in event data
df_event_subset = df_clean_event_data[['clientid_hashed', 'visitid', 'URL', 'eventlabel']]
df_event_subset = df_event_subset[df_event_subset['eventlabel'].isin(['75%','100%', 'read end article'])] #.drop_duplicates(subset=['clientid_hashed', 'visitid', 'URL'])
df_event_subset['Confidence_level'] = 0

In [None]:
#Define confidences
df_event_subset.loc[df_event_subset['eventlabel'] == 'read end article', 'Confidence_level'] = 0.9
df_event_subset.loc[df_event_subset['eventlabel'] == '100%', 'Confidence_level'] = 0.8
df_event_subset.loc[df_event_subset['eventlabel'] == '75%', 'Confidence_level'] = 0.5

In [None]:
#df_seen_recommendation = df_event_subset[['clientid_hashed', 'visitid', 'URL', 'hitnumber']]
#df_seen_recommendation.to_csv('seen_recommendation.csv', index=False)

In [None]:
df_event_subset = df_event_subset.sort_values('Confidence_level', ascending=False).drop_duplicates(subset=['clientid_hashed', 'visitid', 'URL'], keep='first').sort_index()

#Find read articles per unique clientid

In [None]:
minread = 0.5 #minimum percentage of article reading time
maxread = 1.25 #maximum percentage of article reading time

In [None]:
def transform_ms(input): #Transform all the 'x minuten' to actual milliseconds
  output = int(re.sub("[^0-9.]", "",input)) * 60000 #transform required reading time to milliseconds
  return output

In [None]:
df_article_time['READING_TIME'] = df_article_time['READING_TIME'].apply(transform_ms)  #MS transformation
df_clean_page_data = df_clean_page_data.rename(columns={'delta_time':'time_on_page'})
df_clean_page_data['ReadYN'] = 0
df_clean_page_data = pd.merge(left=df_clean_page_data, right= df_article_time, left_on = 'URL', right_on = 'URL', how = 'left')
client_list = df_clean_page_data['clientid_hashed'].unique()

In [None]:
df_clean_page_data = df_clean_page_data.merge(df_event_subset, how='left')

In [None]:
#Condition 1
df_clean_page_data.loc[(df_clean_page_data['time_on_page'] < (df_clean_page_data["READING_TIME"])*minread) & (df_clean_page_data['time_on_page'] != -1), "ReadYN"] = 0

#Condition 2
df_clean_page_data.loc[(df_clean_page_data['time_on_page'] > (df_clean_page_data["READING_TIME"])*maxread), "ReadYN"] = 1
df_clean_page_data.loc[(df_clean_page_data['time_on_page'] > (df_clean_page_data["READING_TIME"])*maxread), "Confidence_level"] = 1

#Condition 3
(df_clean_page_data.loc[df_clean_page_data['eventlabel'].notnull() & ((df_clean_page_data['time_on_page'] == -1) | ((df_clean_page_data['time_on_page'] >= (df_clean_page_data["READING_TIME"])*minread) 
                                                                      & (df_clean_page_data['time_on_page'] <= (df_clean_page_data["READING_TIME"])*maxread))), "ReadYN"]) = 1

In [None]:
df_clean_page_data['Confidence_level'].value_counts()

#Construct the Read, Clicked and Missing pairs matrices

In [None]:
# Second argument is to select only articles, as they have an estimated scraped reading time
# Clicked is only clicked, not read
df_read = df_clean_page_data[(df_clean_page_data['ReadYN'] == 1) & (df_clean_page_data['READING_TIME'].notnull())][['URL', 'clientid_hashed', 'Confidence_level']] #Also drop visitid, under assumption of 1 session
df_clicked = df_clean_page_data[(df_clean_page_data['ReadYN'] == 0) & (df_clean_page_data['READING_TIME'].notnull())][['URL', 'clientid_hashed']]
df_clicked['clicked'] = 1

In [None]:
# Split client list in two. All the dataframe cant fit in memory
# split = int(len(client_list)/10)
# splits = np.arange(1,10)*split
# client_list_part_1 = client_list[:splits[0]]
# client_list_part_2 = client_list[splits[0]:splits[1]]
# client_list_part_3 = client_list[splits[1]:splits[2]]
# client_list_part_4 = client_list[splits[2]:splits[3]]
# client_list_part_5 = client_list[splits[3]:splits[4]]
# client_list_part_6 = client_list[splits[4]:splits[5]]
# client_list_part_7 = client_list[splits[5]:splits[6]]
# client_list_part_8 = client_list[splits[6]:splits[7]]
# client_list_part_9 = client_list[splits[7]:splits[8]]
# client_list_part_10 = client_list[splits[8]:]

In [None]:
df_read.to_csv('/content/drive/My Drive/Knab/Data/CleanData/ALS_inputs/read_pairs.csv', index=False)
df_clicked.to_csv('/content/drive/My Drive/Knab/Data/CleanData/ALS_inputs/clicked_pairs.csv', index=False)

    ## Sparsity

In [None]:
df_read_count = df_read['clientid_hashed'].value_counts().reset_index().rename(columns = {'index':'clientid_hashed', 'clientid_hashed':'read_count'})
df_read_count['read_count'] = df_read_count['read_count'].astype(int)

df_clicked_count = df_clicked['clientid_hashed'].value_counts().reset_index().rename(columns = {'index':'clientid_hashed', 'clientid_hashed':'clicked_count'})
df_clicked_count['clicked_count'] = df_clicked_count['clicked_count'].astype(int)

In [None]:
df_read_5 = df_read_count[df_read_count['read_count'] >= 5]
df_clicked_5 = df_clicked_count[df_clicked_count['clicked_count'] >= 5]

In [None]:
sparsity_read = len(df_read)/(len(df_clean_page_data['clientid_hashed'].unique()) * len(df_clean_article_data['URL'].unique()))
sparsity_clicked = len(df_clicked)/(len(df_clean_page_data['clientid_hashed'].unique()) * len(df_clean_article_data['URL'].unique()))

In [None]:
sparsity_read_5 =(df_read_5['read_count'].values.sum())/(len(df_read_5['clientid_hashed'].unique()) * len(df_clean_article_data['URL'].unique()))
sparsity_clicked_5 = (df_clicked_5['clicked_count'].values.sum())/(len(df_clicked_5['clientid_hashed'].unique()) * len(df_clean_article_data['URL'].unique()))

In [None]:
print(f"Sparsity of read matrix is {(1-sparsity_read)*100}%")
print(f"Sparsity of clicked matrix is {(1-sparsity_clicked)*100}%")

print(f"Sparsity of read matrix of +5 readers is {(1-sparsity_read_5)*100}%")
print(f"Sparsity of clicked matrix of +5 readers is {(1-sparsity_clicked_5)*100}%")