In [0]:
import glob
import pandas as pd
from google.colab import files
from google.colab import drive
import numpy as np
from tqdm import tqdm
import re
import random
import math
import warnings
warnings.filterwarnings('ignore')

#Load in data

In [83]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
df_clean_page_data = pd.read_csv("/content/drive/My Drive/Knab/Data/CleanData/time_delta.csv")
df_clean_event_data = pd.read_csv("/content/drive/My Drive/Knab/Data/CleanData/clean_event_data.csv")

In [0]:
df_clean_article_data = pd.read_csv("/content/drive/My Drive/Knab/Data/CleanData/clean_article_data.csv")
df_article_time = df_clean_article_data[['URL', 'READING_TIME']]

In [0]:
#Find which instances contain reading condition in event data
df_event_subset = df_clean_event_data[['clientid_hashed', 'visitid', 'URL', 'eventlabel']]
df_event_subset = df_event_subset[df_event_subset['eventlabel'].isin(['75%','100%', 'read end article'])] #.drop_duplicates(subset=['clientid_hashed', 'visitid', 'URL'])
df_event_subset['Confidence_level'] = 0

In [0]:
#Define confidences
df_event_subset.loc[df_event_subset['eventlabel'] == 'read end article', 'Confidence_level'] = 0.9
df_event_subset.loc[df_event_subset['eventlabel'] == '100%', 'Confidence_level'] = 0.8
df_event_subset.loc[df_event_subset['eventlabel'] == '75%', 'Confidence_level'] = 0.5

In [0]:
#df_seen_recommendation = df_event_subset[['clientid_hashed', 'visitid', 'URL', 'hitnumber']]
#df_seen_recommendation.to_csv('seen_recommendation.csv', index=False)

In [0]:
df_event_subset = df_event_subset.sort_values('Confidence_level', ascending=False).drop_duplicates(subset=['clientid_hashed', 'visitid', 'URL'], keep='first').sort_index()

#Find read articles per unique clientid

In [0]:
minread = 0.5 #minimum percentage of article reading time
maxread = 1.25 #maximum percentage of article reading time

In [0]:
def transform_ms(input): #Transform all the 'x minuten' to actual milliseconds
  output = int(re.sub("[^0-9.]", "",input)) * 60000 #transform required reading time to milliseconds
  return output

In [0]:
df_article_time['READING_TIME'] = df_article_time['READING_TIME'].apply(transform_ms)  #MS transformation
df_clean_page_data = df_clean_page_data.rename(columns={'delta_time':'time_on_page'})
df_clean_page_data['ReadYN'] = 0
df_clean_page_data = pd.merge(left=df_clean_page_data, right= df_article_time, left_on = 'URL', right_on = 'URL', how = 'left')
client_list = df_clean_page_data['clientid_hashed'].unique()

In [0]:
df_clean_page_data = df_clean_page_data.merge(df_event_subset, how='left')

In [0]:
#Condition 1
df_clean_page_data.loc[(df_clean_page_data['time_on_page'] < (df_clean_page_data["READING_TIME"])*minread) & (df_clean_page_data['time_on_page'] != -1), "ReadYN"] = 0

#Condition 2
df_clean_page_data.loc[(df_clean_page_data['time_on_page'] > (df_clean_page_data["READING_TIME"])*maxread), "ReadYN"] = 1
df_clean_page_data.loc[(df_clean_page_data['time_on_page'] > (df_clean_page_data["READING_TIME"])*maxread), "Confidence_level"] = 1

#Condition 3
(df_clean_page_data.loc[df_clean_page_data['eventlabel'].notnull() & ((df_clean_page_data['time_on_page'] == -1) | ((df_clean_page_data['time_on_page'] >= (df_clean_page_data["READING_TIME"])*minread) 
                                                                      & (df_clean_page_data['time_on_page'] <= (df_clean_page_data["READING_TIME"])*maxread))), "ReadYN"]) = 1

In [0]:
df_clean_page_data['Confidence_level'].value_counts()

0.9    710883
0.5    154122
0.8     94747
1.0     80893
Name: Confidence_level, dtype: int64

#Construct the Read, Clicked and Missing pairs matrices

In [0]:
# Second argument is to select only articles, as they have an estimated scraped reading time
# Clicked is only clicked, not read
df_read = df_clean_page_data[(df_clean_page_data['ReadYN'] == 1) & (df_clean_page_data['READING_TIME'].notnull())][['URL', 'clientid_hashed', 'Confidence_level']] #Also drop visitid, under assumption of 1 session
df_clicked = df_clean_page_data[(df_clean_page_data['ReadYN'] == 0) & (df_clean_page_data['READING_TIME'].notnull())][['URL', 'clientid_hashed']]
df_clicked['clicked'] = 1

In [0]:
# Split client list in two. All the dataframe cant fit in memory
# split = int(len(client_list)/10)
# splits = np.arange(1,10)*split
# client_list_part_1 = client_list[:splits[0]]
# client_list_part_2 = client_list[splits[0]:splits[1]]
# client_list_part_3 = client_list[splits[1]:splits[2]]
# client_list_part_4 = client_list[splits[2]:splits[3]]
# client_list_part_5 = client_list[splits[3]:splits[4]]
# client_list_part_6 = client_list[splits[4]:splits[5]]
# client_list_part_7 = client_list[splits[5]:splits[6]]
# client_list_part_8 = client_list[splits[6]:splits[7]]
# client_list_part_9 = client_list[splits[7]:splits[8]]
# client_list_part_10 = client_list[splits[8]:]

In [0]:
df_read.to_csv('/content/drive/My Drive/Knab/Data/CleanData/ALS_inputs/read_pairs.csv', index=False)
df_clicked.to_csv('/content/drive/My Drive/Knab/Data/CleanData/ALS_inputs/clicked_pairs.csv', index=False)

In [0]:
df_read_count = df_read['clientid_hashed'].value_counts().reset_index().rename(columns = {'index':'clientid_hashed', 'clientid_hashed':'read_count'})
df_read_count['read_count'] = df_read_count['read_count'].astype(int)

df_clicked_count = df_clicked['clientid_hashed'].value_counts().reset_index().rename(columns = {'index':'clientid_hashed', 'clientid_hashed':'clicked_count'})
df_clicked_count['clicked_count'] = df_clicked_count['clicked_count'].astype(int)

In [0]:
df_read_5 = df_read_count[df_read_count['read_count'] >= 5]
df_clicked_5 = df_clicked_count[df_clicked_count['clicked_count'] >= 5]

In [0]:
sparsity_read = len(df_read)/(len(df_clean_page_data['clientid_hashed'].unique()) * len(df_clean_article_data['URL'].unique()))
sparsity_clicked = len(df_clicked)/(len(df_clean_page_data['clientid_hashed'].unique()) * len(df_clean_article_data['URL'].unique()))

In [0]:
sparsity_read_5 =(df_read_5['read_count'].values.sum())/(len(df_read_5['clientid_hashed'].unique()) * len(df_clean_article_data['URL'].unique()))
sparsity_clicked_5 = (df_clicked_5['clicked_count'].values.sum())/(len(df_clicked_5['clientid_hashed'].unique()) * len(df_clean_article_data['URL'].unique()))

In [0]:
print(f"Sparsity of read matrix is {(1-sparsity_read)*100}%")
print(f"Sparsity of clicked matrix is {(1-sparsity_clicked)*100}%")

print(f"Sparsity of read matrix of +5 readers is {(1-sparsity_read_5)*100}%")
print(f"Sparsity of clicked matrix of +5 readers is {(1-sparsity_clicked_5)*100}%")

Sparsity of read matrix is 99.9722350169788%
Sparsity of clicked matrix is 99.84226861998131%
Sparsity of read matrix of +5 readers is 99.15063737764626%
Sparsity of clicked matrix of +5 readers is 99.04505435165403%


#The code below has to be run on a machine +16GB ram

In [0]:
#Initiate the matrices
df_read_matrix = pd.DataFrame(columns=df_article_time['URL'], index = client_list).fillna(0)
df_weightRead_matrix = pd.DataFrame(columns=df_article_time['URL'], index = client_list).fillna(0)

#Loop through all occurrences in read list
for i in tqdm(range(len(df_read))):
  url = df_read.iloc[i]['URL']
  clientid = df_read.iloc[i]['clientid_hashed']
  Conf_L = df_read.iloc[i]['Confidence_level']
  df_read_matrix.loc[df_read_matrix.index == clientid, url] = 1
  df_weightRead_matrix.loc[df_read_matrix.index == clientid, url] = Conf_L

#Input for Bias estimator
  df_read_export = df_clean_page_data[(df_clean_page_data['ReadYN'] == 1) & (df_clean_page_data['READING_TIME'].notnull())][['URL', 'clientid_hashed', 'visitid']]
  
#Check if sum of all 1s is equal to unique number of rows in read list and save to file
if (df_read_matrix.values.sum() == len(df_read.drop_duplicates(['URL', 'clientid_hashed']))) == True:
  df_read_matrix.to_csv('/content/drive/My Drive/Knab/Data/CleanData/ALS_inputs/ReadMatrix.csv', index = True)
  df_weightRead_matrix.to_csv('/content/drive/My Drive/Knab/Data/CleanData/ALS_inputs/WeightReadMatrix.csv', index=True)
  df_read_export.to_csv('/content/drive/My Drive/Knab/Data/CleanData/ALS_inputs/ReadYN.csv', index = True)
else:
  print('Error in calculation!!')

In [0]:
#Initiate the matrices
df_clicked_matrix = pd.DataFrame(columns=df_article_time['URL'], index = client_list).fillna(0)

#Loop through all occurrences in clicked list
for i in tqdm(range(len(df_clicked))):
  url = df_clicked.iloc[i]['URL']
  clientid = df_clicked.iloc[i]['clientid_hashed']
  df_clicked_matrix.loc[df_clicked_matrix.index == clientid, url] = 1
  
#Check if sum of all 1s is equal to unique number of rows in read list and save to file
if (df_clicked_matrix.values.sum() == len(df_clicked.drop_duplicates())) == True:
  df_clicked_matrix.to_csv('/content/drive/My Drive/Knab/Data/CleanData/ALS_inputs/ClickedMatrix.csv', index = True)
else:
  print('Error in calculation!!')

In [0]:
#Client article pairs without read or click
df_missing = pd.DataFrame(columns=df_article_time['URL'], index = client_list).fillna(1) - df_read_matrix - df_clicked_matrix
df_missing.to_csv('/content/drive/My Drive/Knab/Data/CleanData/ALS_inputs/MissingMatrix.csv', index = True)