In [None]:
import glob
import pandas as pd
from google.colab import files
from google.colab import drive
import numpy as np
from tqdm import tqdm
import re
import math
import warnings
warnings.filterwarnings('ignore')

#Load in data

In [None]:
drive.mount('/content/drive', force_remount = True)

In [None]:
!ls "/content/drive/My Drive/Knab/Data/CleanData/"

In [None]:
input_recommendation = pd.read_csv("/content/drive/My Drive/Knab/Data/CleanData/data_recommendations.csv")
input_clean_page_data = pd.read_csv("/content/drive/My Drive/Knab/Data/CleanData/clean_page_data.csv")
input_seen_recommendation = pd.read_csv("/content/drive/My Drive/Knab/Data/CleanData/seen_recommendation.csv") #event data

In [None]:
df_seen_recommendation = input_seen_recommendation.drop(['hitnumber'], axis = 1)
df_seen_recommendation['seen_recommendations'] = 1

In [None]:
#The old recommendations got converted to strings when loading the data
def string_to_array(string):
  string = string.replace('[', '').replace(']', '').replace("'", "").replace(' ', '').split('\r\n')
  
  return string

In [None]:
input_recommendation['old_recommendations'] = input_recommendation['old_recommendations'].apply(string_to_array)

In [None]:
columns_rec = pd.DataFrame(input_recommendation.old_recommendations.tolist(), columns=['rec_1', 'rec_2', 'rec_3'])
df_recommended_articles =  pd.concat([input_recommendation,columns_rec], axis=1).drop(['old_recommendations'], axis = 1)

In [None]:
df_data = pd.merge(left = input_clean_page_data, right = df_recommended_articles[['URL', 'clientid_hashed', 'visitid', 'hitnumber', 'ArticleYN', 'rec_1', 'rec_2', 'rec_3']], left_on = ['URL', 'clientid_hashed', 'visitid', 'hitnumber'], right_on= ['URL', 'clientid_hashed', 'visitid', 'hitnumber'], how='left')
df_data = pd.merge(left = df_data, right = df_seen_recommendation, left_on = ['URL', 'clientid_hashed', 'visitid'], right_on = ['URL', 'clientid_hashed', 'visitid'], how='left').fillna(0)
df_data = df_data.drop_duplicates(['URL', 'clientid_hashed', 'visitid', 'hitnumber', 'visitstarttime'])

In [None]:
df_data_shifted = df_data[['clientid_hashed', 'URL']].rename(columns={'clientid_hashed': 'clientid_hashed_right', 'URL': 'URL_right'})

In [None]:
df_data_shift_left = df_data.drop(df_data.tail(1).index).reset_index(drop=True)
df_data_shift_right = df_data_shifted.drop(df_data_shifted.head(1).index).reset_index(drop=True)
df_click_check = pd.concat([df_data_shift_left,df_data_shift_right], axis=1)
df_click_check['same_client'] = np.where(((df_click_check['clientid_hashed'] == df_click_check['clientid_hashed_right'])),1,0)
df_click_check = df_click_check[df_click_check['same_client'] == 1]
df_click_check = df_click_check[df_click_check['BiebYN'] == 1]
df_click_check = df_click_check[df_click_check['ArticleYN'] == 1]
df_click_check = df_click_check[df_click_check['seen_recommendations'] == 1]

In [None]:
df_click_check['clicked_recommendation'] = 0
df_click_check = df_click_check.drop(['clientid_hashed_right', 'channelgrouping', 'browser', 'devicecategory'], axis = 1)
df_click_check.loc[(df_click_check['URL_right'] == df_click_check['rec_1']), 'clicked_recommendation'] = 'rec_1'
df_click_check.loc[(df_click_check['URL_right'] == df_click_check['rec_2']), 'clicked_recommendation'] = 'rec_2'
df_click_check.loc[(df_click_check['URL_right'] == df_click_check['rec_3']), 'clicked_recommendation'] = 'rec_3'
df_click_check = df_click_check.drop_duplicates(['URL', 'clientid_hashed', 'visitid', 'hitnumber', 'visitstarttime'],keep='last')

In [None]:
df_data = pd.merge(left = df_data, right = df_click_check[['URL', 'clientid_hashed', 'visitid', 'hitnumber', 'clicked_recommendation']], left_on = ['URL', 'clientid_hashed', 'visitid', 'hitnumber'], right_on= ['URL', 'clientid_hashed', 'visitid', 'hitnumber'], how='left')
df_data = df_data.drop_duplicates(['URL', 'clientid_hashed', 'visitid', 'hitnumber', 'visitstarttime']).fillna(0)

In [None]:
df_data['clicked_url'] = 0
df_data.loc[df_data['clicked_recommendation'] == 'rec_1', 'clicked_url'] = df_data['rec_1']
df_data.loc[df_data['clicked_recommendation'] == 'rec_2', 'clicked_url'] = df_data['rec_2']
df_data.loc[df_data['clicked_recommendation'] == 'rec_3', 'clicked_url'] = df_data['rec_3']

In [None]:
count_list_recommended = df_data[(df_data['clicked_recommendation'] != 0) & (df_data['URL'] != df_data['clicked_url'])]['clicked_url'].value_counts()

In [None]:
df_data.to_csv("/content/drive/My Drive/Knab/Data/CleanData/clean_page_data_recommendations.csv", index = False)

In [None]:
df_clean_page_data_recom = pd.read_csv("/content/drive/My Drive/Knab/Data/CleanData/clean_page_data_recommendations.csv")

In [None]:
df_timedelta = pd.read_csv("/content/drive/My Drive/Knab/Data/CleanData/time_delta.csv")

In [None]:
df_clean_page_data_recom_timedelta = pd.merge(left = df_clean_page_data_recom, right = df_timedelta[['URL', 'clientid_hashed', 'visitid', 'hitnumber','delta_time']], left_on = ['URL', 'clientid_hashed', 'visitid', 'hitnumber'], right_on = ['URL', 'clientid_hashed', 'visitid', 'hitnumber'], how = 'left')

In [None]:
df_clean_page_data_recom_timedelta.to_csv("/content/drive/My Drive/Knab/Data/CleanData/clean_page_data_recom_timedelta.csv", index = False)

#Checking 'biased' articles

In [None]:
df_clean_page_data_recom_timedelta = pd.read_csv("/content/drive/My Drive/Knab/Data/CleanData/clean_page_data_recom_timedelta.csv")

In [None]:
df_work = df_clean_page_data_recom_timedelta

In [None]:
df_work = df_work[df_work['ArticleYN'] == 1.0] # only take data points which are article clicks

In [None]:
df_click_recom = df_work[df_work['clicked_url'] != "0"]['clicked_url'].value_counts().reset_index().rename(columns = {'index' : 'URL', 'clicked_url' : 'count_rec'})

In [None]:
df_click_bieb = df_work['URL'].value_counts().reset_index().rename(columns = {'index' : 'URL', 'URL' : 'count'})

In [None]:
df_rec1 = df_work['rec_1'].value_counts().reset_index().rename(columns = {'index' : 'URL', 'URL' : 'count_rec1'})
df_rec2 = df_work['rec_2'].value_counts().reset_index().rename(columns = {'index' : 'URL', 'URL' : 'count_rec2'})
df_rec3 = df_work['rec_3'].value_counts().reset_index().rename(columns = {'index' : 'URL', 'URL' : 'count_rec3'})

In [None]:
#Merge all Article clicks (df_click_bieb), no. times recommended (df_rec1,2,3) and no. clicked on recommendation (df_click_recom)
df_bias_counts = df_click_bieb.merge(df_click_recom, how = "left").merge(df_rec1, how = "left").merge(df_rec2, how = "left").merge(df_rec3, how = "left").fillna(0)
df_bias_counts['number_rec'] = df_bias_counts['rec_1'] + df_bias_counts['rec_2'] + df_bias_counts['rec_3']
df_bias_counts = df_bias_counts.drop(['rec_1', 'rec_2', 'rec_3'], axis = 1)

In [None]:
#Calculate probability of being clicked given being recommended
df_bias_counts['prob_click_given_rec'] = np.where((df_bias_counts['number_rec'] > 0), df_bias_counts['count_rec']/df_bias_counts['number_rec'], 0)

In [None]:
#Calculate probability of being clicked given not being recommended
df_bias_counts['prob_click_given_not_rec'] = (df_bias_counts['count'] - df_bias_counts['count_rec'])/ (df_bias_counts['count'].values.sum() - df_bias_counts['number_rec'])

In [None]:
#Calculate bias = difference in probabilities
df_bias_counts['article_bias'] = 100*(df_bias_counts['prob_click_given_rec'] - df_bias_counts['prob_click_given_not_rec'])

In [None]:
#Average bias among URLs that are recommended ( > 0)
avg_bias = np.mean(df_bias_counts[df_bias_counts['number_rec'] > 0]['article_bias'].values)

In [None]:
#Weighted average bias among URLs that are recommended, weighted on fraction of clicks on total clicks of recommended articles
weighted_avg_bias = np.average(df_bias_counts[df_bias_counts['number_rec'] > 0]['article_bias'].values,weights = df_bias_counts[df_bias_counts['number_rec'] > 0]['count'].values/df_bias_counts[df_bias_counts['number_rec'] > 0]['count'].values.sum())

In [None]:
# t_score of weighted average bias
z_score = weighted_avg_bias/np.std(df_bias_counts[df_bias_counts['number_rec'] > 0]['article_bias'].values)

In [None]:
print(f"Average bias: {avg_bias}")
print(f"Weighted average bias: {weighted_avg_bias}")
print(f"Z score: {z_score}")

In [None]:
df_bias_counts