In [None]:
import glob
import pandas as pd
from google.colab import files
from google.colab import drive
import numpy as np
from tqdm import tqdm
from collections import Counter
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
import datetime as dt
import seaborn as sns
style = plt.style.use('seaborn-deep')

In [None]:
drive.mount('/content/drive')
!ls "/content/drive/My Drive/Knab/Data/CleanData"

#Load in data

In [None]:
#Page data
df_page = pd.read_csv("/content/drive/My Drive/Knab/Data/CleanData/clean_page_data.csv")

In [None]:
#Load in all urls with tags
df_urlstext = pd.read_csv("/content/drive/My Drive/Knab/Data/CleanData/clean_article_data.csv")

In [None]:
#Split dfs, remove the text for memory space
df_urls = df_urlstext[['URL','TAG','TITLE','DATE','READING_TIME']]

In [None]:
#Change the datetimes (They reset when loading in csv)
df_urls['DATE'] = pd.to_datetime(df_urls['DATE'])


In [None]:
#Sort chronologically
df_page = df_page.sort_values(by=['visitstarttime'])

#Calculate the popularity of articles

In [None]:
#Put 0/1 encoding of page is also an article
df_page = df_page.assign(ArticleYN=df_page.URL.isin(df_urls.URL).astype(int))

In [None]:
#Remove big chunk if not usefull data
df_only_articles = df_page.loc[df_page['ArticleYN'] == 1]
df_only_articles['visitstarttime'] = pd.to_datetime(df_only_articles['visitstarttime'])

In [None]:
#Only article csv
df_only_articles.to_csv('clean_page_only_articles.csv', index=False)

In [None]:
#Only for truncated period
def popularity_period(months,dataframe, column_name_data):
  delta_months = dt.timedelta(days = 30*months)
  date_truncate = dt.datetime(2019, 1, 1, 00, 00) - delta_months   #dt.datetime(2019, 1, 1, 00, 00) should be replaced by dt.datetime.now()
  dataframe = dataframe[dataframe[column_name_data] >= date_truncate]
  
  return dataframe

In [None]:
#### HYPERPARAMETER MONTHS!!! ####
#Truncate the data on number of months
df_truncated = popularity_period(months = 3, dataframe = df_only_articles, column_name_data = 'visitstarttime')

In [None]:
merged_truncated = pd.merge(df_truncated, df_urls[['URL', 'TITLE']], left_on = 'URL', right_on='URL', how='left')
df_merged_truncated = merged_truncated.dropna()
df_popcount = pd.DataFrame(df_merged_truncated.groupby(['TITLE','URL']).size()).reset_index()
df_popcount.columns = ['TITLE', 'URL', 'COUNT']

In [None]:
#Sigmoid function
def log_function (A,K,B,v,Q,C,M,x):
  Y = A + (K-A)/((C+np.exp(-B*(x-M)))**(1/v))
  return Y

In [None]:
#Calculate midpoint with percentile on the right hand side

def find_midpoint(midpoint_percentile, dataframe, columnname):
  df = dataframe.sort_values([columnname], ascending = True) #Lowest on top
  length = len(df)
  index = round(length*(1-midpoint_percentile)) #Calculate index on which the percentile is crossed
  midpoint = df.iloc[index][columnname] #Find the view value on this index
  
  return midpoint

In [None]:
midpoint = find_midpoint(0.01,df_popcount,'COUNT') #Position of max learning rate
max_weight = 1 #Weight will be 1+max_weight
growth_rate = 30/max(df_popcount['COUNT'].values) #Defines steepness in function of the reading value

In [None]:
#Plot different scenarios
percentile = [0.25,0.1,0.05,0.01,0.005]
growth_rate_top = [10,25,50,75,100]
max_weight_list = [0.1,0.2,0.5,0.8,1]
growth_rate_list = growth_rate_top/max(df_popcount['COUNT'].values)

x = np.arange(0,max(df_popcount['COUNT'].values),0.025)

#Different midpoint percentiles with as growth rate 25/max(count)
y_per_1 = log_function(A=0.0,K=max_weight,B=growth_rate_list[1],v=1,Q=1,C=1,M=find_midpoint(percentile[0],df_popcount,'COUNT'), x = x)
y_per_2 = log_function(A=0.0,K=max_weight,B=growth_rate_list[1],v=1,Q=1,C=1,M=find_midpoint(percentile[1],df_popcount,'COUNT'), x = x)
y_per_3 = log_function(A=0.0,K=max_weight,B=growth_rate_list[1],v=1,Q=1,C=1,M=find_midpoint(percentile[2],df_popcount,'COUNT'), x = x)
y_per_4 = log_function(A=0.0,K=max_weight,B=growth_rate_list[1],v=1,Q=1,C=1,M=find_midpoint(percentile[3],df_popcount,'COUNT'), x = x)
y_per_5 = log_function(A=0.0,K=max_weight,B=growth_rate_list[1],v=1,Q=1,C=1,M=find_midpoint(percentile[4],df_popcount,'COUNT'), x = x)

#Different growth rates with as percentile 0.01
y_gr_1 = log_function(A=0.0,K=max_weight,B=growth_rate_list[0],v=1,Q=1,C=1,M=find_midpoint(percentile[3],df_popcount,'COUNT'), x = x)
y_gr_2 = log_function(A=0.0,K=max_weight,B=growth_rate_list[1],v=1,Q=1,C=1,M=find_midpoint(percentile[3],df_popcount,'COUNT'), x = x)
y_gr_3 = log_function(A=0.0,K=max_weight,B=growth_rate_list[2],v=1,Q=1,C=1,M=find_midpoint(percentile[3],df_popcount,'COUNT'), x = x)
y_gr_4 = log_function(A=0.0,K=max_weight,B=growth_rate_list[3],v=1,Q=1,C=1,M=find_midpoint(percentile[3],df_popcount,'COUNT'), x = x)
y_gr_5 = log_function(A=0.0,K=max_weight,B=growth_rate_list[4],v=1,Q=1,C=1,M=find_midpoint(percentile[3],df_popcount,'COUNT'), x = x)

#Different upper asymptotes with as percentile 0.01
y_up_1 = log_function(A=0.0,K=max_weight_list[0],B=growth_rate_list[1],v=1,Q=1,C=1,M=find_midpoint(percentile[3],df_popcount,'COUNT'), x = x)
y_up_2 = log_function(A=0.0,K=max_weight_list[1],B=growth_rate_list[1],v=1,Q=1,C=1,M=find_midpoint(percentile[3],df_popcount,'COUNT'), x = x)
y_up_3 = log_function(A=0.0,K=max_weight_list[2],B=growth_rate_list[1],v=1,Q=1,C=1,M=find_midpoint(percentile[3],df_popcount,'COUNT'), x = x)
y_up_4 = log_function(A=0.0,K=max_weight_list[3],B=growth_rate_list[1],v=1,Q=1,C=1,M=find_midpoint(percentile[3],df_popcount,'COUNT'), x = x)
y_up_5 = log_function(A=0.0,K=max_weight_list[4],B=growth_rate_list[1],v=1,Q=1,C=1,M=find_midpoint(percentile[3],df_popcount,'COUNT'), x = x)

In [None]:
sns.set_style("white")

plt.plot( x , y_up_1, linestyle = ':', color='blue', linewidth=2, ms = 10, markevery = 40000)
plt.plot( x , y_up_2, linestyle = '--', color='olive', linewidth=2, ms = 10, markevery = 40000)
plt.plot( x , y_up_3, color='coral', linewidth=2, ms = 10, markevery = 40000)
plt.plot( x , y_up_4, marker = '.', color='m', linewidth=2, ms = 10, markevery = 40000)
plt.plot( x , y_up_5, linestyle = '-.', color='brown', linewidth=2, ms = 10, markevery = 40000)
plt.legend(['0.1', '0.2', '0.5', '0.8', '1.0'], loc='upper left', prop={'size': 18})
plt.grid()
plt.xlabel('Number of article-clicks', fontsize = 22)
plt.ylabel('Weight', fontsize = 22)
plt.tick_params(labelsize = 18)
plt.show()

In [None]:
sns.set_style("white")

fig_size = plt.rcParams['figure.figsize']
fig_size[0] = 12
fig_size[1] = 9

plt.plot( x , y_per_1, linestyle = ':', color='blue', linewidth=2, ms = 10, markevery = 40000)
plt.plot( x , y_per_2, linestyle = '--', color='olive', linewidth=2, ms = 10, markevery = 40000)
plt.plot( x , y_per_3, color='coral', linewidth=2, ms = 10, markevery = 40000)
plt.plot( x , y_per_4, marker = '.', color='m', linewidth=2, ms = 10, markevery = 40000)
plt.plot( x , y_per_5, linestyle = '-.', color='brown', linewidth=2, ms = 10, markevery = 40000)
plt.legend(['0.25 percentile', '0.1 percentile', '0.05 percentile', '0.01 percentile', '0.005 percentile'], loc='lower right', prop={'size': 20})
plt.xlabel('Number of article-clicks', fontsize = 22)
plt.ylabel('Weight', fontsize = 22)
plt.grid()
plt.tick_params(labelsize = 18)
plt.show()

In [None]:
growth_rate_top

In [None]:
sns.set_style("white")

fig_size = plt.rcParams['figure.figsize']
fig_size[0] = 12
fig_size[1] = 9

plt.plot( x , y_gr_1, linestyle = ':', color='blue', linewidth=2, ms = 10, markevery = 40000)
plt.plot( x , y_gr_2, linestyle = '--', color='olive', linewidth=2, ms = 10, markevery = 40000)
plt.plot( x , y_gr_3, color='coral', linewidth=2, ms = 10, markevery = 40000)
plt.plot( x , y_gr_4, marker = '.', color='m', linewidth=2, ms = 10, markevery = 40000)
plt.plot( x , y_gr_5, linestyle = '-.', color='brown', linewidth=2, ms = 10, markevery = 40000)
plt.legend(['10/max(count)', '25/max(count)', '50/max(count)', '75/max(count)', '100/max(count)'], loc='lower right', prop={'size': 20})
plt.xlabel('Number of article-clicks', fontsize = 22)
plt.grid()
plt.tick_params(labelsize = 18)
plt.show()

In [None]:
#Make visualization of the function
sns.set_style("white")

fig_size = plt.rcParams['figure.figsize']
fig_size[0] = 12
fig_size[1] = 9

x = np.arange(0,max(df_popcount['COUNT'].values),0.025)
y = log_function(A=0.0,K=1,B=growth_rate,v=1,Q=1,C=1,M=midpoint, x = x)
plt.xlabel('Number of article-clicks', fontsize = 22)
plt.ylabel('Weight', fontsize = 22)
plt.grid()
plt.tick_params(labelsize = 18)
plt.plot(x, y)

In [None]:
#Run the weighing function over the page data
df_popcount['popularity_weight'] = log_function(A=0.0,K=1,B=growth_rate,v=1,Q=1,C=1,M=midpoint, x = df_popcount['COUNT'].values) 

In [None]:
#Visual Check
df_popcount.sort_values(['COUNT'],ascending=False)

In [None]:
#Save to csv
df_popcount.to_csv('/content/drive/My Drive/Knab/Data/CleanData/Popularity_score.csv', index=False)