In [None]:
import glob
import pandas as pd
from google.colab import files
from google.colab import drive
import numpy as np
from tqdm import tqdm
from collections import Counter
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt

#Import and transformations

In [None]:
drive.mount('/content/drive')

In [None]:
!ls "/content/drive/My Drive/Knab/Data/Knab_Analytics/DataEvents/Events"

In [None]:
!ls "/content/drive/My Drive/Knab/Data/Articles/"

In [None]:
#Load in all the PAGE files
page_files = glob.glob("/content/drive/My Drive/Knab/Data/Knab_Analytics/DataEvents/Pages/*.csv")
#print(page_files)
page_dfs = [pd.read_csv(p, sep=",") for p in page_files]
page_data = pd.concat(page_dfs,ignore_index=False)
page_data.name = 'Page_Data'

In [None]:
#Load in all the PAGE files
event_files = glob.glob("/content/drive/My Drive/Knab/Data/Knab_Analytics/DataEvents/Events/*.csv")
#print(page_files)
event_dfs = [pd.read_csv(p, sep=",") for p in event_files]
event_data = pd.concat(event_dfs,ignore_index=False)
event_data.name = 'Page_Data'

In [None]:
#Load in all urls with tags
df_urlstext = pd.read_excel("/content/drive/My Drive/Knab/Data/Articles/Scraped/URLTEXT.xlsx")

In [None]:
#Split dfs
df_urls = df_urlstext[['URL','TAG','TITLE','DATE','READING_TIME']]

In [None]:
def remove_nan(all_data):
  #Remove NaN column
  all_data = all_data.loc[:, all_data.columns.notnull()]

  #Drop all NaN rows
  all_data = all_data.dropna()
  
  return all_data

In [None]:
page_data = remove_nan(page_data)
event_data = remove_nan(event_data)

In [None]:
event_data.head()

In [None]:
#Change data types
#Integers
page_data['visitid'] = page_data['visitid'].astype('int64')
page_data['hitnumber'] = page_data['hitnumber'].astype('int64')
page_data['time'] = page_data['time'].astype('int64')
page_data['BiebYN'] = page_data['BiebYN'].astype('int64')

#Strings
page_data['clientid_hashed'] = page_data['clientid_hashed'].astype('str')
page_data['pagepath'] = page_data['pagepath'].astype('str')
page_data['channelgrouping'] = page_data['channelgrouping'].astype('str')
page_data['browser'] = page_data['browser'].astype('str')
page_data['devicecategory'] = page_data['devicecategory'].astype('str')
df_urls['URL'] = df_urls['URL'].astype('str')
df_urls['TAG'] = df_urls['TAG'].astype('str')
df_urls['TITLE'] = df_urls['TITLE'].astype('str')
df_urls['READING_TIME'] = df_urls['READING_TIME'].astype('str')

#Datetime
page_data['visitstarttime'] = pd.to_datetime(page_data['visitstarttime'])
df_urls['DATE'] = pd.to_datetime(df_urls['DATE'])

In [None]:
#Sort chronologically
page_data = page_data.sort_values(by=['visitstarttime'])

In [None]:
#Drop index column and reindex
page_data = page_data.drop(['Unnamed: 0'], axis = 1)
page_data = page_data.reset_index(drop=True)

#Calculate the amount of 1-click users

In [None]:
#Only select bieb visits
onlybiebs = page_data.loc[page_data['BiebYN'] == 1]

In [None]:
#Group on clientid and count their pages on bieb
visit_biebYN = onlybiebs.groupby(['clientid_hashed'])['BiebYN'].value_counts() #Every visitid made by certain clientid, counting pages in bieb and out of bieb per visitid

#List of counts
counts = list(visit_biebYN)

In [None]:
#Calculate histogram of how often a clientid has visited bieb
np.histogram(counts, bins = max(counts)-1)

In [None]:
np.set_printoptions(suppress=True) #Surpresses scientific notation
np.histogram(counts, bins = max(counts)-1)[0]/sum(np.histogram(counts, bins = max(counts)-1)[0]) # Calculate percentage of users for every occurence

#Calculate the amount of articles with few views

In [None]:
df_urls

In [None]:
#Add zero column to count
df_urls['COUNT'] = 0
countlist = df_urls[['URL', 'COUNT','TAG']]

In [None]:
#THIS IS NOT VERY CLEAN
#Find every occurence when a visited page is one of the articles
counter = []
for i in tqdm(range(0,len(onlybiebs['pagepath'].values))):
  if onlybiebs['pagepath'].values[i] in df_urls['URL'].values:
    counter.append(onlybiebs['pagepath'].values[i])

#Sum up all duplicates
count_dict = dict(Counter(counter))

In [None]:
#Replace counts in countlist: From dict to dataframe
for i in tqdm(range(0,len(countlist))):
  if countlist['URL'][i] in count_dict:
    countlist['COUNT'][i] = count_dict[countlist['URL'][i]]
  else:
    countlist['COUNT'][i] = 0
  

In [None]:
#Sort urls by most visited
countlist.sort_values(['COUNT'], ascending=False).to_csv('counts.csv', index=False)  # To save .to_csv('counts.csv', index=False)

In [None]:
#Most read article
countlist['URL'][718]

In [None]:
#Calculate the amount of articles that have less than 10 views
less_than_ten = countlist.loc[countlist['COUNT'] < 10]
len(less_than_ten)/len(countlist)

In [None]:
#Histogram of how often articles are read
plt.hist(countlist['COUNT'].values, 1500) #Calculate all bins
#Actually show the plot
plt.xlim(0,5000) #Only show values lower than ...
plt.show()

In [None]:
#Add the counts to the URL list
df_urls['COUNT'] = countlist['COUNT']

#Find interesting info

In [None]:
df_test = onlybiebs[onlybiebs.pagepath.isin(df_urls.URL)] #test snellere manier om alle articles te selecteren in onlybiebs die voorkomen in de url excel

In [None]:
df_test

In [None]:
#Reference the clicks to the article list.
page_data = page_data.assign(ArticleYN=page_data.pagepath.isin(df_urls.URL).astype(int))

In [None]:
#Percentage of views in the bieb that are actually articles
sum(page_data['ArticleYN'].values)/sum(page_data['BiebYN'].values)*100

In [None]:
#Percentage of views on knab website that are actually articles
sum(page_data['ArticleYN'].values)/len(page_data)*100

In [None]:
#Percentage of bieb on knab website
sum(page_data['BiebYN'].values)/len(page_data)*100

In [None]:
#Percentage of article readers that are 1 article-viewers
article_readers = page_data[page_data['ArticleYN'] == 1]
articles_read_by_client = list(article_readers.groupby(['clientid_hashed'])['ArticleYN'].count())
articles_read_by_client.count(1)/len(articles_read_by_client)*100

In [None]:
#Number of one time readers
articles_read_by_client.count(1)

In [None]:
#Number of article readers
len(articles_read_by_client)

In [None]:
#Number of article clicks
sum(page_data['ArticleYN'])

In [None]:
#Number of clicks in the Bieb
sum(page_data['BiebYN'].values)

In [None]:
#Number of total clicks
len(page_data)

In [None]:
#Actually useful datapoints
sum(page_data['ArticleYN'])-articles_read_by_client.count(1)

In [None]:
#Average amount of clicks except with 1clickers
(sum(page_data['ArticleYN'])-articles_read_by_client.count(1))/(len(articles_read_by_client)-articles_read_by_client.count(1))

In [None]:
#Rename the columns - Easier merging
article_readers.rename(columns={'clientid_hashed': 'clientid_hashed', 'visitid': 'visitid', 'visitstarttime': 'visitstarttime', 'hitnumber': 'hitnumber', 'time': 'time', 'pagepath': 'URL', 'channelgrouping': 'channelgrouping', 'browser': 'browser', 'devicecategory': 'devicecategory', 'BiebYN': 'BiebYN', 'ArticleYN': 'ArticleYN'}, inplace=True)

In [None]:
#Merge and drop count list
article_readers = article_readers.merge(df_urls, how='left')
article_readers.drop(['COUNT'], axis=1)

In [None]:
article_readers.groupby(['TAG']).agg(['count'])['clientid_hashed']

#Pipeline: export data to find old recommendations
We need to export all the data, with a merge between url and all_data

In [None]:
page_data = page_data.merge(df_urls, how='left')
export_data = page_data.drop(['COUNT'], axis=1)

In [None]:
#Export to CSV file
export_data.to_csv("merged_data.csv", index=False)
#Open tab on left and download csv file