In [3]:
import pandas as pd
from d2i_tools2 import getMeta
import warnings
warnings.simplefilter("ignore")
warnings.filterwarnings('ignore')  # from LDSA .. try and compare this with above

# read my logged meta data into dataframe
df = pd.read_csv('mac_/datasets/meta_log.csv')

# shape of this dataframe
df.shape

(1768, 7)

## [01] Function to get daily log of total page views / downloads for a specific dataset

In [4]:
# function returns daily log of total page views and downloads
# for the specified dataset between the specified start and end dates

def getMetaOneLog(metalogdf, id, start, end):
    cols = ['id', 'pv_total', 'download_count', 'log_time']
    df1 = metalogdf[cols].copy()
    df1['log_time'] = pd.to_datetime(df1['log_time'].str[0:10], format="%Y-%m-%d")  
    df1 = df1.rename(columns={'pv_total':'total_pageviews', 'download_count':'total_downloads', 'log_time':'date'})
    df1 = df1.set_index(['date'])  # set date index
    df1 = df1[start:end]  # get data between start and end dates
    df1 = df1[df1['id'] == id].drop(columns=['id'])
    print(f'Returning daily log of page views and downloads for dataset {id} ...')
    return df1

getMetaOneLog(df, 'b2ak-trbp', '2021-11-21', '2021-11-25')

Returning daily log of page views and downloads for dataset b2ak-trbp ...


Unnamed: 0_level_0,total_pageviews,total_downloads
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2021-11-21,73859,8834
2021-11-22,73870,8834
2021-11-23,73894,8836
2021-11-24,73910,8836
2021-11-25,73925,8837


## [02] Function to get dataset page views / downloads between specific dates

In [7]:
def getMetaTrends(metalogdf, start, end):
    # prep meta data log (df1) to extract trends from
    cols = ['id', 'pv_total', 'download_count', 'log_time']
    df1 = metalogdf[cols].copy()
    df1['log_time'] = pd.to_datetime(df1['log_time'].str[0:10], format="%Y-%m-%d")  
    df1 = df1.rename(columns={'pv_total':'pv', 'download_count':'dl', 'log_time':'date'})
    df1 = df1.set_index(['date'])  # set date index
    # for each dataset id, extract pageviews and downloads between 'start' and 'end' dates
    # into df2
    lid = []
    lpv = []
    ldl = []
    for id in df1['id'].unique():
        mask = df1['id'] == id
        pv = df1[mask][end:end]['pv'][0] - df1[mask][start:start]['pv'][0]
        dl = df1[mask][end:end]['dl'][0] - df1[mask][start:start]['dl'][0]
        lid.append(id)
        lpv.append(pv)
        ldl.append(dl)
    df2 = pd.DataFrame({'id':lid, 'pageviews':lpv, 'downloads':ldl})
    # read meta dataset id and name (into df3)
    df3 = getMeta()[['id','name']]
    df4 = df3.merge(df2, on='id')  # df4 is final df
    return df4

## [03] Between the specified dates, show top n datasets

In [11]:
metatrendsdf = getMetaTrends(df,'2021-11-19', '2021-11-23')
n = 3



### Show top n datasets by page views

In [12]:
metatrendsdf.sort_values(by='pageviews', ascending=False)[0:n]

Unnamed: 0,id,name,pageviews,downloads
5,i8px-csib,median house prices - by type and sale year,126,6
4,7q9g-yyvg,pay stay zones linked to street segments,89,8
3,vh2v-4nfs,on-street parking bay sensors,74,1192


### Show top n datasets by downloads

In [13]:
metatrendsdf.sort_values(by='downloads', ascending=False)[0:n]

Unnamed: 0,id,name,pageviews,downloads
11,ntht-5rk7,on-street car park bay restrictions,45,1308
3,vh2v-4nfs,on-street parking bay sensors,74,1192
28,d6mv-s43h,pedestrian counting system - past hour (counts...,7,97
