# Wikimedia data

- [Wikimedia Downloads: Analytics Datasets](https://dumps.wikimedia.org/other/analytics/)
Info about Pageviews, mediacounts and unique devices:
- [Pageviews since may 2015](https://dumps.wikimedia.org/other/pageviews/):
```
https://dumps.wikimedia.org/other/pageviews/[YEAR]/[YEAR]-[2-DIGIT-MONTH]/pageviews-YYYYMMDD-HHMMSS.GZ
```

- [Siteviews interactive analysis](https://tools.wmflabs.org/siteviews/?platform=all-access&source=pageviews&agent=all-agents&start=2015-07&end=2017-09&sites=all-projects)

## Running this notebook:

Dependencies:
- Bokeh
- Pandas

Enable widgetsnbextension: 
```
$ jupyter nbextension enable --py --sys-prefix widgetsnbextension
```


In [None]:
# dataframe-related imports
import wikimedia_scraper as ws
from datetime import datetime
import pandas as pd
import numpy as np
from datetime import timedelta

In [None]:
# plotting-related imports
import matplotlib.pylab as plt
%matplotlib inline

from bokeh.io import push_notebook, show, output_notebook
from bokeh.plotting import figure

from bokeh.models import DatetimeTickFormatter, NumeralTickFormatter, BasicTickFormatter
from bokeh.models.tickers import FixedTicker

## Get data source

In [None]:
# use New Wikipedia scrapper and store in a dataframe

start_date = datetime(2016, 11,  1)
end_date   = datetime(2017, 10, 30)

ws.output_notebook()

traffic_generator = ws.get_traffic_generator(start_date, end_date, projects=('en',))
df = pd.DataFrame(list(traffic_generator))

df.head()

In [None]:
# set date as index 

df = df.set_index(pd.DatetimeIndex(df['date']))
df = df.drop(['date'], axis=1)
df = df.loc[df['project']=='en']

df.head()

In [None]:
# z-score (not really meaningful for this study)
#df["col_zscore"] = (df['hits'] - df['hits'].mean())/df['hits'].std(ddof=0)

# Filtering between dates example (not used for now)
#mask = (df.index >= '2017-05-22 15:00:00') & (df.index <= '2017-05-23 5:00:00')

#filtered_df = df.loc[mask]

## Yearly hits data normalization & plotting

In [None]:
# need to convert types to avoid a INF value while computing mean value (too big number?)
df['hits'] = df['hits'].astype(float)

# rolling mean
df['normalized_hits'] = df['hits'].astype(float)/df['hits'].astype(int).sum()
df['rolling'] = df['normalized_hits'].rolling(window=24*7, min_periods=3).mean()

df.head()

In [None]:
# plotting yearly data

# using BOKEH
year_plot = figure(title="wikipedia visits per hour", x_axis_type="datetime")

year_plot.yaxis.formatter = BasicTickFormatter(use_scientific=False)

year_plot.xaxis.formatter = DatetimeTickFormatter(
        hours  = [ '%R'    ],
        days   = [ '%d %b' ],
        months = [ '%b'    ],
        years  = [ '%a %H' ],
)

year_zscore_data = year_plot.line(df.index, df['normalized_hits'], color="#2222aa", line_width=1)
year_rolling_data = year_plot.line(df.index, df['rolling'], color="red", line_width=1)

#output_notebook()
#show(year_plot, notebook_handle=True)
#push_notebook()

# using matplotlib

plt.plot(df['normalized_hits'], color='blue', label='hits')
plt.plot(df['rolling'], color='red',label='Original')

## The "_Average week_" calculation

In [None]:
# Add new columns based on date index
df['weekday'] = df.index.weekday_name
df['hour']    = df.index.hour

startdate = datetime(1970,1,5)
days = ('Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday')

df3 = pd.DataFrame()

for day in days:
    dfx1 = pd.DataFrame(index=pd.date_range(start=startdate, periods=24, freq='H'), columns=['hits'])
    hitmeans = df.loc[df['weekday']==day].groupby('hour')['hits'].mean()
    dfx1['hits'] = np.array(hitmeans)
    startdate += timedelta(days=1)
    df3 = pd.concat([df3,dfx1])

df3['normalized_hits'] = df3['hits'].astype(float)/df3['hits'].astype(int).sum()
df3['rolling'] = df3['normalized_hits'].rolling(window=24, min_periods=3).mean()

In [None]:
# plotting avg week

avg_week_plot = figure(title="wikipedia average week (normalized)", x_axis_type="datetime")

avg_week_plot.yaxis.formatter = BasicTickFormatter(use_scientific=False)

avg_week_plot.xaxis.formatter = DatetimeTickFormatter(
        hours  = [ '%R'    ],
        days   = [ '%a'    ],
        months = [ '%a %H' ],
)

avg_week_plot.yaxis[0].formatter = NumeralTickFormatter(format='0.000a')

avg_week_data = avg_week_plot.line(df3.index, df3['normalized_hits'], color="#2222aa", line_width=1)
avg_week_rolling_data = avg_week_plot.line(df3.index, df3['rolling'], color="red", line_width=1)

#output_notebook()
#show(avg_week_plot, notebook_handle=True)
#push_notebook()

plt.plot(df3['normalized_hits'], color='blue', label='hits')
plt.plot(df3['rolling'], color='red',label='Original')

## Construct a synthetic year

In [None]:
# FIXME: datetime.now() has to be replaced with the last monday at 0:00
dfy = pd.DataFrame(index=pd.date_range(start=datetime.now(), periods=52*7*24, freq='H'), columns=['normalized_hits'])
dfy['normalized_hits'] = list(df3['normalized_hits']) * 52

In [None]:
# plotting yearly data
synthetic_year_plot = figure(title="wikipedia visits per hour", x_axis_type="datetime")

synthetic_year_plot.yaxis.formatter = BasicTickFormatter(use_scientific=False)

synthetic_year_plot.xaxis.formatter = DatetimeTickFormatter(
        hours  = [ '%R'    ],
        days   = [ '%d %b' ],
        months = [ '%b'    ],
        years  = [ '%a %H' ],
)

year_zscore_data = synthetic_year_plot.line(dfy.index, dfy['normalized_hits'], color="#2222aa", line_width=1)
#year_rolling_data = year_plot.line(df.index, dfy['rolling'], color="red", line_width=1)

#output_notebook()
#show(synthetic_year_plot, notebook_handle=True)
#push_notebook()

plt.plot(dfy['normalized_hits'], color='blue', label='hits')

## Growth function

### No growth

In [None]:
dfy['growth_factor'] = (1,) * len(dfy)
dfy.head()

In [None]:
# To be requested from the user
scale_factor = df3['hits'].astype(int).sum()

In [None]:
dfy['hits'] = dfy['normalized_hits'] * dfy['growth_factor'] * scale_factor

### Linear growth (10% over len(dfy))

In [None]:
dfy['growth_factor'] = (np.nan,) * len(dfy)
dfy['growth_factor'][ 0] = 1
dfy['growth_factor'][-1] = 1.10

dfy.interpolate(inplace=True, method='linear')
plt.plot(dfy['growth_factor'])


In [None]:
dfy['hits'] = dfy['normalized_hits'] * dfy['growth_factor'] * scale_factor

In [None]:
# plotting yearly data
synthetic_year_plot = figure(title="wikipedia visits per hour", x_axis_type="datetime")

synthetic_year_plot.yaxis.formatter = BasicTickFormatter(use_scientific=False)

synthetic_year_plot.xaxis.formatter = DatetimeTickFormatter(
        hours  = [ '%R'    ],
        days   = [ '%d %b' ],
        months = [ '%b'    ],
        years  = [ '%a %H' ],
)

year_zscore_data = synthetic_year_plot.line(dfy.index, dfy['hits'], color="#2222aa", line_width=1)
#year_rolling_data = year_plot.line(df.index, dfy['rolling'], color="red", line_width=1)

#output_notebook()
#show(synthetic_year_plot, notebook_handle=True)
#push_notebook()

plt.plot(dfy['hits'], color='#2222aa', label='hits')

### Exponential growth (275% over len(dfy))

In [None]:
dfy['x'] = (np.nan,) * len(dfy)
dfy['x'][ 0] = 0
dfy['x'][-1] = 1

dfy=dfy.drop(['growth_factor'], axis=1)
dfy.interpolate(inplace=True, method='linear')

In [None]:
from math import log

dfy['growth_factor'] = np.exp(log(2)*dfy['x'])

In [None]:
plt.plot(dfy['growth_factor'])

In [None]:
dfy['hits'] = dfy['normalized_hits'] * dfy['growth_factor'] * scale_factor

In [None]:
# plotting yearly data
synthetic_year_plot = figure(title="wikipedia visits per hour", x_axis_type="datetime")

synthetic_year_plot.yaxis.formatter = BasicTickFormatter(use_scientific=False)

synthetic_year_plot.xaxis.formatter = DatetimeTickFormatter(
        hours  = [ '%R'    ],
        days   = [ '%d %b' ],
        months = [ '%b'    ],
        years  = [ '%a %H' ],
)

year_zscore_data = synthetic_year_plot.line(dfy.index, dfy['hits'], color="#2222aa", line_width=1)
#year_rolling_data = year_plot.line(df.index, dfy['rolling'], color="red", line_width=1)

#output_notebook()
#show(synthetic_year_plot, notebook_handle=True)
#push_notebook()

plt.plot(dfy['hits'])

In [None]:
_million_reqs = 1000000

# Free tier, monthly freebies
_free_reqs = _million_reqs
_free_compute_GB_s = 400000

_remaining_free_hits = _free_reqs
_remaining_free_GB_s = _free_compute_GB_s

_cost_per_million_reqs = .20
_cost_per_GB_s = 0.00001667

_mem_per_req_MB = 128 
_time_per_req_ms = 500

def reset_free_tier_counters():
    global _remaining_free_hits
    _remaining_free_hits = _free_reqs
    
    global _remaining_free_GB_s
    _remaining_free_GB_s = _free_compute_GB_s

def __get_free_tier_discount(resources, remaining_resources):
    return (0, remaining_resources-resources) if (remaining_resources > resources) \
        else (resources-remaining_resources, 0)
    
def get_hourly_cost(date, reqs):
    if date.day == 1 and date.hour == 0:
        reset_free_tier_counters()
    
    global _remaining_free_hits
    (reqs, _remaining_free_hits) = __get_free_tier_discount(reqs, _remaining_free_hits)

    compute_GB_s = reqs * _mem_per_req_MB/1024 * _time_per_req_ms/1000

    global _remaining_free_GB_s
    (compute_GB_s, _remaining_free_GB_s) = __get_free_tier_discount(compute_GB_s, _remaining_free_GB_s)
        
    return (reqs / _million_reqs) * _cost_per_million_reqs + compute_GB_s * _cost_per_GB_s

In [None]:
dfy['date'] = dfy.index
dfy['lambda_cost'] = dfy.apply(lambda x: get_hourly_cost(date = x['date'], reqs = x['hits']), axis=1)

In [None]:
dfy['2017-11-30':]