# Wikimedia data

- [Wikimedia Downloads: Analytics Datasets](https://dumps.wikimedia.org/other/analytics/)
Info about Pageviews, mediacounts and unique devices:
- [Pageviews since may 2015](https://dumps.wikimedia.org/other/pageviews/):
```
https://dumps.wikimedia.org/other/pageviews/[YEAR]/[YEAR]-[2-DIGIT-MONTH]/pageviews-YYYYMMDD-HHMMSS.GZ
```

- [Siteviews interactive analysis](https://tools.wmflabs.org/siteviews/?platform=all-access&source=pageviews&agent=all-agents&start=2015-07&end=2017-09&sites=all-projects)

## Running this notebook:

Dependencies:
- Bokeh
- Pandas

Enable widgetsnbextension: 
```
$ jupyter nbextension enable --py --sys-prefix widgetsnbextension
```


In [None]:
# use New Wikipedia scrapper and store in a dataframe
import wikimedia_scraper as ws
from datetime import datetime
import pandas as pd

start_date = datetime(2016,10,31)
end_date  = datetime(2017,5,30)

ws.output_notebook()

traffic_generator = ws.get_traffic_generator(start_date, end_date)
df = pd.DataFrame(list(traffic_generator))

In [None]:
df = df.set_index(pd.DatetimeIndex(df['date']))
df = df.drop(['date'], axis=1)
df = df.loc[df['project']=='en']

df.describe()

In [None]:
# need to convert types to avoid a INF value while computing mean value (too big number?)
df['hits']=df['hits'].astype(float)

#z-score
df["col_zscore"] = (df['hits'] - df['hits'].mean())/df['hits'].std(ddof=0)

#rolling mean
df["rolling"] = df['col_zscore'].rolling(window=24*7, min_periods=3).mean()

df.head()

In [None]:
import numpy as np
from datetime import timedelta


# Add new columns based on date index
df['weekday'] = df.index.weekday_name
df['hour'] = df.index.hour

startdate = datetime(1970,1,5)
days=('Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday')

df3=pd.DataFrame()

for day in days:
    dfx1 = pd.DataFrame(index=pd.date_range(start=startdate, periods=24, freq='H'), columns=['hits'])
    hitmeans=df.loc[df['weekday']==day].groupby('hour')['hits'].mean()
    dfx1['hits']=np.array(hitmeans)
    startdate+=timedelta(days=1)
    df3=pd.concat([df3,dfx1])

In [None]:
# Filtering between dates
mask = (df.index >= '2017-05-22 15:00:00') & (df.index <= '2017-05-23 5:00:00')
df.loc[mask]

# Ahora las gráficas


In [None]:
from bokeh.io import push_notebook, show, output_notebook
from bokeh.plotting import figure

from bokeh.models import DatetimeTickFormatter, NumeralTickFormatter, BasicTickFormatter
from bokeh.models.tickers import FixedTicker

In [None]:
year_plot = figure(title="wikipedia visits per hour", x_axis_type="datetime")

year_plot.yaxis.formatter=BasicTickFormatter(use_scientific=False)

year_plot.xaxis.formatter=DatetimeTickFormatter(
        hours=["%R"],
        days=["%d %b"],
        months=["%b"],
        years=["%a %H"],
    )

year_zscore_data = year_plot.line(df.index, df['col_zscore'], color="#2222aa", line_width=1)
year_rolling_data = year_plot.line(df.index, df['rolling'], color="red", line_width=1)

output_notebook()
show(year_plot, notebook_handle=True)
push_notebook()


In [None]:
avg_week_plot = figure(title="wikipedia average week", x_axis_type="datetime")

avg_week_plot.yaxis.formatter=BasicTickFormatter(use_scientific=False)

avg_week_plot.xaxis.formatter=DatetimeTickFormatter(
        hours=["%R"],
        days=["%a"],
        months=["%a - %H"],
        years=["%a %H"],
    )

avg_week_plot.yaxis[0].formatter = NumeralTickFormatter(format='0.0a')

avg_week_data = avg_week_plot.line(df3.index, df3['hits'], color="#2222aa", line_width=1)

output_notebook()
show(avg_week_plot, notebook_handle=True)
push_notebook()
