In [1]:
import pandas as pd
import numpy as np
import pylab
%matplotlib inline

import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)

# Load data

In [2]:
df = pd.read_csv("../data/20180220-wikia_stats_users_birthdate.csv")
df['datetime.birthDate'] = pd.to_datetime(df['datetime.birthDate'], infer_datetime_format=True, errors='coerce') 
df.set_index(df['datetime.birthDate'], inplace=True)
df.head()

Unnamed: 0_level_0,url,domain,hub,id,lang,language,name,stats.activeUsers,stats.admins,stats.articles,...,stats.nonarticles,users_1,users_5,users_10,users_20,users_50,users_100,bots,birthDate,datetime.birthDate
datetime.birthDate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2010-12-14 14:01:00,http://0ad.wikia.com/,0ad.wikia.com,Games,194794.0,en,en,0 A.D. Wiki,1.0,3.0,101.0,...,824.0,38,20,14,9,7,5,8,"14:01, December 14, 2010",2010-12-14 14:01:00
2016-09-13 02:10:00,http://0hourmysticknights.wikia.com/,0hourmysticknights.wikia.com,Games,1459872.0,en,en,0 Hour: Mystic Knights Wikia,0.0,1.0,22.0,...,139.0,7,6,3,1,1,1,4,"02:10, September 13, 2016",2016-09-13 02:10:00
2013-02-04 10:34:00,http://0-xxii.wikia.com/,0-xxii.wikia.com,Games,685186.0,en,en,0-XXII Wiki,0.0,1.0,34.0,...,262.0,5,3,3,2,2,1,3,"10:34, February 4, 2013",2013-02-04 10:34:00
2011-08-14 10:18:00,http://00fanon.wikia.com/,00fanon.wikia.com,TV,350933.0,en,en,00 Fanon Wiki,1.0,1.0,93.0,...,257.0,5,2,1,1,1,1,3,"10:18, August 14, 2011",2011-08-14 10:18:00
2013-01-24 16:04:00,http://0002oifos.wikia.com/,0002oifos.wikia.com,Games,678685.0,en,en,0002oifos Wiki,0.0,1.0,6.0,...,263.0,3,1,1,1,1,1,2,"16:04, January 24, 2013",2013-01-24 16:04:00


# Number of wikis over the years

In [3]:
byYear = df.resample('y').count()['id']
byYear

datetime.birthDate
1998-12-31      224
1999-12-31        7
2000-12-31        0
2001-12-31        2
2002-12-31        0
2003-12-31        0
2004-12-31       31
2005-12-31      516
2006-12-31     1123
2007-12-31     1111
2008-12-31     3295
2009-12-31    16410
2010-12-31    32576
2011-12-31    27429
2012-12-31    31491
2013-12-31    32641
2014-12-31    31962
2015-12-31    36450
2016-12-31    35694
2017-12-31    56855
2018-12-31    17529
Name: id, dtype: int64

Data shows an anomaly because Wikia was created in 2004. A deepen analysis of some of the anomaly wikis shows that some fake birthdates were included by a maintenance script (see [the earliest edit of this wiki, as an example](http://blasterman.wikia.com/wiki/Blasterman_Wiki?dir=prev&action=history)).

Additionally, 2018 wikis are very young (no more than two months) so they will be also removed.


In [4]:
dfClean = df['2004':'2017'].copy()
byYear = dfClean.resample('y').count()['id']
activeByYear = dfClean[(dfClean['stats.activeUsers']>=1)&(dfClean['users_1']>0)].resample('y').count()['id']
activeByYear

datetime.birthDate
2004-12-31       18
2005-12-31      364
2006-12-31      735
2007-12-31      658
2008-12-31     1519
2009-12-31     2910
2010-12-31     3705
2011-12-31     3688
2012-12-31     4147
2013-12-31     7201
2014-12-31     8675
2015-12-31    15187
2016-12-31    10498
2017-12-31    21031
Freq: A-DEC, Name: id, dtype: int64

In [5]:
traceTotal = go.Bar(x=byYear.index.year, y=byYear.values, name="Total wikis")
traceActive = go.Bar(x=activeByYear.index.year, y=activeByYear.values, name="Active wikis")
layout = go.Layout(
    legend=dict(x=0.1, y=0.85),
    xaxis=dict(
        autotick=False,
        tickangle=30
    )
)
iplot(go.Figure(data=[traceTotal, traceActive], layout=layout), filename='byYear')

## Wiki age

Instead of showing the number of wikis over the years, we will focus on the age of the active wikis in order to visualize the population pyramid.

In [6]:
def computeAge(birthDate):
    timeSinceBirth = pd.Timestamp(2018, 2, 20)-birthDate
    return int(timeSinceBirth.days/365)
dfClean['age'] = dfClean['datetime.birthDate'].apply(computeAge)

In [7]:
activeWikis = dfClean[(dfClean['stats.activeUsers']>=1)&(dfClean['users_1']>0)]
inactiveWikis = dfClean[(dfClean['stats.activeUsers']<1)|(dfClean['users_1']==0)]

In [8]:
activeByAge = activeWikis.groupby(by=['age']).url.count()
inactiveByAge = inactiveWikis.groupby(by=['age']).url.count()

In [10]:
trace0 = go.Scatter(
    x=activeByAge.index.values,
    y=activeByAge.values,
    mode='lines',
    name="Active wikis",
    line=dict(width=0.5),
    fill='tonexty'
)

trace1 = go.Scatter(
    x=inactiveByAge.index.values,
    y=inactiveByAge.values,
    mode='lines',
    name="Inactive wikis",
    line=dict(width=0.5),
    fill='tonexty'
)

layout = go.Layout(
    yaxis=dict(title='Number of active wikis'),
    xaxis=dict(
        domain=[0,0.5],
        tickmode='array',
        tickvals=list(range(0,20)),
        title="Age (in years)"
    ),
    legend=dict(
        x=0.4
    )
);

fig = go.Figure(data=[trace0], layout=layout)
iplot(fig, filename='stacked-area-plot')