In [1]:
import wmfdata as wmf

You can find the source for `wmfdata` at https://github.com/neilpquinn/wmfdata


In [2]:
num_art = """
select
    database() as wiki,
    ss_good_articles as articles
from site_stats;
"""

# Wikipedia articles

In [3]:
wps = wmf.utils.list_wikis(["wikipedia"])

In [None]:
art = wmf.mariadb.multirun(num_art, wikis = wps)

In [17]:
art.head()

Unnamed: 0,wiki,articles
0,aawiki,1
1,abwiki,3408
2,acewiki,7206
3,adywiki,408
4,afwiki,49461


In [5]:
art["articles"].sum()

47902207

# Wiktionary entries

In [6]:
wts = wmf.utils.list_wikis(["wiktionary"])

In [None]:
ent = wmf.mariadb.multirun(num_art, wikis = wts)

In [18]:
ent.head()

Unnamed: 0,wiki,articles
0,aawiktionary,0
1,abwiktionary,0
2,afwiktionary,20342
3,akwiktionary,0
4,alswiktionary,0


In [None]:
ent["articles"].sum()

# Wikidata items

In [9]:
wmf.mariadb.run("select ss_good_articles from wikidatawiki.site_stats")

Unnamed: 0,ss_good_articles
0,47218270


# Commons files

In [11]:
licenses_q = """
select license, count(*) as files
from
(select
  case
    when cats like '%CC-BY-NC-ND%' then 'CC-BY-NC-ND'
    when cats like '%CC-BY-NC-SA%' then 'CC-BY-NC-SA'
    when cats like '%CC-BY-ND%' then 'CC-BY-ND'
    when cats like '%CC-BY-SA%' then 'CC-BY-SA'
    when cats like '%CC-BY-NC%' then 'CC-BY-NC'
    when cats like '%CC-BY%' then 'CC-BY'
    when cats like '%CC-SA%' then 'CC-SA'
    when cats like '%CC-Zero%' then 'CC-0'
    when cats like '%CC-PD%' then 'CC-PD'
    else 'Other CC'
  end as license
  from
  (select cl_from, group_concat(cl_to) as cats
    from commonswiki.categorylinks
    inner join
      (select cat_title from commonswiki.category where
        cat_title like 'CC-%' and
        cat_title not like '%aircraft%' and
        cat_title not regexp 'CC-[[:upper:][:digit:]]{3}'
      ) cc_cats
    on cat_title = cl_to
    where cl_type = "file"
    group by cl_from
  ) cc_files
) licenses
group by license;
"""

licenses = wmf.mariadb.run(licenses_q)

In [12]:
licenses.sort_values("license")

Unnamed: 0,license,files
0,CC-0,2556807
1,CC-BY,6649957
2,CC-BY-NC,6705
3,CC-BY-NC-SA,231
4,CC-BY-ND,15
5,CC-BY-SA,26282103
6,CC-PD,4269282
7,CC-SA,2799
8,Other CC,1


In [13]:
licenses["files"].sum()

39767900

In [None]:
media_types_q = """
select img_media_type, count(*) as files
from
(select distinct cl_from
  from commonswiki.categorylinks
  inner join
    (select cat_title from commonswiki.category where
      cat_title like 'CC-%' and
      cat_title not like '%aircraft%' and
      cat_title not regexp 'CC-[[:upper:][:digit:]]{3}'
    ) cc_cats
  on cat_title = cl_to
  where cl_type = "file"
) cc_files
inner join commonswiki.page on cl_from = page_id
inner join commonswiki.image on page_title = img_name
group by img_media_type;
"""

media_types = wmf.mariadb.run(media_types_q)

In [15]:
media_types

Unnamed: 0,img_media_type,files
0,BITMAP,37784741
1,DRAWING,819140
2,AUDIO,799534
3,VIDEO,103970
4,MULTIMEDIA,4
5,OFFICE,260210
6,3D,281


In [16]:
media_types["files"].sum()

39767880

## Total Commons files

In [10]:
wmf.mariadb.run("select ss_images from commonswiki.site_stats")

Unnamed: 0,ss_images
0,46474917
