Files c.f. https://dumps.wikimedia.org/simplewiki/20180220/

In [13]:
import gzip
import re

from sqlalchemy import create_engine

In [2]:
engine = create_engine('mysql+mysqlconnector://claydavi:@/enwiki')

We need the `page` and `categorylinks` tables loaded; we can use stream processing for the links.

`math_page` is a subset of the wikipedia `page` table, with pages in namespace 0 linked to any category matching `%athematic%`

```sql
create temporary table math_page_ids as
select distinct(cl_from) as page_id 
from categorylinks 
where cl_type='page' 
    and cl_to like '%athematic%';
```
```sql
create table math_page as
select page.page_id, page_title, page_is_redirect 
from page 
    join math_page_ids 
    on page.page_id = math_page_ids.page_id 
where page_namespace = 0;
```

In [6]:
result = engine.execute('select page_id from math_page')
page_ids = {x[0] for x in result}

In [7]:
len(page_ids)

15992

In [9]:
result = engine.execute('select page_title, page_id from math_page')
page_titles = {k.decode('utf8'): v for k, v in result}

In [10]:
len(page_titles)

15992

In [5]:
columns = ['pl_from', 'pl_namespace', 'pl_title', 'pl_from_namespace']

In [64]:
links = set()
line_num = 0

In [72]:
rex = re.compile(r"\((\d+),(\d+),'(.+?)',(\d+)\)")
with gzip.open('enwiki-20180220-pagelinks.sql.gz', 'rt', encoding='latin1') as f:
    for idx, line in enumerate(f):
        if idx % 100 == 0:
            print('line %i' % idx)
        # if idx < line_num:
        #     continue
        line_num = idx
        if line.startswith('INSERT INTO'):
            data = rex.findall(line)
            # print(len(data))
            for row in data:
                try:
                    page_id = int(row[0])
                    page_title = row[2]
                except ValueError:
                    print(datum, row)
                    print(line)
                    raise
                if page_id in page_ids and page_title in page_titles:
                    links.add((page_id, page_titles[page_title]))

line 0
line 100
line 200
line 300
line 400
line 500
line 600
line 700
line 800
line 900
line 1000
line 1100
line 1200
line 1300
line 1400
line 1500
line 1600
line 1700
line 1800
line 1900
line 2000
line 2100
line 2200
line 2300
line 2400
line 2500
line 2600
line 2700
line 2800
line 2900
line 3000
line 3100
line 3200
line 3300
line 3400
line 3500
line 3600
line 3700
line 3800
line 3900
line 4000
line 4100
line 4200
line 4300
line 4400
line 4500
line 4600
line 4700
line 4800
line 4900
line 5000
line 5100
line 5200
line 5300
line 5400
line 5500
line 5600
line 5700
line 5800
line 5900
line 6000
line 6100
line 6200
line 6300
line 6400
line 6500
line 6600
line 6700
line 6800
line 6900
line 7000
line 7100
line 7200
line 7300
line 7400
line 7500
line 7600
line 7700
line 7800
line 7900
line 8000
line 8100
line 8200
line 8300
line 8400
line 8500
line 8600
line 8700
line 8800
line 8900
line 9000
line 9100
line 9200
line 9300
line 9400
line 9500
line 9600
line 9700
line 9800
line 9900
line 10000
l

In [27]:
line_num

41061

In [73]:
len(links)

194368

In [48]:
page_titles

{'Wojciech_Zaremba': 51237053,
 'Mathematics_&_Mechanics_of_Solids': 31834571,
 'Allan_J._C._Cunningham': 1572371,
 'Amplitude_integrated_electroencephalography': 49895047,
 'Particular_values_of_the_gamma_function': 4239318,
 'Giulio_Ascoli': 3855752,
 "Lévy's_constant": 1067914,
 'Shoshana_Kamin': 24401034,
 'James_Lockhart_(banker)': 50061166,
 'Lars_Edvard_Phragmén': 29769882,
 'Raymond_Louis_Wilder': 4297377,
 'Paul_Glaister': 49149830,
 'Hekat_(unit)': 9098286,
 'Karl-Theodor_Sturm': 31696372,
 'Red_auxiliary_number': 19985265,
 "Jamshidian's_trick": 31954655,
 'Statistical_arbitrage': 1137949,
 'Cutting_sequence': 33509133,
 'Multivalued_treatment': 54248486,
 'Jonathan_Borwein': 1145010,
 'Ludvig_Faddeev': 4854639,
 'Pieter_Hendrik_Schoute': 6022750,
 'Christopher_Clavius': 633645,
 'Su_Buqing': 209460,
 'Thomas_Urquhart': 734814,
 'Boris_Zilber': 47992829,
 'Raymond_L._Johnson': 53709624,
 'Des_MacHale': 3681770,
 'Hellenic_Mathematical_Society': 945503,
 'Postcondition': 3929

In [74]:
import networkx as nx

In [75]:
D = nx.DiGraph()
D.add_edges_from(links)

In [76]:
len(D)

15414

In [77]:
len(links)

194368

In [78]:
len(page_ids)

15992

In [83]:
min(D.in_degree, key=lambda x: x[1])

(45285384, 0)

In [86]:
for title, page_id in page_titles.items():
    try:
        D.node[page_id]['title'] = title
    except KeyError:
        pass

In [89]:
nx.write_graphml(D, 'enwiki_math.graphml.gz')

# Resolve redirects

In [91]:
result = engine.execute('select page_id from math_page where page_is_redirect = 1')
redirect_ids = [x[0] for x in result]
len(redirect_ids)

411

In [110]:
for n in redirect_ids:
    if n in D:
        successors = list(D.successors(n))
        if len(successors) > 1:
            raise('%i has >1 successor')
        if len(successors) == 1:
            target = successors[0]
            for source in D.predecessors(n):
                D.add_edge(source, target)
        D.remove_node(n)
        

In [113]:
nx.write_graphml(D, 'enwiki_math.graphml.gz')

In [114]:
len(D)

15220

In [118]:
sorted(D.degree, key=lambda x: x[1], reverse=True)[:5]

[(1152126, 5181),
 (18831, 4987),
 (18902, 4198),
 (1485646, 2084),
 (198822, 1683)]

In [122]:
D.node[198822]['title']

'American_Mathematical_Society'