In [48]:
import re

import numpy as np

from config import get_sessionmaker
from models import Page, PageQuality

In [2]:
DATABASE_URI = "postgresql://postgres:postgres@localhost:5432/complete_wikipedia"
Session = get_sessionmaker(db_uri=DATABASE_URI)
s = Session()

In [4]:
pages_added = 0
for row in np_array:
    title = str(row[1])
    if len(title) > 200:
        continue # ignore titles > 200 chars
    page = Page(
        page_id = int(row[0]),
        page_title = title,
        )
    s.add(page)
    pages_added += 1
    if pages_added % 500000 == 0:
        s.commit()
        print(f"{pages_added} added")

500000 added
1000000 added
1500000 added
2000000 added
2500000 added
3000000 added
3500000 added
4000000 added
4500000 added
5000000 added
5500000 added
6000000 added


# Load Page Qualities

In [3]:
categorylinks_array = np.load('categorylinks.npy', allow_pickle=True)


In [8]:
quality_exp = re.compile(b"^(.*?)-Class")
qualities = []

In [9]:
def gen_item():
    num_items = 0
    num_unique_items = 0
    for row in categorylinks_array:
        last_id = 0
        for item in row:
            item_id = int(item[0])
            num_items += 1
            if item_id != last_id:
                page_quality = quality_exp.match(item[1]).group(1).decode("utf-8")
                last_id = item_id
                num_unique_items += 1
                yield([item_id, page_quality])
            if num_items % 250000 == 0:
                print(f"scanned {num_items} items")
                print(f"{num_unique_items} unique items")
        
    

In [10]:
item_iterator = gen_item()
for page in item_iterator:
    qualities.append(page)

scanned 250000 items
50587 unique items
scanned 500000 items
116632 unique items
scanned 750000 items
180979 unique items
scanned 1000000 items
244563 unique items
scanned 1250000 items
315611 unique items
scanned 1500000 items
390855 unique items
scanned 1750000 items
467635 unique items
scanned 2000000 items
538176 unique items


In [11]:
def sortFunc(e):
    return e[0]

qualities.sort(key=sortFunc)

In [13]:
qualities[:20]

[[128, 'B'],
 [354, 'B'],
 [354, 'C'],
 [672, 'B'],
 [672, 'C'],
 [692, 'B'],
 [692, 'C'],
 [714, 'B'],
 [722, 'GA'],
 [774, 'GA'],
 [793, 'B'],
 [793, 'C'],
 [845, 'B'],
 [861, 'C'],
 [934, 'C'],
 [987, 'GA'],
 [1001, 'GA'],
 [1024, 'C'],
 [1039, 'GA'],
 [1040, 'B']]

In [15]:
unique_qualities = [] # if multiple qualities exist, pick higher quality

In [16]:
last_id = 0
for quality in qualities:
    if quality[0] == last_id:
        continue
    else:
        unique_qualities.append(quality)
        last_id = quality[0]

In [17]:
unique_qualities[:20]

[[128, 'B'],
 [354, 'B'],
 [672, 'B'],
 [692, 'B'],
 [714, 'B'],
 [722, 'GA'],
 [774, 'GA'],
 [793, 'B'],
 [845, 'B'],
 [861, 'C'],
 [934, 'C'],
 [987, 'GA'],
 [1001, 'GA'],
 [1024, 'C'],
 [1039, 'GA'],
 [1040, 'B'],
 [1048, 'C'],
 [1054, 'C'],
 [1065, 'C'],
 [1066, 'C']]

In [18]:
len(unique_qualities)

525828

In [52]:
s.rollback()

In [45]:
qualities_added = 0
qualities_skipped = 0
for row in unique_qualities:
    result = s.query(Page).filter(Page.page_id==row[0]).first()
    if result is None:
        qualities_skipped += 1
        if qualities_skipped % 20000 == 0:
            print(f"{qualities_added} added")
            print(f"{qualities_skipped} skipped")
        continue
    quality = PageQuality(
        page_id = row[0],
        page_quality = row[1],
        )
    s.add(quality)
    qualities_added += 1
    if qualities_added % 1000 == 0:
        s.commit()
        print(f"{qualities_added} added")
        print(f"{qualities_skipped} skipped")

0 added
1000 skipped
2 added
2000 skipped
2 added
3000 skipped
2 added
4000 skipped
2 added
5000 skipped
2 added
6000 skipped
2 added
7000 skipped
2 added
8000 skipped
2 added
9000 skipped
2 added
10000 skipped
2 added
11000 skipped
2 added
12000 skipped
2 added
13000 skipped
2 added
14000 skipped
2 added
15000 skipped
2 added
16000 skipped
2 added
17000 skipped
2 added
18000 skipped
2 added
19000 skipped
2 added
20000 skipped
2 added
21000 skipped
2 added
22000 skipped
2 added
23000 skipped
2 added
24000 skipped
2 added
25000 skipped
2 added
26000 skipped
2 added
27000 skipped
2 added
28000 skipped
2 added
29000 skipped
2 added
30000 skipped
2 added
31000 skipped
2 added
32000 skipped
2 added
33000 skipped
2 added
34000 skipped
2 added
35000 skipped
2 added
36000 skipped
3 added
37000 skipped
3 added
38000 skipped
3 added
39000 skipped
3 added
40000 skipped
3 added
41000 skipped
3 added
42000 skipped
3 added
43000 skipped
3 added
44000 skipped
3 added
45000 skipped
3 added
46000 skipp

KeyboardInterrupt: 

In [51]:
s.query(Page).filter(Page.page_title=="Anarky").first().page_id

1192060

In [None]:
for 