In [1]:
from collections import deque, namedtuple
from IPython.display import HTML, display
from IPython.core.debugger import set_trace
from tqdm import tqdm

import pandas as pd
import numpy as np
import requests
import sqlite3
import urllib

In [2]:
SAMPLE_ID = 25224070
START = 20140101
END = 20200301

DB = './db.sqlite'
TABLES = ['PhilosophicalSchool', 'Philosopher', 'NotableIdea', 'MainInterest', 'Era'       , 'Work'      , 'PhilosopherWasBorn'   , 'PhilosopherDied']
FIELDS = ['wikiPageID'         , 'wikiPageID',   'wikiPageID', 'wikiPageID',   'wikiPageID', 'wikiPageID', 'birthPlace_wikiPageID', 'deathPlace_wikiPageID']
TABLE_NAME = 'WikiPagePopularity'
TEMP_FETCH = './_temp_fetch.csv'

In [3]:
def fetch_title(page_id):
    r = requests.get('https://en.wikipedia.org/w/api.php', params={
        'action': 'query',
        'prop': 'info',
        'pageids': page_id,
        'format': 'json'
    })
    try:
         return r.json()['query']['pages'][str(page_id)]['title']
    except Exception as exp:
        print('Error:', page_id)
        display(page_id)
    
title = fetch_title(SAMPLE_ID)
print(title)

Wikipedia:Pageview statistics


In [4]:
def fetch_popularity(page_title):
    page_title = urllib.parse.quote_plus(page_title.replace(' ', '_'))
    r = requests.get(
        'https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/'
        f'en.wikipedia.org/all-access/all-agents/{page_title}/monthly/{START}/{END}'
    )
    try:
        res = r.json()['items']
    except Exception as exp:
        print('Error:', page_title)
        display(exp)
        return None
    df = pd.DataFrame(res)
    return df.views.sum()

fetch_popularity(title)

166605

In [5]:
def get_ids():
    try:
        conn = sqlite3.connect(DB)
        ids = set()
        for table_name, field in zip(TABLES, FIELDS):
            df = pd.read_sql(f'SELECT * FROM {table_name}', conn)
            df[field].dropna().apply(lambda val: ids.add(int(val)))
    finally:
        conn.close
    ids = list(ids)
    ids.sort()
    return ids
        
ids = get_ids()

In [6]:
fetched = deque()
Popularity = namedtuple('Popularity', ['wikiPageID', 'total_visits'])

In [7]:
def restore_fetched(ids, fetched):
    df = pd.read_csv(TEMP_FETCH)
    print('Total:', len(ids))
    nones = 0
    for id_, pop in df.itertuples(index = False):
        if pop is not None and pop > 0:
            fetched.append(Popularity(id_, pop))
            ids.remove(id_)
        else:
            nones += 1
    print('Restored:', len(fetched))
    print('None:', nones)
    print('Remaining:', len(ids))
        
restore_fetched(ids, fetched)

Total: 7050
Restored: 6997
None: 53
Remaining: 53


In [8]:
def fetch_ids_popularity(ids, fetched):
    for id_ in tqdm(ids):
        title = fetch_title(id_)
        if title is None:
            fetched.append(Popularity(id_, None))
            continue
        popularity = fetch_popularity(title)
        if popularity is None:
            fetched.append(Popularity(id_, None))
            continue
        fetched.append(Popularity(id_, popularity))
    return fetched

fetched = fetch_ids_popularity(ids, fetched)

  0%|          | 0/53 [00:00<?, ?it/s]

Error: 286153


286153

  2%|▏         | 1/53 [00:01<01:30,  1.75s/it]

Error: 329399


329399

  4%|▍         | 2/53 [00:02<01:15,  1.47s/it]

Error: 1486408


1486408

  6%|▌         | 3/53 [00:03<01:02,  1.26s/it]

Error: 1498741


1498741

  8%|▊         | 4/53 [00:04<00:54,  1.11s/it]

Error: 2879398


2879398

  9%|▉         | 5/53 [00:05<00:51,  1.08s/it]

Error: 6197115


6197115

 11%|█▏        | 6/53 [00:08<01:20,  1.71s/it]

Error: 6792310


6792310

 13%|█▎        | 7/53 [00:14<02:15,  2.95s/it]

Error: 6927115


6927115

 15%|█▌        | 8/53 [00:20<02:55,  3.90s/it]

Error: 7179351


7179351

 17%|█▋        | 9/53 [00:25<03:04,  4.19s/it]

Error: 7655409


7655409

 19%|█▉        | 10/53 [00:26<02:18,  3.23s/it]

Error: 7661785


7661785

 21%|██        | 11/53 [00:28<02:04,  2.97s/it]

Error: 8253337


8253337

 23%|██▎       | 12/53 [00:31<01:57,  2.87s/it]

Error: 8368474


8368474

 25%|██▍       | 13/53 [00:32<01:31,  2.30s/it]

Error: 11401875


11401875

 26%|██▋       | 14/53 [00:32<01:09,  1.79s/it]

Error: 13355860


13355860

 28%|██▊       | 15/53 [00:40<02:12,  3.48s/it]

Error: 18437404


18437404

 30%|███       | 16/53 [00:41<01:41,  2.74s/it]

Error: 20627628


20627628

 32%|███▏      | 17/53 [00:42<01:20,  2.23s/it]

Error: 22695176


22695176

 34%|███▍      | 18/53 [00:48<01:56,  3.34s/it]

Error: 23040717


23040717

 36%|███▌      | 19/53 [00:49<01:30,  2.66s/it]

Error: 23763468


23763468

 38%|███▊      | 20/53 [00:49<01:07,  2.05s/it]

Error: 27557728


27557728

 40%|███▉      | 21/53 [00:52<01:08,  2.14s/it]

Error: 30846343


30846343

 42%|████▏     | 22/53 [00:52<00:52,  1.70s/it]

Error: 32395653


32395653

 43%|████▎     | 23/53 [00:53<00:45,  1.52s/it]

Error: 32633712


32633712

 45%|████▌     | 24/53 [00:54<00:35,  1.24s/it]

Error: 33230227


33230227

 47%|████▋     | 25/53 [00:55<00:29,  1.06s/it]

Error: 33287285


33287285

 49%|████▉     | 26/53 [00:57<00:37,  1.40s/it]

Error: 33529713


33529713

 51%|█████     | 27/53 [01:00<00:50,  1.96s/it]

Error: 33744106


33744106

 53%|█████▎    | 28/53 [01:02<00:51,  2.05s/it]

Error: 35844686


35844686

 55%|█████▍    | 29/53 [01:05<00:50,  2.10s/it]

Error: 37563858


37563858

 57%|█████▋    | 30/53 [01:07<00:49,  2.14s/it]

Error: 38962106


38962106

 58%|█████▊    | 31/53 [01:10<00:56,  2.57s/it]

Error: 39155457


39155457

 60%|██████    | 32/53 [01:12<00:47,  2.28s/it]

Error: 39296255


39296255

 62%|██████▏   | 33/53 [01:15<00:51,  2.58s/it]

Error: 39681168


39681168

 64%|██████▍   | 34/53 [01:17<00:45,  2.40s/it]

Error: 41082682


41082682

 66%|██████▌   | 35/53 [01:19<00:41,  2.32s/it]

Error: 41177736


41177736

 68%|██████▊   | 36/53 [01:25<00:54,  3.22s/it]

Error: 42465510


42465510

 70%|██████▉   | 37/53 [01:26<00:40,  2.55s/it]

Error: 46259924


46259924

 72%|███████▏  | 38/53 [01:28<00:37,  2.47s/it]

Error: 47023314


47023314

 74%|███████▎  | 39/53 [01:29<00:29,  2.11s/it]

Error: 47040274


47040274

 75%|███████▌  | 40/53 [01:31<00:24,  1.87s/it]

Error: 47432404


47432404

 77%|███████▋  | 41/53 [01:33<00:25,  2.15s/it]

Error: 48644451


48644451

 79%|███████▉  | 42/53 [01:39<00:35,  3.26s/it]

Error: 48940433


48940433

 81%|████████  | 43/53 [01:41<00:26,  2.67s/it]

Error: 50511439


50511439

 83%|████████▎ | 44/53 [01:41<00:19,  2.14s/it]

Error: 51021251


51021251

 85%|████████▍ | 45/53 [01:42<00:14,  1.78s/it]

Error: 51109323


51109323

 87%|████████▋ | 46/53 [01:46<00:15,  2.20s/it]

Error: 51290174


51290174

 89%|████████▊ | 47/53 [01:49<00:15,  2.51s/it]

Error: 51360124


51360124

 91%|█████████ | 48/53 [01:51<00:12,  2.48s/it]

Error: 51360363


51360363

 92%|█████████▏| 49/53 [01:53<00:09,  2.29s/it]

Error: 51360495


51360495

 94%|█████████▍| 50/53 [01:56<00:07,  2.63s/it]

Error: 51360731


51360731

 96%|█████████▌| 51/53 [02:05<00:08,  4.31s/it]

Error: 51368906


51368906

 98%|█████████▊| 52/53 [02:11<00:04,  4.95s/it]

Error: 51443180


51443180

100%|██████████| 53/53 [02:12<00:00,  2.50s/it]


In [9]:
def save_fetched(fetched):
    df = pd.DataFrame(fetched)
    df.to_csv(TEMP_FETCH, index=False)
    
save_fetched(fetched)

In [10]:
def add_fetched_to_db(fetched):
    df = pd.DataFrame(fetched)
    df = df.set_index('wikiPageID')
    try:
        conn = sqlite3.connect(DB)
        df.to_sql(TABLE_NAME, conn, if_exists='replace')
        conn.commit()
    finally:
        conn.close()
        
add_fetched_to_db(fetched)