In [1]:
from collections import deque, namedtuple
from IPython.display import HTML, display
from IPython.core.debugger import set_trace
from tqdm import tqdm

import pandas as pd
import numpy as np
import requests
import sqlite3
import urllib

In [2]:
SAMPLE_ID = 25224070
START = 20140101
END = 20200301

DB = './db.sqlite'
TABLES = ['PhilosophicalSchool', 'Philosopher', 'NotableIdea', 'MainInterest', 'Era', 'Work']
TABLE_NAME = 'WikiPagePopularity'
TEMP_FETCH = './_temp_fetch.csv'

In [3]:
def fetch_title(page_id):
    r = requests.get('https://en.wikipedia.org/w/api.php', params={
        'action': 'query',
        'prop': 'info',
        'pageids': page_id,
        'format': 'json'
    })
    try:
         return r.json()['query']['pages'][str(page_id)]['title']
    except Exception as exp:
        print('Error:', page_id)
        display(page_id)
    
title = fetch_title(SAMPLE_ID)
print(title)

Wikipedia:Pageview statistics


In [4]:
def fetch_popularity(page_title):
    page_title = urllib.parse.quote_plus(page_title.replace(' ', '_'))
    r = requests.get(
        'https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/'
        f'en.wikipedia.org/all-access/all-agents/{page_title}/monthly/{START}/{END}'
    )
    try:
        res = r.json()['items']
    except Exception as exp:
        print('Error:', page_title)
        display(exp)
        return None
    df = pd.DataFrame(res)
    return df.views.sum()

fetch_popularity(title)

166605

In [5]:
def get_ids():
    try:
        conn = sqlite3.connect(DB)
        ids = set()
        for table_name in TABLES:
            df = pd.read_sql(f'SELECT * FROM {table_name}', conn)
            df['wikiPageID'].dropna().apply(lambda val: ids.add(int(val)))
    finally:
        conn.close
    ids = list(ids)
    ids.sort()
    return ids
        
ids = get_ids()

In [6]:
fetched = deque()
Popularity = namedtuple('Popularity', ['wikiPageID', 'total_visits'])

In [7]:
def restore_fetched(ids, fetched):
    df = pd.read_csv(TEMP_FETCH)
    print('Total:', len(ids))
    nones = 0
    for id_, pop in df.itertuples(index = False):
        if pop is not None and pop > 0:
            fetched.append(Popularity(id_, pop))
            ids.remove(id_)
        else:
            nones += 1
    print('Restored:', len(fetched))
    print('None:', nones)
    print('Remaining:', len(ids))
        
restore_fetched(ids, fetched)

Total: 5102
Restored: 5062
None: 40
Remaining: 40


In [8]:
def fetch_ids_popularity(ids, fetched):
    for id_ in tqdm(ids):
        title = fetch_title(id_)
        if title is None:
            fetched.append(Popularity(id_, None))
            continue
        popularity = fetch_popularity(title)
        if popularity is None:
            fetched.append(Popularity(id_, None))
            continue
        fetched.append(Popularity(id_, popularity))
    return fetched

fetched = fetch_ids_popularity(ids, fetched)

  2%|▎         | 1/40 [00:01<00:58,  1.51s/it]

Error: 286153


286153

  5%|▌         | 2/40 [00:02<00:46,  1.22s/it]

Error: 1486408


1486408

  8%|▊         | 3/40 [00:02<00:36,  1.00it/s]

Error: 1498741


1498741

 10%|█         | 4/40 [00:03<00:32,  1.12it/s]

Error: 6197115


6197115

 12%|█▎        | 5/40 [00:03<00:27,  1.25it/s]

Error: 6927115


6927115

 15%|█▌        | 6/40 [00:06<00:51,  1.51s/it]

Error: 7179351


7179351

 18%|█▊        | 7/40 [00:07<00:41,  1.25s/it]

Error: 7655409


7655409

 20%|██        | 8/40 [00:08<00:33,  1.06s/it]

Error: 7661785


7661785

 22%|██▎       | 9/40 [00:09<00:32,  1.05s/it]

Error: 8253337


8253337

 25%|██▌       | 10/40 [00:09<00:26,  1.14it/s]

Error: 8368474


8368474

 28%|██▊       | 11/40 [00:10<00:23,  1.23it/s]

Error: 11401875


11401875

 30%|███       | 12/40 [00:10<00:20,  1.36it/s]

Error: 13355860


13355860

 32%|███▎      | 13/40 [00:11<00:17,  1.50it/s]

Error: 18437404


18437404

 35%|███▌      | 14/40 [00:12<00:17,  1.47it/s]

Error: 20627628


20627628

 38%|███▊      | 15/40 [00:12<00:16,  1.48it/s]

Error: 22695176


22695176

 40%|████      | 16/40 [00:13<00:16,  1.49it/s]

Error: 23040717


23040717

 42%|████▎     | 17/40 [00:14<00:14,  1.57it/s]

Error: 23763468


23763468

 45%|████▌     | 18/40 [00:14<00:13,  1.60it/s]

Error: 27557728


27557728

 48%|████▊     | 19/40 [00:15<00:13,  1.56it/s]

Error: 30846343


30846343

 50%|█████     | 20/40 [00:15<00:12,  1.61it/s]

Error: 33230227


33230227

 52%|█████▎    | 21/40 [00:16<00:11,  1.61it/s]

Error: 33287285


33287285

 55%|█████▌    | 22/40 [00:17<00:11,  1.60it/s]

Error: 33744106


33744106

 57%|█████▊    | 23/40 [00:18<00:15,  1.07it/s]

Error: 35844686


35844686

 60%|██████    | 24/40 [00:19<00:13,  1.20it/s]

Error: 37563858


37563858

 62%|██████▎   | 25/40 [00:19<00:11,  1.36it/s]

Error: 38962106


38962106

 65%|██████▌   | 26/40 [00:20<00:10,  1.37it/s]

Error: 39155457


39155457

 68%|██████▊   | 27/40 [00:21<00:09,  1.37it/s]

Error: 39681168


39681168

 70%|███████   | 28/40 [00:21<00:08,  1.47it/s]

Error: 41082682


41082682

 72%|███████▎  | 29/40 [00:22<00:07,  1.46it/s]

Error: 41177736


41177736

 75%|███████▌  | 30/40 [00:23<00:06,  1.56it/s]

Error: 47023314


47023314

 78%|███████▊  | 31/40 [00:24<00:07,  1.17it/s]

Error: 47040274


47040274

 80%|████████  | 32/40 [00:25<00:06,  1.30it/s]

Error: 47432404


47432404

 82%|████████▎ | 33/40 [00:25<00:05,  1.22it/s]

Error: 50511439


50511439

 85%|████████▌ | 34/40 [00:26<00:04,  1.31it/s]

Error: 51360124


51360124

 88%|████████▊ | 35/40 [00:27<00:03,  1.37it/s]

Error: 51360363


51360363

 90%|█████████ | 36/40 [00:29<00:04,  1.09s/it]

Error: 51360495


51360495

 92%|█████████▎| 37/40 [00:30<00:03,  1.12s/it]

Error: 51360731


51360731

 95%|█████████▌| 38/40 [00:30<00:01,  1.04it/s]

Error: 51368906


51368906

 98%|█████████▊| 39/40 [00:33<00:01,  1.28s/it]

Error: 51443180


51443180

100%|██████████| 40/40 [00:33<00:00,  1.19it/s]


In [9]:
def save_fetched(fetched):
    df = pd.DataFrame(fetched)
    df.to_csv(TEMP_FETCH, index=False)
    
save_fetched(fetched)

In [11]:
def add_fetched_to_db(fetched):
    df = pd.DataFrame(fetched)
    df = df.set_index('wikiPageID')
    try:
        conn = sqlite3.connect(DB)
        df.to_sql(TABLE_NAME, conn, if_exists='replace')
        conn.commit()
    finally:
        conn.close()
        
add_fetched_to_db(fetched)