In [1]:
from datetime import datetime
import json
import os
import glob
import shutil

from tqdm import tqdm
import requests

In [2]:
QUERY_DIR = 'queries'
DATA_DIR = '_data'

shutil.rmtree(os.path.join(os.getcwd(), DATA_DIR), ignore_errors=True)
os.makedirs(os.path.join(os.getcwd(), DATA_DIR), exist_ok=True)
queries = []
names = []

for filename in glob.glob(f'./{QUERY_DIR}/*.sparql'):
    with open(filename, 'r') as f:
        names.append(os.path.basename(filename))
        queries.append(f.read())
        
names

['work.sparql',
 'school.sparql',
 'main_philosophers.sparql',
 'era.sparql',
 'birthPlace.sparql',
 'notableIdea.sparql',
 'mainInterest.sparql',
 'influenced.sparql']

In [3]:
def params(query, limit, offset):
    q = query + f' LIMIT {limit} OFFSET {offset}'
    return {
        "default-graph-uri": "http://dbpedia.org",
        "query": q,
        "format": "application/sparql-results+json",
        "CXML_redir_for_subjs": 121,
        "CXML_redir_for_hrefs": None,
        "timeout": 30000,
        "run": "Run Query "
    }

params(queries[0], 10000, 10000)

{'default-graph-uri': 'http://dbpedia.org',
 'query': "SELECT * WHERE {\n?object a <http://dbpedia.org/ontology/Philosopher> .\n?object dbo:wikiPageID ?wikiPageID .\n?work dbo:author ?object .\nOPTIONAL {?work foaf:name ?work_name} .\nOPTIONAL {?work dbo:abstract ?work_abstract FILTER(lang(?work_abstract) = 'en')} .\nOPTIONAL {?work dbo:wikiPageID ?work_wikiPageID} .\nOPTIONAL {?work dbp:pubDate ?work_pubDate} .\nOPTIONAL {?work dbp:language ?work_language} .\n} ORDER BY ASC(?object)\n LIMIT 10000 OFFSET 10000",
 'format': 'application/sparql-results+json',
 'CXML_redir_for_subjs': 121,
 'CXML_redir_for_hrefs': None,
 'timeout': 30000,
 'run': 'Run Query '}

In [4]:
def fetch(query, name):
    data = None
    limit = 10000
    i = 0
    with tqdm(desc=name) as bar:
        while True:
            r = requests.get('http://dbpedia.org/sparql/', params=params(query, limit, i * limit))
            if data is None:
                data = r.json()
                upd = len(data["results"]["bindings"])
            else:
                bindings = r.json()["results"]["bindings"]
                upd = len(bindings)
                if len(bindings) > 0:
                    data["results"]["bindings"].extend(bindings)
                else:
                    break
            i += 1
            bar.update(upd)
    return data

In [5]:
def save(query, name):
    name = name + '_' + datetime.now().strftime("%c") + '.json'
    name = os.path.join(DATA_DIR, name)
    data = fetch(query, name)
    with open(name, 'w') as f:
        json.dump(data, f)
    return name

In [6]:
def fetch_all():
    saved = {}
    for name, query in zip(names, queries):
        saved_name = save(query, name)
        saved[name] = saved_name
    with open('last_save.json', 'w') as f:
        json.dump(saved, f)
        
fetch_all()

_data/work.sparql_Sun Mar 15 21:35:07 2020.json: 961it [00:03, 275.40it/s]
_data/school.sparql_Sun Mar 15 21:35:10 2020.json: 2076it [00:11, 181.57it/s]
_data/main_philosophers.sparql_Sun Mar 15 21:35:22 2020.json: 5149it [00:20, 254.81it/s]
_data/era.sparql_Sun Mar 15 21:35:42 2020.json: 1316it [00:06, 217.74it/s]
_data/birthPlace.sparql_Sun Mar 15 21:35:49 2020.json: 4742it [00:10, 452.78it/s]
_data/notableIdea.sparql_Sun Mar 15 21:35:59 2020.json: 1488it [00:05, 248.79it/s]
_data/mainInterest.sparql_Sun Mar 15 21:36:05 2020.json: 4282it [00:20, 206.43it/s]
_data/influenced.sparql_Sun Mar 15 21:36:26 2020.json: 8304it [00:08, 1019.58it/s]
