In [1]:
import requests
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta

In [2]:
def daterange(start_date, end_date):
    for n in range(int((end_date - start_date).days)):
        yield start_date + timedelta(n)

In [3]:
def gen_params(start, end):
    start_str = start.strftime('%Y-%m-%d')
    end_str = end.strftime('%Y-%m-%d')
    date_filter_str = f'from_publication_date:{start_str},to_publication_date:{end_str},cited_by_count:>1,is_retracted:False'
        # cited_by_count:>10, \
        # is_retracted:False'

    params = {
        'filter': f'concepts.id:C107457646,{date_filter_str}',
        # 'sort' : 'publication_date',
        'per_page': 200,
    }

    return params

In [4]:
def fetch(params):
    data = []
    for i in range(1, 51):
        r = requests.get(url, params | {'page': i})
        json_data = r.json()
        assert json_data['meta']['count'] < 10000
        if i == 1: print(json_data['meta']['count'])
        if len(json_data['results']) == 0:break
        data.extend(json_data['results'])

    return data

In [5]:
url = 'https://api.openalex.org/works'

# start = datetime.strptime('1980-01-01', '%Y-%m-%d')
# # step = timedelta(days=28)
# step = relativedelta(months=12)
# end = datetime.strptime('2000-12-31', '%Y-%m-%d')

start = datetime.strptime('1980-01-01', '%Y-%m-%d')
# step = timedelta(days=28)
step = relativedelta(months=6)
end = datetime.strptime('2021-12-31', '%Y-%m-%d')

params = gen_params(start, end)
r = requests.get(url, params)
json_data = r.json()
print(json_data['meta'])

data = []
curr_date = start
while curr_date < end:
    print(curr_date, curr_date + step)
    params = gen_params(curr_date, curr_date + step)
    data.extend(fetch(params))
    curr_date = curr_date + step


len(data)


{'count': 273211, 'db_response_time_ms': 286, 'page': 1, 'per_page': 200}
1980-01-01 00:00:00 1980-07-01 00:00:00
185
1980-07-01 00:00:00 1981-01-01 00:00:00
224
1981-01-01 00:00:00 1981-07-01 00:00:00
259
1981-07-01 00:00:00 1982-01-01 00:00:00
241
1982-01-01 00:00:00 1982-07-01 00:00:00
286
1982-07-01 00:00:00 1983-01-01 00:00:00
293
1983-01-01 00:00:00 1983-07-01 00:00:00
294
1983-07-01 00:00:00 1984-01-01 00:00:00
397
1984-01-01 00:00:00 1984-07-01 00:00:00
364
1984-07-01 00:00:00 1985-01-01 00:00:00
448
1985-01-01 00:00:00 1985-07-01 00:00:00
449
1985-07-01 00:00:00 1986-01-01 00:00:00
486
1986-01-01 00:00:00 1986-07-01 00:00:00
567
1986-07-01 00:00:00 1987-01-01 00:00:00
725
1987-01-01 00:00:00 1987-07-01 00:00:00
689
1987-07-01 00:00:00 1988-01-01 00:00:00
757
1988-01-01 00:00:00 1988-07-01 00:00:00
830
1988-07-01 00:00:00 1989-01-01 00:00:00
782
1989-01-01 00:00:00 1989-07-01 00:00:00
883
1989-07-01 00:00:00 1990-01-01 00:00:00
1063
1990-01-01 00:00:00 1990-07-01 00:00:00
1050


337130

In [6]:
len(data)

337130

In [7]:
from pymongo import MongoClient
client = MongoClient('localhost', 27017)

In [8]:
db = client['diva-proj']
papers = db.papers

In [9]:
def format_for_mongo(din):
    from copy import deepcopy
    d = deepcopy(din)
    d['wid'] = d['id'].split('/')[-1]
    d['publication_date'] = datetime.strptime(d['publication_date'], '%Y-%m-%d')
    return d

In [10]:
data_to_insert = [format_for_mongo(d) for d in data]
try:
    papers.insert_many(data_to_insert, ordered=False)
except:
    print('asd')