In [None]:
import pandas as pd
import requests
import numpy as np

In [None]:
# GET ALL SOURCES (JOURNALS) FROM OPENALEX API WITH SPECIFIED REQUIREMENTS
def request_institutions(filter_string, email):
    # build query (e-mail included for "polite pool")
    query = "https://api.openalex.org/institutions?per-page=200&filter="+filter_string+"&mailto="+email
            
    # open persistent session to shorten processing time between requests
    s = requests.Session()
    # FIRST PAGE
    sources = s.get(query+"&cursor=*")
    next_sources = sources.json()
    next_cursor = next_sources["meta"]["next_cursor"]
    sources_results = next_sources["results"]
    
    # RETRIEVE ALL PAGES
    while next_sources["meta"]["next_cursor"] != None:
        # get next page with cursor
        next_sources = s.get(query+"&cursor="+next_cursor)
        next_sources = next_sources.json()
        next_cursor = next_sources["meta"]["next_cursor"] # remember next cursor
        sources_results.extend(next_sources["results"])
    
    sources_df = pd.DataFrame.from_dict(sources_results)
    return sources_df

In [None]:
data = pd.read_pickle("../data/processed/authors_disambiguated_truncated.pkl")
data.head()

In [None]:
email = input("Enter e-mail address for OpenAlex API: ").strip()
iso_countrycode = input("Enter ISO 3166-1 alpha-2 country code: ").strip()
be_inst = request_institutions("country_code:"+iso_countrycode, email)

In [None]:
be_inst.head()

In [None]:
inst_exp = {}
for inst in be_inst["id"]:
    inst_exp[inst] = []

for row in data.itertuples():
    if row.inst_id in list(be_inst["id"]):
        if type(row.order) is list or type(row.order) is set:
            inst_exp[row.inst_id].extend(row.order)
        else:
            inst_exp[row.inst_id].append(row.order)

In [None]:
inst_exp

In [None]:
expertise = pd.DataFrame({"inst_id":inst_exp.keys(), "inst_name":be_inst["display_name"], "orders":inst_exp.values()})
expertise["orders_unique"] = [list(np.unique(x)) for x in expertise["orders"]]
expertise[expertise["orders"].map(lambda d: len(d)) > 0]

In [None]:
orders = list(np.unique(list(pd.core.common.flatten(expertise["orders"]))))
orders

In [None]:
expertise2 = pd.DataFrame({"order":orders, "inst_ids":None, "inst_names":None})
expertise2 = expertise2.set_index("order")
expertise2["inst_ids"] = [[] for _ in orders]
expertise2["inst_names"] = [[] for _ in orders]

In [None]:
for row in expertise.itertuples():
    for order in row.orders_unique:
        expertise2.loc[order, "inst_ids"].append(row.inst_id)
        expertise2.loc[order, "inst_names"].append(row.inst_name)

In [None]:
expertise2

In [None]:
expertise.to_excel("../data/processed/national_taxonomische_expertise_per_instituut.xlsx")

In [None]:
expertise2.to_excel("../data/processed/national_taxonomische_expertise_per_orde.xlsx")