In [16]:
import pandas  as pd
import os
import glob

In [3]:
import sys
sys.path.append('../code')

In [4]:
%load_ext autoreload
%autoreload 2

from libs import constants
from libs import io
from libs import scholar

In [5]:
aps_os_data_tar_gz = '../data/final_dataset.tar.gz'
root_source_files = '../../../APS_OA_merge/output/institutions'

In [6]:
df_institution = io.read_file_from_tar_gz_as_dataframe(aps_os_data_tar_gz, constants.APS_OA_INSTITUTIONS_FN)
df_institution.shape

(110071, 13)

In [7]:
df_institution.head(2)

Unnamed: 0,id_institution,cited_by_count,country_code,created_date,updated_date,display_name,display_name_acronyms,ror,2yr_mean_citedness,h_index,i10_index,type,works_count
0,368840534,992183,US,2016-06-24,2024-11-06T14:44:38.146674,University of Mississippi,,02teq1165,4.073082,323,15037,education,36888
1,143868143,450049,CN,2016-06-24,2024-11-08T12:36:34.346757,Anhui University,,05th6yx34,4.909063,195,9813,education,36877


In [8]:
status = {'exists':[], 'not_found':[]}
for _, row in df_institution.iterrows():
    fn = glob.glob(io.path_join(root_source_files, f"{row['id_institution']}.json.gz"))
    if len(fn) == 1 and io.exists(fn[0]):
        status['exists'].append(fn[0])
    else:
        status['not_found'].append(row['id_institution'])

In [9]:
len(status['exists']), len(status['not_found'])

(109971, 100)

In [19]:
df_city = pd.DataFrame(columns=['id_institution', 'city'])

for fn in status['exists']:
    obj = io.read_compressed_json(fn)
    id = int(obj.get('id','').replace('https://openalex.org/I',''))
    city = obj.get('geo',{}).get('city','')
    
    df_city = pd.concat([df_city, pd.DataFrame({'id_institution':id, 'city':city}, index=[0])], ignore_index=True)
                        
df_city.shape

(109971, 2)

In [21]:
df_city.head(2)

Unnamed: 0,id_institution,city
0,368840534,Oxford
1,143868143,Hefei


In [22]:
status['not_found'][0]

205783295

In [29]:
import requests
base_url = "https://api.openalex.org/institutions"
URI = 'https://openalex.org/I'

df_city_new = pd.DataFrame(columns=['id_institution', 'city'])

for id_ in status['not_found']:
    city = None
    try:
        # Make a GET request to the OpenAlex API for each ID
        url = f"{base_url}?filter=openalex:I{id_}"
        response = requests.get(url)
        response.raise_for_status()

        # Parse the JSON response
        data = response.json()
        institutions = data.get('results', [])
        
        for institution in institutions:

            if institution.get('id').replace(URI,'') != str(id_):
                continue

            geo = institution.get('geo', {})
            city = geo.get('city', None)
            break
    
    except requests.exceptions.RequestException as e:
        print(f"Error fetching data for ID {id_}: {e}")

    if city is not None:
        df_city_new = pd.concat([df_city_new, pd.DataFrame({'id_institution':id_, 'city':city}, index=[0])], ignore_index=True)
    else:
        print(id_, 'not found')
            
    
df_city_new.shape

(100, 2)

In [30]:
df_city.head(2)

Unnamed: 0,id_institution,city
0,368840534,Oxford
1,143868143,Hefei


In [31]:
df_city_new.head(2)

Unnamed: 0,id_institution,city
0,205783295,Ithaca
1,8204097,Munich


In [33]:
df_cities = pd.concat([df_city, df_city_new], ignore_index=True)
df_cities.shape, df_city.shape, df_city_new.shape

((110071, 2), (109971, 2), (100, 2))

In [34]:
df_cities.head(2)

Unnamed: 0,id_institution,city
0,368840534,Oxford
1,143868143,Hefei


In [36]:
df_institution.head(2)

Unnamed: 0,id_institution,cited_by_count,country_code,created_date,updated_date,display_name,display_name_acronyms,ror,2yr_mean_citedness,h_index,i10_index,type,works_count
0,368840534,992183,US,2016-06-24,2024-11-06T14:44:38.146674,University of Mississippi,,02teq1165,4.073082,323,15037,education,36888
1,143868143,450049,CN,2016-06-24,2024-11-08T12:36:34.346757,Anhui University,,05th6yx34,4.909063,195,9813,education,36877


In [37]:
df_institution.shape

(110071, 13)

In [38]:
df_institution = pd.merge(df_institution, df_cities, on='id_institution', how='left')
df_institution.head(2)

Unnamed: 0,id_institution,cited_by_count,country_code,created_date,updated_date,display_name,display_name_acronyms,ror,2yr_mean_citedness,h_index,i10_index,type,works_count,city
0,368840534,992183,US,2016-06-24,2024-11-06T14:44:38.146674,University of Mississippi,,02teq1165,4.073082,323,15037,education,36888,Oxford
1,143868143,450049,CN,2016-06-24,2024-11-08T12:36:34.346757,Anhui University,,05th6yx34,4.909063,195,9813,education,36877,Hefei


In [39]:
df_institution.shape

(110071, 14)

In [40]:
df_institution.to_csv('../data/temp/institution.csv', index=False)