In [1]:
import os
import pandas as pd
from tqdm import tqdm

In [2]:
import sys
sys.path.append('../code')

In [18]:
%load_ext autoreload
%autoreload 2

from libs import constants
from libs import io
from libs import helpers

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
aps_os_data_tar_gz = '../data/final_dataset.tar.gz'

In [6]:
# authors
df_author = io.read_file_from_tar_gz_as_dataframe(aps_os_data_tar_gz, constants.APS_OA_AUTHORS_FN)
df_author.rename(columns={'id_author': 'id_author_oa'}, inplace=True)
df_author['id_author'] = range(1, len(df_author) + 1)
print(f'\n df_author: {df_author.shape} \n {df_author.head(2)} \n')


 df_author: (481012, 11) 
    id_author_oa created_date                updated_date        name  \
0    5053051063   2023-07-21  2024-11-04T00:46:12.391095  I. Ben‐Zvi   
1    5067224934   2023-07-21  2024-11-06T10:20:37.304362    T. Roser   

                 orcid  two_year_mean_citedness  h_index  i10_index  \
0  0000-0001-5583-0106                 5.416667       33        129   
1  0000-0001-5603-3192                 0.322581       34         89   

   works_count  cited_by_count  id_author  
0          783            6034          1  
1          632            5487          2   



In [7]:
# affiliations
df_institution = io.read_file_from_tar_gz_as_dataframe(aps_os_data_tar_gz, constants.APS_OA_INSTITUTIONS_FN)
df_institution.rename(columns={'id_institution': 'id_institution_oa'}, inplace=True)
df_institution['id_institution'] = range(1, len(df_institution) + 1)
print(f'\n df_institution: {df_institution.shape} \n {df_institution.head(2)} \n')



 df_institution: (110071, 14) 
    id_institution_oa  cited_by_count country_code created_date  \
0          368840534          992183           US   2016-06-24   
1          143868143          450049           CN   2016-06-24   

                 updated_date               display_name  \
0  2024-11-06T14:44:38.146674  University of Mississippi   
1  2024-11-08T12:36:34.346757           Anhui University   

  display_name_acronyms        ror  2yr_mean_citedness  h_index  i10_index  \
0                   NaN  02teq1165            4.073082      323      15037   
1                   NaN  05th6yx34            4.909063      195       9813   

        type  works_count  id_institution  
0  education        36888               1  
1  education        36877               2   



In [8]:
# author_institution_year
df_author_inst_year = io.read_file_from_tar_gz_as_dataframe(aps_os_data_tar_gz, constants.APS_OA_AUTHORS_INSTITUTION_YEAR_FN)
df_author_inst_year.rename(columns={'id_author': 'id_author_oa'}, inplace=True)
df_author_inst_year.rename(columns={'id_institution': 'id_institution_oa'}, inplace=True)
print(f'\n df_author_inst_year: {df_author_inst_year.shape} \n {df_author_inst_year.head(2)} \n')


 df_author_inst_year: (8374715, 3) 
    id_author_oa  id_institution_oa  year
0    5053051063           59553526  2024
1    5053051063           59553526  2022 



In [9]:
df_author_inst_year = df_author_inst_year.merge(df_author[['id_author_oa','id_author']], on='id_author_oa', how='left')
df_author_inst_year = df_author_inst_year.merge(df_institution[['id_institution_oa','id_institution']], on='id_institution_oa', how='left')
print(f'\n df_author_inst_year: {df_author_inst_year.shape} \n {df_author_inst_year.head(2)} \n')


 df_author_inst_year: (8374715, 5) 
    id_author_oa  id_institution_oa  year  id_author  id_institution
0    5053051063           59553526  2024          1         32316.0
1    5053051063           59553526  2022          1         32316.0 



In [11]:
id_institution_oa_missing = df_author_inst_year.query("@pd.isna(id_institution)")[['id_institution_oa','id_institution']].drop_duplicates().id_institution_oa.unique()
len(id_institution_oa_missing)
io.save_list_to_file(id_institution_oa_missing, '../data/temp/id_institution_oa_missing.txt')

In [17]:
df_author_inst_year.query("id_author_oa == 5016939139")

Unnamed: 0,id_author_oa,id_institution_oa,year,id_author,id_institution
2242626,5016939139,129604602,2024,102236,110053.0
2242627,5016939139,129604602,2023,102236,110053.0
2242628,5016939139,129604602,2022,102236,110053.0
2242629,5016939139,129604602,2021,102236,110053.0
2242630,5016939139,129604602,2020,102236,110053.0
2242631,5016939139,129604602,2019,102236,110053.0
2242632,5016939139,129604602,2018,102236,110053.0
2242633,5016939139,129604602,2017,102236,110053.0
2242634,5016939139,129604602,2016,102236,110053.0
2242635,5016939139,11701301,2024,102236,43463.0


In [19]:
def get_object(id_author_oa, id_author, group):
    # {"id_affiliation": "af0000001", "openalex_institution_id": "https://openalex.org/I76130692", "years": [2024, 2023, 2022, 2021, 2020, 2019, 2018, 2017, 2016, 2015]}
    affiliations = [{'id_affiliation':'' if helpers.is_none(id_institution) else int(id_institution), 
                     'openalex_institution_id':f'I{id_institution_oa}', 
                     'years':df.year.sort_values().to_list()} for (id_institution_oa, id_institution), df in group.groupby(['id_institution_oa', 'id_institution'])]
    obj = {}
    obj['id_author'] = id_author
    obj['openalex_author_id'] = f"A{id_author_oa}"
    obj['affiliations'] = affiliations
    return obj

In [20]:
data = [get_object(id_author_oa, id_author, group) for (id_author_oa,id_author), group in tqdm(df_author_inst_year.query('id_author_oa in [5000729353,5016939139]').groupby(['id_author_oa','id_author']), desc="Processing rows")]
len(data)

Processing rows: 100%|██████████| 2/2 [00:00<00:00, 373.52it/s]


2

In [21]:
data[1]

{'id_author': 102236,
 'openalex_author_id': 'A5016939139',
 'affiliations': [{'id_affiliation': 12015,
   'openalex_institution_id': 'I5795714',
   'years': [2022]},
  {'id_affiliation': 43463,
   'openalex_institution_id': 'I11701301',
   'years': [2024]},
  {'id_affiliation': 43454,
   'openalex_institution_id': 'I99043593',
   'years': [2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021]},
  {'id_affiliation': 87789,
   'openalex_institution_id': 'I115475287',
   'years': [2015, 2022]},
  {'id_affiliation': 110053,
   'openalex_institution_id': 'I129604602',
   'years': [2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024]},
  {'id_affiliation': 32283,
   'openalex_institution_id': 'I154570441',
   'years': [1998, 1999, 2009, 2012, 2023]},
  {'id_affiliation': 36792,
   'openalex_institution_id': 'I1337719021',
   'years': [2020]},
  {'id_affiliation': 101188,
   'openalex_institution_id': 'I4210087660',
   'years': [2020]},
  {'id_affiliation': 13513,
   'openalex_institution_i