In [61]:
import os
import pandas as pd

In [65]:
df = pd.DataFrame({'author':[1,1,1,2], 'year':[2016,2016,2016,2020]})
tmp = df.groupby('author').year.agg(['min','max']).reset_index().rename(columns={'min':'min_year','max':'max_year'})
tmp['aps_years_of_activity'] = tmp.apply(lambda row: sorted(set([row.min_year, row.max_year])), axis=1)
tmp.loc[:,'aps_career_age'] = tmp.apply(lambda row: (row.max_year - row.min_year) + 1, axis=1)
tmp

Unnamed: 0,author,min_year,max_year,aps_years_of_activity,aps_career_age
0,1,2016,2016,[2016],1
1,2,2020,2020,[2020],1


In [41]:
import sys
sys.path.append('../code')

In [42]:
%load_ext autoreload
%autoreload 2

from libs import constants
from libs import io
from libs import scholar

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [43]:
aps_os_data_tar_gz = '../data/final_dataset.tar.gz'
aps_data_path = '../data/aps_20240130'


In [20]:
# 1. Read aps-oa data
df_authors = io.read_file_from_tar_gz_as_dataframe(aps_os_data_tar_gz, constants.APS_OA_AUTHORS_FN) # id_author (oopenalex)
df_authors['ID'] = range(1, len(df_authors) + 1)
df_authors.rename(columns={'name':'display_name', 'id_author':'id_author_oa'}, inplace=True)
print(f'\n df_authors: {df_authors.shape}  \n {df_authors.head(5)} \n')


 df_authors: (481012, 11)  
    id_author_oa created_date                updated_date  display_name  \
0    5053051063   2023-07-21  2024-11-04T00:46:12.391095    I. Ben‐Zvi   
1    5067224934   2023-07-21  2024-11-06T10:20:37.304362      T. Roser   
2    5012146130   2023-07-21  2024-11-01T15:52:34.664125   M. Ferrario   
3    5051894783   2023-07-21  2024-11-01T03:49:19.361803  Jean-Luc Vay   
4    5041648606   2023-07-21  2024-11-02T20:12:56.799035   A. Friedman   

                 orcid  two_year_mean_citedness  h_index  i10_index  \
0  0000-0001-5583-0106                 5.416667       33        129   
1  0000-0001-5603-3192                 0.322581       34         89   
2  0000-0002-1105-0359                 2.535714       45        146   
3  0000-0002-0040-799X                 1.581395       34        101   
4  0000-0003-0421-2476                 0.428571       28         97   

   works_count  cited_by_count  ID  
0          783            6034   1  
1          632          

In [21]:
# 1.1 Read author mapping (mappring oa & aps)
df_author_map = io.read_file_from_tar_gz_as_dataframe(aps_os_data_tar_gz, constants.APS_OA_AUTHORS_MAPPING_FN) # id_author_aps, id_author_oa 
# df_authors = df_authors.merge(df_author_map[['id_author_aps','id_author_oa']], on='id_author_oa', how='left')
print(f'\n df_authors (mapping aps): {df_authors.shape}  \n {df_authors.head(5)} \n')


 df_authors (mapping aps): (481012, 11)  
    id_author_oa created_date                updated_date  display_name  \
0    5053051063   2023-07-21  2024-11-04T00:46:12.391095    I. Ben‐Zvi   
1    5067224934   2023-07-21  2024-11-06T10:20:37.304362      T. Roser   
2    5012146130   2023-07-21  2024-11-01T15:52:34.664125   M. Ferrario   
3    5051894783   2023-07-21  2024-11-01T03:49:19.361803  Jean-Luc Vay   
4    5041648606   2023-07-21  2024-11-02T20:12:56.799035   A. Friedman   

                 orcid  two_year_mean_citedness  h_index  i10_index  \
0  0000-0001-5583-0106                 5.416667       33        129   
1  0000-0001-5603-3192                 0.322581       34         89   
2  0000-0002-1105-0359                 2.535714       45        146   
3  0000-0002-0040-799X                 1.581395       34        101   
4  0000-0003-0421-2476                 0.428571       28         97   

   works_count  cited_by_count  ID  
0          783            6034   1  
1         

In [22]:
df_authors.id_author_oa.nunique(), df_author_map.id_author_oa.nunique()

(481012, 430198)

In [23]:
# 2. Reading APS data
df_aps_authors = io.read_csv(io.path_join(aps_data_path, constants.APS_AUTHORS_FN)) # id_author
df_aps_authors.rename(columns={'id_author':'id_author_aps'}, inplace=True)
print(f'\n df_aps_authors: {df_aps_authors.shape}  \n {df_aps_authors.head(5)} \n')

df_aps_author_names = io.read_csv(io.path_join(aps_data_path, constants.APS_AUTHOR_NAMES_FN)) # id_author, id_author_name
df_aps_author_names.rename(columns={'id_author':'id_author_aps'}, inplace=True)
print(f'\n df_aps_author_names: {df_aps_author_names.shape}  \n {df_aps_author_names.head(5)} \n')

df_aps_publications = io.read_csv(io.path_join(aps_data_path, constants.APS_PUBLICATIONS_FN)) # id_publication, timestamp
df_aps_publications['year'] = df_aps_publications.timestamp.str[:4].astype(int)
print(f'\n df_aps_publications: {df_aps_publications.shape}  \n {df_aps_publications.head(5)} \n')

df_aps_authorships = io.read_csv(io.path_join(aps_data_path, constants.APS_AUTHORSHIPS_FN)) # id_author_name, id_publication
df_aps_authorships = df_aps_authorships.merge(df_aps_author_names[['id_author_aps','id_author_name']], on='id_author_name', how='left')
df_aps_authorships = df_aps_authorships.merge(df_aps_publications[['id_publication','year']], on='id_publication', how='left')
print(f'\n df_aps_authorships: {df_aps_authorships.shape}  \n {df_aps_authorships.head(5)} \n')

df_aps_citations = io.read_csv(io.path_join(aps_data_path, constants.APS_CITATIONS_FN)) # id_publication_citing A-->B id_publication_cited (A cites B; B receives a citation from A)
df_aps_citations = df_aps_citations.merge(df_aps_authorships[['id_publication','id_author_aps']], left_on='id_publication_cited', right_on='id_publication', how='left')
print(f'\n df_aps_citations: {df_aps_citations.shape}  \n {df_aps_citations.head(5)} \n')


 df_aps_authors: (874556, 4)  
    id_author_aps  id_gender  disambiguated  id_gender_nq
0              0          0           True             0
1              1          2           True             0
2              2          0           True             0
3              3          2           True             2
4              4          0           True             0 


 df_aps_author_names: (948771, 4)  
    id_author_name  id_author_aps             name gender_nq
0               0              0       S. Megtert         -
1               1              1     Aleksa Bjeli         -
2               2              1         A. Bjeli         -
3               3              2     J. Przystawa         -
4               4              3  Jerzy Przystawa        gm 


 df_aps_publications: (678916, 5)  
    id_publication  id_journal   timestamp                           doi  year
0               0           0  2004-02-25  10.1103/PhysRevSTAB.7.023501  2004
1               1           1

____

# Missing authors

___

In [56]:
# # auhtors
# df_author = io.read_file_from_tar_gz_as_dataframe(aps_os_data_tar_gz, constants.APS_OA_AUTHORS_FN)
# df_author.rename(columns={'id_author':'id_author_oa'}, inplace=True)
# df_author['id_author'] = range(1, len(df_author) + 1)

# # collaborations
# df_authorship = io.read_file_from_tar_gz_as_dataframe(aps_os_data_tar_gz, constants.APS_OA_AUTHORSHIPS_FN)
# df_authorship.rename(columns={'id_author':'id_author_oa'}, inplace=True)
# df_authorship['id_network'] = range(1, len(df_authorship) + 1)
# df_authorship_author = df_authorship.merge(df_author[['id_author_oa','id_author']], on='id_author_oa', how='left')

df_author.shape, df_authorship.shape, df_authorship.id_author_oa.nunique(), df_authorship_author.id_author_oa.nunique(), df_authorship_author.id_author.nunique()

((481012, 11), (3111978, 6), 481108, 481108, 481012)

In [57]:
random_author_id = 5107829036
id_publications = df_authorship_author.query("id_author_oa == @random_author_id").id_publication.unique().tolist()

coauthors_nan = df_authorship_author.query("id_publication in @id_publications").id_author.values.tolist()
coauthors = df_authorship_author.query("id_publication in @id_publications").id_author.dropna().astype(int).values.tolist()

random_author_id, len(coauthors_nan), len(coauthors)

(5107829036, 10999, 10979)

In [60]:
10999-10979

20

In [59]:
random_author_id = 5107829036
id_publications = df_authorship.query("id_author_oa == @random_author_id").id_publication.unique().tolist()

coauthors_nan = df_authorship.query("id_publication in @id_publications").id_author_oa.values.tolist()
coauthors = df_authorship.query("id_publication in @id_publications").id_author_oa.dropna().astype(int).values.tolist()

random_author_id, len(coauthors_nan), len(coauthors)

(5107829036, 10999, 10999)

In [49]:
df_authorship

[5101884633,
 5076268307,
 5019559553,
 5106460287,
 5101891416,
 5108208323,
 5102802867,
 5027347024,
 5025881282,
 5108099961,
 5108117697,
 5079179983,
 5019047837,
 5106454045,
 5019793319,
 5114376638,
 5103540898,
 5064343039,
 5107860405,
 5066213291,
 5040673747,
 5107829036,
 5000230968,
 5111858404,
 5091039649,
 5084360039,
 5016484747,
 5071449593,
 5033688415,
 5107889068,
 5018783153,
 5106454502,
 5070468840,
 5106266295,
 5107841405,
 5107909389,
 5034018025,
 5077388463,
 5114377228,
 5106469771,
 5105975793,
 5045388885,
 5102845224,
 5048597800,
 5036370117,
 5100599347,
 5107825114,
 5106465057,
 5106460483,
 5068368730,
 5050570144,
 5074784295,
 5012596599,
 5107196302,
 5103060462,
 5105705577,
 5107839622,
 5106458654,
 5017023412,
 5083213123,
 5107857706,
 5039580774,
 5101905248,
 5107839693,
 5079313387,
 5048077517,
 5046631959,
 5044819932,
 5050274158,
 5107839311,
 5105824217,
 5007143662,
 5024618531,
 5013673927,
 5106459165,
 5073887944,
 5106465046,

___

In [24]:
# 1. Read aps-oa data
df_names = io.read_file_from_tar_gz_as_dataframe(aps_os_data_tar_gz, constants.APS_OA_ALTERNATIVE_NAMES_FN) # id_author (oopenalex)
df_names['ID'] = range(1, len(df_names) + 1)
df_names.rename(columns={'id_author':'id_author_oa'}, inplace=True)
print(f'\n df_authors: {df_names.shape}  \n {df_names.head(5)} \n')


 df_authors: (1487098, 3)  
    id_author_oa alternative_name  ID
0    5053051063       I. Ben Zvi   1
1    5053051063       I. Ben‐Zvi   2
2    5053051063         I.B. Zvi   3
3    5053051063     Ilan Ben‐Zvi   4
4    5067224934            Roser   5 



In [31]:
df_names.id_author_oa.nunique(), df_authors.id_author_oa.nunique(), df_authors.shape[0]

(481012, 481012, 481012)

In [33]:
oa_missing_aps = df_authors.query("id_author_oa not in @df_author_map.id_author_oa").id_author_oa.unique()
aps_taken_authors = df_author_map.id_author_aps.unique()

# oa_authors, oa_missing_aps, aps_not_taken_yet
df_authors.shape[0], len(oa_missing_aps), df_aps_authors.id_author_aps.nunique()-len(aps_taken_authors)

(481012, 50874, 6583)

In [37]:
for group, df_alt_names in df_names.query("id_author_oa in @oa_missing_aps").groupby('id_author_oa'):

    anames = df_alt_names.alternative_name.unique()
    r = df_aps_author_names.query("name in @anames")

    print(group)
    print(df_alt_names)
    print(r)
    
    break
     #.join(df_names[['id_author_oa','alternative_name']], on='id_author_oa')



5000006697
        id_author_oa    alternative_name      ID
577113    5000006697  Federico Ronchetti  577114
577114    5000006697        F. Ronchetti  577115
Empty DataFrame
Columns: [id_author_name, id_author_aps, name, gender_nq]
Index: []


In [39]:
df_aps_author_names.query("name.str.contains('Ronchetti')")

Unnamed: 0,id_author_name,id_author_aps,name,gender_nq
250617,250617,198909,Marco Ronchetti,gm


___

In [11]:
# 3. Stats

# work counts
df_work_counts = df_aps_authorships.groupby('id_author_aps').id_publication.nunique().reset_index().rename(columns={'id_publication':'aps_works_count'})
print(f'\n df_work_counts: {df_work_counts.shape}  \n {df_work_counts.head(5)} \n')

# citations
df_citation_counts = df_aps_citations.groupby('id_author_aps').id_publication_citing.count().reset_index().rename(columns={'id_publication_citing':'aps_cited_by_count'})
print(f'\n df_citation_counts: {df_citation_counts.shape}  \n {df_citation_counts.head(5)} \n')


 df_work_counts: (868506, 2)  
    id_author_aps  aps_works_count
0              0                6
1              1                5
2              2                1
3              3                3
4              4                3 


 df_citation_counts: (726653, 2)  
    id_author_aps  aps_cited_by_count
0            0.0                  24
1            1.0                  23
2            3.0                   6
3            4.0                  11
4            5.0                 480 



In [19]:
# compute h-index
# col_name = 'citations'
# df_h_index = df_aps_citations.groupby('id_author_aps').apply(lambda group: scholar.compute_h_index(group.reset_index().groupby('id_publication_cited').id_publication_citing.count().reset_index(col_name),col_name)).reset_index(name='aps_h_index')
# print(f'\n df_h_index: {df_h_index.shape}  \n {df_h_index.head(5)} \n')
def fnc(group):
    citations = group.groupby('id_publication_cited').id_publication_citing.count().values

    h_index = sum(c >= i + 1 for i, c in enumerate(citations))
    print(citations, h_index)
    return None

tmp = df_aps_citations.groupby('id_author_aps').apply(lambda group: fnc(group)).reset_index(name='aps_h_index')

[ 1  5  3 11  4] 4
[ 7  2 14] 3
[2 2 2] 2
[11] 1
[ 9  1  3  4  2  8 11  7  5  6 36 41 46  4 40  1  3  3 32 36  9 14 25 18
 31 85] 14
[ 1  5  3  5  6 14 18  2 11] 8
[36  5 11 17 14  7 35 15  3 29  9 10 19 43 13 54 13  7 19 13  1  7 16  5
 25 13 21 24 20  7  5  6 12 16 12 40 12  1 21  2 10 20 14  2  2 16 39 18
 30 20 11] 15
[10] 1
[14] 1
[14] 1
[11  5  3] 3
[7] 1
[ 3  7 12] 3
[2 2 1] 2
[2] 1
[ 2 10] 2
[18] 1
[14] 1
[11] 1
[  9  25  43   5  22  17   9  15  58  21  11   8  11  16  14  11   3   6
   1  35   2   5   2  24  25  39  45  14   3  18   6  29   2   7   5  26
   2   2   6   4  94  37  22  12   5   7  20 106 283  47   7   7   8  21
   7  40  13  36 250   8  23  26   3 158 204  27] 23
[20  9  3  3 58] 4
[ 3  3  3  1 45  3  7  9  2  3  1  2 23  7  7 58  1  2 23] 9
[13  3 16  5  2  1 13  8  3  5] 6
[ 1  1  3 15  1 15  6  1  1] 4
[5 1 3 7] 3
[5] 1
[2] 1
[3 2 2 2] 2
[6] 1
[3] 1
[3 3 4 7 1 2 3 1 2 1 4] 4
[3] 1
[27  3  2  9  2  1 49  5  4] 4
[11 21  8] 3
[14 10 11 21  1] 4
[ 3 15  7 21 11 

KeyboardInterrupt: 

In [None]:
# compute i10-index
df_i10_index = df_aps_citations.groupby('id_author_aps').apply(lambda group: scholar.compute_i10_index(group.reset_index().groupby('id_publication_cited').id_publication_citing.count().reset_index(col_name),col_name)).reset_index(name='aps_i10_index')
print(f'\n df_i10_index: {df_i10_index.shape}  \n {df_i10_index.head(5)} \n')

In [None]:
# compute e-index
df_e_index = df_aps_citations.groupby('id_author_aps').apply(lambda group: scholar.compute_e_index(group.reset_index(), col_publication='id_publication_cited', col_citation_from='id_publication_citing')).reset_index(name='aps_e_index')
print(f'\n df_e_index: {df_e_index.shape}  \n {df_e_index.head(5)} \n')

In [None]:
# time-related stats
df_time = df_aps_authorships.groupby('id_author_aps').year.agg(['min','max']).reset_index().rename(columns={'min':'min_year','max':'max_year'})
df_time['aps_years_of_activity'] = df_time.apply(lambda row: [row.min_year, row.max_year], axis=1)
df_time['aps_career_age'] = df_time.max_year - df_time.min_year
print(f'\n df_time: {df_time.shape}  \n {df_time.head(5)} \n')

In [None]:
# 4. Final merge
df_authors = df_authors.merge(df_work_counts[['id_author_aps','aps_works_count']], on='id_author_aps', how='left')
df_authors = df_authors.merge(df_citation_counts[['id_author_aps','aps_cited_by_count']], on='id_author_aps', how='left')
df_authors = df_authors.merge(df_time[['id_author_aps','aps_years_of_activity', 'aps_career_age']], on='id_author_aps', how='left')
df_authors = df_authors.merge(df_h_index[['id_author_aps','aps_h_index']], on='id_author_aps', how='left')
df_authors = df_authors.merge(df_i10_index[['id_author_aps','aps_i10_index']], on='id_author_aps', how='left')
df_authors = df_authors.merge(df_e_index[['id_author_aps','aps_e_index']], on='id_author_aps', how='left')
