This notebook provides the code to generate csv files with metadata on OpenAlex papers that can be downloaded here: https://zenodo.org/records/8278104.   
This code was used to generate the files:
- 'authoridname.csv'
- 'affiliationidnametype.csv'
- 'paperauthoridaffiliationIFAVAILABLE'
- 'paperyear'
- 'papervolisspages'
- 'paperpmid'
- 'paperjournalid'
- 'paperncitesfrompapers'
- 'paperdoi'
- 'paperauthoridaffiliationIFAVAILABLE.csv'
- 'journalidname.csv'
- 'paperdoi.csv'
- 'paperjournalid.csv'
- 'paperncitesfrompapers.csv'
- 'paperpmid.csv'
- 'papervolisspages.csv'
- 'paperyear.csv'
- 'paperauthoridorder.csv'
  
This code requires OpenAlex data in flat files format. 

In [17]:
## load packages

import pandas as pd
import glob
main_path = '/home/fs01/spec1142/Emma/PPPs/'

## Affiliation id name type

In [31]:
## load data, rename, save data

institutions = pd.read_csv(main_path + 'institutions_up_to_20230817.tsv' , delimiter = '\t' , usecols = ['institution_id','display_name','type'])
institutions = institutions.rename(columns = { 'institution_id' : 'affiliationid' , 'display_name' : 'name' })
institutions.to_csv(main_path + "data_to_post/clean_files/" + 'affiliationidnametype.csv' , index = False) 

## Authors

In [12]:
## load data from GoogleCloud

from google.cloud import storage
path = '/home/fs01/spec1142/Emma/'
path_to_private_key = path + "openalex-lee-c532eb059285.json"
client_storage = storage.Client.from_service_account_json(json_credentials_path=path_to_private_key)


bucket_name = 'openalex-lee'
prefix = 'OpenAlex/authors'
dl_dir = path + "PPPs/data_to_post/authors/" 
    
    
bucket = client_storage.get_bucket(bucket_or_name=bucket_name)
blobs = bucket.list_blobs(prefix=prefix)  # Get list of files
for blob in blobs:
    filename = blob.name.replace('/', '_') 
    blob.download_to_filename(dl_dir + filename)  # Download

In [1]:
## print files

import glob 
path = '/home/fs01/spec1142/Emma/'



### Authors

In [16]:
## Select author files only

list_files_authors = glob.glob(path + "PPPs/data_to_post/authors/OpenAlex_authors_updated_*" )

In [26]:
## load, rename, save file

import pandas as pd 

file_authors = pd.concat( [ pd.read_csv( main_path  + elem , delimiter = "\t", usecols = ["author_id","display_name"]) for elem in list_files_authors])
file_authors = file_authors.rename(columns = { 'author_id' : 'authorid' , 'display_name' : 'authorname' })
file_authors.to_csv(main_path + "data_to_post/clean_files/" + 'authoridname.csv' , index = False) 

In [36]:
file_authors

Unnamed: 0,authorid,authorname
0,A4347464006,Lucero Molina
0,A4330392039,Thibault Mayor
0,A4225353913,Valentina Politi
0,A4379737106,Dina Tarek
1,A4379737109,Yitian Sun
...,...,...
304180,A5091901145,A. I. Bush
304181,A5091907166,Aleksandra Volkova
304182,A5091910692,K. V. Pavelec
304183,A5091913047,F. H. Gladwin


### Authors - works

In [4]:
## select works-authors files only

list_works_authors = glob.glob(path + "PPPs/data_to_post/authors/OpenAlex_authors_works_*" )

In [10]:
## add entity letter in front of the ID. 

import pandas as pd 
from tqdm import tqdm 

for file in tqdm(list_works_authors):
    df = pd.read_csv(  file , delimiter = "\t")
    df['work_id'] = [ 'W' + str(elem) for elem in df['work_id']]
    df['author_id'] = [ 'A' + str(elem) for elem in df['author_id']]
    df['institution_id'] = [ 'I' + str(elem).split('.')[0] if pd.isna(elem) == False else elem for elem in df['institution_id']]
    df.to_csv(  file , sep = "\t", index = False)

  df = pd.read_csv(  file , delimiter = "\t")
  df = pd.read_csv(  file , delimiter = "\t")
  df = pd.read_csv(  file , delimiter = "\t")
  df = pd.read_csv(  file , delimiter = "\t")
  df = pd.read_csv(  file , delimiter = "\t")
  df = pd.read_csv(  file , delimiter = "\t")
  df = pd.read_csv(  file , delimiter = "\t")
  df = pd.read_csv(  file , delimiter = "\t")
  df = pd.read_csv(  file , delimiter = "\t")
100%|█████████████████████████████████████████| 131/131 [59:51<00:00, 27.42s/it]


In [5]:
## load, rename, save files (small files)

file_authors_works = pd.concat( [ pd.read_csv(   elem , delimiter = "\t") for elem in list_works_authors[:-10]])
file_authors_works = file_authors_works.rename(columns = { 'work_id' : 'oaid' , 'author_id' : 'authorid' , 'institution_id' : 'affiliationid' , 'institution_name' : 'affiliationameifnoid' })                                            
file_authors_works = file_authors_works[file_authors_works['oaid'] != 'Wwork_id'] 
file_authors_works.to_csv(main_path + "data_to_post/clean_files/" + 'paperauthoridaffiliationIFAVAILABLE/file1.csv' , index = False) 

In [12]:
file_authors_works[file_authors_works['affiliationameifnoid'].notnull()]

Unnamed: 0,oaid,authorid,affiliationid,affiliationameifnoid
27,W2590154271,A2590585227,,"Clinical Instructor, King Fahad Medical City, ..."
29,W2590154271,A2661761770,,"Clinical Instructor, Al Amal Complex for Menta..."
36,W631075702,A1987814616,,"Nurse Manager, Fairleigh Lodge, Auckland, New ..."
67,W2594753679,A2592423679,,"Executive Director, CARE Centre for Internatio..."
96,W2994201468,A2688736441,,WordMeridian Communications.
...,...,...,...,...
94872,W955277603,A4345380707,,"Université Grenoble Alpes - UFR Langage, lettr..."
94927,W96478308,A2551780365,,"Université Grenoble Alpes - UFR Langage, lettr..."
95143,W996452691,A341007127,,"Escuela Superior de Guerra, , Buenos Aires, Ar..."
1259,W4292532348,A2953613218,,Philosophies contemporaines


In [14]:
## load, rename, save files (big files)

k = 2

for elem in list_works_authors[-10:]:
    file_authors_works = pd.read_csv(   elem , delimiter = "\t")
    file_authors_works = file_authors_works.rename(columns = { 'work_id' : 'oaid' , 'author_id' : 'authorid' , 'institution_id' : 'affiliationid' , 'institution_name' : 'affiliationameifnoid' })                                            
    file_authors_works = file_authors_works[file_authors_works['oaid'] != 'Wwork_id'] 
    file_authors_works.to_csv(main_path + "data_to_post/clean_files/" + 'paperauthoridaffiliationIFAVAILABLE/file' + str(k) + '.csv' , index = False) 
    k += 1
    print(k)
    
    

3


  file_authors_works = pd.read_csv(   elem , delimiter = "\t")


4


  file_authors_works = pd.read_csv(   elem , delimiter = "\t")


5


  file_authors_works = pd.read_csv(   elem , delimiter = "\t")


6


  file_authors_works = pd.read_csv(   elem , delimiter = "\t")


7


  file_authors_works = pd.read_csv(   elem , delimiter = "\t")


8


  file_authors_works = pd.read_csv(   elem , delimiter = "\t")


9


  file_authors_works = pd.read_csv(   elem , delimiter = "\t")


10


  file_authors_works = pd.read_csv(   elem , delimiter = "\t")


11


  file_authors_works = pd.read_csv(   elem , delimiter = "\t")


12


## Works

In [109]:
path = '/home/fs01/spec1142/Emma/'
list_files_works = glob.glob(path + "PPPs/data_to_post/works/*" )[:128]

In [110]:
## load, rename, save files (small files)


import pandas as pd 

file_works = pd.concat( [ pd.read_csv(  elem , delimiter = "\t", on_bad_lines = 'skip',lineterminator='\n', usecols = ['work_id', 'doi', 'pmid', 'venue_or_source', 'first_page', 'last_page',
       'volume', 'issue', 'cited_by_count','publication_date']) for elem in list_files_works])

file_works['work_id'] = [ "W" + str(elem) for elem in file_works['work_id'] ] 
file_works['venue_or_source'] = [ "S" + str(elem) for elem in file_works['venue_or_source'] ] 
file_works['paperyear'] = [ elem[:4] if pd.isna(elem) == False else elem for elem in file_works['venue_or_source'] ] 

file_works = file_works.rename(columns = {'work_id' : 'oaid', 'venue_or_source': 'journalid', 'volume': 'papervolume' , 'issue' : 'paperissue' , 'first_page': 'paperIstpage' , 'last_page' : 'paperlastpage' , 'cited_by_count':'numcitesfrompapers'   } ) 



file_works[file_works['paperyear'].notnull()][['oaid','paperyear']].to_csv(path + 'PPPs/data_to_post/clean_files/paperyear/file1.csv' , index = False)
file_works[file_works['doi'].notnull()][['oaid','doi']].to_csv(path + 'PPPs/data_to_post/clean_files/paperdoi/file1.csv'  , index = False)
file_works[file_works['pmid'].notnull()][['oaid','pmid']].to_csv(path + 'PPPs/data_to_post/clean_files/paperpmid/file1.csv'  , index = False)
file_works[file_works['journalid'].notnull()][['oaid','journalid']].to_csv(path + 'PPPs/data_to_post/clean_files/paperjournalid/file1.csv'  , index = False)
file_works[file_works['numcitesfrompapers'].notnull()][['oaid','numcitesfrompapers']].to_csv(path + 'PPPs/data_to_post/clean_files/paperncitesfrompapers/file1.csv'  , index = False)
file_works[['oaid','papervolume','paperissue','paperIstpage','paperlastpage']].to_csv(path + 'PPPs/data_to_post/clean_files/papervolisspages/file1.csv'  , index = False)



  file_works = pd.concat( [ pd.read_csv(  elem , delimiter = "\t", on_bad_lines = 'skip',lineterminator='\n', usecols = ['work_id', 'doi', 'pmid', 'venue_or_source', 'first_page', 'last_page',


In [106]:
## load, rename, save files (big files)

import pandas as pd 


def clean_files(k):

    day = k + 8

    list_files_works = glob.glob(path + "PPPs/data_to_post/works/OpenAlex_works_updated_date=2023-08-" + str(day) + '_*' )
    
    
    file_works = pd.concat( [ pd.read_csv(  elem , delimiter = "\t", on_bad_lines = 'skip',lineterminator='\n', usecols = ['work_id', 'doi', 'pmid', 'venue_or_source', 'first_page', 'last_page',
           'volume', 'issue', 'cited_by_count','publication_date']) for elem in list_files_works])
    
    file_works['work_id'] = [ "W" + str(elem) for elem in file_works['work_id'] ] 
    file_works['venue_or_source'] = [ "S" + str(elem) for elem in file_works['venue_or_source'] ] 
    file_works['paperyear'] = [ elem[:4] if pd.isna(elem) == False else elem for elem in file_works['venue_or_source'] ] 
    
    file_works = file_works.rename(columns = {'work_id' : 'oaid', 'venue_or_source': 'journalid', 'volume': 'papervolume' , 'issue' : 'paperissue' , 'first_page': 'paperIstpage' , 'last_page' : 'paperlastpage' , 'cited_by_count':'numcitesfrompapers'   } ) 
    
    
    
    file_works[file_works['paperyear'].notnull()][['oaid','paperyear']].to_csv(path + 'PPPs/data_to_post/clean_files/paperyear/file' + str(k) + '.csv'  , index = False)
    file_works[file_works['doi'].notnull()][['oaid','doi']].to_csv(path + 'PPPs/data_to_post/clean_files/paperdoi/file' + str(k) + '.csv'  , index = False)
    file_works[file_works['pmid'].notnull()][['oaid','pmid']].to_csv(path + 'PPPs/data_to_post/clean_files/paperpmid/file' + str(k) + '.csv' , index = False)
    file_works[file_works['journalid'].notnull()][['oaid','journalid']].to_csv(path + 'PPPs/data_to_post/clean_files/paperjournalid/file' + str(k) + '.csv'  , index = False)
    file_works[file_works['numcitesfrompapers'].notnull()][['oaid','numcitesfrompapers']].to_csv(path + 'PPPs/data_to_post/clean_files/paperncitesfrompapers/file' + str(k) + '.csv' , index = False)
    file_works[['oaid','papervolume','paperissue','paperIstpage','paperlastpage']].to_csv(path + 'PPPs/data_to_post/clean_files/papervolisspages/file' + str(k) + '.csv'  , index = False)
    


In [None]:
## parallelize the code

import warnings

        
from multiprocessing import Process


if __name__ == '__main__':
    with warnings.catch_warnings():
        warnings.simplefilter("ignore",UserWarning)
        
        processes = [Process(target=clean_files, args=(k,)) for k in range(2,11)]
        
        for process in processes:
            process.start()
            
        for process in processes:
            process.join()

## code to merge files in bash

head -n 1 authorid_workid_1.tsv > authorid_workid.tsv  
for file in authorid_workid_*.tsv; do  
  tail -n +2 "$file" >> authorid_workid.tsv  
done