In [10]:
## Packages 

import json, requests 
from tqdm import tqdm
import pandas as pd
import subprocess
import time 
import csv
import gzip
import numpy as np
import psycopg2
import subprocess
import os


In [9]:
main_path = '/home/fs01/spec1142/'
data = 'openalex-snapshot/data'

path = '/home/fs01/spec1142/Emma/GateKeepers/Download_OpenAlex/'

f = open('/home/fs01/spec1142/Emma/GateKeepers/' + "database.txt", "r")
user , password = f.read().split()

In [3]:
## list_tables : list of the folders in the folder "data", which represents OpenAlex tables

list_tables = ! ls {main_path + data}
list_tables

['authors',
 'concepts',
 'funders',
 'institutions',
 'merged_ids',
 'publishers',
 'sources',
 'works']

## Function to create / load data / index tables into the Postgres database

In [1]:
def create_table(table_name, schema, drop_if_exists):

    """
    This function creates a table in a PostgreSQL database with the specified schema, and drops the table if it already exists.

    Parameters:
    table_name (str): The name of the table to be created in the PostgreSQL database.
    schema (str): The SQL schema defining the structure of the table.
    drop_if_exists (bool): A flag indicating whether to drop the table if it already exists before creating it.

    Returns:
    str: A success message if the table is created successfully.

    Note:
    - The function assumes that the `user` and `password` variables are defined elsewhere in the code.
    - The function commits the transaction and closes the database connection after creating the table.
    """

    conn = psycopg2.connect("user=" + user + " password=" + password)
    cursor = conn.cursor()

    if drop_if_exists == True:
        cursor.execute("DROP TABLE IF EXISTS " + table_name)
    
    #Creating table as per requirement
    sql = schema
    
    cursor.execute(sql)
    conn.commit()

    #Closing the connection
    conn.close()

    return "Table created successfully........"



In [2]:
## load data into the table 

def load_data_into_table(table_name, data_path, size):

    """
    This function loads data from a TSV file into a specified table in a PostgreSQL database in chunks.

    Parameters:
    table_name (str): The name of the table in the PostgreSQL database to load data into.
    data_path (str): The file path of the TSV file containing the data to be loaded.
    size (int): The chunk size for loading data into the table.

    Returns:
    str: A success message if the data is loaded successfully.

    Note:
    - The function assumes that the `user` and `password` variables are defined elsewhere in the code.
    - The function iterates over the data chunks, saves each chunk to a temporary TSV file, and then uses the `copy_expert` method to load the data from the temporary file into the specified table.
    - The function commits the transaction and closes the database connection after loading the data.
    """
    
    df_chunks = pd.read_csv(data_path , error_bad_lines = "warn" , chunksize = size ,header = 0 , delimiter = "\t", index_col = 0   ) 
    count = 0 
    
    #establishing the connection
    conn = psycopg2.connect("user=" + user + " password=" + password)
    #Creating a cursor object using the cursor() method
    cursor = conn.cursor()
    
    
    ## add the data chunk by chunk in the database 
    for data in tqdm(df_chunks):
        
        count += 1
        ## create chunk and save chunk
        df = pd.DataFrame(data , dtype = str)
        df.to_csv(data_path + "chunk.tsv" , sep = "\t"  )
    
        ## then, add chunk to the database (double check if you're adding the data to the good table. 
        with open("/home/fs01/spec1142/Emma/Download_OpenAlex/chunk.tsv") as f:
            cursor.copy_expert("COPY " + table_name + " FROM STDIN WITH DELIMITER E'\t' CSV HEADER", f)
                
    ## commit to the database.     
    conn.commit()       
    #Closing the connection
    conn.close()

    return "Data loaded successfully.........."
                

In [3]:
def index_table(table_name , index_columns):

    """
    This function creates an index on each specified column in a table in a PostgreSQL database.

    Parameters:
    table_name (str): The name of the table to be indexed in the PostgreSQL database.
    index_columns (list): A list of column names to create indexes on.

    Returns:
    str: A success message if the table is indexed successfully.

    Note:
    - The function assumes that the `user` and `password` variables are defined elsewhere in the code.
    - The function commits the transaction and closes the database connection after creating the indexes.
    """
    
    #establishing the connection
    conn = psycopg2.connect("user=" + user + " password=" + password)
    
    #Creating a cursor object using the cursor() method
    cursor = conn.cursor()
    
    ## index each column
    for index_column in index_columns:
    
        #Index table as per requirement
        sql ='''CREATE INDEX ''' + index_column + '_' + table_name + ''' ON '''+ table_name +'''(''' + index_column + ''');'''
        cursor.execute(sql)
        
    conn.commit()
    #Closing the connection
    conn.close()
    
    return "Table indexed successfully........"

## Download OpenAlex raw data

In [None]:
## from https://docs.openalex.org/download-all-data/download-to-your-machine
! aws s3 sync "s3://openalex" "openalex-snapshot" --no-sign-request


## OpenAlex Institutions

In [23]:
path = '/home/fs01/spec1142/Emma/GateKeepers/Download_OpenAlex/'


### Flatten the data

In [4]:
## list_folders : list of the folders in the folder "data/institutions/", which represents the institution files

list_folders_institutions = ! ls {main_path + data}/institutions
list_folders_institutions = list_folders_institutions[1:]
list_folders_institutions

['updated_date=2023-06-07',
 'updated_date=2023-06-08',
 'updated_date=2023-06-10',
 'updated_date=2023-06-11',
 'updated_date=2023-06-12',
 'updated_date=2023-06-13',
 'updated_date=2023-06-14',
 'updated_date=2023-06-15',
 'updated_date=2023-06-16',
 'updated_date=2023-06-17',
 'updated_date=2023-06-18',
 'updated_date=2023-06-19',
 'updated_date=2023-06-20',
 'updated_date=2023-06-21',
 'updated_date=2023-06-22',
 'updated_date=2023-06-23',
 'updated_date=2023-06-24',
 'updated_date=2023-06-25',
 'updated_date=2023-06-26',
 'updated_date=2023-06-27',
 'updated_date=2023-06-28',
 'updated_date=2023-06-29',
 'updated_date=2023-06-30',
 'updated_date=2023-07-01',
 'updated_date=2023-07-02',
 'updated_date=2023-07-03',
 'updated_date=2023-07-04',
 'updated_date=2023-07-05',
 'updated_date=2023-07-06',
 'updated_date=2023-07-07',
 'updated_date=2023-07-08',
 'updated_date=2023-07-09',
 'updated_date=2023-07-10',
 'updated_date=2023-07-11',
 'updated_date=2023-07-12',
 'updated_date=2023-

In [5]:
files = ! ls {main_path + data}/institutions/{list_folders_institutions[4]}

In [6]:
## print one row of one file 

file = main_path + data +"/institutions/" + list_folders_institutions[4] + "/" + files[0]

with gzip.open(file, 'rb') as f:
    for line in f:
        print(json.loads(line).keys())

        print(json.loads(line)["associated_institutions"])
        break
    

dict_keys(['image_thumbnail_url', 'counts_by_year', 'homepage_url', 'roles', 'type', 'geo', 'ror', 'repositories', 'summary_stats', 'works_api_url', 'associated_institutions', 'id', 'international', 'x_concepts', 'display_name_alternatives', 'cited_by_count', 'image_url', 'display_name', 'country_code', 'display_name_acronyms', 'ids', 'updated_date', 'created_date', 'works_count', 'updated'])
[{'country_code': 'GB', 'ror': 'https://ror.org/00xkkpn05', 'id': 'https://openalex.org/I4210100473', 'display_name': 'Norfolk Community Health and Care NHS Trust', 'type': 'healthcare', 'relationship': 'parent'}]


In [35]:
## organize the data into flat files 

start = time.time()

## search in each folder

for folder in list_folders_institutions:
    
    print("folder: " , folder)
    
    ##get the files in the folder
    
    files = ! ls {main_path + data}/institutions/{folder}
    print("Files: " , files)
    
    #create dictionary 
    dic = {}
    
    ##search in each file in the folder 
    for file in files:
        
        ##unzip file
        with gzip.open(main_path + data +"/institutions/" + folder + "/" + file, 'rb') as f:
        
        
            
            
            #print("File:" , file)

            #open file 

            #fill out the intermediate dictionary 
            for line in tqdm(f):
                line = json.loads(line)
                
                institution_id = line["id"][21:]
                
                   
                dic[institution_id] = {}
                dic[institution_id]["display_name"] = line["display_name"]
                dic[institution_id]["display_name_alternatives"] = ''
                if line["display_name_acronyms"] != []:
                    dic[institution_id]["display_name_alternatives"] += "; ".join(line["display_name_acronyms"])
                if line["display_name_alternatives"] != []:
                    dic[institution_id]["display_name_alternatives"] += "; ".join(line["display_name_alternatives"])
                dic[institution_id]["homepage_url"] = line["homepage_url"]
                
                dic[institution_id]["works_count"] = line["works_count"]
                dic[institution_id]["cited_by_count"] = line["cited_by_count"]
                
                dic[institution_id]["type"] = line["type"]
                
                
                dic[institution_id]["longitude"] = line["geo"]["longitude"]
                dic[institution_id]["latitude"] = line["geo"]["latitude"]
                dic[institution_id]["city"] = line["geo"]["city"]
                dic[institution_id]["geonames_city_id"] = line["geo"]["geonames_city_id"]
                dic[institution_id]["region"] = line["geo"]["region"]
                dic[institution_id]["country"] = line["geo"]["country"]
                dic[institution_id]["country_code"] = line["geo"]["country_code"]
                
                dic[institution_id]["last_update"] = line["updated"][:10]
                     

    table = pd.DataFrame(dic).T
    table.to_csv(path + "OA_institutions/" + folder + ".tsv", sep = "\t", index_label = "institution_id")
               
            
end = time.time()
print("Time:" , end - start) 




folder:  updated_date=2023-06-07
Files:  ['part_000.gz']


8438it [00:00, 17387.33it/s]


folder:  updated_date=2023-06-08
Files:  ['part_000.gz']


5112it [00:00, 18193.81it/s]


folder:  updated_date=2023-06-10
Files:  ['part_000.gz']


3it [00:00, 199.41it/s]

folder:  updated_date=2023-06-11





Files:  ['part_000.gz']


850it [00:00, 9322.94it/s]

folder:  updated_date=2023-06-12





Files:  ['part_000.gz']


71it [00:00, 2406.10it/s]

folder:  updated_date=2023-06-13





Files:  ['part_000.gz']


6it [00:00, 201.99it/s]

folder:  updated_date=2023-06-14





Files:  ['part_000.gz']


3it [00:00, 36.83it/s]

folder:  updated_date=2023-06-15





Files:  ['part_000.gz']


3it [00:00, 407.79it/s]

folder:  updated_date=2023-06-16





Files:  ['part_000.gz']


2it [00:00, 625.27it/s]

folder:  updated_date=2023-06-17





Files:  ['part_000.gz']


134it [00:00, 2381.97it/s]

folder:  updated_date=2023-06-18





Files:  ['part_000.gz']


134it [00:00, 6608.70it/s]

folder:  updated_date=2023-06-19





Files:  ['part_000.gz']


44it [00:00, 1088.11it/s]

folder:  updated_date=2023-06-20





Files:  ['part_000.gz']


15it [00:00, 997.57it/s]

folder:  updated_date=2023-06-21





Files:  ['part_000.gz']


377it [00:00, 8585.51it/s]

folder:  updated_date=2023-06-22





Files:  ['part_000.gz']


9it [00:00, 849.87it/s]

folder:  updated_date=2023-06-23





Files:  ['part_000.gz']


12it [00:00, 1761.02it/s]

folder:  updated_date=2023-06-24





Files:  ['part_000.gz']


14it [00:00, 757.86it/s]

folder:  updated_date=2023-06-25





Files:  ['part_000.gz']


139it [00:00, 889.22it/s]

folder:  updated_date=2023-06-26





Files:  ['part_000.gz']


66it [00:00, 1962.39it/s]

folder:  updated_date=2023-06-27





Files:  ['part_000.gz']


118it [00:00, 2004.02it/s]

folder:  updated_date=2023-06-28





Files:  ['part_000.gz']


1489it [00:00, 7750.06it/s]


folder:  updated_date=2023-06-29
Files:  ['part_000.gz']


12it [00:00, 315.48it/s]

folder:  updated_date=2023-06-30





Files:  ['part_000.gz']


14it [00:00, 341.85it/s]

folder:  updated_date=2023-07-01





Files:  ['part_000.gz']


15it [00:00, 645.24it/s]

folder:  updated_date=2023-07-02





Files:  ['part_000.gz']


335it [00:00, 4352.11it/s]

folder:  updated_date=2023-07-03





Files:  ['part_000.gz']


87it [00:00, 1911.72it/s]

folder:  updated_date=2023-07-04





Files:  ['part_000.gz']


19it [00:00, 974.26it/s]

folder:  updated_date=2023-07-05





Files:  ['part_000.gz']


65it [00:00, 1952.68it/s]

folder:  updated_date=2023-07-06





Files:  ['part_000.gz']


13it [00:00, 581.23it/s]

folder:  updated_date=2023-07-07





Files:  ['part_000.gz']


12it [00:00, 749.08it/s]

folder:  updated_date=2023-07-08





Files:  ['part_000.gz']


12it [00:00, 159.78it/s]

folder:  updated_date=2023-07-09





Files:  ['part_000.gz']


304it [00:00, 2024.31it/s]

folder:  updated_date=2023-07-10





Files:  ['part_000.gz']


118it [00:00, 2933.10it/s]

folder:  updated_date=2023-07-11





Files:  ['part_000.gz']


223it [00:00, 5261.49it/s]

folder:  updated_date=2023-07-12





Files:  ['part_000.gz']


15it [00:00, 663.93it/s]

folder:  updated_date=2023-07-13





Files:  ['part_000.gz']


9it [00:00, 564.59it/s]

folder:  updated_date=2023-07-14





Files:  ['part_000.gz']


18it [00:00, 1899.45it/s]

folder:  updated_date=2023-07-15





Files:  ['part_000.gz']


34it [00:00, 1644.12it/s]

folder:  updated_date=2023-07-16





Files:  ['part_000.gz']


2473it [00:00, 11704.83it/s]


folder:  updated_date=2023-07-17
Files:  ['part_000.gz']


907it [00:00, 7286.56it/s]


folder:  updated_date=2023-07-18
Files:  ['part_000.gz']


40it [00:00, 2228.70it/s]

folder:  updated_date=2023-07-19





Files:  ['part_000.gz']


35it [00:00, 1235.40it/s]

folder:  updated_date=2023-07-20





Files:  ['part_000.gz']


29it [00:00, 939.29it/s]

folder:  updated_date=2023-07-21





Files:  ['part_000.gz']


3133it [00:00, 7459.99it/s]


folder:  updated_date=2023-07-22
Files:  ['part_000.gz']


300it [00:00, 2859.97it/s]

folder:  updated_date=2023-07-23





Files:  ['part_000.gz']


554it [00:00, 9189.12it/s]


folder:  updated_date=2023-07-24
Files:  ['part_000.gz']


354it [00:00, 5432.76it/s]

folder:  updated_date=2023-07-25





Files:  ['part_000.gz']


30it [00:00, 236.80it/s]

folder:  updated_date=2023-07-27





Files:  ['part_000.gz']


23it [00:00, 1007.29it/s]

folder:  updated_date=2023-07-28





Files:  ['part_000.gz']


1it [00:00, 62.68it/s]

folder:  updated_date=2023-07-29





Files:  ['part_000.gz']


1it [00:00, 61.59it/s]

folder:  updated_date=2023-07-30





Files:  ['part_000.gz']


521it [00:00, 6504.79it/s]

folder:  updated_date=2023-07-31





Files:  ['part_000.gz']


172it [00:00, 3266.96it/s]

folder:  updated_date=2023-08-01





Files:  ['part_000.gz']


3it [00:00, 978.99it/s]

folder:  updated_date=2023-08-02





Files:  ['part_000.gz']


18it [00:00, 548.03it/s]

folder:  updated_date=2023-08-03





Files:  ['part_000.gz']


79it [00:00, 643.01it/s]

folder:  updated_date=2023-08-04





Files:  ['part_000.gz']


116it [00:00, 3658.02it/s]

folder:  updated_date=2023-08-05





Files:  ['part_000.gz']


114it [00:00, 6692.01it/s]

folder:  updated_date=2023-08-06





Files:  ['part_000.gz']


2402it [00:00, 10909.40it/s]


folder:  updated_date=2023-08-07
Files:  ['part_000.gz']


1073it [00:00, 8050.67it/s]


folder:  updated_date=2023-08-08
Files:  ['part_000.gz']


54it [00:00, 4979.28it/s]

folder:  updated_date=2023-08-09





Files:  ['part_000.gz']


60it [00:00, 2216.65it/s]

folder:  updated_date=2023-08-10





Files:  ['part_000.gz']


155it [00:00, 6377.26it/s]

folder:  updated_date=2023-08-11





Files:  ['part_000.gz']


529it [00:00, 10220.82it/s]

folder:  updated_date=2023-08-12





Files:  ['part_000.gz']


523it [00:00, 9771.88it/s]

folder:  updated_date=2023-08-13





Files:  ['part_000.gz']


6568it [00:00, 10583.22it/s]


folder:  updated_date=2023-08-14
Files:  ['part_000.gz']


4671it [00:00, 11031.11it/s]


folder:  updated_date=2023-08-15
Files:  ['part_000.gz']


5408it [00:00, 10455.18it/s]


folder:  updated_date=2023-08-16
Files:  ['part_000.gz']


9749it [00:00, 11047.60it/s]


folder:  updated_date=2023-08-17
Files:  ['part_000.gz']


44119it [00:04, 9897.38it/s] 


Time: 39.756977558135986


### Check the data

In [92]:
table = pd.concat([ pd.read_csv(path + "OA_institutions/" + folder + ".tsv", delimiter = "\t") for folder in list_folders_institutions ] ) 
table

Unnamed: 0,institution_id,display_name,display_name_alternatives,homepage_url,works_count,cited_by_count,type,longitude,latitude,city,geonames_city_id,region,country,country_code,last_update
0,I4210144170,Clear Lake Medical Foundation,,http://www.clearlakemedicalfoundation.com/,2,211,other,-95.260056,29.743965,Houston,4699066.0,Texas,United States,US,2023-06-07
1,I4210164531,Nasson College,Nasson Institute,https://www.nasson.org,20,173,education,-70.799550,43.466022,Springvale,4979580.0,Maine,United States,US,2023-06-07
2,I4210163161,NanOasis Technologies (United States),,https://www.nanoasisinc.fogcitydesign.com/,3,166,company,-122.333450,37.919600,Richmond,5387428.0,California,United States,US,2023-06-07
3,I4210101651,Kenya Institute Of Organic Farming,KIOF,http://www.kiof.net/,4,152,nonprofit,37.013905,-1.089038,Nairobi,184745.0,,Kenya,KE,2023-06-07
4,I4210163681,United African University of Tanzania,UAUT,https://www.uaut.ac.tz/,2,144,education,39.328940,-6.870453,Dar es Salaam,160263.0,,Tanzania,TZ,2023-06-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
102553,I4210167279,National Athletic Trainers Association,NATA,https://www.nata.org/,1,0,other,-96.922970,32.938538,Carrollton,4679195.0,Texas,United States,US,2023-08-17
102554,I4210167362,Foundation for the Education and Research in N...,FERNE,http://www.ferne.org/,0,0,nonprofit,-87.671715,41.871350,Chicago,4887398.0,Illinois,United States,US,2023-08-17
102555,I4210167401,American Dysautonomia Institute,ADI,http://www.adiwebsite.org/,0,0,nonprofit,-83.459885,42.530990,Detroit,4990729.0,Michigan,United States,US,2023-08-17
102556,I82930923,Society for Research into Higher Education,SRHE,https://www.srhe.ac.uk/,2,0,other,-0.119311,51.531994,London,2643743.0,,United Kingdom,GB,2023-08-17


In [89]:
table.to_csv(main_path + "Emma/Download_OpenAlex/OA_institutions/institutions_up_to_20230817.tsv" , sep = "\t", index = False )

### Upload the data into the database (Postgres)

In [None]:
## 1) create the table. Drop table if already exists, then create table with the given schema

## pick a name for the table 
table_name = "institutions_OpenAlex"

## create table. Precise table schema and columns names and types 
schema ="""CREATE TABLE  """ + table_name + """ (
   institution_id VARCHAR(20),
   display_name VARCHAR(300),
   display_name_alternatives TEXT ,
   homepage_url TEXT,
   works_count INTEGER,
   cited_by_count INTEGER,
   type VARCHAR(20),
   longitude FLOAT,
   latitude FLOAT,
   city VARCHAR(50),
   geonames_city_id FLOAT,
   region VARCHAR(50),
   country VARCHAR(50),
   country_code VARCHAR(3),
   last_update DATE
   );"""

drop_if_exists = False


#create_table(table_name, schema, drop_if_exists)

In [None]:
## 2) fill out the table with the data. Add the data chunk by chunk for Python RAM memory limitation. 

table_name = "institutions_OpenAlex"

size = 1000000

path= '/home/fs01/spec1142/Emma/GateKeepers/Download_OpenAlex/'
data_path = path + "OA_institutions/institutions_up_to_20230817.tsv"


#load_data_into_table(table_name, data_path, size)


In [None]:
## 3) once everything is in the database, index table. 

## chose the columns to index 
table_name = "institutions_OpenAlex"
index_columns = ['institution_id']

#index_table(table_name , index_columns)

### Test the database 

In [30]:
#establishing the connection
conn = psycopg2.connect("user=" + user + " password=" + password)

#Creating a cursor object using the cursor() method
cursor = conn.cursor()

table_name = "institutions_OpenAlex"



#Creating table as per requirement
sql ="""SELECT institution_id_int , institution_id, longitude, latitude, type,  homepage_url, last_update
        FROM   institutions_OpenAlex 
        WHERE institution_id_int = '""" + '24156275' + """';"""
cursor.execute(sql)
result = cursor.fetchall()



#Closing the connection
conn.close()

## OpenAlex Authors

In [24]:
path = '/home/fs01/spec1142/Emma/GateKeepers/Download_OpenAlex/'


### Flatten the data

In [11]:
## list_folders : list of the folders in the folder "data/authors/", which represents the institution files

cmd = 'ls ' + main_path + data + '/authors'
list_folders_authors = subprocess.check_output(cmd,  shell=True)
list_folders_authors = list_folders_authors.decode().split("\n")
list_folders_authors.remove('')
list_folders_authors = list_folders_authors[1:]
list_folders_authors

['nohup.out',
 'updated_date=2023-04-21',
 'updated_date=2023-05-02',
 'updated_date=2023-05-24',
 'updated_date=2023-06-08',
 'updated_date=2023-06-13',
 'updated_date=2023-06-14',
 'updated_date=2023-06-18',
 'updated_date=2023-06-22',
 'updated_date=2023-06-26',
 'updated_date=2023-06-27',
 'updated_date=2023-06-28',
 'updated_date=2023-06-30',
 'updated_date=2023-07-02',
 'updated_date=2023-07-06',
 'updated_date=2023-07-14',
 'updated_date=2023-07-17',
 'updated_date=2023-07-20',
 'updated_date=2023-07-21',
 'updated_date=2023-07-25',
 'updated_date=2023-07-26',
 'updated_date=2023-07-27',
 'updated_date=2023-07-28',
 'updated_date=2023-07-29',
 'updated_date=2023-07-30',
 'updated_date=2023-07-31',
 'updated_date=2023-08-01',
 'updated_date=2023-08-02',
 'updated_date=2023-08-03',
 'updated_date=2023-08-04',
 'updated_date=2023-08-05',
 'updated_date=2023-08-06',
 'updated_date=2023-08-07',
 'updated_date=2023-08-08',
 'updated_date=2023-08-09',
 'updated_date=2023-08-10',
 'upda

In [12]:
## print files in one of the foler

list_folders_authors = ["updated_date=2023-07-21"]

folder = list_folders_authors[0]
cmd = 'ls ' + main_path + data + '/authors/' + folder
files = subprocess.check_output(cmd,  shell=True)
files = files.decode().split("\n")
files.remove('')
    
print("Files: " , files) 

Files:  ['part_000.gz', 'part_001.gz', 'part_002.gz', 'part_003.gz', 'part_004.gz', 'part_005.gz', 'part_006.gz', 'part_007.gz', 'part_008.gz', 'part_009.gz', 'part_010.gz', 'part_011.gz', 'part_012.gz', 'part_013.gz', 'part_014.gz', 'part_015.gz', 'part_016.gz', 'part_017.gz', 'part_018.gz', 'part_019.gz', 'part_020.gz', 'part_021.gz', 'part_022.gz', 'part_023.gz', 'part_024.gz']


In [None]:
## organize the data into flat files 

start = time.time()
count = 0 

## search in each folder

for folder in list_folders_authors:
    
    print("folder: " , folder)
    
    ##get the files in the folder
    
    cmd = 'ls ' + main_path + data + '/authors/' + folder
    files = subprocess.check_output(cmd,  shell=True)
    files = files.decode().split("\n")
    files.remove('')
    
    print("Files: " , files) 
    

    
    ##search in each file in the folder 
    for file in files:
        
        #create dictionary 
        dic = {}
        
        ##unzip file
        with gzip.open(main_path + data +"/authors/" + folder + "/" + file, 'rb') as f:
        
        
            

            #print("File:" , file)

            #open file 

            #fill out the intermediate dictionary 
            for line in tqdm(f):
                line = json.loads(line)
                
                author_id = line["id"][21:]
                
                
                dic[author_id] = {}
                dic[author_id]["display_name"] = line["display_name"]
                dic[author_id]["display_name_alternatives"] = "; ".join(line["display_name_alternatives"])

                dic[author_id]["orcid"] = line["orcid"]

                if line["last_known_institution"] != None:
                    dic[author_id]["last_known_institution_id"] = line["last_known_institution"]["id"][21:]
                    dic[author_id]["last_known_institution_display_name"] = line["last_known_institution"]["display_name"]
                else:
                    dic[author_id]["last_known_institution_id"] = None
                    dic[author_id]["last_known_institution_display_name"] = None
                dic[author_id]["works_count"] = line["works_count"]
                dic[author_id]["cited_by_count"] = line["cited_by_count"]
                dic[author_id]["last_update"] = line["updated"]
                
                

                     

        table = pd.DataFrame(dic).T
        table.to_csv(path + "OA_authors/" + folder + ".tsv", sep = "\t", index_label = "institution_id", mode = 'a')

            
end = time.time()
print("Time:" , end - start) 



### Check the data

In [5]:
table = pd.read_csv(path + "OA_authors/updated_date=2023-07-28.tsv" , delimiter = "\t")

In [6]:
table

Unnamed: 0,author_id,display_name,display_name_alternatives,orcid,last_known_institution_id,last_known_institution_display_name,works_count,cited_by_count,last_update
0,A5087054413,Marianne Dieckmann,M. Dieckmann; Marianne Dieckmann; M Dieckmann,,I97018004,Stanford University,15,13039,2023-07-28
1,A5043169110,Gerald Klatskin,G Klatskin; Gerald Klatskin; G. Klatskin,,I32971472,Yale University,126,7056,2023-07-28
2,A5062314199,George J. Schroepfer,George J.Schroepfer; G.J. Schroepfer; G. J. Ju...,,I74775410,Rice University,260,6812,2023-07-28
3,A5020422860,Andreas R. Gruentzig,A. R. Gruentzig; Andreas Gruentzig; A R Gruent...,,I2800556163,Emory University Hospital,74,6390,2023-07-28
4,A5089029176,Elinor T. Adman,E. Adman; Elinor Adman; Elinor T. Adman; E T A...,,I201448701,University of Washington,130,6211,2023-07-28
...,...,...,...,...,...,...,...,...,...
12106141,A5091914855,Carmiña Ofelia Vargas Riaño,Carmiña Ofelia Vargas Riaño,,I1332873766,Banco de la República Colombia,3,0,2023-07-28
12106142,A5091914857,Ph Rauh,Ph Rauh,,I161046081,University of Freiburg,1,0,2023-07-28
12106143,A5091914876,Anzhou Liu,Anzhou Liu,,I49232843,Changchun University,1,0,2023-07-28
12106144,A5091914928,Nur Fitriana,Nur Laili Fitriana; Nur Fitriana,,I23758234,State University of Surabaya,3,0,2023-07-28


### Upload the data into the database (Postgres)

In [36]:
## files_to_upload: list of the OpenAlex author's files to upload 

data_path =  '/home/fs01/spec1142/Emma/GateKeepers/Download_OpenAlex/'



cmd = "ls " + data_path + "OA_authors/"
files_to_upload = subprocess.check_output(cmd,  shell=True)
files_to_upload = files_to_upload.decode().split("\n")
files_to_upload.remove('')
files_to_upload


['updated_date=2023-04-21.tsv',
 'updated_date=2023-05-02.tsv',
 'updated_date=2023-05-24.tsv',
 'updated_date=2023-06-08.tsv',
 'updated_date=2023-06-13.tsv',
 'updated_date=2023-06-14.tsv',
 'updated_date=2023-06-18.tsv',
 'updated_date=2023-06-22.tsv',
 'updated_date=2023-06-26.tsv',
 'updated_date=2023-06-27.tsv',
 'updated_date=2023-06-28.tsv',
 'updated_date=2023-06-30.tsv',
 'updated_date=2023-07-02.tsv',
 'updated_date=2023-07-06.tsv',
 'updated_date=2023-07-14.tsv',
 'updated_date=2023-07-17.tsv',
 'updated_date=2023-07-20.tsv',
 'updated_date=2023-07-21.tsv',
 'updated_date=2023-07-25.tsv',
 'updated_date=2023-07-26.tsv',
 'updated_date=2023-07-27.tsv',
 'updated_date=2023-07-28.tsv',
 'updated_date=2023-07-29.tsv',
 'updated_date=2023-07-30.tsv',
 'updated_date=2023-07-31.tsv',
 'updated_date=2023-08-01.tsv',
 'updated_date=2023-08-02.tsv',
 'updated_date=2023-08-03.tsv',
 'updated_date=2023-08-04.tsv',
 'updated_date=2023-08-05.tsv',
 'updated_date=2023-08-06.tsv',
 'update

In [None]:
## 1) create the table. Drop table if already exists, then create table with the given schema

table_name = "authors_OpenAlex"


schema ="""CREATE TABLE  """ + table_name + """ (
   author_id VARCHAR(20),
   display_name VARCHAR(100),
   display_name_reverse VARCHAR(100),
   display_name_alternatives TEXT,
   orcid VARCHAR(100),
   last_known_institution_id VARCHAR(20),
   last_known_institution_display_name TEXT,
   works_count INTEGER,
   cited_by_count INTEGER,
   last_update DATE
   );"""


drop_if_exists = False


#create_table(table_name, schema, drop_if_exists)




Note that columns can be indexed with different types of indexes: 
- names can be indexed with gin (f_unaccent(name::text) gin_trgm_ops)
- ids can be indexed with btree 

In [None]:
## 2) fill out the table with the data. Add the data chunk by chunk for Python RAM memory limitation. 

table_name = "authors_OpenAlex"

size = 5000000

path= '/home/fs01/spec1142/Emma/GateKeepers/Download_OpenAlex/'


for file in files_to_upload:

    data_path =  path + "OA_authors/" + file
    
    #load_data_into_table(table_name, data_path, size)

In [None]:
## 3) once everything is in the database, index table. 

table_name = "authors_OpenAlex"
index_columns = ['author_id']
#index_table(table_name , index_columns)



### Test the database 

In [8]:
import pandas as pd
from tqdm import tqdm
import psycopg2

#establishing the connection
conn = psycopg2.connect("user=" + user + " password=" + password)

#Creating a cursor object using the cursor() method
cursor = conn.cursor()

table_name = "authors_OpenAlex"



#Creating table as per requirement
sql ="""SELECT display_name,display_name_reverse,display_name_alternatives,orcid,last_known_institution_id,last_known_institution_display_name,works_count,cited_by_count,last_update
        FROM   authors_OpenAlex 
        WHERE author_id = '""" + 'A4383185263' + """';"""
cursor.execute(sql)
result = cursor.fetchall()



#Closing the connection
conn.close()

In [4]:
result

[('Laurence Breysse-Chanet',
  'Breysse-Chanet Laurence',
  'nan',
  'nan',
  'I4210098463',
  'Center for Research and Interdisciplinarity',
  1,
  0,
  datetime.date(2023, 7, 6))]

In [160]:
#establishing the connection
conn = psycopg2.connect("user=" + user + " password=" + password)

#Creating a cursor object using the cursor() method
cursor = conn.cursor()

table_name = "authors_OpenAlex"



#Creating table as per requirement
sql ="""SELECT count(author_id)
        FROM   authors_OpenAlex 
        ;"""
cursor.execute(sql)
result = cursor.fetchall()



#Closing the connection
conn.close()

In [161]:
## number of author ids in the database 
result

[(91748427,)]

## OpenAlex Works 

Run the python file "flatten_data_works.py" is used to flatten the works and works-authors data. 

### Upload the data into the database (Postgres) - WORKS

In [2]:
## files_to_upload: list of the OpenAlex author's files to upload 
import subprocess

path = '/home/fs01/spec1142/Emma/GateKeepers/Download_OpenAlex/'



cmd = "ls " + data_path + "OA_works/"
files_to_upload = subprocess.check_output(cmd,  shell=True)
files_to_upload = files_to_upload.decode().split("\n")
files_to_upload.remove('')
files_to_upload



['updated_date=2021-11-03.tsv',
 'updated_date=2021-12-23.tsv',
 'updated_date=2021-12-30.tsv',
 'updated_date=2022-01-30.tsv',
 'updated_date=2022-02-02.tsv',
 'updated_date=2022-02-03.tsv',
 'updated_date=2022-02-17.tsv',
 'updated_date=2022-03-09.tsv',
 'updated_date=2022-03-24.tsv',
 'updated_date=2022-03-31.tsv',
 'updated_date=2023-01-12.tsv',
 'updated_date=2023-04-07.tsv',
 'updated_date=2023-04-08.tsv',
 'updated_date=2023-04-09.tsv',
 'updated_date=2023-04-11.tsv',
 'updated_date=2023-04-12.tsv',
 'updated_date=2023-04-13.tsv',
 'updated_date=2023-04-14.tsv',
 'updated_date=2023-04-15.tsv',
 'updated_date=2023-04-16.tsv',
 'updated_date=2023-04-17.tsv',
 'updated_date=2023-04-24.tsv',
 'updated_date=2023-04-25.tsv',
 'updated_date=2023-04-26.tsv',
 'updated_date=2023-04-27.tsv',
 'updated_date=2023-04-28.tsv',
 'updated_date=2023-04-29.tsv',
 'updated_date=2023-04-30.tsv',
 'updated_date=2023-05-01.tsv',
 'updated_date=2023-05-02.tsv',
 'updated_date=2023-05-03.tsv',
 'update

In [None]:
table_name = "works_OpenAlex"

# 1) create table 

schema ="""CREATE TABLE  """ + table_name + """ (
   work_id VARCHAR(20),
   doi TEXT,
   pmid VARCHAR(20),
   title TEXT,
   abstract TEXT,
   publication_date DATE,
   type VARCHAR(30),
   venue_or_source VARCHAR(30),
   publisher_id VARCHAR(30),
   first_page VARCHAR(50),
   last_page VARCHAR(50),
   volume TEXT,
   issue TEXT,
   cited_by_count INTEGER,
   concepts TEXT,
   referenced_works TEXT,
   last_update DATE
   );"""


drop_if_exists = False
#create_table(table_name, schema, drop_if_exists)


# 2) upload data 
table_name = "works_OpenAlex"
size = 5000000
for file in files_to_upload:

    data_path =  path + "OA_works/" + file
    
    #load_data_into_table(table_name, data_path, size)


# 3) index 
table_name = "works_OpenAlex"
index_columns = ['work_id']
#index_table(table_name , index_columns)

### Upload the data into the database (Postgres) -AUTHORS - WORKS

In [162]:
## files_to_upload: list of the OpenAlex author's files to upload 
import subprocess

data_path  = '/home/fs01/spec1142/Emma/GateKeepers/Download_OpenAlex/'



cmd = "ls " + data_path + "OA_authors_works/"
files_to_upload = subprocess.check_output(cmd,  shell=True)
files_to_upload = files_to_upload.decode().split("\n")
files_to_upload.remove('')
files_to_upload



['updated_date=2021-11-03.tsv',
 'updated_date=2021-12-23.tsv',
 'updated_date=2021-12-30.tsv',
 'updated_date=2022-01-30.tsv',
 'updated_date=2022-02-02.tsv',
 'updated_date=2022-02-03.tsv',
 'updated_date=2022-02-17.tsv',
 'updated_date=2022-03-09.tsv',
 'updated_date=2022-03-24.tsv',
 'updated_date=2022-03-31.tsv',
 'updated_date=2023-01-12.tsv',
 'updated_date=2023-04-07.tsv',
 'updated_date=2023-04-08.tsv',
 'updated_date=2023-04-09.tsv',
 'updated_date=2023-04-11.tsv',
 'updated_date=2023-04-12.tsv',
 'updated_date=2023-04-13.tsv',
 'updated_date=2023-04-14.tsv',
 'updated_date=2023-04-15.tsv',
 'updated_date=2023-04-16.tsv',
 'updated_date=2023-04-17.tsv',
 'updated_date=2023-04-24.tsv',
 'updated_date=2023-04-25.tsv',
 'updated_date=2023-04-26.tsv',
 'updated_date=2023-04-27.tsv',
 'updated_date=2023-04-28.tsv',
 'updated_date=2023-04-29.tsv',
 'updated_date=2023-04-30.tsv',
 'updated_date=2023-05-01.tsv',
 'updated_date=2023-05-02.tsv',
 'updated_date=2023-05-03.tsv',
 'update

In [168]:
table = pd.read_csv(path + "OA_authors_works/updated_date=2021-11-03.tsv" , delimiter = "\t" )
table

Unnamed: 0,work_id,author_id,institution_id,institution_name
0,W1612067013,A2162240685,I185261750,
1,W1612067013,A2630951464,,
2,W1612067013,A1975636128,,
3,W2098696036,A2250517268,I120514687,
4,W2098696036,A2068701579,I120514687,
...,...,...,...,...
101,W3082828695,A2699446244,,
102,W3082828695,A2186928341,,
103,W3112025781,A3111654815,,
104,W3153153405,A2138850483,,


In [None]:
table_name = "authors_works_OpenAlex"

# 1) create table
schema ="""CREATE TABLE  """ + table_name + """ (
   work_id VARCHAR(20),
   author_id VARCHAR(20),
   institution_id VARCHAR(20),
   institution_name TEXT
   );"""


drop_if_exists = False
#create_table(table_name, schema, drop_if_exists)


# 2) upload data 
table_name = "authors_works_OpenAlex"
size = 5000000
for file in files_to_upload:

    data_path =  path + "OA_authors_works/" + file
    
    #load_data_into_table(table_name, data_path, size)


# 3) index 
table_name = "authors_works_OpenAlex"
index_columns = ['work_id', 'author_id']
#index_table(table_name , index_columns)


### Test the database 

In [2]:
import pandas as pd
from tqdm import tqdm
import psycopg2

#establishing the connection
conn = psycopg2.connect("user=" + user + " password=" + password)

#Creating a cursor object using the cursor() method
cursor = conn.cursor()


#Creating table as per requirement
sql ="""SELECT count( work_id)
        FROM works_authors_OpenAlex 
        ;"""
cursor.execute(sql)
result = cursor.fetchall()



#Closing the connection
conn.close()

In [3]:
result

[(679889296,)]

In [7]:
import pandas as pd
from tqdm import tqdm
import psycopg2

#establishing the connection
conn = psycopg2.connect("user=" + user + " password=" + password)

#Creating a cursor object using the cursor() method
cursor = conn.cursor()


#Creating table as per requirement
sql ="""SELECT * 
        FROM  works_authors_OpenAlex 
        WHERE work_id = 'W4235109850';"""

cursor.execute(sql)
result = cursor.fetchall()



#Closing the connection
conn.close()

In [8]:
result

[('W4235109850', 'A5039758892', None, None)]

In [2]:
#establishing the connection
conn = psycopg2.connect("user=" + user + " password=" + password)

#Creating a cursor object using the cursor() method
cursor = conn.cursor()


#Creating table as per requirement
sql ="""SELECT * 
        FROM  works_OpenAlex 
        WHERE work_id = 'W2228461672';"""

cursor.execute(sql)
result = cursor.fetchall()



#Closing the connection
conn.close()

In [3]:
result

[('W2228461672',
  '10.1080/02533839.2015.1091279',
  'nan',
  'Total precision inspection of machine tools with virtual metrology',
  'The technology of virtual metrology (VM) has been applied in the semiconductor industry to convert sampling inspection with metrology delay into real time and online total inspection. The purpose of this study is trying to apply VM for inspecting machining precision of machine tools. However, machining processes will cause severe vibrations that make process data collection, data cleaning, and feature extraction difficult to handle. Thus, the tasks of how to accurately segment essential parts of the raw process data from the original numerical-control file, how to effectively handle raw process/sensor data with low signal-to-noise ratios, and how to properly extract significant features from the segmented and clean raw process data are challenging issues for successfully applying VM to machine tools. These issues are judiciously addressed and successfu

## OpenAlex works journals and author positions

In [None]:
path = '/home/fs01/spec1142/Emma/GateKeepers/Download_OpenAlex/'


### Flatten data

In [6]:

## list_folders : list of the folders in the folder "data/institutions/", which represents the institution files

list_folders_works = ! ls {main_path + data}/works
list_folders_works = list_folders_works[1:]
list_folders_works

['updated_date=2021-11-03',
 'updated_date=2021-12-23',
 'updated_date=2021-12-30',
 'updated_date=2022-01-30',
 'updated_date=2022-02-02',
 'updated_date=2022-02-03',
 'updated_date=2022-02-17',
 'updated_date=2022-03-09',
 'updated_date=2022-03-24',
 'updated_date=2022-03-31',
 'updated_date=2023-01-12',
 'updated_date=2023-04-07',
 'updated_date=2023-04-08',
 'updated_date=2023-04-09',
 'updated_date=2023-04-11',
 'updated_date=2023-04-12',
 'updated_date=2023-04-13',
 'updated_date=2023-04-14',
 'updated_date=2023-04-15',
 'updated_date=2023-04-16',
 'updated_date=2023-04-17',
 'updated_date=2023-04-24',
 'updated_date=2023-04-25',
 'updated_date=2023-04-26',
 'updated_date=2023-04-27',
 'updated_date=2023-04-28',
 'updated_date=2023-04-29',
 'updated_date=2023-04-30',
 'updated_date=2023-05-01',
 'updated_date=2023-05-02',
 'updated_date=2023-05-03',
 'updated_date=2023-05-04',
 'updated_date=2023-05-05',
 'updated_date=2023-05-06',
 'updated_date=2023-05-07',
 'updated_date=2023-

In [7]:
files = ! ls {main_path + data}/works/{list_folders_works[4]}

In [8]:
file = main_path + data +"/works/" + list_folders_works[4] + "/" + files[0]
file

'/home/fs01/spec1142/openalex-snapshot/data/works/updated_date=2022-02-02/part_000.gz'

In [60]:
def get_journals_years(i):

     """
    This function processes gzipped JSON files containing work data, extracts relevant information (publication date, venue or source, publisher ID, and language), and stores it in a TSV file.

    Parameters:
    i (int): The starting index for selecting folders.

    Returns:
    None. The function directly writes the processed data to a TSV file.

    Note:
    - The function assumes that the `list_folders_works`, `main_path`, and `data` variables are defined elsewhere in the code.
    - The function iterates over a single folder (based on the starting index `i`), gets a list of files in the folder, and processes each file.
    - For each file, the function unzips it, creates a dictionary (`dic_works`), and fills it with the extracted data.
    - The function then converts the dictionary to a DataFrame and writes it to a TSV file. The TSV file is named based on the folder name.
    - The function appends to the TSV file if it already exists, except for the first file, where it overwrites the file.
    - The function also prints the time taken to process all the files.
    """

    start = time.time()
    
    ## search in each folder
    
    for folder in list_folders_works[i:i+1]:
        
        #print("folder: " , folder)
        
        ##get the files in the folder
        
        files = ! ls {main_path + data}/works/{folder}
        #print("Files: " , files) 

        
        ##search in each file in the folder 
        for file in files:
            
            ##unzip file
            with gzip.open(main_path + data +"/works/" + folder + "/" + file, 'rb') as f:
                
                #create dictionary 
                dic_works = {}
                dic_authors_works = {}
                count = 0 
            
    
                #print("File:" , file)
    
                #open file 
    
                #fill out the intermediate dictionary 
                for line in f:
                    
                    line = json.loads(line)
                    
                    work_id = line["id"][21:]
                    
                       
                    dic_works[work_id] = {}
    
                    dic_works[work_id]["publication_date"] = line["publication_date"]
    
    
                    ## host_venue / publisher and sources 
                    if "host_venue" in line:
                        if "id" in line["host_venue"] and line["host_venue"]["id"] != None:
                            print(line["host_venue"]["id"][21:])
                            dic_works[work_id]["venue_or_source"] = line["host_venue"]["id"][21:]
                        else:
                            dic_works[work_id]["venue_or_source"] = None
                        dic_works[work_id]["publisher_id"] = None
                    elif "primary_location" in line and line["primary_location"] != None:
                        
                        if "source" in line["primary_location"] and line["primary_location"]["source"] != None and  "id" in line["primary_location"]["source"] and line["primary_location"]["source"]['id'] != None:
                            dic_works[work_id]["venue_or_source"] = line["primary_location"]['source']["id"][21:]
                        else:
                            dic_works[work_id]["venue_or_source"] = None
                            
                        if "source" in line["primary_location"] and line["primary_location"]["source"] != None and  "publisher_id" in line["primary_location"]["source"] and line["primary_location"]["source"]['publisher_id'] != None:
                            dic_works[work_id]["publisher_id"] = line["primary_location"]['source']["publisher_id"][21:]
                        else:
                            dic_works[work_id]["publisher_id"] = None
                        
                        
                    else:
                        dic_works[work_id]["venue_or_source"] = None
                        dic_works[work_id]["publisher_id"] = None
                        
                    if "language" in line:
                        dic_works[work_id]["language"] = line["language"]
                        
    
                if file == files[0]:
    
                    table = pd.DataFrame(dic_works).T
                    table.to_csv(path + "OA_works_journal/" + folder + ".tsv", sep = "\t", index_label = "work_id" , mode='w')

                else:
                    table = pd.DataFrame(dic_works).T
                    table.to_csv(path + "OA_works_journal/" + folder + ".tsv", sep = "\t", index_label = "work_id" , mode='a', header = False)

                    
            
    end = time.time()
    print("Time:" , end - start) 
    




In [61]:
## run code get_journals_years using multiple CPUs. 

import warnings

        
from multiprocessing import Process


if __name__ == '__main__':
    with warnings.catch_warnings():
        warnings.simplefilter("ignore",UserWarning)
        
        processes = [Process(target=get_journals_years, args=(k,)) for k in range(0,138)]
        
        for process in processes:
            process.start()
            
        for process in processes:
            process.join()


Time: 2023.0723605155945
Time: 2215.2538208961487
Time: 3614.933911085129
Time: 3791.784571170807
Time: 4015.3293240070343
Time: 4839.842512130737
Time: 5287.343179941177
Time: 6055.877161741257
Time: 6186.000864267349


In [44]:
table = pd.DataFrame(dic_works).T
table

Unnamed: 0,publication_date,venue_or_source,publisher_id,language
W2107277218,2001-12-01,S73539393,P4310320990,en
W2194775991,2016-06-01,,,en
W4247665917,2013-05-22,,,en
W2024556021,1962-07-01,S185046733,P4310320503,en
W2230728100,1965-11-15,S54862371,P4310320257,en
...,...,...,...,...
W2042214850,2005-01-01,S204030396,P4310320990,en
W2051372804,1994-01-01,S75627607,P4310320262,en
W2053134332,1994-09-01,S45720283,P4310311648,en
W2056568229,1980-01-01,S49861241,P4310320990,en


In [16]:

def get_authors_position(i):
## search in each folder

    index = [ k for k in range(i,len(list_folders_works),12) ] 

    for ind in index:

        folder = list_folders_works[ind]
        
        #print("folder: " , folder)
        
        ##get the files in the folder
        
        files = ! ls {main_path + data}/works/{folder}
        #print("Files: " , files) 
        

        ##search in each file in the folder 
        for file in files:
            
            ##unzip file
            with gzip.open(main_path + data +"/works/" + folder + "/" + file, 'rb') as f:
                
                #create dictionary 
                dic_works = {}
                dic_authors_works = {}
                count = 0 
            
    
                #print("File:" , file)
    
                #open file 
    
                #fill out the intermediate dictionary 
                for line in f:
                    
                    line = json.loads(line)
                    
                    work_id = line["id"][21:]
                                
                    authorships = line["authorships"]
                    for elem in authorships:
                        
                        if "id" in elem["author"] and elem["author"]["id"] != None and 'author_position' in elem:
    
                            dic_authors_works[count] = {}
                            dic_authors_works[count]["oaid"] = work_id
                            dic_authors_works[count]["authorid"] = elem["author"]['id'][21:]
                            
                            dic_authors_works[count]["firstmidlast"] = elem["author_position"]
                            if dic_authors_works[count]["firstmidlast"] == 'middle':
                                dic_authors_works[count]["firstmidlast"] = 'mid'
                                    
                            count += 1

                table = pd.DataFrame(dic_authors_works).T
                if file == files[0]:
                    table = pd.DataFrame(dic_authors_works).T
                    table.to_csv(path + "OA_authors_position/" + folder + ".tsv", sep = "\t", index = False, mode = 'w')
                else:
                    table = pd.DataFrame(dic_authors_works).T
                    table.to_csv(path + "OA_authors_position/" + folder + ".tsv", sep = "\t", index = False, mode = 'a', header = False)
    return table      
        


In [74]:
## run code get_journals_years using multiple CPUs. 


import warnings

        
from multiprocessing import Process


if __name__ == '__main__':
    with warnings.catch_warnings():
        warnings.simplefilter("ignore",UserWarning)
        
        processes = [Process(target=get_authors_position, args=(k,)) for k in range(12)]
        
        for process in processes:
            process.start()
            
        for process in processes:
            process.join()

### Check tables

In [22]:
table = pd.read_csv(path + 'OA_works_journal/updated_date=2023-06-19.tsv' , delimiter = "\t" )
table.head()

Unnamed: 0,work_id,publication_date,venue_or_source,publisher_id,language
0,W2069614918,2001-05-01,,,en
1,W2080626185,2015-05-01,S24284137,P4310320990,en
2,W4200258892,2021-09-15,S4306462964,P4310320561,
3,W4244934795,2010-03-01,S21837735,P4310320443,en
4,W2213700795,2014-06-01,S19887683,P4310319808,en


In [21]:
table = pd.read_csv(path + 'OA_authors_position/updated_date=2021-11-03.tsv' , delimiter = "\t" )
table.head()

Unnamed: 0,oaid,authorid,firstmidlast
0,W1612067013,A2162240685,first
1,W1612067013,A2630951464,mid
2,W1612067013,A1975636128,last
3,W2098696036,A2250517268,first
4,W2098696036,A2068701579,mid


## OpenAlex journals

In [25]:
path = '/home/fs01/spec1142/Emma/GateKeepers/Download_OpenAlex/'


### Flatten data

In [11]:
## list_folders : list of the folders in the folder "data/sources/", which represents the journal's files

list_folders_sources = ! ls {main_path + data}/sources
list_folders_sources = list_folders_sources[1:]
list_folders_sources

['updated_date=2023-06-12',
 'updated_date=2023-06-13',
 'updated_date=2023-06-14',
 'updated_date=2023-06-15',
 'updated_date=2023-06-16',
 'updated_date=2023-06-17',
 'updated_date=2023-06-18',
 'updated_date=2023-06-19',
 'updated_date=2023-06-20',
 'updated_date=2023-06-21',
 'updated_date=2023-06-22',
 'updated_date=2023-06-23',
 'updated_date=2023-06-24',
 'updated_date=2023-06-25',
 'updated_date=2023-06-26',
 'updated_date=2023-06-27',
 'updated_date=2023-06-28',
 'updated_date=2023-06-29',
 'updated_date=2023-06-30',
 'updated_date=2023-07-01',
 'updated_date=2023-07-02',
 'updated_date=2023-07-03',
 'updated_date=2023-07-04',
 'updated_date=2023-07-05',
 'updated_date=2023-07-06',
 'updated_date=2023-07-07',
 'updated_date=2023-07-08',
 'updated_date=2023-07-09',
 'updated_date=2023-07-10',
 'updated_date=2023-07-11',
 'updated_date=2023-07-12',
 'updated_date=2023-07-13',
 'updated_date=2023-07-14',
 'updated_date=2023-07-15',
 'updated_date=2023-07-16',
 'updated_date=2023-

In [12]:
files = ! ls {main_path + data}/sources/{list_folders_sources[4]}

In [13]:
file = main_path + data +"/sources/" + list_folders_sources[4] + "/" + files[0]
file

'/home/fs01/spec1142/openalex-snapshot/data/sources/updated_date=2023-06-16/part_000.gz'

In [19]:
with gzip.open(file, 'rb') as f:
    for line in f:
        print(json.loads(line).keys())

        print(json.loads(line)["issn_l"])
        break
    

dict_keys(['publisher_id', 'host_organization_lineage', 'is_in_doaj', 'publisher_lineage', 'issn_l', 'counts_by_year', 'homepage_url', 'apc_prices', 'type', 'abbreviated_title', 'summary_stats', 'works_api_url', 'id', 'societies', 'host_organization', 'apc_usd', 'x_concepts', 'host_organization_lineage_names', 'cited_by_count', 'publisher_lineage_names', 'display_name', 'host_organization_name', 'country_code', 'issn', 'alternate_titles', 'publisher', 'host_institution_lineage_names', 'ids', 'is_oa', 'updated_date', 'created_date', 'works_count', 'updated', 'host_institution_lineage'])
None


In [21]:
import gzip


#create dictionary 
dic = {}

start = time.time()

## search in each folder

for folder in list_folders_sources:
    
    print("folder: " , folder)
    
    ##get the files in the folder
    
    files = ! ls {main_path + data}/sources/{folder}
    print("Files: " , files)
    
    ##search in each file in the folder 
    for file in files:
        
        ##unzip file
        with gzip.open(main_path + data +"/sources/" + folder + "/" + file, 'rb') as f:


            #fill out the intermediate dictionary 
            for line in tqdm(f):
                line = json.loads(line)
                
                source_id = line["id"][21:]
                
                   
                dic[source_id] = {}
                dic[source_id]["journalname"] = line["display_name"]
                dic[source_id]["issn_l"] = line["issn_l"]
                
                     

table = pd.DataFrame(dic).T
table.to_csv(path + "OA_sources.tsv", sep = "\t", index_label = "journalid")
               
            
end = time.time()
print("Time:" , end - start) 



folder:  updated_date=2023-06-12
Files:  ['part_000.gz']


32467it [00:01, 19973.87it/s]

folder:  updated_date=2023-06-13





Files:  ['part_000.gz']


65645it [00:03, 21730.73it/s]

folder:  updated_date=2023-06-14





Files:  ['part_000.gz']


300it [00:00, 6891.05it/s]

folder:  updated_date=2023-06-15





Files:  ['part_000.gz']


87it [00:00, 9835.96it/s]

folder:  updated_date=2023-06-16





Files:  ['part_000.gz']


63it [00:00, 9227.26it/s]

folder:  updated_date=2023-06-17





Files:  ['part_000.gz']


55it [00:00, 6090.10it/s]

folder:  updated_date=2023-06-18





Files:  ['part_000.gz']


138it [00:00, 10278.70it/s]

folder:  updated_date=2023-06-19





Files:  ['part_000.gz']


2367it [00:00, 13386.55it/s]

folder:  updated_date=2023-06-20





Files:  ['part_000.gz']


105it [00:00, 1821.49it/s]

folder:  updated_date=2023-06-21





Files:  ['part_000.gz']


102it [00:00, 5169.96it/s]

folder:  updated_date=2023-06-22





Files:  ['part_000.gz']


120it [00:00, 2848.70it/s]

folder:  updated_date=2023-06-23





Files:  ['part_000.gz']


109it [00:00, 2088.31it/s]

folder:  updated_date=2023-06-24





Files:  ['part_000.gz']


151it [00:00, 5724.02it/s]

folder:  updated_date=2023-06-25





Files:  ['part_000.gz']


454it [00:00, 5739.56it/s]

folder:  updated_date=2023-06-26





Files:  ['part_000.gz']


2066it [00:00, 12008.02it/s]

folder:  updated_date=2023-06-27





Files:  ['part_000.gz']


166it [00:00, 3984.61it/s]

folder:  updated_date=2023-06-28





Files:  ['part_000.gz']


222it [00:00, 3997.40it/s]

folder:  updated_date=2023-06-29





Files:  ['part_000.gz']


262it [00:00, 5828.33it/s]

folder:  updated_date=2023-06-30





Files:  ['part_000.gz']


287it [00:00, 3186.76it/s]

folder:  updated_date=2023-07-01





Files:  ['part_000.gz']


434it [00:00, 6199.22it/s]

folder:  updated_date=2023-07-02





Files:  ['part_000.gz']


914it [00:00, 11038.47it/s]

folder:  updated_date=2023-07-03





Files:  ['part_000.gz']


3316it [00:00, 15347.44it/s]

folder:  updated_date=2023-07-04





Files:  ['part_000.gz']


122it [00:00, 2434.50it/s]

folder:  updated_date=2023-07-05





Files:  ['part_000.gz']


165it [00:00, 2911.73it/s]

folder:  updated_date=2023-07-06





Files:  ['part_000.gz']


111it [00:00, 2635.94it/s]

folder:  updated_date=2023-07-07





Files:  ['part_000.gz']


150it [00:00, 2710.49it/s]

folder:  updated_date=2023-07-08





Files:  ['part_000.gz']


898it [00:00, 11326.24it/s]

folder:  updated_date=2023-07-09





Files:  ['part_000.gz']


152it [00:00, 1677.62it/s]

folder:  updated_date=2023-07-10





Files:  ['part_000.gz']


3022it [00:00, 13939.90it/s]

folder:  updated_date=2023-07-11





Files:  ['part_000.gz']


212it [00:00, 4693.72it/s]

folder:  updated_date=2023-07-12





Files:  ['part_000.gz']


181it [00:00, 4885.23it/s]

folder:  updated_date=2023-07-13





Files:  ['part_000.gz']


546it [00:00, 4619.98it/s]

folder:  updated_date=2023-07-14





Files:  ['part_000.gz']


591it [00:00, 10748.05it/s]

folder:  updated_date=2023-07-15





Files:  ['part_000.gz']


166it [00:00, 7597.05it/s]

folder:  updated_date=2023-07-16





Files:  ['part_000.gz']


233it [00:00, 8090.61it/s]

folder:  updated_date=2023-07-17





Files:  ['part_000.gz']


3981it [00:00, 13770.47it/s]

folder:  updated_date=2023-07-18





Files:  ['part_000.gz']


252it [00:00, 4767.82it/s]

folder:  updated_date=2023-07-19





Files:  ['part_000.gz']


225it [00:00, 3254.35it/s]

folder:  updated_date=2023-07-20





Files:  ['part_000.gz']


180it [00:00, 2995.98it/s]

folder:  updated_date=2023-07-21





Files:  ['part_000.gz']


23it [00:00, 1447.31it/s]

folder:  updated_date=2023-07-22





Files:  ['part_000.gz']


334it [00:00, 5236.32it/s]


folder:  updated_date=2023-07-23
Files:  ['part_000.gz']


370it [00:00, 2056.16it/s]

folder:  updated_date=2023-07-24





Files:  ['part_000.gz']


4368it [00:00, 15414.80it/s]

folder:  updated_date=2023-07-25





Files:  ['part_000.gz']


912it [00:00, 11120.88it/s]

folder:  updated_date=2023-07-26





Files:  ['part_000.gz']


515it [00:00, 8016.84it/s]

folder:  updated_date=2023-07-27





Files:  ['part_000.gz']


2085it [00:00, 15398.03it/s]

folder:  updated_date=2023-07-28





Files:  ['part_000.gz']


93it [00:00, 4388.48it/s]

folder:  updated_date=2023-07-29





Files:  ['part_000.gz']


1it [00:00, 19.08it/s]

folder:  updated_date=2023-07-30





Files:  ['part_000.gz']


37it [00:00, 4987.44it/s]

folder:  updated_date=2023-07-31





Files:  ['part_000.gz']


2336it [00:00, 10303.00it/s]

folder:  updated_date=2023-08-01





Files:  ['part_000.gz']


18it [00:00, 711.53it/s]

folder:  updated_date=2023-08-02





Files:  ['part_000.gz']


433it [00:00, 8734.48it/s]

folder:  updated_date=2023-08-03





Files:  ['part_000.gz']


1840it [00:00, 15090.52it/s]

folder:  updated_date=2023-08-04





Files:  ['part_000.gz']


1310it [00:00, 13833.70it/s]

folder:  updated_date=2023-08-05





Files:  ['part_000.gz']


1218it [00:00, 10922.55it/s]

folder:  updated_date=2023-08-06





Files:  ['part_000.gz']


582it [00:00, 8641.47it/s]

folder:  updated_date=2023-08-07





Files:  ['part_000.gz']


9811it [00:00, 12984.26it/s]

folder:  updated_date=2023-08-08





Files:  ['part_000.gz']


4136it [00:00, 14670.15it/s]

folder:  updated_date=2023-08-09





Files:  ['part_000.gz']


2136it [00:00, 16091.35it/s]

folder:  updated_date=2023-08-10





Files:  ['part_000.gz']


1990it [00:00, 11515.75it/s]

folder:  updated_date=2023-08-11





Files:  ['part_000.gz']


883it [00:00, 13114.81it/s]

folder:  updated_date=2023-08-12





Files:  ['part_000.gz']


1283it [00:00, 10708.53it/s]

folder:  updated_date=2023-08-13





Files:  ['part_000.gz']


2751it [00:00, 11055.77it/s]

folder:  updated_date=2023-08-14





Files:  ['part_000.gz']


30513it [00:02, 12548.49it/s]

folder:  updated_date=2023-08-15





Files:  ['part_000.gz']


9201it [00:00, 12777.43it/s]

folder:  updated_date=2023-08-16





Files:  ['part_000.gz']


12980it [00:01, 11399.11it/s]

folder:  updated_date=2023-08-17





Files:  ['part_000.gz']


18844it [00:01, 11487.33it/s]


Time: 26.526280403137207


In [22]:
table = pd.read_csv(path + "OA_sources.tsv", delimiter = "\t")
