In [5]:
import os
import json
from tqdm import tqdm
import requests
import urllib.parse
import concurrent.futures

from containers import Paper, Author

In [6]:
class CrossRefFetcher :
    def __init__(self) :
        pass

    def fetchMetaDatafromTitle(self, paper) :
        '''
        args :
            paper : Paper
                expect paper.title
        '''
        title = urllib.parse.quote(paper.title)
        url = f'https://api.crossref.org/works?query.bibliographic={title}&rows=1'

        try :
            r = requests.get(url)
            metadata = r.json()['message']['items'][0]
            if len(metadata) == 0 :
                paper.DOI = False
                paper.reference_list = False
                return None
            reference_list = []
            try :
                paper.DOI = metadata['DOI']
                for reference in metadata['reference'] :
                    if 'DOI' in reference :
                        reference_list.append(reference['DOI'])
            except :
                pass
            paper.crossref_json = metadata
            paper.reference_list = reference_list
        
        except Exception as e :
            paper.DOI = False
            paper.crossref_json = False

In [8]:
whole_author_list = []
whole_paper_dict = {}

AUTHOR_FILE_PATH = "./author_list.json"
if os.path.exists(AUTHOR_FILE_PATH) :
    with open(AUTHOR_FILE_PATH, "r") as f :
        author_list_raw = json.load(f)
    for author in author_list_raw :
        whole_author_list.append(Author(**author))

WHOLE_PAPER_FILE_PATH = "./whole_paper_dict.json"
if os.path.exists(WHOLE_PAPER_FILE_PATH) :
    with open(WHOLE_PAPER_FILE_PATH, "r") as f :
        whole_paper_dict = json.load(f)
    for k, v in whole_paper_dict.items() :
        whole_paper_dict[k] = Paper(**v)

PROCESSED_PAPER_FILE_PATH = "./processed_paper_dict.json"
if os.path.exists(PROCESSED_PAPER_FILE_PATH) :
    with open(PROCESSED_PAPER_FILE_PATH, "r") as f :
        processed_paper_dict = json.load(f)
    for k, v in processed_paper_dict.items() :
        processed_paper_dict[k] = Paper(**v)

def checkAlreadyProcessed(key) :
    return (    
        key in processed_paper_dict
    ) and (
        processed_paper_dict[key].DOI is not False
    ) and (
        processed_paper_dict[key].DOI is not None
    )

# remove redundancy in whole_paper_dict
unique_whole_paper_dict = {}
for k, v in whole_paper_dict.items() :
    if k not in unique_whole_paper_dict :
        unique_whole_paper_dict[k] = v
        
paper_to_process_keys_list = list(filter(
    lambda key : not checkAlreadyProcessed(key),
    unique_whole_paper_dict.keys()
))

print(f"whole paper dict size : {len(whole_paper_dict)}, paper to process : {len(paper_to_process_keys_list)}")


whole paper dict size : 15005, paper to process : 10587


In [9]:
crossref_fetcher = CrossRefFetcher()

max_threads = 30

def process_paper(key) :
    paper = whole_paper_dict[key]
    crossref_fetcher.fetchMetaDatafromTitle(paper)

# Using ThreadPoolExecutor to parallelize the task
with concurrent.futures.ThreadPoolExecutor(max_threads) as executor:
    # Map each paper processing task to the executor
    list(tqdm(
        executor.map(process_paper, paper_to_process_keys_list),
        total=len(paper_to_process_keys_list)
    ))


100%|██████████| 10587/10587 [22:41<00:00,  7.78it/s] 


In [22]:
# open processed paper dict, merge, and save
with open(PROCESSED_PAPER_FILE_PATH, "r") as f :
    processed_paper_dict = json.load(f)
for k, v in processed_paper_dict.items() :
    processed_paper_dict[k] = Paper(**v)
for k in paper_to_process_keys_list :
    processed_paper_dict[k] = whole_paper_dict[k]

failed_paper_dict = {}

unique_processed_paper_dict = {}
for k, v in processed_paper_dict.items() :
    if (
        k not in unique_processed_paper_dict
    ) and (
        v.crossref_json
    ) and (
        "issn-type" in v.crossref_json
    ) :
        unique_processed_paper_dict[k] = v
    else :
        failed_paper_dict[k] = v
for k, v in unique_processed_paper_dict.items() :
    unique_processed_paper_dict[k] = v.toDict()

for k, v in failed_paper_dict.items() :
    failed_paper_dict[k] = v.toDict()

with open(PROCESSED_PAPER_FILE_PATH, "w") as f :
    json.dump(unique_processed_paper_dict, f, indent=4, ensure_ascii=False)

with open("./failed_paper_dict.json", "w") as f :
    json.dump(failed_paper_dict, f, indent=4, ensure_ascii=False)

In [23]:
count = 0
for k, v in unique_processed_paper_dict.items() :
    if v["crossref_json"] and "issn-type" in v["crossref_json"] :
        count += 1
count

4633

In [20]:
len(unique_processed_paper_dict)

10660