In [2]:
#pip install selenium
import os
import random

from selenium import webdriver
#import undetected_chromedriver.v2 as webdriver
#import undetected_chromedriver as webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
import time
from tqdm import tqdm

In [3]:
from bs4 import BeautifulSoup

In [4]:
import json
from dataclasses import dataclass

@dataclass
class Institution :
    name :str
    google_scholar_url : str
    homepage_url : str = None

    def toJSON(self) :
        return json.dumps(self, default=lambda o: o.__dict__, 
            sort_keys=True, indent=4)
    def toDict(self):
        return json.loads(self.toJSON())

    def fromDict(self, dic):
        self.name = dic['name']
        self.google_scholar_url = dic['google_scholar_url']
        self.homepage_url = dic['homepage_url']

@dataclass
class Expertise :
    name : str
    url : str

    def toJSON(self):
        return json.dumps(self, default=lambda o: o.__dict__, 
            sort_keys=True, indent=4)
    def toDict(self):
        return json.loads(self.toJSON())
    def fromDict(self, dic):
        self.name = dic['name']
        self.url = dic['url']

@dataclass
class Author :
    name : str
    google_schorlar_profile_url : str
    affiliation : str = None
    expertise_list : list[str] = None
    homepage_url : str = None
    paper_list : list = None
    paper_title_list : list = None

    def toJSON(self):
        return json.dumps(self, default=lambda o: o.__dict__, 
            sort_keys=True, indent=4)
    def toDict(self):
        return json.loads(self.toJSON())
    def fromDict(self, dic):
        self.name = dic['name']
        self.google_schorlar_profile_url = dic['google_schorlar_profile_url']
        self.affiliation = dic['affiliation']
        self.expertise_list = dic['expertise_list']
        self.homepage_url = dic['homepage_url']
        self.paper_list = dic['paper_list']
        self.paper_title_list = dic['paper_title_list']
        
@dataclass
class Paper :
    # After search paper title using Google Schorlar,
    # fill in basic metadata (abstract) from Google Schorlar
    # fill in other metadata from Crossref
    DOI : str = None
    crossref_json : dict = None
    google_schorlar_metadata : dict = None
    title : str = None
    authors : list = None
    abstract : str = None
    conference : str = None
    journal : str = None
    year : int = None
    reference_list : list[str] = None
    referenced_list : list[str] = None
    cite_bibtex : str = None

    def toJSON(self):
        '''convert to JSON recursively'''
        return json.dumps(self, default=lambda o: o.__dict__, 
            sort_keys=True, indent=4)
    def toDict(self):
        '''convert to dict recursively'''
        return json.loads(self.toJSON())
    def fromDict(self, dic) :
        '''convert from dict recursively'''
        self.DOI = dic['DOI']
        self.crossref_json = dic['crossref_json']
        self.google_schorlar_metadata = dic['google_schorlar_metadata']
        self.title = dic['title']
        self.authors = dic['authors']
        self.abstract = dic['abstract']
        self.conference = dic['conference']
        self.journal = dic['journal']
        self.year = dic['year']
        self.reference_list = dic['reference_list']
        self.referenced_list = dic['referenced_list']
        self.cite_bibtex = dic['cite_bibtex']

In [5]:
import requests
import urllib.parse

class CrossRefFetcher :
    def __init__(self) :
        pass

    def fetchMetaDatafromTitle(self, paper) :
        '''
        args :
            paper : Paper
                expect paper.title
        '''
        title = urllib.parse.quote(paper.title)
        url = f'https://api.crossref.org/works?query.bibliographic={title}&rows=1'
        r = requests.get(url)
        metadata = r.json()['message']['items'][0]

        paper.DOI = metadata['DOI']
        paper.crossref_json = metadata
        if len(metadata) == 0 :
            return None
    
        reference_list = []
        try :
            for reference in metadata['reference'] :
                if 'DOI' in reference :
                    reference_list.append(reference['DOI'])
        except :
            pass
            
        paper.reference_list = reference_list

In [6]:
paper = Paper(title = "NeRF: Representing Scenes as Neural Radiance Fields for View Synthesis")

crossref_fetcher = CrossRefFetcher()
crossref_fetcher.fetchMetaDatafromTitle(paper)

In [20]:

class GoogleScharlarSearcher :
    BASE_URL = "https://scholar.google.com"

    def __init__(
            self,
            institution_dict = None,
            expertise_dict = None,
        ) :
        chrome_options = webdriver.ChromeOptions()
        #chrome_options.add_argument("--headless")
        chrome_options.add_argument("--use_subprocess")

        self.driver = webdriver.Chrome(options=chrome_options)
        self.institution_dict = institution_dict
        self.expertise_dict = expertise_dict

        self.crossref_fetcher = CrossRefFetcher()



    def searchPaperByName(self, name) :
        self.driver.get(self.BASE_URL)
        self.driver.implicitly_wait(10)
        # search given paper name
        search = self.driver.find_element(by=By.XPATH, value='//*[@id="gs_hdr_tsi"]')
        search.send_keys(name)
        search.send_keys(Keys.RETURN)
        self.driver.implicitly_wait(10)
        # click the first paper

    def searchAuthorByName(self, name, continue_search = False, search_width = 1000) :
        """
        If continue_search is True, search every co-author until search_width
        """
        self.driver.get(self.BASE_URL)
        self.driver.implicitly_wait(10)
        # search by author name
        searcher = self.driver.find_element(by=By.XPATH, value='//*[@id="gs_hdr_tsi"]')

        time.sleep(0.1)

        #searcher.send_keys(name)

        for chr in name :
            searcher.send_keys(chr)
            time.sleep(random.randint(1, 10)/200)

        time.sleep(0.2)
        searcher.send_keys(Keys.RETURN)
        self.driver.implicitly_wait(10)
        time.sleep(0.2)

        page_source = self.driver.page_source
        soup = BeautifulSoup(page_source, 'html.parser')
        author_list = soup.find_all("h4", class_="gs_rt2")

        author_list = list(map(
            lambda author : Author(
                name = author.text,
                google_schorlar_profile_url = self.BASE_URL +author.find("a")["href"],
            ),
            author_list
        ))

        print(f"authors found : {list(map(lambda author : author.name, author_list))}")
        whole_paper_dict = {}
        for author in author_list :
            author, paper_dict = self.fillAuthor(author)
            whole_paper_dict.update(paper_dict)


        if continue_search and len(author_list) < search_width :
            pass


        return author_list, whole_paper_dict

    def addInstitution(
        self,
        html_str
    ) :
        '''
        initialize Institution instance and append to
        self.instaitution_dict if not exist
        args :
            institution_html :
                expected to have name,
                google_schorlar_institution_url field
        return :
            institution name
        '''
        #institution_name = html_str.find("a").text
        institution_name = html_str.text

        if institution_name not in self.institution_dict :
            try :
                google_schorlar_institution_url = self.BASE_URL + html_str.find("a")["href"]
            except Exception as e :
                google_schorlar_institution_url = None
            homepage_url = None
            self.institution_dict[institution_name] = Institution(
                name = institution_name,
                google_scholar_url = google_schorlar_institution_url,
                homepage_url = homepage_url,
            )
        return institution_name

    def addExpertise(
        self,
        html_str_list
    ) :
        '''
        initialize Expertise instance and append to
        self.expertise_dict if not exist
        args :
            html_str_list :
                list of html_str. each elements are html str
                expected to have name,
                google_schorlar_expertise_url field
        return :
            expertise name
        '''
        expertise_name_list = []
        for html_str in html_str_list :
            expertise_name = html_str.text
            if expertise_name not in self.expertise_dict :
                google_schorlar_expertise_url = self.BASE_URL + html_str["href"]
                self.expertise_dict[expertise_name] = Expertise(
                    name = expertise_name,
                    url = google_schorlar_expertise_url,
                )
            expertise_name_list.append(expertise_name)
        return expertise_name_list

    def fillAuthor(self, author) :
        """
        fill in author instance
        args :
            author :
                expected to have name, google_schorlar_profile_url field
        """
        # load page html        
        self.driver.get(author.google_schorlar_profile_url)
        self.driver.implicitly_wait(10)
        page_source = self.driver.page_source
        soup = BeautifulSoup(page_source, 'html.parser')

        # fill in expertise
        expertise_html_list = soup.find_all("a", class_="gsc_prf_inta")
        expertise_name_list = self.addExpertise(expertise_html_list)
        author.expertise_list = expertise_name_list

        # fill in institution
        institution_html = soup.find("div", class_="gsc_prf_il")
        try :
            institution_name = self.addInstitution(institution_html)
            author.affiliation = institution_name
        except Exception as e :
            print(e)
            print(soup)
            raise e

        paper_dict = self.makePaperDictFromAuthor(author)
        #DOI_list = list(paper_dict.keys())
        #author.paper_list = DOI_list

        return author, paper_dict
    

    def makePaperDictFromAuthor(self, author, search_width_limit = 20) :
        """
        make paper instance from author instance
        args :
            author : Author
                expected to have name, google_schorlar_profile_url field
        return :
            paper_list : list[Paper]
        """

        # load page html        
        self.driver.get(author.google_schorlar_profile_url)
        self.driver.implicitly_wait(10)
        page_source = self.driver.page_source
        soup = BeautifulSoup(page_source, 'html.parser')

        # search papers
        # click "show more" button until it is disabled
        '''
        while True :
            load_more_button = self.driver.find_element(by=By.XPATH, value='//*[@id="gsc_bpf_more"]')
            self.driver.implicitly_wait(10)
            load_more_button.click()
            time.sleep(2)
            if load_more_button.get_property("disabled") :
                break
        '''
        # get papaer html list
        page_source = self.driver.page_source
        soup = BeautifulSoup(page_source, 'html.parser')
        paper_html_list = soup.find_all("tr", class_="gsc_a_tr")
        paper_html_list = paper_html_list[:search_width_limit]

        paper_list = []

        print(f"filling google schorlar metadata of papers from {author.name}...")
        with tqdm(total=len(paper_html_list)) as pbar:
            for paper_html in paper_html_list :
                google_schorlar_url = self.BASE_URL + paper_html.find("a", class_="gsc_a_at")["href"]
                title = paper_html.find("a", class_="gsc_a_at").text
                
                self.driver.get(google_schorlar_url)
                self.driver.implicitly_wait(10)
                soup = BeautifulSoup(self.driver.page_source, 'html.parser')
                metadata_list = soup.find_all("div", class_="gs_scl")
                
                html_title = soup.find("a", class_="gsc_oci_title_link")

                google_schorlar_metadata = {}
                for metadata in metadata_list :
                    field = metadata.find("div", class_="gsc_oci_field").text
                    value = metadata.find("div", class_="gsc_oci_value").text
                    google_schorlar_metadata[field] = value
                

                paper = Paper(title = title, google_schorlar_metadata = google_schorlar_metadata)
                paper_list.append(paper)

                pbar.set_postfix_str(title)
                pbar.update(1)

        author.paper_title_list = list(map(lambda paper : paper.title, paper_list))


        paper_dict = {}

        for paper in paper_list :
            paper_dict[paper.title] = paper
        return paper_dict

        # query_crossref
        print(f"fetching crosserf metadata of papers from {author.name}...")
        for paper in tqdm(paper_list) :
            self.crossref_fetcher.fetchMetaDatafromTitle(paper)
            paper_dict[paper.DOI] = paper

        return paper_dict

In [27]:
# read from file if file is available.

institution_dict = {}
expertise_dict = {}
whole_author_list = []
whole_paper_dict = {}

INSTITUTION_FILE_PATH = "./institution_dict.json"
if os.path.exists(INSTITUTION_FILE_PATH) :
    with open(INSTITUTION_FILE_PATH, "r") as f :
        institution_dict_raw = json.load(f)
    for k, v in institution_dict_raw.items() :
        institution_dict[k] = Institution(**v)

EXPERTISE_FILE_PATH = "./expertise_dict.json"
if os.path.exists(EXPERTISE_FILE_PATH) :
    with open(EXPERTISE_FILE_PATH, "r") as f :
        expertise_dict_raw = json.load(f)
    for k, v in expertise_dict_raw.items() :
        expertise_dict[k] = Expertise(**v)

AUTHOR_FILE_PATH = "./author_list.json"
if os.path.exists(AUTHOR_FILE_PATH) :
    with open(AUTHOR_FILE_PATH, "r") as f :
        author_list_raw = json.load(f)
    for author in author_list_raw :
        whole_author_list.append(Author(**author))

WHOLE_PAPER_FILE_PATH = "./whole_paper_dict.json"
if os.path.exists(WHOLE_PAPER_FILE_PATH) :
    with open(WHOLE_PAPER_FILE_PATH, "r") as f :
        whole_paper_dict = json.load(f)
    for k, v in whole_paper_dict.items() :
        whole_paper_dict[k] = Paper(**v)

In [28]:
with open("./author_name_list.json", "r") as f :
    author_name_list = json.load(f)

author_name_to_append_list = author_name_list

pre_existing_author_name_list = list(map(lambda author : author.name, whole_author_list))
author_name_to_append_list = list(filter(lambda name : name not in pre_existing_author_name_list, author_name_to_append_list))


In [29]:
len(author_name_list)
len(author_name_to_append_list)

997

In [24]:

gsearch = GoogleScharlarSearcher(institution_dict, expertise_dict)
for author_name in author_name_to_append_list :
    author_list, paper_dict = gsearch.searchAuthorByName(author_name)
    whole_paper_dict.update(paper_dict)
    whole_author_list += author_list

authors found : ['Jitendra MALIK']
filling google schorlar metadata of papers from Jitendra MALIK...


100%|██████████| 20/20 [00:09<00:00,  2.07it/s, Large displacement optical flow: descriptor matching in variational motion estimation]                                                    


authors found : []
authors found : []
authors found : []


NoSuchWindowException: Message: no such window: target window already closed
from unknown error: web view not found
  (Session info: chrome=118.0.5993.117)
Stacktrace:
#0 0x5612c7b9cfb3 <unknown>
#1 0x5612c78704a7 <unknown>
#2 0x5612c7849204 <unknown>
#3 0x5612c78decaf <unknown>
#4 0x5612c78f2756 <unknown>
#5 0x5612c78d9713 <unknown>
#6 0x5612c78ac18b <unknown>
#7 0x5612c78acf7e <unknown>
#8 0x5612c7b628d8 <unknown>
#9 0x5612c7b66800 <unknown>
#10 0x5612c7b70cfc <unknown>
#11 0x5612c7b67418 <unknown>
#12 0x5612c7b3442f <unknown>
#13 0x5612c7b8b4e8 <unknown>
#14 0x5612c7b8b6b4 <unknown>
#15 0x5612c7b9c143 <unknown>
#16 0x7fe5abe94ac3 <unknown>


In [25]:
whole_paper_dict_dict = {}
for key, paper in whole_paper_dict.items() :
    whole_paper_dict_dict[key] = paper.toDict()
with open('whole_paper_dict.json', 'w') as f:
    json.dump(whole_paper_dict_dict, f, indent=4, ensure_ascii=False)

whole_author_dict_list = list(map(lambda author : author.toDict(), whole_author_list))
with open("author_list.json", 'w') as f :
    json.dump(whole_author_dict_list, f, indent=4, ensure_ascii=False)

In [26]:
institution_dict = {}
for key, institution in gsearch.institution_dict.items() :
    institution_dict[key] = institution.toDict()
    #institution_dict[key] = json.loads(institution.toJOSN())
with open("institution_dict.json", 'w') as f :
    json.dump(institution_dict, f, indent=4, ensure_ascii=False)

expertise_dict = {}
for key, expertise in gsearch.expertise_dict.items() :
    expertise_dict[key] = expertise.toDict()
with open("expertise_dict.json", 'w') as f :
    json.dump(expertise_dict, f, indent=4, ensure_ascii=False)


In [83]:
import urllib
import requests

def getMetaData(title) :

    # URL encode the title to ensure it's in the correct format for a URL
    encoded_title = urllib.parse.quote_plus(title)

    # The Crossref API endpoint for works
    api_url = f"https://api.crossref.org/works?query.title={encoded_title}"

    # Make the GET request to the Crossref API
    response = requests.get(api_url)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the response to JSON
        data = response.json()

        # Check if there are items in the message
        if 'items' in data['message']:
            # Loop through the items to find the first one with a DOI
            for item in data['message']['items']:
                # Print the DOI
                print("Title:", item.get('title')[0])
                print("DOI:", item.get('DOI'))
                break
        else:
            print("No results found for this title.")
    else:
        print(f"Error: {response.status_code}")
    
    return data

In [66]:
import urllib
import requests

# Your paper title
title = "Example Title of Your Academic Paper"
title = "Visual and Range Data"
title = "3D Gaussian Splatting for Real-Time Radiance Field Rendering"
title = "Sound-Guided Semantic Image Manipulation"

# URL encode the title to ensure it's in the correct format for a URL
encoded_title = urllib.parse.quote_plus(title)

# The Crossref API endpoint for works
api_url = f"https://api.crossref.org/works?query.title={encoded_title}"

# Make the GET request to the Crossref API
response = requests.get(api_url)

# Check if the request was successful
if response.status_code == 200:
    # Parse the response to JSON
    data = response.json()

    # Check if there are items in the message
    if 'items' in data['message']:
        # Loop through the items to find the first one with a DOI
        for item in data['message']['items']:
            # Print the DOI
            print("Title:", item.get('title')[0])
            print("DOI:", item.get('DOI'))
            break
    else:
        print("No results found for this title.")
else:
    print(f"Error: {response.status_code}")

Title: Sound-Guided Semantic Image Manipulation
DOI: 10.1109/cvpr52688.2022.00337


In [65]:
import requests
import urllib.parse

def get_doi_from_title(title):
    encoded_title = urllib.parse.quote_plus(title)
    api_url = f"https://api.crossref.org/works?query.title={encoded_title}"
    response = requests.get(api_url)
    if response.status_code == 200:
        data = response.json()
        items = data['message'].get('items', [])
        if items:
            # Assuming the first result is the correct one
            return items[0].get('DOI')
    return None

def get_references_from_doi(doi):
    api_url = f"https://api.crossref.org/works/{doi}"
    response = requests.get(api_url)
    if response.status_code == 200:
        data = response.json()
        references = data['message'].get('reference', [])
        return references
    return []

# Example usage:
paper_title = "Example Title of Your Academic Paper"
paper_title = "3D Gaussian Splatting for Real-Time Radiance Field Rendering"
paper_title = "Sound-Guided Semantic Image Manipulation"

paper_doi = get_doi_from_title(paper_title)

if paper_doi:
    print(f"DOI for '{paper_title}': {paper_doi}")
    references = get_references_from_doi(paper_doi)
    if references:
        print("References:")
        for ref in references:
            print(ref.get('DOI', 'No DOI provided'))
else:
    print(f"No DOI found for the paper titled '{paper_title}'")

DOI for 'Sound-Guided Semantic Image Manipulation': 10.1109/cvpr52688.2022.00337
References:
10.1145/2647868.2655045
10.1109/CVPR46437.2021.00232
10.1109/CVPR.2019.00244
No DOI provided
No DOI provided
10.1109/CVPR.2019.00772
No DOI provided
No DOI provided
No DOI provided
10.1109/ICCV48922.2021.00209
No DOI provided
10.1109/WACV48630.2021.00313
No DOI provided
No DOI provided
10.1109/ICCV.2019.00453
No DOI provided
No DOI provided
10.1109/ICCV48922.2021.00212
No DOI provided
10.1109/CVPR42600.2020.00813
No DOI provided
10.1145/3394171.3413624
No DOI provided
No DOI provided
10.1007/s11633-021-1293-0
No DOI provided
10.1109/ICCV.2019.01040
No DOI provided
10.1007/978-90-481-8847-5_10
10.1167/16.12.326
No DOI provided
10.1109/ICASSP.2017.7952261
No DOI provided
10.1109/IJCNN52387.2021.9533654
10.1609/aaai.v32i1.12329
No DOI provided
No DOI provided
No DOI provided
10.1109/ICASSP40776.2020.9053174
10.1109/CVPR.2015.7298698
No DOI provided
10.1145/3126686.3126723
10.1109/CVPR46437.2021.00