In [11]:
#pip install selenium
import os
import random
import json
from bs4 import BeautifulSoup
import platform
os_name = platform.system()

if os_name == "Darwin":
    import undetected_chromedriver as webdriver
    #from selenium import webdriver

elif os_name == "Linux":
    from selenium import webdriver

#from selenium import webdriver
#import undetected_chromedriver.v2 as webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
import time
from tqdm import tqdm

from containers import Institution, Author, Paper, Expertise

In [12]:

class GoogleScharlarSearcher :
    BASE_URL = "https://scholar.google.com"

    def __init__(
            self,
            institution_dict = None,
            expertise_dict = None,
            os_name = None
        ) :

        if False : # os_name == "Darwin":
            self.driver = webdriver.Safari()
        else :
            chrome_options = webdriver.ChromeOptions()
            #chrome_options.add_argument("--headless")
            chrome_options.add_argument("--use_subprocess")

            self.driver = webdriver.Chrome(options=chrome_options)
        self.institution_dict = institution_dict
        self.expertise_dict = expertise_dict

    def searchPaperByName(self, name) :
        self.driver.get(self.BASE_URL)
        self.driver.implicitly_wait(10)
        # search given paper name
        search = self.driver.find_element(by=By.XPATH, value='//*[@id="gs_hdr_tsi"]')
        search.send_keys(name)
        search.send_keys(Keys.RETURN)
        self.driver.implicitly_wait(10)
        # click the first paper

    def searchAuthorByName(
        self,
        name,
        continue_search = False,
        search_width = 1000,
        ask_for_continue = False,
    ) :
        """
        If continue_search is True, search every co-author until search_width
        """
        self.driver.get(self.BASE_URL)
        self.driver.implicitly_wait(10)

        self.checkCaptcha()

        # search by author name
        searcher = self.driver.find_element(by=By.XPATH, value='//*[@id="gs_hdr_tsi"]')
        time.sleep(0.1)

        #searcher.send_keys(name)
        for chr in name :
            searcher.send_keys(chr)
            time.sleep(random.randint(1, 10)/200)
        searcher.send_keys(Keys.RETURN)
        self.driver.implicitly_wait(10)
        time.sleep(0.2)

        self.checkCaptcha()

        if ask_for_continue :
            key_input = input("Press [n] to stop...")
            if key_input in ["n", "N", "no", "No", "NO", "nO"] :
                return [], {}

        page_source = self.driver.page_source
        soup = BeautifulSoup(page_source, 'html.parser')
        author_list = soup.find_all("h4", class_="gs_rt2")

        author_list = list(map(
            lambda author : Author(
                name = author.text,
                google_schorlar_profile_url = self.BASE_URL +author.find("a")["href"],
            ),
            author_list
        ))

        print(f"authors found : {list(map(lambda author : author.name, author_list))}")
        
        auther_href_button_list = self.driver.find_elements(by=By.XPATH, value='//*[@class="gs_rt2"]/a')        
        whole_paper_dict = {}
        for author, auther_href_button in zip(author_list, auther_href_button_list) :
            print(f"filling google schorlar metadata of papers from {author.name}...")
            auther_href_button.click()
            time.sleep(0.2)

            self.checkCaptcha()

            author, paper_dict = self.fillAuthor(author, url_already_loaded = True)
            whole_paper_dict.update(paper_dict)
            self.driver.back()
            time.sleep(0.2)
            self.checkCaptcha()

            break

        #for author in author_list :
            #author, paper_dict = self.fillAuthor(author)
            #whole_paper_dict.update(paper_dict)


        if continue_search and len(author_list) < search_width :
            pass


        return author_list, whole_paper_dict

    def addInstitution(
        self,
        html_str
    ) :
        '''
        initialize Institution instance and append to
        self.instaitution_dict if not exist
        args :
            institution_html :
                expected to have name,
                google_schorlar_institution_url field
        return :
            institution name
        '''
        #institution_name = html_str.find("a").text
        institution_name = html_str.text

        if institution_name not in self.institution_dict :
            try :
                google_schorlar_institution_url = self.BASE_URL + html_str.find("a")["href"]
            except Exception as e :
                google_schorlar_institution_url = None
            homepage_url = None
            self.institution_dict[institution_name] = Institution(
                name = institution_name,
                google_scholar_url = google_schorlar_institution_url,
                homepage_url = homepage_url,
            )
        return institution_name

    def addExpertise(
        self,
        html_str_list
    ) :
        '''
        initialize Expertise instance and append to
        self.expertise_dict if not exist
        args :
            html_str_list :
                list of html_str. each elements are html str
                expected to have name,
                google_schorlar_expertise_url field
        return :
            expertise name
        '''
        expertise_name_list = []
        for html_str in html_str_list :
            expertise_name = html_str.text
            if expertise_name not in self.expertise_dict :
                google_schorlar_expertise_url = self.BASE_URL + html_str["href"]
                self.expertise_dict[expertise_name] = Expertise(
                    name = expertise_name,
                    url = google_schorlar_expertise_url,
                )
            expertise_name_list.append(expertise_name)
        return expertise_name_list

    def fillAuthor(self, author, url_already_loaded = False) :
        """
        fill in author instance
        args :
            author :
                expected to have name, google_schorlar_profile_url field
        """
        # load page html        
        if not url_already_loaded :
            self.driver.get(author.google_schorlar_profile_url)
            self.driver.implicitly_wait(10)
        page_source = self.driver.page_source
        soup = BeautifulSoup(page_source, 'html.parser')

        # fill in expertise
        expertise_html_list = soup.find_all("a", class_="gsc_prf_inta")
        expertise_name_list = self.addExpertise(expertise_html_list)
        author.expertise_list = expertise_name_list

        # fill in institution
        institution_html = soup.find("div", class_="gsc_prf_il")
        try :
            institution_name = self.addInstitution(institution_html)
            author.affiliation = institution_name
        except Exception as e :
            print(e)
            print(soup)
            raise e

        paper_dict = self.makePaperDictFromAuthor(author, url_already_loaded = True)
        #DOI_list = list(paper_dict.keys())
        #author.paper_list = DOI_list

        return author, paper_dict
    

    def makePaperDictFromAuthor(self, author, url_already_loaded = False, search_width_limit = 20) :
        """
        make paper instance from author instance
        args :
            author : Author
                expected to have name, google_schorlar_profile_url field
        return :
            paper_list : list[Paper]
        """

        # load page html        
        if not url_already_loaded :
            self.driver.get(author.google_schorlar_profile_url)
            self.driver.implicitly_wait(10)
        page_source = self.driver.page_source
        soup = BeautifulSoup(page_source, 'html.parser')

        # search papers
        # click "show more" button until it is disabled
        '''
        while True :
            load_more_button = self.driver.find_element(by=By.XPATH, value='//*[@id="gsc_bpf_more"]')
            self.driver.implicitly_wait(10)
            load_more_button.click()
            time.sleep(2)
            if load_more_button.get_property("disabled") :
                break
        '''
        # get papaer html list
        page_source = self.driver.page_source
        soup = BeautifulSoup(page_source, 'html.parser')
        paper_html_list = soup.find_all("tr", class_="gsc_a_tr")
        paper_html_list = paper_html_list[:search_width_limit]

        paper_href_list = self.driver.find_elements(by=By.XPATH, value='//*[@class="gsc_a_t"]/a')


        paper_list = []

        print(f"filling google schorlar metadata of papers from {author.name}...")
        with tqdm(total=len(paper_html_list)) as pbar:
            #for paper_html in paper_html_list :
            for paper_html, paper_href in zip(paper_html_list, paper_href_list) :
                google_schorlar_url = self.BASE_URL + paper_html.find("a", class_="gsc_a_at")["href"]
                title = paper_html.find("a", class_="gsc_a_at").text
                
                #self.driver.get(google_schorlar_url)
                #self.driver.implicitly_wait(10)
                paper_href.click()

                self.checkCaptcha()

                time.sleep(0.2)
                soup = BeautifulSoup(self.driver.page_source, 'html.parser')
                metadata_list = soup.find_all("div", class_="gs_scl")
                html_title = soup.find("a", class_="gsc_oci_title_link")

                google_schorlar_metadata = {}
                for metadata in metadata_list :
                    field = metadata.find("div", class_="gsc_oci_field").text
                    value = metadata.find("div", class_="gsc_oci_value").text
                    google_schorlar_metadata[field] = value
                

                paper = Paper(title = title, google_schorlar_metadata = google_schorlar_metadata)
                paper_list.append(paper)

                pbar.set_postfix_str(title)
                pbar.update(1)

                self.driver.back()
                time.sleep(0.2)
                self.checkCaptcha()

        author.paper_title_list = list(map(lambda paper : paper.title, paper_list))


        paper_dict = {}
        for paper in paper_list :
            paper_dict[paper.title] = paper
        return paper_dict

        # query_crossref
        print(f"fetching crosserf metadata of papers from {author.name}...")
        for paper in tqdm(paper_list) :
            self.crossref_fetcher.fetchMetaDatafromTitle(paper)
            paper_dict[paper.DOI] = paper

        return paper_dict
    
    def checkCaptcha(self) :
        captcha_found = False
        source = self.driver.page_source
        if source.find("사용자가 로봇이 아니라는 확인이 필요합니다.") != -1 :
            print("로봇이 아니라는 확인이 필요합니다 text detected!")
            captcha_found = True
        if source.lower().find("recaptcha") != -1 :
            print("recaptcha detected!")
            captcha_found = True
        try:
            captcha_image = self.driver.find_element_by_xpath("//img[contains(@alt, 'captcha')]")
            if captcha_image:
                print("captcha image detected!")
                captcha_found = True
        except Exception as e:
            pass
        try:
            captcha_text = self.driver.find_element_by_xpath("//*[contains(text(), 'prove you are human')]")
            if captcha_text:
                print("CAPTCHA text detected!")
                captcha_found = True
        except Exception as e:
            pass
        try:
            robot_detection = self.driver.find_element_by_xpath("//*[contains(text(), 'Google의 시스템이 컴퓨터 네트워크에서 비정상적인 트래픽을 감지했습니다.')]")
            if robot_detection:
                print("로봇이 아니라는 확인이 필요합니다 detected!")
                captcha_found = True
        except Exception as e:
            pass
        if captcha_found :
            key_input = input("Press [n] to stop...")
            if key_input in ["n", "N", "no", "No", "NO", "nO"] :
                raise Exception("captcha detected!")
        # reload page
        self.driver.refresh()
        time.sleep(0.5)

In [15]:
# read from file if file is available.

institution_dict = {}
expertise_dict = {}
whole_author_list = []
whole_paper_dict = {}

INSTITUTION_FILE_PATH = "./institution_dict.json"
if os.path.exists(INSTITUTION_FILE_PATH) :
    with open(INSTITUTION_FILE_PATH, "r") as f :
        institution_dict_raw = json.load(f)
    for k, v in institution_dict_raw.items() :
        institution_dict[k] = Institution(**v)

EXPERTISE_FILE_PATH = "./expertise_dict.json"
if os.path.exists(EXPERTISE_FILE_PATH) :
    with open(EXPERTISE_FILE_PATH, "r") as f :
        expertise_dict_raw = json.load(f)
    for k, v in expertise_dict_raw.items() :
        expertise_dict[k] = Expertise(**v)

AUTHOR_FILE_PATH = "./author_list.json"
if os.path.exists(AUTHOR_FILE_PATH) :
    with open(AUTHOR_FILE_PATH, "r") as f :
        author_list_raw = json.load(f)
    for author in author_list_raw :
        whole_author_list.append(Author(**author))

WHOLE_PAPER_FILE_PATH = "./whole_paper_dict.json"
if os.path.exists(WHOLE_PAPER_FILE_PATH) :
    with open(WHOLE_PAPER_FILE_PATH, "r") as f :
        whole_paper_dict = json.load(f)
    for k, v in whole_paper_dict.items() :
        whole_paper_dict[k] = Paper(**v)

with open("./author_name_list.json", "r") as f :
    author_name_list = json.load(f)

author_name_to_append_list = author_name_list

pre_existing_author_name_list = list(map(lambda author : author.name, whole_author_list))
author_name_to_append_list = list(filter(lambda name : name not in pre_existing_author_name_list, author_name_to_append_list))

print(len(author_name_list))
print(len(author_name_to_append_list))

empty_author_name_list = []

gsearch = GoogleScharlarSearcher(institution_dict, expertise_dict, os_name=os_name)
for author_name in author_name_to_append_list :
    author_list, paper_dict = gsearch.searchAuthorByName(
        author_name,
        #ask_for_continue=True
    )
    if len(author_list) == 0 :
        empty_author_name_list.append(author_name)
        continue
    whole_paper_dict.update(paper_dict)
    whole_author_list += author_list



970
568
authors found : ['Simon Baker']
filling google schorlar metadata of papers from Simon Baker...
recaptcha detected!
filling google schorlar metadata of papers from Simon Baker...


100%|██████████| 20/20 [00:35<00:00,  1.80s/it, A layered approach to stereo reconstruction]                                                                     


recaptcha detected!
authors found : ['Wang, Michael Y', 'Michael Yu Wang', 'Michael Zhuo Wang']
filling google schorlar metadata of papers from Wang, Michael Y...
filling google schorlar metadata of papers from Wang, Michael Y...


100%|██████████| 20/20 [00:36<00:00,  1.82s/it, Clinical and radiographic comparison of mini–open transforaminal lumbar interbody fusion with open transforaminal lumbar interbody fusion in 42 patients with long-term follow-up]


authors found : ['Stephen L. Johnson', 'Stephen R. Johnson']
filling google schorlar metadata of papers from Stephen L. Johnson...
filling google schorlar metadata of papers from Stephen L. Johnson...


100%|██████████| 20/20 [00:35<00:00,  1.77s/it, Genetic variation in the zebrafish]                                                                                                                                              


authors found : ['José A Caballero', 'Jose Ricardo Diaz Caballero', 'Jose Alonso Caballero Márquez']
filling google schorlar metadata of papers from José A Caballero...
filling google schorlar metadata of papers from José A Caballero...


100%|██████████| 20/20 [00:35<00:00,  1.78s/it, Mathematical considerations for nonisothermal kinetics in thermal decomposition]                                               


authors found : ['Hengshuang Zhao']
filling google schorlar metadata of papers from Hengshuang Zhao...
filling google schorlar metadata of papers from Hengshuang Zhao...


100%|██████████| 20/20 [00:35<00:00,  1.79s/it, LAVT: Language-Aware Vision Transformer for Referring Image Segmentation]                 


authors found : ['BVK Vijaya Kumar']
filling google schorlar metadata of papers from BVK Vijaya Kumar...
filling google schorlar metadata of papers from BVK Vijaya Kumar...


100%|██████████| 20/20 [00:35<00:00,  1.80s/it, Joint disentangling and adaptation for cross-domain person re-identification]                      


authors found : []
authors found : ['Subhransu Maji']
filling google schorlar metadata of papers from Subhransu Maji...
filling google schorlar metadata of papers from Subhransu Maji...


 70%|███████   | 14/20 [00:24<00:10,  1.76s/it, 3D shape segmentation with projective convolutional networks]                

In [14]:
for empty_author_name in empty_author_name_list :
    author_name_list.remove(empty_author_name)
    with open("./author_name_list.json", "w") as f :
        json.dump(author_name_list, f, indent=4, ensure_ascii=False)

whole_paper_dict_dict = {}
for key, paper in whole_paper_dict.items() :
    whole_paper_dict_dict[key] = paper.toDict()
with open('whole_paper_dict.json', 'w') as f:
    json.dump(whole_paper_dict_dict, f, indent=4, ensure_ascii=False)

whole_author_dict_list = list(map(lambda author : author.toDict(), whole_author_list))
with open("author_list.json", 'w') as f :
    json.dump(whole_author_dict_list, f, indent=4, ensure_ascii=False)

institution_dict = {}
for key, institution in gsearch.institution_dict.items() :
    institution_dict[key] = institution.toDict()
    #institution_dict[key] = json.loads(institution.toJOSN())
with open("institution_dict.json", 'w') as f :
    json.dump(institution_dict, f, indent=4, ensure_ascii=False)

expertise_dict = {}
for key, expertise in gsearch.expertise_dict.items() :
    expertise_dict[key] = expertise.toDict()
with open("expertise_dict.json", 'w') as f :
    json.dump(expertise_dict, f, indent=4, ensure_ascii=False)


In [None]:
for empty_author_name in empty_author_name_list :
    author_name_list.remove(empty_author_name)

In [32]:
author_name_list.remove("Verena")
ㅠ

In [33]:
with open("./author_name_list.json", "w") as f :
    json.dump(author_name_list, f, indent=4, ensure_ascii=False)

In [20]:
list(filter(lambda name : name[0] in ["v", "V"],  author_name_list))

['Vincent Vanhoucke',
 'Vladlen Koltun',
 'Verena',
 'Victor Lempitsky',
 'Vincent Lepetit',
 'Valentina Salvatelli',
 'Vinod Nair',
 'Vishal M. Patel',
 'Vittorio Ferrari',
 'Vijay Badrinarayanan',
 'Visvanathan Ramesh',
 'Vittorio Murino',
 'Volker Blanz',
 'Vaclav Hlavac (Václav Hlaváč)',
 'Vincent Rabaud',
 'Vicente Ordóñez',
 'Vijayan Asari',
 'Vibhav Vineet']

In [22]:

whole_paper_dict_dict = {}
for key, paper in whole_paper_dict.items() :
    whole_paper_dict_dict[key] = paper.toDict()
with open('whole_paper_dict.json', 'w') as f:
    json.dump(whole_paper_dict_dict, f, indent=4, ensure_ascii=False)

whole_author_dict_list = list(map(lambda author : author.toDict(), whole_author_list))
with open("author_list.json", 'w') as f :
    json.dump(whole_author_dict_list, f, indent=4, ensure_ascii=False)

institution_dict = {}
for key, institution in gsearch.institution_dict.items() :
    institution_dict[key] = institution.toDict()
    #institution_dict[key] = json.loads(institution.toJOSN())
with open("institution_dict.json", 'w') as f :
    json.dump(institution_dict, f, indent=4, ensure_ascii=False)

expertise_dict = {}
for key, expertise in gsearch.expertise_dict.items() :
    expertise_dict[key] = expertise.toDict()
with open("expertise_dict.json", 'w') as f :
    json.dump(expertise_dict, f, indent=4, ensure_ascii=False)


In [6]:
name = "Jinyoung Han"

In [7]:
driver = webdriver.Chrome()

In [10]:
driver.get(GoogleScharlarSearcher.BASE_URL)
driver.implicitly_wait(10)

# search by author name
searcher = driver.find_element(by=By.XPATH, value='//*[@id="gs_hdr_tsi"]')
time.sleep(0.1)

#searcher.send_keys(name)
for chr in name :
    searcher.send_keys(chr)
    time.sleep(random.randint(1, 10)/200)
searcher.send_keys(Keys.RETURN)
driver.implicitly_wait(10)
time.sleep(0.2)

In [11]:
source = driver.page_source

In [14]:
source.find("reCAPTCHA")

source.lower().find("recaptcha")

46827

In [None]:
"""
import json
from dataclasses import dataclass

@dataclass
class Institution :
    name :str
    google_scholar_url : str
    homepage_url : str = None

    def toJSON(self) :
        return json.dumps(self, default=lambda o: o.__dict__, 
            sort_keys=True, indent=4)
    def toDict(self):
        return json.loads(self.toJSON())

    def fromDict(self, dic):
        self.name = dic['name']
        self.google_scholar_url = dic['google_scholar_url']
        self.homepage_url = dic['homepage_url']

@dataclass
class Expertise :
    name : str
    url : str

    def toJSON(self):
        return json.dumps(self, default=lambda o: o.__dict__, 
            sort_keys=True, indent=4)
    def toDict(self):
        return json.loads(self.toJSON())
    def fromDict(self, dic):
        self.name = dic['name']
        self.url = dic['url']

@dataclass
class Author :
    name : str
    google_schorlar_profile_url : str
    affiliation : str = None
    expertise_list : list[str] = None
    homepage_url : str = None
    paper_list : list = None
    paper_title_list : list = None

    def toJSON(self):
        return json.dumps(self, default=lambda o: o.__dict__, 
            sort_keys=True, indent=4)
    def toDict(self):
        return json.loads(self.toJSON())
    def fromDict(self, dic):
        self.name = dic['name']
        self.google_schorlar_profile_url = dic['google_schorlar_profile_url']
        self.affiliation = dic['affiliation']
        self.expertise_list = dic['expertise_list']
        self.homepage_url = dic['homepage_url']
        self.paper_list = dic['paper_list']
        self.paper_title_list = dic['paper_title_list']
        
@dataclass
class Paper :
    # After search paper title using Google Schorlar,
    # fill in basic metadata (abstract) from Google Schorlar
    # fill in other metadata from Crossref
    DOI : str = None
    crossref_json : dict = None
    google_schorlar_metadata : dict = None
    title : str = None
    authors : list = None
    abstract : str = None
    conference : str = None
    journal : str = None
    year : int = None
    reference_list : list[str] = None
    referenced_list : list[str] = None
    cite_bibtex : str = None

    def toJSON(self):
        '''convert to JSON recursively'''
        return json.dumps(self, default=lambda o: o.__dict__, 
            sort_keys=True, indent=4)
    def toDict(self):
        '''convert to dict recursively'''
        return json.loads(self.toJSON())
    def fromDict(self, dic) :
        '''convert from dict recursively'''
        self.DOI = dic['DOI']
        self.crossref_json = dic['crossref_json']
        self.google_schorlar_metadata = dic['google_schorlar_metadata']
        self.title = dic['title']
        self.authors = dic['authors']
        self.abstract = dic['abstract']
        self.conference = dic['conference']
        self.journal = dic['journal']
        self.year = dic['year']
        self.reference_list = dic['reference_list']
        self.referenced_list = dic['referenced_list']
        self.cite_bibtex = dic['cite_bibtex']
"""