In [1]:
import os
import json
from dataclasses import dataclass
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By

from bs4 import BeautifulSoup

import time
from tqdm import tqdm

@dataclass
class Expertise :
    name : str
    url : str

    def toJSON(self):
        return json.dumps(self, default=lambda o: o.__dict__, 
            sort_keys=True, indent=4)
    def toDict(self):
        return json.loads(self.toJSON())
    def fromDict(self, dic):
        self.name = dic['name']
        self.url = dic['url']

expertise_dict = {}
EXPERTISE_FILE_PATH = "./expertise_dict.json"
if os.path.exists(EXPERTISE_FILE_PATH) :
    with open(EXPERTISE_FILE_PATH, "r") as f :
        expertise_dict_raw = json.load(f)
    for k, v in expertise_dict_raw.items() :
        expertise_dict[k] = Expertise(**v)


In [2]:
expertise_dict

{'Computer Vision': Expertise(name='Computer Vision', url='https://scholar.google.com/citations?view_op=search_authors&hl=ko&mauthors=label:computer_vision'),
 'computational materials science': Expertise(name='computational materials science', url='https://scholar.google.com/citations?view_op=search_authors&hl=ko&mauthors=label:computational_materials_science'),
 'computational nano-mechanics': Expertise(name='computational nano-mechanics', url='https://scholar.google.com/citations?view_op=search_authors&hl=ko&mauthors=label:computational_nano_mechanics'),
 'molecular dynamics': Expertise(name='molecular dynamics', url='https://scholar.google.com/citations?view_op=search_authors&hl=ko&mauthors=label:molecular_dynamics'),
 'density functional theory': Expertise(name='density functional theory', url='https://scholar.google.com/citations?view_op=search_authors&hl=ko&mauthors=label:density_functional_theory'),
 'li ion batteries': Expertise(name='li ion batteries', url='https://scholar.go

In [3]:
expertise_dict["Robotics"].toDict()

{'name': 'Robotics',
 'url': 'https://scholar.google.com/citations?view_op=search_authors&hl=ko&mauthors=label:robotics'}

In [5]:
TARGET_EXPERTISE_URL = expertise_dict["Computer Vision"].url
TARGET_EXPERTISE_URL = expertise_dict["computer graphics"].url
TARGET_EXPERTISE_URL = expertise_dict["Robotics"].url

START_IDX = 0
WIDTH = 1000



def checkCaptcha(driver) :
    captcha_found = False
    source = driver.page_source
    if source.find("사용자가 로봇이 아니라는 확인이 필요합니다.") != -1 :
        print("로봇이 아니라는 확인이 필요합니다 text detected!")
        captcha_found = True
    if source.lower().find("recaptcha") != -1 :
        print("recaptcha detected!")
        captcha_found = True
    try:
        captcha_image = driver.find_element_by_xpath("//img[contains(@alt, 'captcha')]")
        if captcha_image:
            print("captcha image detected!")
            captcha_found = True
    except Exception as e:
        pass
    try:
        captcha_text = driver.find_element_by_xpath("//*[contains(text(), 'prove you are human')]")
        if captcha_text:
            print("CAPTCHA text detected!")
            captcha_found = True
    except Exception as e:
        pass
    try:
        robot_detection = driver.find_element_by_xpath("//*[contains(text(), 'Google의 시스템이 컴퓨터 네트워크에서 비정상적인 트래픽을 감지했습니다.')]")
        if robot_detection:
            print("로봇이 아니라는 확인이 필요합니다 detected!")
            captcha_found = True
    except Exception as e:
        pass
    if captcha_found :
        key_input = input("Press [n] to stop...")
        if key_input in ["n", "N", "no", "No", "NO", "nO"] :
            raise Exception("captcha detected!")
    driver.navigate().refresh()
    time.sleep(0.5)


author_name_list = []

driver = webdriver.Chrome()
driver.get(TARGET_EXPERTISE_URL)
time.sleep(1)
checkCaptcha(driver)

for i in range(START_IDX, START_IDX + WIDTH) :
    soup = BeautifulSoup(driver.page_source, "html.parser")

    author_html_list = soup.find_all("h3", class_ = "gs_ai_name")

    for author_html in author_html_list :
        author_name_list.append(author_html.text)

    button_list = driver.find_elements(by=By.XPATH, value='//button')
    button_list[-1].click()
    driver.implicitly_wait(10)
    time.sleep(1)
    checkCaptcha(driver)


recaptcha detected!


In [6]:
author_name_list

['Jian Sun',
 'Jitendra MALIK',
 'Leonidas Guibas',
 'Alexei A. Efros',
 'Richard Szeliski',
 'C. Lawrence Zitnick',
 'Michael J. Black',
 'Demetri Terzopoulos',
 'Vladlen Koltun',
 'Shree Nayar',
 'Jun-Yan Zhu',
 'Marc Levoy',
 'Dinesh Manocha',
 'Hans-Peter Seidel',
 'Martin Wicke',
 'Patrick Hanrahan',
 'Marc Pollefeys',
 'Heung-Yeung Shum',
 'Michael F Cohen',
 'Markus Gross',
 'Fredo Durand',
 'Thomas Funkhouser',
 'Hugues Hoppe',
 'Peter Belhumeur',
 'Ming-Ming Cheng',
 'Steve Seitz',
 'David Forsyth',
 'Ming Lin',
 'Hanan Samet',
 'Ronald Fedkiw',
 'Eli Shechtman',
 'Michael Kass',
 'Timo Aila',
 'Brian Curless',
 'Christian Theobalt',
 'Hanspeter Pfister',
 '李重(Zhong Li)',
 'Daniel Thalmann',
 'Baining Guo',
 'Samuli Laine',
 'Ziwei Liu',
 'Noah Snavely',
 'Iain Matthews',
 'Philipp Krähenbühl',
 'David Salesin',
 'Katsushi Ikeuchi',
 'Sankarshan Ghosh',
 'Yaser Sheikh',
 'Ravi Ramamoorthi',
 'Matthias Nießner',
 'Paul Debevec',
 'Szymon Rusinkiewicz',
 'Jessica Hodgins',
 'Dan

In [7]:
with open("./author_name_list.json", "w") as f :
    json.dump(author_name_list, f, indent=4)

In [56]:
#driver.find_element(by=By.XPATH, value='//@gsc_pgn_pnx').text
#driver.find_element(by=By.XPATH, value='//@"gs_btnPR gs_in_ib gs_btn_half gs_btn_lsb gs_btn_srt gsc_pgn_pnx"')
#driver.find_elements(by=By.XPATH, value='//div[@class="gs_r gs_or gs_scl"]')


'''
button_list = driver.find_elements(by=By.XPATH, value='//button')
button_list[-1].click()
for button in button_list :
    print(button.accessible_name)

'''
driver.find_elements(by=By.XPATH, value='//div[@class="gs_r gs_or gs_scl"]')

[]