In [None]:
#pip install selenium
import os
import random
import json
from bs4 import BeautifulSoup
import platform
os_name = platform.system()

if os_name == "Darwin":
    #import undetected_chromedriver as webdriver
    from selenium import webdriver

elif os_name == "Linux":
    from selenium import webdriver

#from selenium import webdriver
#import undetected_chromedriver.v2 as webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
import time
from tqdm import tqdm

from containers import Institution, Author, Paper, Expertise


from dataclasses import dataclass

@dataclass
class JournalConference :
    type : str = None
    name : str = None
    issn : str = None
    eissn : str = None
    publisher : str = None
    url : str = None
    

In [None]:

class ISSN_Crawler :
    BASE_URL = "https://portal.issn.org/"

    def __init__(
            self,
            institution_dict = None,
            expertise_dict = None,
            os_name = None
        ) :

        if False : # os_name == "Darwin":
            self.driver = webdriver.Safari()
            self.browser_name = "safari"
        else :
            chrome_options = webdriver.ChromeOptions()
            #chrome_options.add_argument("--headless")
            chrome_options.add_argument("--use_subprocess")
            self.browser_name = "chrome"

            self.driver = webdriver.Chrome(options=chrome_options)
        
    def crawl_by_issn


    def searchPaperByName(self, name) :
        self.driver.get(self.BASE_URL)
        self.driver.implicitly_wait(10)
        # search given paper name
        search = self.driver.find_element(by=By.XPATH, value='//*[@id="gs_hdr_tsi"]')
        search.send_keys(name)
        search.send_keys(Keys.RETURN)
        self.driver.implicitly_wait(10)
        # click the first paper

    def searchAuthorByName(
        self,
        name,
        continue_search = False,
        search_width = 1000,
        ask_for_continue = False,
    ) :
        """
        If continue_search is True, search every co-author until search_width
        """
        self.driver.get(self.BASE_URL)
        self.driver.implicitly_wait(10)

        self.checkCaptcha()

        # search by author name
        searcher = self.driver.find_element(by=By.XPATH, value='//*[@id="gs_hdr_tsi"]')
        time.sleep(0.1)

        #searcher.send_keys(name)
        for chr in name :
            searcher.send_keys(chr)
            time.sleep(random.randint(1, 10)/200)
        searcher.send_keys(Keys.RETURN)
        self.driver.implicitly_wait(10)
        time.sleep(0.2)

        self.checkCaptcha()

        if ask_for_continue :
            key_input = input("Press [n] to stop...")
            if key_input in ["n", "N", "no", "No", "NO", "nO"] :
                return [], {}

        page_source = self.driver.page_source
        soup = BeautifulSoup(page_source, 'html.parser')
        author_list = soup.find_all("h4", class_="gs_rt2")

        author_list = list(map(
            lambda author : Author(
                name = author.text,
                google_schorlar_profile_url = self.BASE_URL +author.find("a")["href"],
            ),
            author_list
        ))

        print(f"authors found : {list(map(lambda author : author.name, author_list))}")
        
        auther_href_button_list = self.driver.find_elements(by=By.XPATH, value='//*[@class="gs_rt2"]/a')        
        print(auther_href_button_list)

        print("nubmer of authors found : ", len(auther_href_button_list))
        whole_paper_dict = {}
        for author, auther_href_button in zip(author_list, auther_href_button_list) :
            print(f"filling google schorlar metadata of papers from {author.name}...")
            
            if self.browser_name == "safari" :
                # clik in safari does not work. so use send_keys
                auther_href_button.send_keys(Keys.RETURN)
            else :
                auther_href_button.click()


            print("clicked")
            self.driver.implicitly_wait(10)

            self.checkCaptcha()

            author, paper_dict = self.fillAuthor(author, url_already_loaded = True)
            whole_paper_dict.update(paper_dict)
            self.driver.back()
            self.driver.implicitly_wait(10)
            self.checkCaptcha()

            break

        #for author in author_list :
            #author, paper_dict = self.fillAuthor(author)
            #whole_paper_dict.update(paper_dict)


        if continue_search and len(author_list) < search_width :
            pass


        return author_list, whole_paper_dict

    def addInstitution(
        self,
        html_str
    ) :
        '''
        initialize Institution instance and append to
        self.instaitution_dict if not exist
        args :
            institution_html :
                expected to have name,
                google_schorlar_institution_url field
        return :
            institution name
        '''
        #institution_name = html_str.find("a").text


        try :
            institution_name = html_str.text
        except Exception as e :
            print(e)
        
        if institution_name not in self.institution_dict :
            try :
                google_schorlar_institution_url = self.BASE_URL + html_str.find("a")["href"]
            except Exception as e :
                google_schorlar_institution_url = None
            homepage_url = None
            self.institution_dict[institution_name] = Institution(
                name = institution_name,
                google_scholar_url = google_schorlar_institution_url,
                homepage_url = homepage_url,
            )
        return institution_name

    def addExpertise(
        self,
        html_str_list
    ) :
        '''
        initialize Expertise instance and append to
        self.expertise_dict if not exist
        args :
            html_str_list :
                list of html_str. each elements are html str
                expected to have name,
                google_schorlar_expertise_url field
        return :
            expertise name
        '''
        expertise_name_list = []
        for html_str in html_str_list :
            expertise_name = html_str.text
            if expertise_name not in self.expertise_dict :
                google_schorlar_expertise_url = self.BASE_URL + html_str["href"]
                self.expertise_dict[expertise_name] = Expertise(
                    name = expertise_name,
                    url = google_schorlar_expertise_url,
                )
            expertise_name_list.append(expertise_name)
        return expertise_name_list

    def fillAuthor(self, author, url_already_loaded = False) :
        """
        fill in author instance
        args :
            author :
                expected to have name, google_schorlar_profile_url field
        """
        # load page html        
        if not url_already_loaded :
            self.driver.get(author.google_schorlar_profile_url)
            self.driver.implicitly_wait(10)
        page_source = self.driver.page_source
        soup = BeautifulSoup(page_source, 'html.parser')

        # fill in expertise
        expertise_html_list = soup.find_all("a", class_="gsc_prf_inta")
        expertise_name_list = self.addExpertise(expertise_html_list)
        author.expertise_list = expertise_name_list

        # fill in institution
        institution_html = soup.find("div", class_="gsc_prf_il")
        try :
            institution_name = self.addInstitution(institution_html)
            author.affiliation = institution_name
        except Exception as e :
            print(e)
            raise e

        paper_dict = self.makePaperDictFromAuthor(author, url_already_loaded = True)
        #DOI_list = list(paper_dict.keys())
        #author.paper_list = DOI_list

        return author, paper_dict
    

    def makePaperDictFromAuthor(self, author, url_already_loaded = False, search_width_limit = 20) :
        """
        make paper instance from author instance
        args :
            author : Author
                expected to have name, google_schorlar_profile_url field
        return :
            paper_list : list[Paper]
        """

        # load page html
        if not url_already_loaded :
            self.driver.get(author.google_schorlar_profile_url)
            self.driver.implicitly_wait(10)
        page_source = self.driver.page_source
        soup = BeautifulSoup(page_source, 'html.parser')

        # search papers
        # click "show more" button until it is disabled
        '''
        while True :
            load_more_button = self.driver.find_element(by=By.XPATH, value='//*[@id="gsc_bpf_more"]')
            self.driver.implicitly_wait(10)
            load_more_button.click()
            time.sleep(2)
            if load_more_button.get_property("disabled") :
                break
        '''
        # get papaer html list
        page_source = self.driver.page_source
        soup = BeautifulSoup(page_source, 'html.parser')
        paper_html_list = soup.find_all("tr", class_="gsc_a_tr")
        paper_html_list = paper_html_list[:search_width_limit]

        paper_href_list = self.driver.find_elements(by=By.XPATH, value='//*[@class="gsc_a_t"]/a')


        paper_list = []

        print(f"filling google schorlar metadata of papers from {author.name}...")
        with tqdm(total=len(paper_html_list)) as pbar:
            #for paper_html in paper_html_list :
            for paper_html, paper_href in zip(paper_html_list, paper_href_list) :
                google_schorlar_url = self.BASE_URL + paper_html.find("a", class_="gsc_a_at")["href"]
                title = paper_html.find("a", class_="gsc_a_at").text
                
                #self.driver.get(google_schorlar_url)
                #self.driver.implicitly_wait(10)
                if self.browser_name == "safari" :
                    # clik in safari does not work. so use send_keys
                    paper_href.send_keys(Keys.RETURN)
                else :
                    paper_href.click()

                self.checkCaptcha()

                time.sleep(0.2)
                soup = BeautifulSoup(self.driver.page_source, 'html.parser')
                metadata_list = soup.find_all("div", class_="gs_scl")
                html_title = soup.find("a", class_="gsc_oci_title_link")

                google_schorlar_metadata = {}
                for metadata in metadata_list :
                    field = metadata.find("div", class_="gsc_oci_field").text
                    value = metadata.find("div", class_="gsc_oci_value").text
                    google_schorlar_metadata[field] = value
                

                paper = Paper(title = title, google_schorlar_metadata = google_schorlar_metadata)
                paper_list.append(paper)

                pbar.set_postfix_str(title)
                pbar.update(1)

                self.driver.back()
                time.sleep(0.2)
                self.checkCaptcha()

        author.paper_title_list = list(map(lambda paper : paper.title, paper_list))


        paper_dict = {}
        for paper in paper_list :
            paper_dict[paper.title] = paper
        return paper_dict

        # query_crossref
        print(f"fetching crosserf metadata of papers from {author.name}...")
        for paper in tqdm(paper_list) :
            self.crossref_fetcher.fetchMetaDatafromTitle(paper)
            paper_dict[paper.DOI] = paper

        return paper_dict
    
    def checkCaptcha(self) :
        captcha_found = False
        source = self.driver.page_source
        if source.find("사용자가 로봇이 아니라는 확인이 필요합니다.") != -1 :
            print("로봇이 아니라는 확인이 필요합니다 text detected!")
            captcha_found = True
        if source.lower().find("recaptcha") != -1 :
            print("recaptcha detected!")
            captcha_found = True
        try:
            captcha_image = self.driver.find_element_by_xpath("//img[contains(@alt, 'captcha')]")
            if captcha_image:
                print("captcha image detected!")
                captcha_found = True
        except Exception as e:
            pass
        try:
            captcha_text = self.driver.find_element_by_xpath("//*[contains(text(), 'prove you are human')]")
            if captcha_text:
                print("CAPTCHA text detected!")
                captcha_found = True
        except Exception as e:
            pass
        try:
            robot_detection = self.driver.find_element_by_xpath("//*[contains(text(), 'Google의 시스템이 컴퓨터 네트워크에서 비정상적인 트래픽을 감지했습니다.')]")
            if robot_detection:
                print("로봇이 아니라는 확인이 필요합니다 detected!")
                captcha_found = True
        except Exception as e:
            pass
        if captcha_found :
            key_input = input("Press [n] to stop...")
            if key_input in ["n", "N", "no", "No", "NO", "nO"] :
                raise Exception("captcha detected!")
        # reload page
        #self.driver.refresh()
        self.driver.implicitly_wait(10)
        time.sleep(1)