In [55]:
#pip install selenium
import os
import random
import time
from tqdm import tqdm
import json
from bs4 import BeautifulSoup
import platform
from pprint import pprint

os_name = platform.system()

if os_name == "Darwin":
    #import undetected_chromedriver as webdriver
    from selenium import webdriver

elif os_name == "Linux":
    from selenium import webdriver

#from selenium import webdriver
#import undetected_chromedriver.v2 as webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException


from containers import Institution, Author, Paper, Expertise


from dataclasses import dataclass
@dataclass
class JournalConference :
    type : str = None
    name : str = None
    ISSN : str = None
    eissn : str = None
    publisher : str = None
    URL : str = None
    Country : str = None
    Status : str = None
    url_list : list = None

    def toJSON(self) :
        return json.dumps(self, default=lambda o: o.__dict__, 
            sort_keys=True, indent=4)
    def toDict(self) :
        return json.loads(self.toJSON())


class ISSN_Crawler :
    BASE_URL = "https://portal.issn.org/"

    def __init__(
            self,
            institution_dict = None,
            expertise_dict = None,
            os_name = None
        ) :

        if os_name == "Darwin":
            self.driver = webdriver.Safari()
            self.browser_name = "safari"
        else :
            chrome_options = webdriver.ChromeOptions()
            #chrome_options.add_argument("--headless")
            chrome_options.add_argument("--use_subprocess")
            self.browser_name = "chrome"

            self.driver = webdriver.Chrome(options=chrome_options)
        
    def crawl_by_issn(self, issn) :
        '''
        access issn portal and crawl journal/conference by issn
        args :
            issn : str
        '''
        self.driver.get(self.BASE_URL)
        self.driver.implicitly_wait(10)
        # find input tag with id = "edit-keyword--2"
        searcher = self.driver.find_element(by=By.XPATH, value="//input[@id='edit-keyword--2']")

        searcher.send_keys(issn)
        searcher.send_keys(Keys.RETURN)
        
        self.driver.implicitly_wait(10)

        '''
        try:
            element = WebDriverWait(self.driver, 10).until(self.wait_for_specific_elements)
            # Now one of the elements is present, and you can interact with it
            # 'element' will be either the h3 or h5 element, depending on which appeared first
        except Exception as e:
            print(e)
            print("Neither element appeared within the given time.")
        '''

        page_source = self.driver.page_source
        soup = BeautifulSoup(page_source, 'html.parser')

        failed_message_list = soup.find_all("h3", class_ = "page-title")
        for failed_message in failed_message_list :
            if "The server was unable to fulfill your request" in failed_message.text :
                print("failed to crawl : ", issn)
                return False, None
            
        search_result_list =  soup.find_all("div", class_ = "item-result")

        jc_dict = {}
        for search_result in search_result_list :
            info_dict = {"url_list": []}
            for i in search_result.find_all("p")[:-1] :
                info_dict[i.text.split(":")[0]] = i.text.split(":")[1]
                url_list = []
                href_list = i.find_all("a")
                if len(href_list) > 0 :
                    for href in href_list :
                        url_list.append(href["href"])
                
                info_dict["url_list"] += url_list

            title = search_result.find("h5", class_ = "item-result-title").text.strip()
            jc_dict[title] = info_dict

        journalnal_conference_dict = {}
        for k, v in jc_dict.items() :
            #issn = v["ISSN"]
            title = k
            journalnal_conference_dict[issn] = JournalConference(name = title, **v)

        return True, journalnal_conference_dict
    
    def wait_for_specific_elements(self):
        try:
            # Trying to find the h3 with class 'page-title'
            h3_element = self.driver.find_element(By.CSS_SELECTOR, "h3.page-title")
            if h3_element:
                return h3_element
        except:
            pass
        try:
            # Trying to find the h5 with class 'item-result'
            h5_element = self.driver.find_element(By.CSS_SELECTOR, "h5.item-result")
            if h5_element:
                return h5_element
        except:
            pass
        return False


PROCESSED_PAPER_FILE_PATH = "./processed_paper_dict.json"
if os.path.exists(PROCESSED_PAPER_FILE_PATH) :
    with open(PROCESSED_PAPER_FILE_PATH, "r") as f :
        processed_paper_dict = json.load(f)
for k, v in processed_paper_dict.items() :
    processed_paper_dict[k] = Paper(**v)

unique_processed_paper_dict = {}
for k in list(set(processed_paper_dict.keys())) :
    unique_processed_paper_dict[k] = processed_paper_dict[k]

print(len(unique_processed_paper_dict), len(processed_paper_dict))

11877 11877


In [56]:
issn_list = []
for k, v in unique_processed_paper_dict.items() :
    issn_list += v.crossref_json["ISSN"]
issn_list = sorted(list(set(issn_list)))

In [59]:
issn_crawler = ISSN_Crawler()

result_dict = {}

for issn in tqdm(issn_list) :
    if issn in result_dict.keys() :
        continue
    try :
        success, jc_dict = issn_crawler.crawl_by_issn(issn)
    except Exception as e :
        continue
    if success :
        result_dict.update(jc_dict)

  0%|          | 0/3527 [00:00<?, ?it/s]

In [None]:
pprint(result_dict)

{'0001-0782': JournalConference(type=None,
                                name='Communications of the ACM (Online)',
                                ISSN=' 1557-7317',
                                eissn=None,
                                publisher=None,
                                URL=' books.google.com/books?id ... ',
                                Country=' United States',
                                Status=' Confirmed',
                                url_list=['http://books.google.com/books?id=HT5VAAAAMAAJ']),
 '0001-1541': JournalConference(type=None,
                                name='AIChE journal (Online)',
                                ISSN=' 1547-5905',
                                eissn=None,
                                publisher=None,
                                URL=' www.sciencedirect.com/sci ... ',
                                Country=' United States',
                                Status=' Confirmed',
                                u

In [23]:
journal_conference_dict_dict = {}
for k, v in result_dict.items() :
    journal_conference_dict_dict[k] = v.__dict__
with open("journal_conference_dict.json", "w") as f :
    json.dump(journal_conference_dict_dict, f, indent=4)

In [26]:
ISSN = "0028-1298"


driver = webdriver.Chrome()
driver.get(ISSN_Crawler.BASE_URL)
driver.implicitly_wait(10)
# find input tag with id = "edit-keyword--2"
searcher = driver.find_element(by=By.XPATH, value="//input[@id='edit-keyword--2']")

searcher.send_keys(ISSN)
searcher.send_keys(Keys.RETURN)

In [27]:
page_source = driver.page_source
soup = BeautifulSoup(page_source, 'html.parser')

In [30]:
failed_message_list = soup.find_all("h3", class_ = "page-title")
for failed_message in failed_message_list :
    if "The server was unable to fulfill your request" in failed_message.text :
        print("failed to crawl : ", ISSN)
        break

In [37]:

search_result_list =  soup.find_all("div", class_ = "item-result")

jc_dict = {}
for search_result in search_result_list :
    info_dict = {}
    for i in search_result.find_all("p")[:-1] :
        info_dict[i.text.split(":")[0]] = i.text.split(":")[1]
        url_list = []
        href_list = i.find_all("a")
        if len(href_list) > 0 :
            for href in href_list :
                url_list.append(href["href"])

    title = search_result.find("h5", class_ = "item-result-title").text.strip()
    jc_dict[title] = info_dict

journalnal_conference_dict = {}
for k, v in jc_dict.items() :
    issn = v["ISSN"]
    title = k
    journalnal_conference_dict[issn] = JournalConference(name = title, **v)

journalnal_conference_dict

http://www.springerlink.com/content/100530


{' 0028-1298': JournalConference(type=None, name="Naunyn-Schmiedeberg's archives of pharmacology", ISSN=' 0028-1298', eissn=None, publisher=None, URL=None, Country=' Germany', Status=' Confirmed'),
 ' 1432-1912': JournalConference(type=None, name="Naunyn-Schmiedeberg's archives of pharmacology (Internet)", ISSN=' 1432-1912', eissn=None, publisher=None, URL=' www.springerlink.com/cont ... ', Country=' Germany', Status=' Confirmed')}

In [101]:
journalnal_conference_dict

{}

In [80]:
jc_dict.keys()

dict_keys(['Journal of accounting and public policy (Print)', 'Journal of accounting and public policy (Online)'])

In [73]:
infor = {}
for i in info[:-1] :
    infor[i.text.split(": ")[0]] = i.text.split(": ")[1]
infor

{'ISSN': '1873-2070', 'Country': 'Netherlands', 'Status': 'Confirmed'}

In [68]:
info[0].text

'ISSN: 1873-2070'

In [33]:
soup.find_all("input", class_="input-main-search form-text required")

[<input class="input-main-search form-text required" id="edit-keyword" maxlength="400" name="keyword" placeholder="Type an ISSN or a title" size="400" type="text" value=""/>,
 <input class="input-main-search form-text required" id="edit-keyword--2" maxlength="400" name="keyword" placeholder="Type an ISSN or a title" size="400" type="text" value=""/>]