In [1]:
#pip install selenium
import os
import random
import time
from tqdm import tqdm
import json
from bs4 import BeautifulSoup
import platform
os_name = platform.system()

if os_name == "Darwin":
    #import undetected_chromedriver as webdriver
    from selenium import webdriver

elif os_name == "Linux":
    from selenium import webdriver

#from selenium import webdriver
#import undetected_chromedriver.v2 as webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException


from containers import Institution, Author, Paper, Expertise


from dataclasses import dataclass
@dataclass
class JournalConference :
    type : str = None
    name : str = None
    ISSN : str = None
    eissn : str = None
    publisher : str = None
    URL : str = None
    Country : str = None
    Status : str = None



PROCESSED_PAPER_FILE_PATH = "./processed_paper_dict.json"
if os.path.exists(PROCESSED_PAPER_FILE_PATH) :
    with open(PROCESSED_PAPER_FILE_PATH, "r") as f :
        processed_paper_dict = json.load(f)
for k, v in processed_paper_dict.items() :
    processed_paper_dict[k] = Paper(**v)

unique_processed_paper_dict = {}
for k in list(set(processed_paper_dict.keys())) :
    unique_processed_paper_dict[k] = processed_paper_dict[k]

print(len(unique_processed_paper_dict), len(processed_paper_dict))

11877 11877


In [2]:
issn_list = []
for k, v in unique_processed_paper_dict.items() :
    issn_list += v.crossref_json["ISSN"]
issn_list = list(set(issn_list))

journal_conference_dict = {}
for issn in tqdm(issn_list) :
    journal_conference_dict[issn] = None

100%|██████████| 3527/3527 [00:00<00:00, 745179.84it/s]


In [20]:

class ISSN_Crawler :
    BASE_URL = "https://portal.issn.org/"

    def __init__(
            self,
            institution_dict = None,
            expertise_dict = None,
            os_name = None
        ) :

        if os_name == "Darwin":
            self.driver = webdriver.Safari()
            self.browser_name = "safari"
        else :
            chrome_options = webdriver.ChromeOptions()
            #chrome_options.add_argument("--headless")
            chrome_options.add_argument("--use_subprocess")
            self.browser_name = "chrome"

            self.driver = webdriver.Chrome(options=chrome_options)
        
    def crawl_by_issn(self, issn) :
        '''
        access issn portal and crawl journal/conference by issn
        args :
            issn : str
        '''
        self.driver.get(self.BASE_URL)
        self.driver.implicitly_wait(10)
        # find input tag with id = "edit-keyword--2"
        searcher = self.driver.find_element(by=By.XPATH, value="//input[@id='edit-keyword--2']")

        searcher.send_keys(issn)
        searcher.send_keys(Keys.RETURN)
        
        self.driver.implicitly_wait(10)

        '''
        try:
            element = WebDriverWait(self.driver, 10).until(self.wait_for_specific_elements)
            # Now one of the elements is present, and you can interact with it
            # 'element' will be either the h3 or h5 element, depending on which appeared first
        except Exception as e:
            print(e)
            print("Neither element appeared within the given time.")
        '''

        page_source = self.driver.page_source
        soup = BeautifulSoup(page_source, 'html.parser')

        failed_message_list = soup.find_all("h3", class_ = "page-title")
        for failed_message in failed_message_list :
            if "The server was unable to fulfill your request" in failed_message.text :
                print("failed to crawl : ", issn)
                return False, None
            
        search_result_list =  soup.find_all("div", class_ = "item-result")


        jc_dict = {}
        for search_result in search_result_list :
            info_dict = {}
            for i in search_result.find_all("p")[:-1] :
                info_dict[i.text.split(":")[0]] = i.text.split(":")[1]

            title = search_result.find("h5", class_ = "item-result-title").text.strip()
            jc_dict[title] = info_dict

        journalnal_conference_dict = {}
        for k, v in jc_dict.items() :
            if "ISSN" in v :
                if issn != v["ISSN"] :
                    print(issn, v["ISSN"])
            #issn = v["ISSN"]
            title = k
            journalnal_conference_dict[issn] = JournalConference(name = title, **v)

        return True, journalnal_conference_dict
    

    def wait_for_specific_elements(self):
        try:
            # Trying to find the h3 with class 'page-title'
            h3_element = self.driver.find_element(By.CSS_SELECTOR, "h3.page-title")
            if h3_element:
                return h3_element
        except:
            pass

        try:
            # Trying to find the h5 with class 'item-result'
            h5_element = self.driver.find_element(By.CSS_SELECTOR, "h5.item-result")
            if h5_element:
                return h5_element
        except:
            pass

        return False

In [15]:
ISSN = issn_list[0]
print(ISSN)

issn_crawler = ISSN_Crawler(os_name = os_name)
jc_dict = issn_crawler.crawl_by_issn(ISSN)

1017-1398
name 'WebDriverWait' is not defined
Neither element appeared within the given time.


In [22]:
issn_crawler = ISSN_Crawler()

result_dict = {}

for issn in tqdm(journal_conference_dict.keys()) :
    if issn in result_dict.keys() :
        continue
    try :
        success, jc_dict = issn_crawler.crawl_by_issn(issn)
    except Exception as e :
        continue
    if success :
        result_dict.update(jc_dict)

  0%|          | 1/3531 [00:12<12:02:19, 12.28s/it]

1017-1398  1017-1398
1017-1398  1572-9265


  0%|          | 2/3531 [00:14<6:30:46,  6.64s/it] 

0929-6212  0929-6212
0929-6212  1572-834X


  0%|          | 4/3531 [00:22<4:33:28,  4.65s/it]

0021-9797  0021-9797
0021-9797  1095-7103


  0%|          | 7/3531 [00:32<3:31:38,  3.60s/it]

0022-1007  0022-1007
0022-1007  1540-9538


  0%|          | 10/3531 [00:43<3:26:16,  3.51s/it]

1383-469X  1383-469X
1383-469X  1572-8153


  0%|          | 12/3531 [00:50<3:11:05,  3.26s/it]

0883-2927  0883-2927
0883-2927  1872-9134


  0%|          | 13/3531 [00:52<3:03:10,  3.12s/it]

0016-5492  0016-5492
0016-5492  1460-3594


  0%|          | 14/3531 [00:56<3:16:19,  3.35s/it]

0142-7164  0142-7164
0142-7164  1469-1817


  0%|          | 15/3531 [00:59<3:05:56,  3.17s/it]

1942-3888  1942-3888
1942-3888  1942-3896


  0%|          | 16/3531 [01:02<3:03:52,  3.14s/it]

2168-6165  2168-6165
2168-6165  2168-6173


  0%|          | 17/3531 [01:05<3:07:26,  3.20s/it]

0350-5596  0350-5596
0350-5596  1854-3871


  1%|          | 18/3531 [01:08<2:53:43,  2.97s/it]

1094-2076  1094-2076
1094-2076  2325-5404


  1%|          | 19/3531 [01:11<2:57:29,  3.03s/it]

0737-8831  0737-8831
0737-8831  2054-166X


In [95]:
ISSN = "0278-0000"


driver = webdriver.Chrome()
driver.get(ISSN_Crawler.BASE_URL)
driver.implicitly_wait(10)
# find input tag with id = "edit-keyword--2"
searcher = driver.find_element(by=By.XPATH, value="//input[@id='edit-keyword--2']")

searcher.send_keys(ISSN)
searcher.send_keys(Keys.RETURN)

In [98]:
page_source = driver.page_source
soup = BeautifulSoup(page_source, 'html.parser')

In [104]:
failed_message_list = soup.find_all("h3", class_ = "page-title")
for failed_message in failed_message_list :
    if "The server was unable to fulfill your request" in failed_message.text :



failed
failed


In [100]:
search_result_list =  soup.find_all("div", class_ = "item-result")

jc_dict = {}
for search_result in search_result_list :
    info_dict = {}
    for i in search_result.find_all("p")[:-1] :
        info_dict[i.text.split(":")[0]] = i.text.split(":")[1]

    title = search_result.find("h5", class_ = "item-result-title").text.strip()
    jc_dict[title] = info_dict
jc_dict

journalnal_conference_dict = {}
for k, v in jc_dict.items() :
    issn = v["ISSN"]
    title = k
    journalnal_conference_dict[issn] = JournalConference(name = title, **v)

In [101]:
journalnal_conference_dict

{}

In [80]:
jc_dict.keys()

dict_keys(['Journal of accounting and public policy (Print)', 'Journal of accounting and public policy (Online)'])

In [73]:
infor = {}
for i in info[:-1] :
    infor[i.text.split(": ")[0]] = i.text.split(": ")[1]
infor

{'ISSN': '1873-2070', 'Country': 'Netherlands', 'Status': 'Confirmed'}

In [68]:
info[0].text

'ISSN: 1873-2070'

In [33]:
soup.find_all("input", class_="input-main-search form-text required")

[<input class="input-main-search form-text required" id="edit-keyword" maxlength="400" name="keyword" placeholder="Type an ISSN or a title" size="400" type="text" value=""/>,
 <input class="input-main-search form-text required" id="edit-keyword--2" maxlength="400" name="keyword" placeholder="Type an ISSN or a title" size="400" type="text" value=""/>]