In [60]:
#pip install selenium
import os
import random
import time
from tqdm import tqdm
import json
from bs4 import BeautifulSoup
import platform
from pprint import pprint

os_name = platform.system()

if os_name == "Darwin":
    #import undetected_chromedriver as webdriver
    from selenium import webdriver

elif os_name == "Linux":
    from selenium import webdriver

#from selenium import webdriver
#import undetected_chromedriver.v2 as webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException


from containers import Paper, JournalConference

11877 11877


### Define ISSN_Crawler class

In [None]:

class ISSN_Crawler :
    BASE_URL = "https://portal.issn.org/"

    def __init__(
            self,
            institution_dict = None,
            expertise_dict = None,
            os_name = None
        ) :

        if os_name == "Darwin":
            self.driver = webdriver.Safari()
            self.browser_name = "safari"
        else :
            chrome_options = webdriver.ChromeOptions()
            #chrome_options.add_argument("--headless")
            chrome_options.add_argument("--use_subprocess")
            self.browser_name = "chrome"

            self.driver = webdriver.Chrome(options=chrome_options)
        
    def crawl_by_issn(self, issn) :
        '''
        access issn portal and crawl journal/conference by issn
        args :
            issn : str
        '''
        self.driver.get(self.BASE_URL)
        self.driver.implicitly_wait(10)
        # find input tag with id = "edit-keyword--2"
        searcher = self.driver.find_element(by=By.XPATH, value="//input[@id='edit-keyword--2']")

        searcher.send_keys(issn)
        searcher.send_keys(Keys.RETURN)
        
        self.driver.implicitly_wait(10)

        '''
        try:
            element = WebDriverWait(self.driver, 10).until(self.wait_for_specific_elements)
            # Now one of the elements is present, and you can interact with it
            # 'element' will be either the h3 or h5 element, depending on which appeared first
        except Exception as e:
            print(e)
            print("Neither element appeared within the given time.")
        '''

        page_source = self.driver.page_source
        soup = BeautifulSoup(page_source, 'html.parser')

        failed_message_list = soup.find_all("h3", class_ = "page-title")
        for failed_message in failed_message_list :
            if "The server was unable to fulfill your request" in failed_message.text :
                print("failed to crawl : ", issn)
                return False, None
            
        search_result_list =  soup.find_all("div", class_ = "item-result")

        jc_dict = {}
        for search_result in search_result_list :
            info_dict = {"url_list": []}
            for i in search_result.find_all("p")[:-1] :
                info_dict[i.text.split(":")[0]] = i.text.split(":")[1]
                url_list = []
                href_list = i.find_all("a")
                if len(href_list) > 0 :
                    for href in href_list :
                        url_list.append(href["href"])
                
                info_dict["url_list"] += url_list

            title = search_result.find("h5", class_ = "item-result-title").text.strip()
            jc_dict[title] = info_dict

        journalnal_conference_dict = {}
        for k, v in jc_dict.items() :
            if "ISSN" not in v.keys() :
                continue
            ISSN = v["ISSN"]
            title = k
            journalnal_conference_dict[ISSN] = JournalConference(name = title, **v)

        return True, journalnal_conference_dict
    
    def wait_for_specific_elements(self):
        try:
            # Trying to find the h3 with class 'page-title'
            h3_element = self.driver.find_element(By.CSS_SELECTOR, "h3.page-title")
            if h3_element:
                return h3_element
        except:
            pass
        try:
            # Trying to find the h5 with class 'item-result'
            h5_element = self.driver.find_element(By.CSS_SELECTOR, "h5.item-result")
            if h5_element:
                return h5_element
        except:
            pass
        return False

### Read from file

In [None]:
PROCESSED_PAPER_FILE_PATH = "./processed_paper_dict.json"
if os.path.exists(PROCESSED_PAPER_FILE_PATH) :
    with open(PROCESSED_PAPER_FILE_PATH, "r") as f :
        processed_paper_dict = json.load(f)
for k, v in processed_paper_dict.items() :
    processed_paper_dict[k] = Paper(**v)

unique_processed_paper_dict = {}
for k in list(set(processed_paper_dict.keys())) :
    unique_processed_paper_dict[k] = processed_paper_dict[k]

print(len(unique_processed_paper_dict), len(processed_paper_dict))

### Get Unique list of issn

In [61]:
issn_list = []
for k, v in unique_processed_paper_dict.items() :
    issn_list += v.crossref_json["ISSN"]
issn_list = sorted(list(set(issn_list)))

### Crawling ISSN portal

In [67]:
issn_crawler = ISSN_Crawler()

result_dict = {}

for issn in tqdm(issn_list) :
    if issn in result_dict.keys() :
        continue
    try :
        success, jc_dict = issn_crawler.crawl_by_issn(issn)
    except Exception as e :
        continue
    if success :
        result_dict.update(jc_dict)

 13%|█▎        | 452/3527 [25:41<15:15:39, 17.87s/it]

failed to crawl :  0102-6445


 13%|█▎        | 453/3527 [26:14<19:08:46, 22.42s/it]

failed to crawl :  0103-9733


 13%|█▎        | 454/3527 [26:47<21:47:17, 25.52s/it]

failed to crawl :  0104-6500


 41%|████▏     | 1456/3527 [1:24:44<10:42:19, 18.61s/it]

failed to crawl :  1360-0532


 41%|████▏     | 1457/3527 [1:25:16<13:09:40, 22.89s/it]

failed to crawl :  1360-0559


 41%|████▏     | 1458/3527 [1:25:49<14:54:39, 25.94s/it]

failed to crawl :  1360-2322


 41%|████▏     | 1459/3527 [1:26:24<16:24:37, 28.57s/it]

failed to crawl :  1361-6382


 41%|████▏     | 1460/3527 [1:27:02<17:56:18, 31.24s/it]

failed to crawl :  1361-6420


 41%|████▏     | 1461/3527 [1:27:39<19:00:43, 33.13s/it]

failed to crawl :  1361-6463


 41%|████▏     | 1462/3527 [1:28:14<19:13:55, 33.53s/it]

failed to crawl :  1361-648X


 41%|████▏     | 1463/3527 [1:28:51<19:49:31, 34.58s/it]

failed to crawl :  1361-6501


 42%|████▏     | 1464/3527 [1:29:28<20:22:33, 35.56s/it]

failed to crawl :  1361-6528


 42%|████▏     | 1465/3527 [1:30:07<20:50:15, 36.38s/it]

failed to crawl :  1361-6536


 42%|████▏     | 1467/3527 [1:31:06<19:03:20, 33.30s/it]

failed to crawl :  1361-6579


 42%|████▏     | 1468/3527 [1:31:44<19:49:36, 34.67s/it]

failed to crawl :  1361-6641


 54%|█████▍    | 1897/3527 [2:05:30<1:47:50,  3.97s/it] 


KeyboardInterrupt: 

### Save to file

In [69]:
journal_conference_dict_dict = {}
for k, v in result_dict.items() :
    journal_conference_dict_dict[k] = v.__dict__
with open("journal_conference_dict.json", "w") as f :
    json.dump(journal_conference_dict_dict, f, indent=4)