In [13]:
import csv
import os
import time
import ujson
from random import randint
from typing import Dict, List, Any
import json

import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException, StaleElementReferenceException
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By

In [14]:
def write_authors(list1, file_name):
    with open(file_name, 'w', encoding='utf-8') as f:
        for i in range(0, len(list1)):
            f.write(list1[i] + '\n')

def initCrawlerScraper(seed, page_limit):
    # Initialize driver for Chrome
    webOpt = webdriver.ChromeOptions()
    webOpt.add_experimental_option('excludeSwitches', ['enable-logging'])
    webOpt.add_argument('--ignore-certificate-errors')
    webOpt.add_argument('--incognito')
    webOpt.add_argument('--headless')  # Set headless mode
    driver = webdriver.Chrome(ChromeDriverManager().install(), options=webOpt, service_log_path=os.devnull)

    driver.get(seed)  # Start with the original link

    Links = []  # Array with pureportal profiles URL
    pub_data = []  # To store publication information for each pureportal profile
    next_link = driver.find_element(By.CSS_SELECTOR, ".nextLink")
    next_link_enabled = next_link.is_enabled()

    page_count = 0  # Counter for the number of pages crawled

    while next_link_enabled and page_count < page_limit:
        page_count += 1

        page = driver.page_source
        bs = BeautifulSoup(page, "lxml")

        # Extracting exact URL by accessing the href attribute
        for link in bs.findAll('a', class_='link person'):
            url = link['href']
            Links.append(url)

        # Click on Next button to visit the next page
        try:
            if driver.find_element(By.CSS_SELECTOR, ".nextLink"):
                element = driver.find_element(By.CSS_SELECTOR, ".nextLink")
                driver.execute_script("arguments[0].click();", element)
            else:
                next_link_enabled = False
        except NoSuchElementException:
            break

    print("Crawler has found", len(Links), "pureportal profiles")
    write_authors(Links, 'Authors_URL.txt')

    print("Scraping publication data for", len(Links), "pureportal profiles...")

    # Load the URLs from the file
    with open('Authors_URL.txt', 'r') as f:
        urls = f.read().splitlines()

    # Crawl data from 10 URLs
    for i in range(8):
        if i >= len(urls):
            break

        url = urls[i]

        # Visit each link to get data
        time.sleep(1)
        driver.get(url)
        try:
            if driver.find_elements(By.CSS_SELECTOR, ".portal_link.btn-primary.btn-large span"):
                elements = driver.find_elements(By.CSS_SELECTOR, ".portal_link.btn-primary.btn-large span")
                for span in elements:
                    try:
                        if span.text:
                            if "research output".lower() in span.text.lower():
                                driver.execute_script("arguments[0].click();", span)
                                driver.get(driver.current_url)
                                # Get name of Author
                                name = driver.find_element(By.CSS_SELECTOR, "div[class='header person-details']>h1")
                                r = requests.get(driver.current_url)
                                page = r.content
                                bs = BeautifulSoup(page, "lxml")
                                rows = bs.findAll("div", {"class": "result-container"})
                                for row in rows:
                                    data: Dict[str, Any] = {}
                                    data['name'] = row.h3.a.text.strip()
                                    data['pub_url'] = row.h3.a['href']
                                    date = row.find("span", class_="date")
                                    data['cu_author'] = name.text if name else ''
                                    data['date'] = date.text if date else ''
                                    pub_data.append(data)
                                    print("Publication Name:", data['name'])
                                    print("Publication URL:", data['pub_url'])
                                    print("CU Author:", data['cu_author'])
                                    print("Date:", data['date'])
                                    print("\n")
                    except StaleElementReferenceException:
                        continue

        except NoSuchElementException:
            break

    # Writing publication data to a JSON file
    with open('scraper_results.json', 'w', encoding='utf-8') as json_file:
        json.dump(pub_data, json_file, ensure_ascii=False, indent=4)

seed_link = 'https://pureportal.coventry.ac.uk/en/publications/'
page_limit = 1
initCrawlerScraper(seed_link, page_limit)

  driver = webdriver.Chrome(ChromeDriverManager().install(), options=webOpt, service_log_path=os.devnull)


Crawler has found 70 pureportal profiles
Scraping publication data for 70 pureportal profiles...
Publication Name: Bletchley Park in Myth and Memory
Publication URL: https://pureportal.coventry.ac.uk/en/publications/bletchley-park-in-myth-and-memory
CU Author: Thomas Knowles
Date: 2024


Publication Name: Book Review: Paradoxa# 31: Climate Fictions. Edited by Alison Sperling
Publication URL: https://pureportal.coventry.ac.uk/en/publications/book-review-paradoxa-31-climate-fictions-edited-by-alison-sperlin
CU Author: Thomas Knowles
Date: 2022


Publication Name: J. G. Ballard and the Sciences
Publication URL: https://pureportal.coventry.ac.uk/en/publications/j-g-ballard-and-the-sciences
CU Author: Thomas Knowles
Date: 30 Jan 2020


Publication Name: J. G. Ballard and Making: An Experiment in Collaborative Practice
Publication URL: https://pureportal.coventry.ac.uk/en/publications/j-g-ballard-and-making-an-experiment-in-collaborative-practice
CU Author: Thomas Knowles
Date: 2019


Public

Publication Name: 10 Years on with French Arbitration Law Reform: Does the Judicial Control Frustrate or Facilitate the Enforcement of Arbitral Awards?
Publication URL: https://pureportal.coventry.ac.uk/en/publications/10-years-on-with-french-arbitration-law-reform-does-the-judicial-
CU Author: Margaret Liu
Date: 25 Mar 2023


Publication Name: Anti-suit injunction: paving the way to arbitration of antitrust claims?
Publication URL: https://pureportal.coventry.ac.uk/en/publications/anti-suit-injunction-paving-the-way-to-arbitration-of-antitrust-c
CU Author: Margaret Liu
Date: 20 Oct 2022


Publication Name: Anti-suit Injunctions under OHADA Law: Could this Mechanism Turn the Tide?
Publication URL: https://pureportal.coventry.ac.uk/en/publications/anti-suit-injunctions-under-ohada-law-could-this-mechanism-turn-t
CU Author: Margaret Liu
Date: 20 Nov 2022


Publication Name: The Privity of Contract Under the Contracts (Rights of Third Parties) Act: Frustrate or Facilitate the Participatio

Publication Name: A CFD-Based Numerical Evaluation, Assessment and Optimization of Conjugate Heat Transfer for Aerodynamic Cooling of a Wheel-Hub-Motors in Micro-Mobility Vehicles
Publication URL: https://pureportal.coventry.ac.uk/en/publications/a-cfd-based-numerical-evaluation-assessment-and-optimization-of-c
CU Author: Evangelos Gkanas
Date: 11 Apr 2023


Publication Name: Comparison of the effect of one-way and two-way fire-wind coupling on the modelling of wildland fire propagation dynamics
Publication URL: https://pureportal.coventry.ac.uk/en/publications/comparison-of-the-effect-of-one-way-and-two-way-fire-wind-couplin
CU Author: Evangelos Gkanas
Date: Dec 2022


Publication Name: Development and Testing of Ni-Cu Bimetallic Catalysts for Effective Syngas Production via Low-Temperature Methane Steam Reforming
Publication URL: https://pureportal.coventry.ac.uk/en/publications/development-and-testing-of-ni-cu-bimetallic-catalysts-for-effecti
CU Author: Evangelos Gkanas
Date: 6 Dec 

Publication Name: Modeling and Simulation for Absorption-Desorption Cyclic Process on a Three-Stage Metal Hydride Hydrogen Compressor
Publication URL: https://pureportal.coventry.ac.uk/en/publications/modeling-and-simulation-for-absorption-desorption-cyclic-process--2
CU Author: Evangelos Gkanas
Date: 2013


Publication Name: Nanotechnology and Innovation: Recent Status and the Strategic Implication for the Formation of High Tech Clusters in Greece, in between a Global Economic Crisis
Publication URL: https://pureportal.coventry.ac.uk/en/publications/nanotechnology-and-innovation-recent-status-and-the-strategic-imp-2
CU Author: Evangelos Gkanas
Date: Mar 2013


Publication Name: Polymer-stable magnesium nanocomposites prepared by laser ablation for efficient hydrogen storage
Publication URL: https://pureportal.coventry.ac.uk/en/publications/polymer-stable-magnesium-nanocomposites-prepared-by-laser-ablatio-2
CU Author: Evangelos Gkanas
Date: 30 Aug 2013


