In [1]:
import requests
from bs4 import BeautifulSoup
import json
import re
import time
import os

In [3]:
def getProfileURLorNone(url):
    if "no-content" in url:
        return None
    pattern = r"^(\\/[^?]+)"
    path_match = re.match(pattern, url)
    path = None
    if path_match:
        path = path_match.group(1)
    return path    

In [7]:
def scrapeAuthors(start_page=1, page_limit=1000):
    page = start_page
    base_url = "https://pureportal.coventry.ac.uk"
    url = f"{base_url}/en/organisations/eec-school-of-computing-mathematics-and-data-sciences-cmds/persons/"
    authors = []
    
    while page < page_limit:
        try:
            pageSource = requests.get(url).text
            soup = BeautifulSoup(pageSource, "html.parser")
            authorList = soup.select("li.grid-result-item div.result-container")
            if len(authorList) == 0:
                break
            for author in authorList:
                try:
                    authorInfo = {}
                    authorInfo['picUrl'] = getProfileURLorNone(
                            author.select_one("img")['src']
                    )
                    if authorInfo['picUrl'] is not None:
                        authorInfo['picUrl'] = base_url + authorInfo['picUrl']
                    name = author.select_one("a", attrs={'rel': 'Person'})
                    authorInfo['name'] = name.text
                    authorInfo['profileLink'] = base_url + name['href']
                    dept = author.select_one(".relations.organisations a", 
                            attrs={'rel': 'Organisation'})
                    authorInfo['department'] = dept.text
                    authorInfo['deptLink'] = base_url + dept['href']
                    authors.append(authorInfo)
                except Exception as e:
                    print(f"Error processing author: {e}")
                    pass
            
            print(f"Finished page {page}")
            page += 1

            # Look for next page link and update the URL
            next_page_link = soup.select_one("a[rel='next']")
            if next_page_link:
                url = next_page_link['href']
                if not url.startswith("http"):
                    url = base_url + url
            else:
                break
                
        except Exception as e:
            print(f"Error processing page {page}: {e}")
            break
    
    # Use a safe timestamp format for the filename
    timestamp = time.strftime("%Y%m%d-%H%M%S")
    if not os.path.exists("./scrapedData"):
        os.makedirs("./scrapedData")
    with open(f"./scrapedData/authors-{timestamp}.json", "w") as f:
        f.write(json.dumps(authors, indent=4))
    
    return authors

if __name__ == '__main__':
    scrapeAuthors()

Finished page 1
