In [43]:
import requests
from bs4 import BeautifulSoup
import json
import time
import os


In [45]:
def getAuthorsAndOtherDocumentInformation(paperInfo):
    try:
        source = requests.get(paperInfo['link']).text
        paperSoup = BeautifulSoup(source, "html.parser")
        if paperSoup.select_one("div.doi a") is not None:
            paperInfo['doi'] = paperSoup.select_one("div.doi a")['href']
        persons = paperSoup.select_one("p.relations.persons")
        if persons is not None:
            paperInfo['authors'] = list(map(
                    lambda x : x.strip(), 
                    persons.text.split(','))
                )
        paperInfo['tags'] = [span.text for span in 
                paperSoup.select("li.userdefined-keyword")]
        if persons is not None:
            paperInfo['coventryAuthors'] = [a['href'] for a in 
                    persons.select('a', attrs = { 'rel' : 'Person'})]
        abstract = paperSoup.select_one(".rendering_researchoutput_abstractportal")
        paperInfo['abstract'] = None
        if abstract:
            paperInfo['abstract'] = abstract.text
    except Exception as e:
        print(f"Error in getAuthorsAndOtherDocumentInformation: {e}")

In [None]:
# Function to scrape Coventry publications page
def scrapPapers(start_page=1, page_limit=1000):
    page = start_page
    url_template = "https://pureportal.coventry.ac.uk/en/organisations/eec-school-of-computing-mathematics-and-data-sciences-cmds?page={}"
    papers = []

    while page < page_limit:
        try:
            url = url_template.format(page)
            pageSource = requests.get(url).text
            soup = BeautifulSoup(pageSource, "html.parser")
            paperLists = soup.select(".list-result-item")

            if len(paperLists) == 0:
                break

            for paper in paperLists:
                try:
                    paperInfo = {}
                    link_element = paper.select_one('h3.title a')
                    if link_element:
                        paperInfo['link'] = link_element['href']
                        paperInfo['title'] = link_element.text

                    journal = paper.select_one('a', attrs={'rel': 'Journal'})
                    if journal:
                        paperInfo['journal'] = journal.text
                        paperInfo['journalLink'] = journal['href']

                    cols = ['date', 'volume', 'pages', 'numberofpages', 'type_classification']
                    for x in cols:
                        try:
                            element = paper.select_one(f'span.{x}')
                            if element:
                                paperInfo[x] = element.text
                                if x == 'numberofpages':
                                    paperInfo[x] = int(paperInfo[x][:-2])
                                elif x == 'pages':
                                    paperInfo[x] = paperInfo[x][3:]
                                elif x == 'volume':
                                    paperInfo[x] = int(paperInfo[x])
                        except:
                            pass
                    
                    getAuthorsAndOtherDocumentInformation(paperInfo)
                    papers.append(paperInfo)
                except Exception as e:
                    print(f"Error processing paper: {e}")

            print(f"Finished page {page}")
            page += 1
        except Exception as e:
            print(f"An error occurred: {e}")
            break

    # Use a safe timestamp format for the filename
    timestamp = time.strftime("%Y%m%d-%H%M%S")
    if not os.path.exists("./scrapedData"):
        os.makedirs("./scrapedData")
    with open(f"./scrapedData/papers-{timestamp}.json", "w") as f:
        f.write(json.dumps(papers, indent=4))

    return papers

if __name__ == '__main__':
    scrapPapers()

Finished page 1
Finished page 2
Finished page 3
Finished page 4
Finished page 5
Finished page 6
Finished page 7
Finished page 8
Finished page 9
Finished page 10
Finished page 11
Finished page 12
Finished page 13
Finished page 14
Finished page 15
Finished page 16
Finished page 17
Finished page 18
Finished page 19
Finished page 20
Finished page 21
Finished page 22
Finished page 23
Finished page 24
Finished page 25
Finished page 26
Finished page 27
Finished page 28
Finished page 29
Finished page 30
Finished page 31
Finished page 32
Finished page 33
Finished page 34
Finished page 35
Finished page 36
Finished page 37
Finished page 38
Finished page 39
Finished page 40
Finished page 41
Finished page 42
Finished page 43
Finished page 44
Finished page 45
Finished page 46
Finished page 47
Finished page 48
Finished page 49
Finished page 50
Finished page 51
Finished page 52
Finished page 53
Finished page 54
Finished page 55
Finished page 56
Finished page 57
Finished page 58
Finished page 59
Finish