this is a webscraper, that will stay on a specific site and get all the links from it. 

The intention is that you can save the pages, later filtering them for course info.

In [None]:
import pandas as pd
import bs4
import aiohttp
import httpx
import logging
from datetime import datetime
import asyncio
from collections import deque
import time

In [None]:
pageQueue = deque()
pageSet = set()
pageData = pd.DataFrame(columns=["url", "title", "timestamp",  "raw"])
pageData = pageData.set_index("url")
logging.basicConfig(level=logging.INFO)
pageQueue.append("https://academic-calendar.wlu.ca/")
pageSet.add("https://academic-calendar.wlu.ca/")
counter = 0
urlErrors = []
urlWhitelist = ["y=85", "https://academic-calendar.wlu.ca/"] # only select the 2023 year (y=85) and stay on the site

In [None]:
# outline of the scraper
# 1. get a page from the queue
# 2. scrape the page for links
# 3. add the links to the queue if they are not already in the set
# 4. repeat until the queue is empty
# how can I save the pages? I need to save the links as well as the html. 
# I think I will save the links in a csv file and the html in a folder

In [None]:
limits = httpx.Limits(max_connections=10)
timeout = httpx.Timeout(10.0, connect=60.0)
async with httpx.AsyncClient(limits=limits, timeout=timeout) as client:
    async def scrapePage(url: str):
        if not url.startswith("https://"):
            url = "https://academic-calendar.wlu.ca/" + url
        try:
            data = await client.get(url)
        except Exception as e:
            logging.error(f"Error while scraping {url}: {e}")
            urlErrors.append(url)
            return
        soup = bs4.BeautifulSoup(data.text, "html.parser")
        title = soup.title.string if soup.title is not None else None
        links = filter(lambda x: x is not None and x not in pageSet and any(whitelist in x for whitelist in urlWhitelist), map(lambda x: x.get("href"), soup.find_all("a")))
        for link in links:
            pageQueue.append(link)
            pageSet.add(link)

        pageData.loc[url] = [title, datetime.now(), data.text]  # Assign the string representation of links
    
    while pageQueue:
        tasks = [scrapePage(pageQueue.popleft()) for _ in range(min(len(pageQueue), 10))]
        counter += len(tasks)
        await asyncio.gather(*tasks)
        logging.info(f"Scraped {counter} pages")
        if len(pageData) > 10000:
            pageData.to_csv(f"data/laurier/data_{counter}.csv")
            logging.info(f"Saved {len(pageData)} pages")
            pageData = pd.DataFrame(columns=["url", "title", "timestamp",  "raw"])
            pageData = pageData.set_index("url")