In [3]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

In [4]:
from queue import Queue
import re

MAX_DEPTH = 2
class Node():
    def __init__(self, url, depth):
        self.url = url
        self.depth = depth
        
    def __eq__(self, other):
        return self.url == other.url and self.depth == other.depth
    
    def is_already_done(self, other):
        return self.url == other.url and self.depth <= other.depth
    
    def __hash__(self):
        return hash((self.url, self.depth))

urls = [
    "https://virginia.edu/apply",
    "https://virginia.edu/alumni",
    "https://virginia.edu/facts",
    "https://virginia.edu/academics",
    "https://virginia.edu/life",
]
q = Queue()
for url in urls:
    q.put(Node(url, 0))
    
nodes_done = set()
visited = set()

def is_node_done(node):
    for fin_node in nodes_done:
        if node.is_already_done(fin_node):
            return True
    return False

def crawl(node):
    print("crawling ", node.url)
    if node.url in visited:
        print("already visited", node.url)
        return

    if is_node_done(node) or node.depth >= MAX_DEPTH:
        print("skipping (done or too deep):", node.url)
        visited.add(node.url)
        return

    visited.add(node.url)
    nodes_done.add(node)
    try:
        response = requests.get(node.url)
        print(response.status_code)
        soup = BeautifulSoup(response.text, "html.parser")
        for link in soup.find_all("a", href=True):
            next_url = urljoin(node.url, link["href"])
            if bool(re.search(r'\b(virginia|uva)\b', next_url, re.IGNORECASE)):
                print("appending ", next_url)
                q.put(Node(next_url, node.depth+1))
                print(q.qsize())
    except Exception as e:
        print(e)

while not q.empty():
    n = q.get()
    print(n.url)
    print(q.qsize())
    crawl(n)
    

print("\n".join(visited))


https://virginia.edu/apply
4
crawling  https://virginia.edu/apply
200
appending  https://virginia.edu/apply#main-content
5
appending  https://virginia.edu/
6
appending  https://sisuva.admin.virginia.edu/ihprd/signon.html
7
appending  https://canvas.virginia.edu
8
appending  http://its.virginia.edu/switchboard/
9
appending  https://hr.virginia.edu/careers-uva/job-openings
10
appending  https://virginia.edu/alumni
11
appending  https://students.virginia.edu/
12
appending  https://virginia.edu/facultystaff
13
appending  https://virginia.edu/parents
14
appending  https://virginia.edu/apply#search-close
15
appending  https://virginia.edu/aboutuva
16
appending  https://virginia.edu/aboutuva
17
appending  https://virginia.edu/life/charlottesville
18
appending  https://virginia.edu/facts
19
appending  https://federalinfo.virginia.edu/
20
appending  https://president.virginia.edu/
21
appending  https://strategicplan.virginia.edu
22
appending  https://virginia.edu/visit
23
appending  https://vir

In [5]:
visited = {
    entry for entry in visited
    if " " not in entry and not entry.startswith(("mailto", "tel"))
}

visited

{'https://virginia.edu/follow-us/',
 'https://engineering.virginia.edu/departments/materials-science-and-engineering/academics/undergraduate',
 'https://news.virginia.edu/content/inside-uva',
 'https://www.scps.virginia.edu/programs/accounting-certificate',
 'https://ehs.virginia.edu',
 'https://engagement.virginia.edu/giving/dxo',
 'https://news.virginia.edu/news-topic/business-%26-government',
 'https://generalfacultycouncil.virginia.edu/',
 'https://summer.virginia.edu/morven-summer-institute',
 'https://giving.virginia.edu/stories',
 'https://virginia.edu/aboutuva#search-close',
 'https://education.virginia.edu/offices-departments/kinesiology',
 'https://research.virginia.edu/initiatives/research-achievement-awards/2021-research-awards-winners',
 'https://giving.virginia.edu/stories/all?field_term_school_target_id=40',
 'https://research.virginia.edu/media/536',
 'https://www.coursera.org/learn/uva-darden-digital-business-strategy',
 'https://giving.virginia.edu/ways-to-give/gift-p

In [7]:
with open("links_2deep.txt", "w") as f:
    for item in visited:
        f.write(f"{item}\n")