In [6]:
import os
import time
import requests
from urllib.parse import urljoin, urlparse
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service

# Setup ChromeDriver
service = Service('/usr/bin/chromedriver')
options = webdriver.ChromeOptions()
# options.add_argument('--headless')  # Enable this if you want headless mode
driver = webdriver.Chrome(service=service, options=options)

# Constants
BASE_URLS = [
    "https://erincollege.com/"
]
OUTPUT_DIR = "./"
VISITED = set()

# Ensure base directory exists
os.makedirs(OUTPUT_DIR, exist_ok=True)

def sanitize_path(url):
    parsed = urlparse(url)
    path = parsed.path.strip("/")
    if not path or path.endswith("/"):
        path += "index"
    path = path.replace("-", "_")
    return os.path.join(OUTPUT_DIR, path)

# This was reused but the result is fine
def normalize_url(url):
    return urljoin("https://erincollege.com/", url).split("#")[0].rstrip("/")

def is_valid_link(href):
    if not href:
        return False
    abs_url = normalize_url(href)
    return any(abs_url.startswith(base.rstrip("/")) for base in BASE_URLS)

def download_document(url, save_path):
    ext = os.path.splitext(urlparse(url).path)[1]
    if not save_path.endswith(ext):
        save_path += ext

    try:
        r = requests.get(url)
        os.makedirs(os.path.dirname(save_path), exist_ok=True)
        with open(save_path, "wb") as f:
            f.write(r.content)
        print(f"[DOC] {url} -> {save_path}")
    except Exception as e:
        print(f"Download failed: {url} ({e})")

def extract_markdown_from_body(body):
    output = []

    def append_text(tag, prefix=""):
        text = tag.get_text(strip=True)
        if text:
            output.append(f"{prefix}{text}")

    for elem in body.descendants:
        if elem.name in {"script", "style", "noscript"}:
            continue
        if elem.name == "h1":
            append_text(elem, "# ")
        elif elem.name == "h2":
            append_text(elem, "## ")
        elif elem.name == "h3":
            append_text(elem, "### ")
        elif elem.name == "p":
            append_text(elem)
        elif elem.name == "li":
            append_text(elem, "- ")
        elif elem.name == "pre":
            code = elem.get_text().strip()
            output.append("```")
            output.append(code)
            output.append("```")
        elif elem.name == "blockquote":
            append_text(elem, "> ")

    return "\n\n".join(output)

def scrape_page(url):
    norm_url = normalize_url(url)
    if norm_url in VISITED:
        return
    VISITED.add(norm_url)

    driver.get(url)
    time.sleep(1.5)  # Let JavaScript render
    soup = BeautifulSoup(driver.page_source, "html.parser")
    body = soup.body

    if body:
        for tag in body(["script", "style", "noscript"]):
            tag.decompose()
        markdown_text = extract_markdown_from_body(body)

        if markdown_text:
            rel_path = sanitize_path(url) + ".txt"
            os.makedirs(os.path.dirname(rel_path), exist_ok=True)
            with open(rel_path, "w", encoding="utf-8") as f:
                f.write(markdown_text)
            print(f"[PAGE] {url} -> {rel_path}")
        else:
            print(f"No visible text found at {url}")

    # Recurse on links
    for a in soup.find_all("a", href=True):
        href = a["href"]
        abs_url = urljoin(url, href)
        norm_abs_url = normalize_url(abs_url)

        if norm_abs_url in VISITED:
            continue

        if any(abs_url.lower().endswith(ext) for ext in [".pdf"]):
            save_path = sanitize_path(abs_url)
            download_document(abs_url, save_path)
        elif is_valid_link(abs_url):
            scrape_page(abs_url)

# Start crawling
for url in BASE_URLS:
    scrape_page(url)

driver.quit()


[PAGE] https://erincollege.com/ -> ./index.txt
[PAGE] https://erincollege.com/work-with-us/ -> ./work_with_us.txt
[PAGE] https://erincollege.com/accreditations/ -> ./accreditations.txt
[PAGE] https://erincollege.com/our-buildings/ -> ./our_buildings.txt
[PAGE] https://erincollege.com/mission-vision-and-values/ -> ./mission_vision_and_values.txt
[PAGE] https://erincollege.com/sales-team/ -> ./sales_team.txt
[PAGE] https://erincollege.com/academic-team/ -> ./academic_team.txt
[PAGE] https://erincollege.com/gallery/ -> ./gallery.txt
[PAGE] https://erincollege.com/general-english/ -> ./general_english.txt
[PAGE] https://erincollege.com/price-list/ -> ./price_list.txt
[PAGE] https://erincollege.com/short-stay-junior-programme/ -> ./short_stay_junior_programme.txt
[PAGE] https://erincollege.com/short-term-courses/ -> ./short_term_courses.txt
[PAGE] https://erincollege.com/language-lab/ -> ./language_lab.txt
[PAGE] https://erincollege.com/ielts-preparation-course/ -> ./ielts_preparation_cours

In [9]:
!ls

Scrape.ipynb
academic_team.txt
accommodation.txt
accreditations.txt
advantage_programme.txt
after_class_activities.txt
author
become_our_partner.txt
category
dublin.txt
exams.txt
faq.txt
find_job.txt
gallery.txt
general_english.txt
get_a_quotation.txt
get_you_ppsn.txt
get_your_leap_card.txt
ielts_preparation_course.txt
inclusivity_and_special_needs_policy.txt
index.txt
language_lab.txt
mandarin_speaker_sales_and_student_advisor.txt
mission_vision_and_values.txt
our_buildings.txt
price_list.txt
privacy_policy_2.txt
promo_of_the_month.txt
sales_advisor_spanish_speaker.txt
sales_team.txt
short_stay_junior_programme.txt
short_term_courses.txt
spanish_sales_and_student_advisor.txt
student_complaints.txt
student_disciplinary_policy.txt
student_handbook_2.txt
visa_information.txt
work_with_us.txt
wp_content
