In [9]:
import requests
import streamlit as st
from bs4 import BeautifulSoup
from fpdf import FPDF
from pathlib import Path

# ExtractHtml class to get BeautifulSoup object from URL
class ExtractHtml:
    @staticmethod
    def soup(url):
        response = requests.get(url)
        return BeautifulSoup(response.text, "html.parser")

# NBILinkScraper class to scrape links from the base URL
class NBILinkScraper:
    def __init__(self) -> None:
        self.base_url = "https://www.nbi-handelsakademin.se"

    def scrape(self, pathname, query=""):
        url = f"{self.base_url}/{pathname}/{query}"
        all_links_raw = ExtractHtml.soup(url).select(
            ".wpgb-card-media-content-bottom > a[href]"
        )
        extracted_links = set(link["href"] for link in all_links_raw)
        extracted_links = {url.split("/")[-2]: url for url in extracted_links}
        return extracted_links

# DataScraper class to scrape data from the extracted links
class DataScraper:
    def __init__(self, pathname, query="") -> None:
        self.links = NBILinkScraper().scrape(pathname, query)

    def scrape(self, subject):
        description_raw = ExtractHtml.soup(self.links[subject]).select(
            ".wpb_text_column span, p,h4+ul li"
        )
        description = " ".join(
            [
                raw.text
                for raw in description_raw
                if not "\xa0" in raw.get_text()  # or "@" in raw.text)
            ]
        )
        return description

# Specific scrapers for education and course pages
class EducationScraper(DataScraper):
    def __init__(self) -> None:
        super().__init__("utbildningar", query="/?_programkurser=program")

class CourseScraper(DataScraper):
    def __init__(self) -> None:
        super().__init__("kurser")

# ScrapeFormat class for scraping structured text and lists from a page
class ScrapeFormat:
    def __init__(self, pathname) -> None:
        base_url = NBILinkScraper().base_url
        url = f"{base_url}/{pathname}"
        self.soup = ExtractHtml.soup(url)

    def extract_text(self, tag):
        text_list = [tag.text for tag in self.soup.select(tag)]
        return " ".join(text_list)

    def extract_list(self, tag):
        return [tag.text for tag in self.soup.select(tag)]

# Specific scrapers for application and FAQ pages
class ApplicationScraper:
    def __init__(self, pathname="ansokan") -> None:
        self._scraper = ScrapeFormat(pathname)

    @property
    def description(self):
        return self._scraper.extract_text("h1, h1~*")

    @property
    def time_plan(self) -> list:
        return self._scraper.extract_list("#tab-tidplan ul li, #tab-tidplan ul + p")

    @property
    def available_educations(self) -> list:
        return self._scraper.extract_list("#tab-ansok li")

    @property
    def application_steps(self) -> list:
        return self._scraper.extract_list("h3 a")

class FaqScraper:
    def __init__(self, pathname="faq") -> None:
        self._scraper = ScrapeFormat(pathname)

    @property
    def faq(self) -> list:
        return self._scraper.extract_list(".toggle.default a, .toggle.default p")

# ExportScrapedText class to save text data into a file
class ExportScrapedText:
    def __init__(self, filename, content) -> None:
        data_path = Path(__file__).parent / "data"
        data_path.mkdir(parents=True, exist_ok=True)

        with open(data_path / filename, "w") as file:
            file.write(content)

# PDF creation using fpdf
class PDF(FPDF):
    def header(self):
        self.set_font("Arial", "B", 12)
        self.cell(0, 10, "Scraped Data", 0, 1, "C")
    
    def footer(self):
        self.set_y(-15)
        self.set_font("Arial", "I", 8)
        self.cell(0, 10, f"Page {self.page_no()}", 0, 0, "C")

def create_pdf(text, filename):
    pdf = PDF()
    pdf.add_page()
    pdf.set_auto_page_break(auto=True, margin=15)
    pdf.set_font("Arial", size=12)
    pdf.multi_cell(0, 10, text.encode('latin-1', 'replace').decode('latin-1'))
    pdf.output(filename, 'F')

# Example usage of the combined classes and functions
if __name__ == "__main__":
    education_scraper = EducationScraper()
    description = education_scraper.scrape("some-education-subject")
    
    # Save the scraped text into a PDF
    create_pdf(description, "output.pdf")
