# Website Archiver using Python

This Jupyter Notebook allows you to archive a website by downloading its HTML, linked resources, and saving them locally.
It is based on the Python script provided.

### Features:
- Downloads a webpage and all linked resources (CSS, JS, images, etc.).
- Maintains folder structure based on URL paths.
- Updates links in the downloaded HTML to reference local files.
- Saves the website into a specified directory.

## Installation and Setup

Before running the notebook, ensure you have the required dependencies installed. Run the following command:

```bash
pip install requests beautifulsoup4
```

Modify the `target_url` variable to the URL you want to archive before running the notebook.


In [None]:
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse

In [None]:
class WebsiteArchiver:
    def __init__(self, url, output_dir="CTE_archive"):
        self.url = url
        self.output_dir = output_dir
        os.makedirs(self.output_dir, exist_ok=True)
        self.visited_urls = set()

    def save_file(self, url):
        parsed_url = urlparse(url)
        file_path = os.path.join(self.output_dir, parsed_url.path.lstrip('/'))
        os.makedirs(os.path.dirname(file_path), exist_ok=True)

        if not os.path.exists(file_path):
            try:
                response = requests.get(url)
                response.raise_for_status()  # Raise an error for bad status codes
                with open(file_path, 'wb') as file:
                    file.write(response.content)
            except requests.exceptions.RequestException as e:
                print(f"Failed to download {url}: {e}")
        return file_path

    def archive_website(self):
        response = requests.get(self.url)
        soup = BeautifulSoup(response.content, 'html.parser')

        # Save the main HTML file
        html_file = os.path.join(self.output_dir, "index.html")
        with open(html_file, 'wb') as file:
            file.write(response.content)

        # Find and save all linked resources
        for tag in soup.find_all(['link', 'script', 'img', 'a']):
            src = tag.get('href') or tag.get('src')
            if src:
                src_url = urljoin(self.url, src)

                if src_url not in self.visited_urls:
                    self.visited_urls.add(src_url)
                    local_file = self.save_file(src_url)
                    relative_path = os.path.relpath(local_file, self.output_dir)

                    if tag.name == 'a':
                        tag['href'] = relative_path
                    elif tag.name == 'img':
                        tag['src'] = relative_path
                    elif tag.name in ['link', 'script']:
                        tag['href' if tag.name == 'link' else 'src'] = relative_path

        # Save the updated HTML with local paths
        with open(html_file, 'w', encoding='utf-8') as file:
            file.write(str(soup))

        print(f"Website archived in {self.output_dir}")

In [None]:
# Replace with your target URL
target_url = "https://www2.ed.gov/datastory/cte/index.html"  # Modify as needed
archiver = WebsiteArchiver(target_url)
archiver.archive_website()