<a href="https://colab.research.google.com/github/DrKenReid/DeadLinkScraper/blob/main/Website_Deadlink_Finder.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import pandas as pd
from datetime import datetime, timedelta
import os
import sys
from google.colab import drive
import time
import traceback

class WebsiteDeadlinkScraper:
    def __init__(self, base_url, drive_folder='WebScraperResults'):
        """Initialize the website deadlink scraper."""
        self.base_url = base_url
        self.drive_folder = drive_folder
        self.results_file = 'deadlinks.csv'
        self.history_file = 'scan_history.csv'
        self.max_pages = 10000
        self.max_depth = 20
        self.visited_urls = set()
        self.deadlinks = []
        self.current_depth = 0
        self.setup()

    def setup(self):
        """Set up the scraper by mounting drive and loading necessary data."""
        self.mount_drive()
        self.load_history()
        self.load_existing_results()

    def mount_drive(self):
        """Mount Google Drive and set up the drive path."""
        try:
            drive.mount('/content/drive', force_remount=True)
        except Exception as e:
            print(f"Error mounting Google Drive: {str(e)}")
            sys.exit(1)

        self.drive_path = f'/content/drive/My Drive/{self.drive_folder}/'
        os.makedirs(self.drive_path, exist_ok=True)
        print(f"Drive mounted successfully. Working directory: {self.drive_path}")

    def load_history(self):
        """Load scan history from CSV file or create a new DataFrame if it doesn't exist."""
        history_path = os.path.join(self.drive_path, self.history_file)
        if os.path.exists(history_path):
            self.history_df = pd.read_csv(history_path)
            self.history_df['LastScanned'] = pd.to_datetime(self.history_df['LastScanned'])
        else:
            self.history_df = pd.DataFrame(columns=['URL', 'LastScanned'])

    def load_existing_results(self):
        """Load existing results from CSV file if it exists."""
        results_path = os.path.join(self.drive_path, self.results_file)
        if os.path.exists(results_path):
            self.deadlinks = pd.read_csv(results_path).to_dict('records')
        else:
            self.deadlinks = []

    def is_valid_url(self, url):
        """Check if a URL is valid and belongs to the same domain as the base URL."""
        parsed_url = urlparse(url)
        return parsed_url.netloc == urlparse(self.base_url).netloc

    def check_link(self, url):
        """Check if a link is dead (returns a non-200 status code)."""
        try:
            response = requests.head(url, allow_redirects=True, timeout=5)
            return response.status_code != 200
        except requests.RequestException:
            return True

    def scrape_page(self, url, depth, force_scan=False):
        """Scrape a single page for deadlinks and collect internal links for further scraping."""

        if url in self.visited_urls or depth > self.max_depth:
            return []

        self.visited_urls.add(url)
        self.current_depth = max(self.current_depth, depth)

        # Check if the page was scanned in the last 14 days
        if not force_scan and url in self.history_df['URL'].values:
            last_scanned = self.history_df.loc[self.history_df['URL'] == url, 'LastScanned'].iloc[0]
            if datetime.now() - last_scanned < timedelta(days=14):
                return []

        self.update_progress(f"Scanning: {url} (Depth: {depth})")

        try:
            response = requests.get(url, timeout=10)
            soup = BeautifulSoup(response.text, 'html.parser')

            internal_links = []
            for link in soup.find_all('a', href=True):
                full_url = urljoin(url, link['href'])
                if self.is_valid_url(full_url):
                    if self.check_link(full_url):
                        self.deadlinks.append({'source': url, 'deadlink': full_url})
                        self.save_result({'source': url, 'deadlink': full_url})
                    elif full_url not in self.visited_urls:
                        internal_links.append((full_url, depth + 1))

            # Update history
            self.update_history(url)
            return internal_links

        except Exception as e:
            print(f"DEBUG: Error scanning {url}: {str(e)}")
            return []

    def start_scraping(self):
        """Start the scraping process from the base URL."""
        to_visit = [(self.base_url, 0)]  # (url, depth)
        while to_visit and len(self.visited_urls) < self.max_pages:
            url, depth = to_visit.pop(0)
            force_scan = (url == self.base_url)  # Force scan for the initial URL
            new_links = self.scrape_page(url, depth, force_scan)
            to_visit.extend(new_links)
            self.update_progress(f"Queue size: {len(to_visit)}")

        self.update_progress("Scraping completed.")

    def save_result(self, result):
        """Save a single result to the CSV file."""
        df = pd.DataFrame([result])
        results_path = os.path.join(self.drive_path, self.results_file)
        df.to_csv(results_path, mode='a', header=not os.path.exists(results_path), index=False)

    def update_history(self, url):
        """Update the scan history for a single URL and save it."""
        if url in self.history_df['URL'].values:
            self.history_df.loc[self.history_df['URL'] == url, 'LastScanned'] = datetime.now()
        else:
            new_row = pd.DataFrame({'URL': [url], 'LastScanned': [datetime.now()]})
            self.history_df = pd.concat([self.history_df, new_row], ignore_index=True)

        self.save_history()

    def save_history(self):
        """Save the entire scan history to a CSV file in Google Drive."""
        self.history_df.to_csv(os.path.join(self.drive_path, self.history_file), index=False)

    def update_progress(self, message):
        """Update the progress message on a single line."""
        progress = f"\rScanned: {len(self.visited_urls)} pages, Found: {len(self.deadlinks)} deadlinks, Max Depth: {self.current_depth}, {message}"
        sys.stdout.write(progress)
        sys.stdout.flush()

# Usage
if __name__ == "__main__":
    try:
        base_url = input("Enter the base URL to scrape (e.g., https://example.com): ")
        scraper = WebsiteDeadlinkScraper(base_url)
        scraper.start_scraping()
    except Exception as e:
        print(f"\nAn error occurred: {str(e)}")
        print("Traceback:")
        traceback.print_exc()
        print("Please check the error message and ensure Google Drive is properly mounted.")

Enter the base URL to scrape (e.g., https://example.com): https://midas.umich.edu/
Mounted at /content/drive
Drive mounted successfully. Working directory: /content/drive/My Drive/WebScraperResults/
Scanned: 2 pages, Found: 0 deadlinks, Max Depth: 1, Scanning: https://midas.umich.edu/#ajax-content-wrap (Depth: 1)

  self.history_df = pd.concat([self.history_df, new_row], ignore_index=True)


Scanned: 4 pages, Found: 0 deadlinks, Max Depth: 1, Scanning: https://midas.umich.edu/#sidewidgetarea (Depth: 1)