In [1]:
import requests
import os
from urllib.parse import urlparse
from bs4 import BeautifulSoup
import PyPDF2
import io
import time
from datetime import datetime
import hashlib
import logging


In [None]:
# !wget http://nvd.handsonhacking.org/nvd.jsonl #1.2GB snapshot of NVD with refs

In [None]:
from markitdown import MarkItDown

md = MarkItDown()
result = md.convert("test.xlsx")
print(result.text_content)

In [3]:

class ContentCrawler:
    def __init__(self, output_dir="archived_content"):
        self.output_dir = output_dir
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        })
        
        # Setup logging
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s - %(message)s',
            handlers=[
                logging.FileHandler('crawler.log'),
                logging.StreamHandler()
            ]
        )
        self.logger = logging.getLogger(__name__)
        
        # Create output directory if it doesn't exist
        os.makedirs(output_dir, exist_ok=True)

    def generate_filename(self, url, content_type):
        """Generate a unique filename based on URL and timestamp"""
        parsed_url = urlparse(url)
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        url_hash = hashlib.md5(url.encode()).hexdigest()[:8]
        
        base = f"{parsed_url.netloc}_{url_hash}_{timestamp}"
        if content_type == "pdf":
            return f"{base}.pdf"
        return f"{base}.html"

    def save_content(self, content, url, content_type="html"):
        """Save the content to a file"""
        filename = self.generate_filename(url, content_type)
        filepath = os.path.join(self.output_dir, filename)
        
        mode = "wb" if content_type == "pdf" else "w"
        encoding = None if content_type == "pdf" else "utf-8"
        
        try:
            with open(filepath, mode, encoding=encoding) as f:
                f.write(content)
            self.logger.info(f"Content saved to {filepath}")
            return filepath
        except Exception as e:
            self.logger.error(f"Error saving content: {str(e)}")
            return None

    def check_wayback_machine(self, url):
        """Try to retrieve content from Wayback Machine"""
        wb_url = f"https://web.archive.org/web/{url}"
        try:
            response = self.session.get(wb_url)
            if response.status_code == 200:
                self.logger.info(f"Content found on Wayback Machine: {wb_url}")
                return response.text
        except Exception as e:
            self.logger.error(f"Error accessing Wayback Machine: {str(e)}")
        return None

    def check_google_cache(self, url):
        """Try to retrieve content from Google Cache"""
        cache_url = f"https://webcache.googleusercontent.com/search?q=cache:{url}"
        try:
            response = self.session.get(cache_url)
            if response.status_code == 200:
                self.logger.info(f"Content found in Google Cache: {cache_url}")
                return response.text
        except Exception as e:
            self.logger.error(f"Error accessing Google Cache: {str(e)}")
        return None

    def fetch_content(self, url):
        """Main method to fetch content from a URL"""
        try:
            response = self.session.get(url, timeout=30)
            response.raise_for_status()
            
            content_type = response.headers.get('content-type', '').lower()
            
            if 'application/pdf' in content_type:
                return self.handle_pdf(response.content, url)
            else:
                return self.handle_html(response.text, url)
                
        except requests.RequestException as e:
            self.logger.warning(f"Error accessing {url}: {str(e)}")
            
            # Try alternative sources
            content = self.check_google_cache(url)
            if content:
                return self.handle_html(content, url)
                
            content = self.check_wayback_machine(url)
            if content:
                return self.handle_html(content, url)
                
            self.logger.error(f"Could not retrieve content from {url} or any alternative sources")
            return None

    def handle_pdf(self, content, url):
        """Handle PDF content"""
        try:
            pdf_file = io.BytesIO(content)
            pdf_reader = PyPDF2.PdfReader(pdf_file)
            text_content = ""
            
            for page in pdf_reader.pages:
                text_content += page.extract_text() + "\n"
            
            # Save both raw PDF and extracted text
            self.save_content(content, url, "pdf")
            text_filepath = self.save_content(text_content, url + "_text", "html")
            
            return text_filepath
            
        except Exception as e:
            self.logger.error(f"Error processing PDF: {str(e)}")
            return None

    def handle_html(self, content, url):
        """Handle HTML content"""
        try:
            soup = BeautifulSoup(content, 'html.parser')
            
            # Remove script and style elements
            for element in soup(['script', 'style']):
                element.decompose()
            
            # Extract text content
            text_content = soup.get_text()
            
            # Save both raw HTML and cleaned text
            self.save_content(content, url)
            text_filepath = self.save_content(text_content, url + "_text", "html")
            
            return text_filepath
            
        except Exception as e:
            self.logger.error(f"Error processing HTML: {str(e)}")
            return None

    def crawl_urls(self, urls):
        """Crawl a list of URLs"""
        results = {}
        for url in urls:
            self.logger.info(f"Processing: {url}")
            filepath = self.fetch_content(url)
            results[url] = filepath
            time.sleep(1)  # Be nice to servers
        return results

# Example usage
if __name__ == "__main__":
    crawler = ContentCrawler(output_dir="archived_content")
    urls = [
        "https://example.com/article",
        "https://example.com/document.pdf",
        # Add more URLs here
    ]
    
    #TODO DeDup UrLs
    urls = [
        "http://www.oracle.com/technetwork/security-advisory/cpuapr2017-3236618.html",
        "http://www.securityfocus.com/bid/97882",
        "http://www.securitytracker.com/id/1038301"
        
        # Add more URLs here
    ]
    
    urls = [
        "https://chromereleases.googleblog.com/2024/05/stable-channel-update-for-desktop_21.html",
        "https://chromereleases.googleblog.com/2024/05/stable-channel-update-for-desktop_21.html",	
        "https://issues.chromium.org/issues/338908243",	
        "https://issues.chromium.org/issues/338908243",	
        "https://lists.fedoraproject.org/archives/list/package-announce@lists.fedoraproject.org/message/5KEVD4433KTOCYY6V4I7MMYKQ6URUS4L/",
        "https://lists.fedoraproject.org/archives/list/package-announce@lists.fedoraproject.org/message/5KEVD4433KTOCYY6V4I7MMYKQ6URUS4L/",
        "https://lists.fedoraproject.org/archives/list/package-announce@lists.fedoraproject.org/message/FX6IYZ6XF7B2WE66NFPNI2NHWJFI6VDF/",
        "https://lists.fedoraproject.org/archives/list/package-announce@lists.fedoraproject.org/message/FX6IYZ6XF7B2WE66NFPNI2NHWJFI6VDF/"
    ]
    
    results = crawler.crawl_urls(urls)
    
    for url, filepath in results.items():
        if filepath:
            print(f"Successfully archived {url} to {filepath}")
        else:
            print(f"Failed to archive {url}")

2024-12-14 21:28:11,715 - INFO - Processing: https://chromereleases.googleblog.com/2024/05/stable-channel-update-for-desktop_21.html
2024-12-14 21:28:12,627 - INFO - Content saved to archived_content/chromereleases.googleblog.com_cad9cf10_20241214_212812.html
2024-12-14 21:28:12,628 - INFO - Content saved to archived_content/chromereleases.googleblog.com_ccc46054_20241214_212812.html
2024-12-14 21:28:13,629 - INFO - Processing: https://chromereleases.googleblog.com/2024/05/stable-channel-update-for-desktop_21.html
2024-12-14 21:28:13,853 - INFO - Content saved to archived_content/chromereleases.googleblog.com_cad9cf10_20241214_212813.html
2024-12-14 21:28:13,854 - INFO - Content saved to archived_content/chromereleases.googleblog.com_ccc46054_20241214_212813.html
2024-12-14 21:28:14,855 - INFO - Processing: https://issues.chromium.org/issues/338908243
2024-12-14 21:28:15,650 - INFO - Content saved to archived_content/issues.chromium.org_7d40a4ad_20241214_212815.html
2024-12-14 21:28:15

Successfully archived https://chromereleases.googleblog.com/2024/05/stable-channel-update-for-desktop_21.html to archived_content/chromereleases.googleblog.com_ccc46054_20241214_212813.html
Successfully archived https://issues.chromium.org/issues/338908243 to archived_content/issues.chromium.org_652b9da2_20241214_212817.html
Successfully archived https://lists.fedoraproject.org/archives/list/package-announce@lists.fedoraproject.org/message/5KEVD4433KTOCYY6V4I7MMYKQ6URUS4L/ to archived_content/lists.fedoraproject.org_55870a42_20241214_212820.html
Successfully archived https://lists.fedoraproject.org/archives/list/package-announce@lists.fedoraproject.org/message/FX6IYZ6XF7B2WE66NFPNI2NHWJFI6VDF/ to archived_content/lists.fedoraproject.org_773d5a8a_20241214_212823.html
