In [1]:
import requests
from bs4 import BeautifulSoup
from langchain.document_loaders import UnstructuredURLLoader

def get_all_links(base_url):
    """Fetch all links from the base_url."""
    try:
        response = requests.get(base_url)
        response.raise_for_status()  # Check for HTTP errors
        soup = BeautifulSoup(response.content, 'html.parser')
        links = set()

        for a_tag in soup.find_all('a', href=True):
            link = a_tag['href']
            # Ensure the link is a full URL
            if link.startswith('/'):
                link = base_url + link
            elif not link.startswith('http'):
                continue
            links.add(link)

        return links
    except Exception as e:
        print(f"Error fetching {base_url}: {e}")
        return set()

def scrape_website(base_url):
    """Scrape the website starting from base_url."""
    all_links = get_all_links(base_url)
    documents = []

    for link in all_links:
        try:
            loader = UnstructuredURLLoader([link])
            data = loader.load()
            documents.extend(data)
        except Exception as e:
            print(f"Error loading {link}: {e}")

    return documents

# Usage
main_domain = "https://sslwireless.com/"  # Replace with your main domain
all_documents = get_all_links(main_domain)

print(list(all_documents))
print(f"Total documents scraped: {len(all_documents)}")


['https://sslwireless.com/enterprise-solutions/', 'https://sslwireless.com/it-security-2/', 'https://sslwireless.com//contact-us/', 'https://sslwireless.com//entertainment-vas/', 'https://sslwireless.com/marketing-promotion/', 'https://sslwireless.com/our-company/', 'https://sslwireless.com//our-company/', 'https://sslwireless.com//privacy-policy/', 'https://sslwireless.com/', 'https://sslwireless.com/entertainment-vas/', 'https://sslwireless.com//contact-us', 'https://www.facebook.com/SSLWireless', 'https://sslwireless.com/application-developments/', 'https://sslwireless.com/our-certifications/', 'https://sslwireless.com/job-openings/', 'https://sslwireless.com/research/', 'https://sslwireless.com//our-milestones/', 'https://sslwireless.com/news-events/', 'https://sslwireless.com/solutions-platforms-and-tools/', 'https://sslwireless.com/our-milestones/', 'https://www.linkedin.com/company/ssl-wireless', 'https://www.youtube.com/user/sslvtube', 'https://sslwireless.com/payment-services/