In [None]:
import requests
from bs4 import BeautifulSoup
from collections import Counter
from urllib.parse import urlparse
import nltk
from nltk.corpus import stopwords
import re

In [None]:
# Ensure NLTK stop words are downloaded
nltk.download('stopwords')
english_stopwords = set(stopwords.words('english'))

In [None]:
# Initialize variables
unique_urls = set()
word_counts = {}
common_words = Counter()
subdomains = {}

In [None]:
def crawl(url, base_domain):
    if url in unique_urls:
        return

    print(url)
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')

        # Add URL to unique URLs set
        unique_urls.add(url)

        # Count words, excluding stopwords
        # words = [word for word in re.findall(r'\w+', soup.get_text().lower()) if word not in english_stopwords]
        # word_counts[url] = len(words)
        # common_words.update(words)
        
        # Count words, excluding stopwords and single-letter words
        words = [word for word in re.findall(r'\w+', soup.get_text().lower()) if word not in english_stopwords and len(word) > 1]
        word_counts[url] = len(words)
        common_words.update(words)

        # Check for subdomain and update count
        subdomain = urlparse(url).netloc
        if base_domain in subdomain:
            subdomains[subdomain] = subdomains.get(subdomain, 0) + 1

        # Find and crawl other links on the page
        for link in soup.find_all('a', href=True):
            next_page = link['href']
            if next_page.startswith('/'):
                next_page = f'{base_domain}{next_page}'
            if next_page.startswith('http') and base_domain in next_page:
                crawl(next_page, base_domain)

    except requests.exceptions.RequestException:
        pass

In [None]:
base_url = 'http://ics.uci.edu/'
crawl(base_url, 'ics.uci.edu')

In [None]:
# Print results
print(f'Number of unique URLs: {len(unique_urls)}')
print(f'Longest page: {max(word_counts, key=word_counts.get)} with {word_counts[max(word_counts, key=word_counts.get)]} words')
print('50 most common words:', common_words.most_common(50))
print('Subdomains and their page counts:')
for subdomain, count in sorted(subdomains.items()):
    print(f'{subdomain}, {count}')