In [8]:
import os
import random
import time
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

def crawl_site(start_url, visited_urls_file, urls_file, not_visiting_urls_file):
    
    # 初始化集合
    not_visiting_urls = set()
    visited_urls = set()
    urls_to_visit = set([start_url])

    # 在函数的开头打开文件，并在整个函数执行期间保持打开状态
    with open(urls_file, 'a+', encoding='utf-8') as urls_f, \
        open(visited_urls_file, 'a+', encoding='utf-8') as visited_urls_f, \
        open(not_visiting_urls_file, 'r', encoding='utf-8') as not_visiting_urls_f:
        # 读取 urls_file 文件并更新 urls_to_visit 集合
        urls_f.seek(0)
        for url in urls_f:
            urls_to_visit.add(url.strip())

        # 读取 visited_urls_file 文件并更新 visited_urls 集合
        visited_urls_f.seek(0)
        for url in visited_urls_f:
            visited_urls.add(url.strip())
            
        not_visiting_urls_f.seek(0)
        for url in not_visiting_urls_f:
            not_visiting_urls.add(url.strip())

        # 从 urls_to_visit 中去除已访问过的 URLs
        urls_to_visit.difference_update(visited_urls)

        while urls_to_visit:
            current_url = urls_to_visit.pop()

            # 只有当current_url不在visited_urls里时才进行爬取
            if current_url not in visited_urls and current_url not in any(not_visiting_urls):
                print(f"Visiting: {current_url}")
                # 在尝试访问链接前，添加随机延迟
                time.sleep(random.randint(1, 3))

                try:
                    response = requests.get(current_url)
                    soup = BeautifulSoup(response.text, 'html.parser')

                    # 提取并存储所有文字内容
                    text_content = soup.get_text()

                    # 移除current_url中的"https:"，并为文件路径准备
                    sanitized_url = current_url.replace('https://', '').replace('http://', '')
                    content_directory = f"{sanitized_url}/"
                    content_file_path = os.path.join("scraping_data/"+content_directory, "content.txt") #加个scraping_data

                    # 确保目录存在
                    os.makedirs(content_directory, exist_ok=True)

                    with open(content_file_path, 'w', encoding='utf-8') as content_f:
                        content_f.write(f"URL: {current_url}\n{text_content}\n{'='*100}\n\n")

                    # 查找并处理所有链接
                    for link in soup.find_all('a', href=True):
                        absolute_link = urljoin(current_url, link['href'])
                        if (absolute_link not in visited_urls) and \
                        (absolute_link not in urls_to_visit) and \
                        ("cis.unimelb" in absolute_link) and \
                        (absolute_link not in any(not_visiting_urls)):
                            urls_to_visit.add(absolute_link)
                            urls_f.write(f"{absolute_link}\n")

                    visited_urls.add(current_url)
                    visited_urls_f.write(f"{current_url}\n")
                except requests.RequestException as e:
                    print(f"Error during requests to {current_url}: {str(e)}")


if __name__ == "__main__":
    start_url = 'https://cis.unimelb.edu.au/'  # 替换成你想爬取的网站的起始URL
    visited_urls_file = 'scraping_urls/visited_urls.txt'
    urls_file = 'scraping_urls/urls.txt'
    not_visiting_urls_file = 'scraping_urls/not_visiting_urls.txt'
    crawl_site(start_url, visited_urls_file, urls_file, not_visiting_urls_file)


Visiting: http://clouds.cis.unimelb.edu.au/broker/2.4/2.4/docs


FileNotFoundError: [Errno 2] No such file or directory: 'scraping_data/clouds.cis.unimelb.edu.au/broker/2.4/2.4/docs/content.txt'