In [2]:
import requests
from bs4 import BeautifulSoup
import time
import re
import logging
from urllib.parse import urljoin
from pathlib import Path

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")


class CSRCDownloader:
    def __init__(self, base_url, download_dir="csrc_downloads"):
        self.base_url = base_url
        self.download_dir = Path(download_dir)
        self.download_dir.mkdir(exist_ok=True)

        self.session = requests.Session()
        self.session.headers.update({
            "User-Agent": "Mozilla/5.0",
            "Referer": "http://www.csrc.gov.cn"
        })

        self.downloaded = set()
        self._load_downloaded()

    def _load_downloaded(self):
        f = self.download_dir / "downloaded_files.txt"
        if f.exists():
            self.downloaded = set(f.read_text(encoding="utf-8").splitlines())

    def _save_downloaded(self, url):
        f = self.download_dir / "downloaded_files.txt"
        with f.open("a", encoding="utf-8") as fp:
            fp.write(url + "\n")
        self.downloaded.add(url)

    def fetch(self, url):
        r = self.session.get(url, timeout=30)
        r.raise_for_status()
        r.encoding = "utf-8"
        return BeautifulSoup(r.text, "lxml")

    def extract_links(self, soup, page_url):
        files, contents = [], []
        for a in soup.find_all("a", href=True):
            href = a["href"]
            text = a.get_text(strip=True)
            full = urljoin(page_url, href)

            if re.search(r"\.(csv|xls|xlsx)$", href, re.I):
                name = re.sub(r"[\\/:*?\"<>|]", "_", text or full.split("/")[-1])
                files.append((full, name))
            elif "common_detail" in href or href.endswith(".shtml"):
                contents.append(full)

        return files, list(set(contents))

    def download(self, url, name):
        if url in self.downloaded:
            return
        r = self.session.get(url, stream=True, timeout=60)
        r.raise_for_status()
        path = self.download_dir / name
        with open(path, "wb") as f:
            for c in r.iter_content(8192):
                f.write(c)
        self._save_downloaded(url)
        logger.info(f"Downloaded: {name}")

    # ---------- 智能翻页核心 ----------

    def find_next_button(self, soup, current_url):
        for a in soup.find_all("a", href=True):
            if "下一页" in a.get_text():
                return urljoin(current_url, a["href"])
        return None

    def generate_next_url(self, current_url):
        if current_url.endswith("common_list.shtml"):
            return current_url.replace("common_list.shtml", "common_list_1.shtml")
        m = re.search(r"common_list_(\d+)\.shtml", current_url)
        if m:
            n = int(m.group(1)) + 1
            return current_url.replace(m.group(0), f"common_list_{n}.shtml")
        return None

    def url_exists(self, url):
        try:
            return self.session.head(url, timeout=10).status_code == 200
        except:
            return False

    def smart_next(self, soup, current_url):
        nxt = self.find_next_button(soup, current_url)
        if nxt:
            logger.info("Using 下一页 button")
            return nxt

        guess = self.generate_next_url(current_url)
        if guess and self.url_exists(guess):
            logger.info("Using URL pagination")
            return guess

        return None

    # ---------- 主流程 ----------

    def run(self, max_pages=None):
        current = self.base_url
        visited = set()
        page = 1

        while current:
            if current in visited:
                break
            if max_pages and page > max_pages:
                break

            visited.add(current)
            logger.info(f"Processing page {page}: {current}")

            soup = self.fetch(current)
            files, contents = self.extract_links(soup, current)

            for url, name in files:
                self.download(url, name)
                time.sleep(1)

            for c in contents:
                try:
                    s = self.fetch(c)
                    fs, _ = self.extract_links(s, c)
                    for u, n in fs:
                        self.download(u, n)
                        time.sleep(1)
                except:
                    pass

            current = self.smart_next(soup, current)
            page += 1
            time.sleep(2)

        logger.info("All pages processed.")


In [3]:
import schedule
import threading
import time
import logging

logger = logging.getLogger(__name__)

class DownloaderScheduler:
    def __init__(self, downloader):
        self.downloader = downloader
        self.running = False

    def start_daily(self, hour=2, minute=0):
        schedule.clear()
        schedule.every().day.at(f"{hour:02d}:{minute:02d}").do(self.downloader.run)
        self.running = True
        threading.Thread(target=self.loop, daemon=True).start()
        logger.info("Scheduler started (daily)")

    def loop(self):
        while self.running:
            schedule.run_pending()
            time.sleep(1)

    def stop(self):
        self.running = False
        schedule.clear()
        logger.info("Scheduler stopped")


ModuleNotFoundError: No module named 'schedule'

In [1]:
import tkinter as tk
from tkinter import ttk, scrolledtext
import threading
import logging

from csrc_downloader import CSRCDownloader
from scheduler import DownloaderScheduler

class TextHandler(logging.Handler):
    def __init__(self, widget):
        super().__init__()
        self.widget = widget

    def emit(self, record):
        msg = self.format(record)
        self.widget.after(0, lambda: self.widget.insert(tk.END, msg + "\n"))

class App:
    def __init__(self, root):
        self.root = root
        root.title("CSRC 自动下载器")
        root.geometry("900x600")

        frame = ttk.Frame(root)
        frame.pack(fill="x", padx=10, pady=5)

        ttk.Label(frame, text="栏目ID").pack(side="left")
        self.section = tk.StringVar(value="c100122")
        ttk.Entry(frame, textvariable=self.section, width=20).pack(side="left", padx=5)

        ttk.Button(frame, text="立即下载", command=self.start).pack(side="left", padx=5)
        ttk.Button(frame, text="每天更新", command=self.daily).pack(side="left", padx=5)

        self.log = scrolledtext.ScrolledText(root)
        self.log.pack(fill="both", expand=True)

        handler = TextHandler(self.log)
        handler.setFormatter(logging.Formatter("%(asctime)s - %(message)s"))
        logging.getLogger().addHandler(handler)

        self.downloader = None
        self.scheduler = None

    def start(self):
        def run():
            url = f"http://www.csrc.gov.cn/csrc/{self.section.get()}/common_list.shtml"
            self.downloader = CSRCDownloader(url)
            self.downloader.run()
        threading.Thread(target=run, daemon=True).start()

    def daily(self):
        if not self.downloader:
            url = f"http://www.csrc.gov.cn/csrc/{self.section.get()}/common_list.shtml"
            self.downloader = CSRCDownloader(url)
        self.scheduler = DownloaderScheduler(self.downloader)
        self.scheduler.start_daily()

if __name__ == "__main__":
    root = tk.Tk()
    App(root)
    root.mainloop()


ModuleNotFoundError: No module named 'csrc_downloader'