In [24]:
print("Q")

Q


In [32]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import trafilatura
from readability import Document


def scrape_website(base_url):
    # 儲存已處理的 URL
    visited_urls = set()

    # 結果儲存
    scraped_data = []

    def scrape_page(url, should_follow_links=True):
        # 如果 URL 已被訪問，跳過
        # Remove fragment identifier from URL
        url = url.split("#")[0]

        if url in visited_urls:
            return

        # 標記 URL 為已訪問
        visited_urls.add(url)
        print("Scraping", url)

        try:
            # 發送請求
            response = requests.get(url, timeout=10)
            response.raise_for_status()

            # 使用 BeautifulSoup 解析原始內容
            soup = BeautifulSoup(response.text, "html.parser")
            text_raw = soup.get_text(separator=" ", strip=True)

            # 使用 trafilatura 提取主要內容
            text_trafilatura = trafilatura.extract(response.text)

            # 使用 readability 提取主要內容
            doc = Document(response.text)
            text_readability = BeautifulSoup(doc.summary(), "html.parser").get_text(
                separator=" ", strip=True
            )

            # 抓取所有連結
            links = []
            for a_tag in soup.find_all("a", href=True):
                link = urljoin(url, a_tag["href"])  # 解析相對路徑為絕對路徑
                links.append(link)

            # 儲存資料
            scraped_data.append(
                {
                    "url": url,
                    # "text_raw": text_raw,
                    "content": text_trafilatura,
                    # "text_readability": text_readability,
                    "links": links,
                }
            )

            # 只有當 should_follow_links 為 True 且是同域名時才遞迴抓取
            if should_follow_links:
                for link in links:
                    # 檢查是否同域名
                    if is_same_domain(link, base_url):
                        scrape_page(link, should_follow_links=True)
                    else:
                        # 如果不是同域名，只抓取該頁面但不遞迴
                        scrape_page(link, should_follow_links=False)

        except requests.RequestException as e:
            print(f"Failed to scrape {url}: {e}")

    def is_same_domain(link, base_url):
        # 檢查連結是否與基底網址同網域
        base_netloc = urlparse(base_url).netloc
        link_netloc = urlparse(link).netloc
        return base_netloc == link_netloc

    # 開始抓取
    scrape_page(base_url, should_follow_links=True)

    return scraped_data


# 測試爬蟲
if __name__ == "__main__":
    base_url = "https://docs.bucketprotocol.io"
    result = scrape_website(base_url)

    # 輸出結果
    import json

    with open("scraped_data.json", "w", encoding="utf-8") as f:
        json.dump(result, f, ensure_ascii=False, indent=4)

    print("Scraping complete. Data saved to scraped_data.json.")

Scraping https://docs.bucketprotocol.io
Scraping https://docs.bucketprotocol.io/
Scraping https://bucketprotocol.io
Scraping https://twitter.com/bucket_protocol
Scraping https://discord.gg/wJWY5hq3Gx
Scraping https://docs.bucketprotocol.io/introduction/key-advantages
Scraping https://docs.bucketprotocol.io/bucket-campaign/bucket-x-sui-wallet-campaign
Scraping https://docs.bucketprotocol.io/mechanisms/system-overview
Scraping https://docs.bucketprotocol.io/mechanisms/terminology
Scraping https://docs.bucketprotocol.io/mechanisms/borrowing
Scraping https://docs.bucketprotocol.io/mechanisms/buck-savings-rate-bsr-and-sbuck
Scraping https://docs.bucketprotocol.io/mechanisms/tank-and-liquidations
Scraping https://docs.bucketprotocol.io/mechanisms/peg-stability-module
Scraping https://docs.bucketprotocol.io/mechanisms/redemptions
Scraping https://docs.bucketprotocol.io/mechanisms/recovery-mode
Scraping https://docs.bucketprotocol.io/mechanisms/flash-loan
Scraping https://docs.bucketprotocol.i

In [26]:
type(result)

list

In [22]:
from collections import Counter
from collections import defaultdict


def remove_common_sections(scraped_data):
    # 找出所有連續5個以上的文字片段
    def get_ngrams(text, n):
        words = text.split()
        return [" ".join(words[i : i + n]) for i in range(len(words) - n + 1)]

    # 收集所有文件中的文字片段
    all_ngrams = []
    ngram_locations = defaultdict(list)  # 記錄每個片段出現的位置

    for item in scraped_data:
        text = item["text_content"]
        url = item["url"]
        # 取得5個以上文字的片段
        for n in range(5, 11):
            ngrams = get_ngrams(text, n)
            all_ngrams.extend(ngrams)
            # 記錄每個片段出現的URL
            for ngram in ngrams:
                ngram_locations[ngram].append(url)

    # 計算片段出現次數
    ngram_counts = Counter(all_ngrams)
    common_ngrams = {
        ngram
        for ngram, count in ngram_counts.items()
        if count > len(scraped_data) * 0.5
    }

    # 打印重複的部分及其位置
    print("重複出現的文字片段及其位置:")
    for ngram in common_ngrams:
        print(f"- {ngram}")
        print("  出現於:")
        for url in set(ngram_locations[ngram]):  # 使用set去除重複URL
            print(f"  * {url}")
        print()

    # 移除每個文件中的重複片段
    for item in scraped_data:
        text = item["text_content"]
        for ngram in common_ngrams:
            text = text.replace(ngram, "")
        # 清理多餘空格
        item["text_content"] = " ".join(text.split())

    return scraped_data

In [23]:
remove_common_sections(result)

重複出現的文字片段及其位置:
- | Bucket Campaign Savings Mode Flash & Depeg Scenarios
  出現於:
  * https://docs.bucketprotocol.io/price-stability-and-depeg-analysis
  * https://docs.bucketprotocol.io/external-audits-and-analysis
  * https://docs.bucketprotocol.io/external-audits-and-analysis/terms-of-service
  * https://docs.bucketprotocol.io/
  * https://docs.bucketprotocol.io/outro/links
  * https://docs.bucketprotocol.io/external-audits-and-analysis/formal-verification
  * https://docs.bucketprotocol.io/mechanisms
  * https://docs.bucketprotocol.io/external-audits-and-analysis/introduction
  * https://docs.bucketprotocol.io/introduction
  * https://docs.bucketprotocol.io/mechanisms/flash-loan
  * https://docs.bucketprotocol.io/outro/oracles
  * https://docs.bucketprotocol.io/mechanisms/recovery-mode
  * https://docs.bucketprotocol.io/mechanisms/redemptions
  * https://docs.bucketprotocol.io/mechanisms/borrowing
  * https://docs.bucketprotocol.io/mechanisms/buck-savings-rate-bsr-and-sbuck
  * https:

[{'url': 'https://docs.bucketprotocol.io/',
  'text_content': 'Introduction Security this page Introduction Introduction Bucket Protocol is a decentralized stablecoin protocol that enables users to maximize the financial efficiency of their crypto assets without incurring unforeseen interest payments. By depositing crypto assets into a smart contract, users create a Collateralized Debt Position (CDP). This allows them to generate liquidity instantly by borrowing $BUCK, a stablecoin pegged to the US dollar. To ensure the system’s stability, each CDP (or “Bottle”) must maintain a minimum collateral ratio (MCR) of 110%, although this ratio may vary depending on the specific collateral type. At any point, $BUCK holders have the option to exchange their stablecoins for the underlying collateral. The protocol employs an immediate liquidation mechanism designed to encourage stable deposits and automatically rebalance risk between high-risk and low-risk Bottles. This approach enables Bucket Pr