In [1]:
import requests
from bs4 import BeautifulSoup

# 設定PPT熱門看板的網址
url = "https://www.ptt.cc/bbs/hotboards.html"

# 發送HTTP GET請求
response = requests.get(url)

# 使用BeautifulSoup解析HTML
soup = BeautifulSoup(response.text, "html.parser")

# 爬取每個看板的名稱和網址
boards = soup.find_all('div', class_='board-name')
board_links = soup.find_all('a', class_='board')

# 取得看板名稱和網址
for board, link in zip(boards, board_links):
    board_name = board.text
    board_url = "https://www.ptt.cc" + link['href']
    print(f"看板名稱: {board_name}")
    print(f"網址: {board_url}\n")


看板名稱: Gossiping
網址: https://www.ptt.cc/bbs/Gossiping/index.html

看板名稱: Baseball
網址: https://www.ptt.cc/bbs/Baseball/index.html

看板名稱: C_Chat
網址: https://www.ptt.cc/bbs/C_Chat/index.html

看板名稱: HatePolitics
網址: https://www.ptt.cc/bbs/HatePolitics/index.html

看板名稱: Stock
網址: https://www.ptt.cc/bbs/Stock/index.html

看板名稱: LoL
網址: https://www.ptt.cc/bbs/LoL/index.html

看板名稱: Lifeismoney
網址: https://www.ptt.cc/bbs/Lifeismoney/index.html

看板名稱: NBA
網址: https://www.ptt.cc/bbs/NBA/index.html

看板名稱: KoreaStar
網址: https://www.ptt.cc/bbs/KoreaStar/index.html

看板名稱: home-sale
網址: https://www.ptt.cc/bbs/home-sale/index.html

看板名稱: Elephants
網址: https://www.ptt.cc/bbs/Elephants/index.html

看板名稱: car
網址: https://www.ptt.cc/bbs/car/index.html

看板名稱: movie
網址: https://www.ptt.cc/bbs/movie/index.html

看板名稱: Lions
網址: https://www.ptt.cc/bbs/Lions/index.html

看板名稱: Tech_Job
網址: https://www.ptt.cc/bbs/Tech_Job/index.html

看板名稱: Japan_Travel
網址: https://www.ptt.cc/bbs/Japan_Travel/index.html

看板名稱: PC_Shopp

In [1]:
import re
import requests
from bs4 import BeautifulSoup
import os
import json
from datetime import datetime, timedelta

# PTT八卦板首頁網址
base_url = "https://www.ptt.cc/bbs/Gossiping/index.html"

# 設定 cookies 以繞過18禁限制
ptt_session = requests.Session()
ptt_session.cookies.set('over18', '1')

# 模擬瀏覽器的 headers
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}

# 爬取幾天內的文章（預設7天）
days_to_crawl = 7
start_date = datetime.now() - timedelta(days=days_to_crawl)

# 創建儲存文章的資料夾
if not os.path.exists('gossiping_posts'):
    os.mkdir('gossiping_posts')

def get_soup(url):
    try:
        response = ptt_session.get(url, headers=headers)
        if response.status_code != 200:
            print(f"HTTP 請求錯誤，狀態碼: {response.status_code}")
            return None
        soup = BeautifulSoup(response.text, 'html.parser')
        return soup
    except Exception as e:
        print(f"抓取頁面時發生錯誤: {e}")
        return None

def sanitize_filename(filename):
    # 移除不合法的字元
    return re.sub(r'[<>:"/\\|?*]', '', filename)

def parse_post(link):
    if not link or not link['href']:
        print(f"無效的連結: {link}")
        return None
    
    post_url = "https://www.ptt.cc" + link['href']
    post_soup = get_soup(post_url)
    
    if not post_soup:
        print(f"無法取得文章頁面: {post_url}")
        return None
    
    # 取得文章資訊
    metadata = post_soup.find_all('span', class_='article-meta-value')
    
    if len(metadata) < 4:
        print(f"跳過無效文章: {post_url}")
        return None
    
    author = metadata[0].text
    title = metadata[2].text
    post_time_str = metadata[3].text
    post_time = datetime.strptime(post_time_str, '%a %b %d %H:%M:%S %Y')

    # 檢查文章是否在7天內
    if post_time < start_date:
        return None
    
    category = title.split(']')[0].strip('[') if ']' in title else '未分類'
    
    # 內文
    content = post_soup.find(id="main-content").text.split('※ 發信站')[0].strip()

    # 抓取留言
    comments = []
    comment_elements = post_soup.find_all('div', class_='push')
    for comment in comment_elements:
        push_userid = comment.find('span', class_='push-userid')
        push_content = comment.find('span', class_='push-content')
        push_time = comment.find('span', class_='push-ipdatetime')
        
        if push_userid and push_content and push_time:
            comments.append({
                'comment_author': push_userid.text.strip(),
                'comment_time': push_time.text.strip(),
                'comment_content': push_content.text.strip(': ').strip()
            })

    # 儲存每篇文章及其留言
    post_data = {
        'author': author,
        'title': title,
        'post_time': post_time_str,
        'content': content,
        'category': category,
        'comments': comments
    }
    
    # 以標題為檔名儲存，處理非法字元
    filename = f"{post_time.strftime('%Y%m%d_%H%M')}_{sanitize_filename(title[:30].strip())}.json"
    with open(os.path.join('gossiping_posts', filename), 'w', encoding='utf-8') as f:
        json.dump(post_data, f, ensure_ascii=False, indent=4)
    
    return post_data

def crawl_gossiping(base_url):
    url = base_url
    page = 0
    while True:
        print(f'正在爬取第 {page+1} 頁...')
        soup = get_soup(url)
        
        if not soup:
            print(f"無法取得頁面內容，停止爬取")
            break
        
        links = soup.find_all('div', class_='title')

        for link in links:
            if link.a:  # 檢查 link.a 是否存在
                #print(f"Current link: {link.a['href']}")
                parse_post(link.a)

        paging_div = soup.find('div', class_='btn-group-paging')
        if paging_div:
            paging_buttons = paging_div.find_all('a')
            if len(paging_buttons) < 2:
                print("無法找到下一頁，停止爬取")
                break
            
            prev_page_link = paging_buttons[1].get('href')
            if not prev_page_link:
                print("無法取得上一頁的連結，停止爬取")
                break
            url = "https://www.ptt.cc" + prev_page_link
            print(prev_page_link)
        else:
            print("無法找到 btn-group-paging 元素，停止爬取")
            break
        
        # 確保最後一篇文章存在且有連結
        if links and links[-1].a:
            last_post = parse_post(links[-1].a)
            if last_post and datetime.strptime(last_post['post_time'], '%a %b %d %H:%M:%S %Y') < start_date:
                print("已超過7天，停止爬取")
                break
        
        page += 1

# 開始爬取八卦版文章
crawl_gossiping(base_url)


正在爬取第 1 頁...
跳過無效文章: https://www.ptt.cc/bbs/Gossiping/M.1725417932.A.55B.html
/bbs/Gossiping/index38851.html
跳過無效文章: https://www.ptt.cc/bbs/Gossiping/M.1725417932.A.55B.html
正在爬取第 2 頁...
/bbs/Gossiping/index38850.html
正在爬取第 3 頁...
/bbs/Gossiping/index38849.html
正在爬取第 4 頁...
/bbs/Gossiping/index38848.html
正在爬取第 5 頁...
/bbs/Gossiping/index38847.html
正在爬取第 6 頁...
/bbs/Gossiping/index38846.html
正在爬取第 7 頁...
/bbs/Gossiping/index38845.html
正在爬取第 8 頁...
/bbs/Gossiping/index38844.html
正在爬取第 9 頁...
/bbs/Gossiping/index38843.html
正在爬取第 10 頁...
/bbs/Gossiping/index38842.html
正在爬取第 11 頁...
/bbs/Gossiping/index38841.html
正在爬取第 12 頁...
/bbs/Gossiping/index38840.html
正在爬取第 13 頁...
/bbs/Gossiping/index38839.html
正在爬取第 14 頁...
/bbs/Gossiping/index38838.html
正在爬取第 15 頁...
/bbs/Gossiping/index38837.html
正在爬取第 16 頁...
/bbs/Gossiping/index38836.html
正在爬取第 17 頁...
/bbs/Gossiping/index38835.html
正在爬取第 18 頁...
/bbs/Gossiping/index38834.html
正在爬取第 19 頁...
/bbs/Gossiping/index38833.html
正在爬取第 20 頁...
/bbs/Gossi

In [3]:
import os
import re
import requests
from bs4 import BeautifulSoup

# 定義抓取每本書的詳細資料
def get_book_content(book_url):
    response = requests.get(book_url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # 抓取標題、作者、時間、內文
    title = soup.find('h1').text if soup.find('h1') else "Unknown Title"
    author = soup.find('h2').text if soup.find('h2') else "Unknown Author"
    content_div = soup.find('div', {'class': 'chapter'})
    
    # 抓取出版時間
    time = "Unknown Time"
    time_tag = soup.find(string="Release Date:")
    if time_tag:
        time = time_tag.find_next().text
    
    # 抓取內文
    content = content_div.text if content_div else "No content available"
    
    return title, author, time, content

# 定義主爬取函數
def scrape_books(base_url, book_limit=200):
    response = requests.get(base_url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # 找到書籍的所有鏈接
    book_links = soup.find_all('a', href=True)
    
    # 過濾出有效的電子書鏈接
    books = []
    for link in book_links:
        href = link['href']
        if href.startswith('/ebooks/'):
            books.append(f'https://www.gutenberg.org{href}')
        if len(books) >= book_limit:
            break

    # 逐本書爬取內容
    for book_url in books:
        title, author, time, content = get_book_content(book_url)
        
        # 清理標題中的非法字符
        safe_title = re.sub(r'[\\/*?:"<>|]', "", title)
        file_name = f"{safe_title}.txt"
        
        # 存為本地檔案
        with open(file_name, 'w', encoding='utf-8') as f:
            f.write(f"Title: {title}\n")
            f.write(f"Author: {author}\n")
            f.write(f"Release Date: {time}\n")
            f.write("\n" + content)
        print(f"Saved: {file_name}")

# 執行爬取
scrape_books('https://www.gutenberg.org/browse/languages/zh')


Saved: Search.txt
Saved: Search.txt
Saved: Category Bookshelf.txt
Saved: Offline Catalogs and Feeds.txt
Saved: 豆棚閒話 by Ainajushi.txt
Saved: 戲中戲 by Aiyuezhuren.txt
Saved: 比目魚 by Aiyuezhuren.txt
Saved: 比目魚 by Aiyuezhuren.txt
Saved: Study of Inner Cultivation by Anonymous.txt
Saved: 三字經 by Anonymous.txt
Saved: 山水情 by Anonymous.txt
Saved: 山海經 by Anonymous.txt
Saved: 施公案 by Anonymous.txt
Saved: 施公案 by Anonymous.txt
Saved: 易經 by Anonymous.txt
Saved: 木蘭奇女傳 by Anonymous.txt
Saved: 海公案 by Anonymous.txt
Saved: 燕丹子 by Anonymous.txt
Saved: 狄公案 by Anonymous.txt
Saved: 百家姓 by Anonymous.txt
Saved: 禮記 by Anonymous.txt
Saved: 綠牡丹 by Anonymous.txt
Saved: 詩經 by Anonymous.txt
Saved: 麟兒報 by Anonymous.txt
Saved: Study of Inner Cultivation by Anonymous.txt
Saved: Hu Die Mei by Nanyuedaoren.txt
Saved: Qing Lou Meng by Tao Zor.txt
Saved: 天豹圖 by Unknown.txt
Saved: 梁公九諫 by Unknown.txt
Saved: 長恨歌 by Juyi Bai.txt
Saved: 李娃傳 by Xingjian Bai.txt
Saved: 玉樓春 by Baiyundaoren.txt
Saved: 漢書 by Gu Ban.txt
Saved: 引鳳蕭 by Ba