<a href="https://colab.research.google.com/github/41371113h-xian/114-1/blob/main/hw_4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ==========================================================
#  PTT NBA爬蟲 + Gradio互動介面（最完整Colab版本）
# ==========================================================
!pip -q install -U gradio==4.* requests beautifulsoup4 pandas openpyxl

import os, re, io, time, json, shutil, random, requests
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime
import gradio as gr

BASE_URL = "https://www.ptt.cc"

# -----------------------------
# Session 與 Request設定
# -----------------------------
def make_session():
    s = requests.Session()
    s.cookies.set('over18', '1', domain='ptt.cc')
    s.headers.update({
        "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0 Safari/537.36"
    })
    return s

def http_get(s, url, retries=3, delay=1.2):
    for i in range(retries):
        try:
            r = s.get(url, timeout=10)
            if r.status_code == 200:
                return r
        except:
            time.sleep(delay)
    return None

def get_index(s, board="NBA"):
    url = f"{BASE_URL}/bbs/{board}/index.html"
    r = http_get(s, url)
    soup = BeautifulSoup(r.text, "html.parser")
    up = soup.find("a", string="上頁 ›")
    if not up:
        up = soup.find("a", string=re.compile("上頁"))
    idx = int(re.search(r"index(\d+)\.html", up["href"]).group(1)) + 1
    return idx

# -----------------------------
# 抓取索引頁文章列表
# -----------------------------
def parse_index(html):
    soup = BeautifulSoup(html, "html.parser")
    rows = soup.find_all("div", class_="r-ent")
    data = []
    for r in rows:
        tdiv = r.find("div", class_="title")
        a = tdiv.find("a") if tdiv else None
        title = a.text.strip() if a else tdiv.text.strip()
        link = BASE_URL + a['href'] if a else None
        author = r.find("div", class_="author").text.strip()
        date = r.find("div", class_="date").text.strip()
        nrec = r.find("div", class_="nrec").text.strip()
        nrec = 100 if nrec == "爆" else (int(nrec[1:])*-1 if nrec.startswith("X") else int(nrec) if nrec.isdigit() else 0)
        data.append({"title": title, "link": link, "author": author, "nrec": nrec, "date": date})
    return data

# -----------------------------
# 抓取文章內文與推文
# -----------------------------
def parse_article(html):
    soup = BeautifulSoup(html, "html.parser")
    meta = {m.select_one('.article-meta-tag').text: m.select_one('.article-meta-value').text
            for m in soup.select('.article-metaline') if m.select_one('.article-meta-tag')}
    title = meta.get("標題", "")
    author = meta.get("作者", "")
    time_str = meta.get("時間", "")

    main = soup.find(id="main-content")
    for t in main.select('.article-metaline, .article-metaline-right'): t.extract()
    content = main.get_text("\n", strip=False).split('--\n')[0].strip()

    pushes = []
    for p in soup.select('.push'):
        tag = p.select_one('.push-tag').text.strip()
        user = p.select_one('.push-userid').text.strip()
        cont = p.select_one('.push-content').text.strip(':').strip()
        iptime = p.select_one('.push-ipdatetime').text.strip()
        pushes.append({"tag": tag, "user": user, "content": cont, "time": iptime})
    return title, author, time_str, content, pushes

# -----------------------------
# 主爬蟲邏輯
# -----------------------------
def scrape_ptt(board="NBA", pages=5, delay=1.0, fetch_detail=False, progress=gr.Progress(track_tqdm=True)):
    s = make_session()
    cur = get_index(s, board)
    results, pushes_all = [], []

    for i in progress.tqdm(range(cur, cur-pages, -1), desc="抓取索引頁"):
        html = http_get(s, f"{BASE_URL}/bbs/{board}/index{i}.html")
        if not html: continue
        data = parse_index(html.text)
        for d in data:
            d["board"], d["page_index"] = board, i
        results.extend(data)
        time.sleep(delay)

    df = pd.DataFrame(results)
    if fetch_detail:
        for i, row in progress.tqdm(df.iterrows(), total=len(df), desc="抓取文章內容"):
            if not row.link: continue
            art = http_get(s, row.link)
            if not art: continue
            title, author, time_full, content, pushes = parse_article(art.text)
            df.loc[i, "title_full"] = title
            df.loc[i, "author_full"] = author
            df.loc[i, "time_full"] = time_full
            df.loc[i, "content"] = content
            for p in pushes:
                p["title"], p["board"] = title, board
            pushes_all.extend(pushes)
            time.sleep(delay)

    # 儲存檔案
    os.makedirs("/content/ptt_data", exist_ok=True)
    df.to_csv(f"/content/ptt_data/{board}_articles.csv", index=False, encoding="utf-8-sig")
    if pushes_all:
        pd.DataFrame(pushes_all).to_csv(f"/content/ptt_data/{board}_pushes.csv", index=False, encoding="utf-8-sig")
    return df, pushes_all

# -----------------------------
# Gradio 介面
# -----------------------------
def run_scraper(board, pages, delay, fetch_detail):
    df, pushes = scrape_ptt(board, pages, delay, fetch_detail)
    msg = f"✅ 已完成爬取 {board} 看板，共 {len(df)} 筆文章"
    if fetch_detail:
        msg += f"，共擷取 {len(pushes)} 筆推文。"
    msg += "\n檔案已儲存於 /content/ptt_data/"
    return df.head(300), msg

with gr.Blocks(title="PTT NBA爬蟲（Colab最完整版本）") as demo:
    gr.Markdown("## 🏀 PTT 爬蟲互動介面（Colab版）\n可抓取標題、作者、連結、推文數與內文。")
    board = gr.Textbox(value="NBA", label="看板名稱")
    pages = gr.Slider(1, 30, value=5, step=1, label="抓取頁數（最新往前）")
    delay = gr.Slider(0.5, 3.0, value=1.0, step=0.1, label="每頁延遲（秒）")
    fetch_detail = gr.Checkbox(value=False, label="是否抓取文章內文與推文")
    btn = gr.Button("開始爬取 🚀")
    output_df = gr.Dataframe(label="文章清單（前300筆）")
    output_msg = gr.Markdown()
    btn.click(fn=run_scraper, inputs=[board, pages, delay, fetch_detail], outputs=[output_df, output_msg])

# ✅ Colab啟動設定（會在cell中內嵌顯示並提供分享連結）
demo.queue()
demo.launch(inline=True, share=True, server_name="0.0.0.0")


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://6469d899f4c781febe.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


