<a href="https://colab.research.google.com/github/41371232H/PL_Repo/blob/main/HW4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# HW4
## 1. 爬蟲資料收集與存儲
抓取 PTT 電影版文章的標題、作者、日期與連結
資料自動寫入 Google Sheet（「爬蟲資料」工作表）
可設定起始頁與抓取頁數，避免爬取過多資料導致延遲

## 2. 詞頻與關鍵詞統計
從 Google Sheet 讀取爬取資料
使用 jieba 中文分詞
計算每個詞的 TF-IDF 平均權重
支援停用詞過濾，排除高頻無意義詞
前 N 熱門詞結果自動回寫 Google Sheet（「關鍵詞統計」工作表）

## 3. AI 洞察生成
使用 Google Gemini 模型（如 gemini-2.0-flash）生成分析摘要
統一產生 5 句洞察摘要 + 一段 120 字結論
API Key 可在執行時動態輸入，保護安全性

## 4. Gradio 介面
分頁 1：爬蟲與資料顯示，顯示抓取的文章資料，可設定起始頁與抓取頁數
分頁 2：熱門詞統計，選擇前 N 名熱門詞，顯示 TF-IDF 統計結果
分頁 3：AI 洞察與結論，輸入 API Key 並生成分析摘要與結論，一鍵操作，直覺式介面，使用者友好

### 試算表連結:https://docs.google.com/spreadsheets/d/1ZrA1VIiUl2lJ1SXIdSPq1jRvIyqI1LsbiJ77BoVb8Tg/edit?gid=742427183#gid=742427183

# 最終程式碼

In [57]:
!pip install --quiet gspread google-auth google-generativeai gradio jieba scikit-learn beautifulsoup4 requests
import requests
from bs4 import BeautifulSoup
import re
import jieba
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
import gspread
from google.colab import auth
from google.auth import default
import gradio as gr
import os
import json
import time

# ============================================================
# 🧾 Google Sheet 授權
# ============================================================
auth.authenticate_user()
creds, _ = default(scopes=['https://www.googleapis.com/auth/spreadsheets'])
gc = gspread.authorize(creds)
SHEET_URL = 'https://docs.google.com/spreadsheets/d/1ZrA1VIiUl2lJ1SXIdSPq1jRvIyqI1LsbiJ77BoVb8Tg/edit#gid=0'
sh = gc.open_by_url(SHEET_URL)
RAW_SHEET_NAME = '爬蟲資料'
STAT_SHEET_NAME = '關鍵詞統計'

# ============================================================
# 🧹 停用詞
# ============================================================
stopwords = set(['的', '了', '是', '在', '我', '你', '他', '她', '之', '一個', '和', '討論', '分享'])

# ============================================================
# 🕷️ PTT 爬蟲
# ============================================================
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
                  '(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

def extract_index_split(url):
    try:
        index_str = url.split('index')[1].split('.html')[0]
        return int(index_str)
    except Exception:
        return None

def get_last_index(url):
    response = requests.get(url, headers=headers, timeout=5)
    soup = BeautifulSoup(response.text, 'html.parser')
    btns = soup.find('div', class_='btn-group btn-group-paging')
    if btns:
        prev_link = btns.find_all('a')[1]['href']
        match = re.search(r'index(\d+).html', prev_link)
        if match:
            return int(match.group(1)) + 1
    return None

def fetch_ptt_articles(start_url, pages=10):
    articles_data = []
    START_INDEX = extract_index_split(start_url)
    if START_INDEX is None:
        START_INDEX = get_last_index(start_url)
    if START_INDEX is None:
        print("⚠️ 無法取得起始頁頁碼")
        return []

    BASE_URL = "https://www.ptt.cc/bbs/movie/index"
    stop_index = START_INDEX - pages
    for idx in range(START_INDEX, stop_index, -1):
        url = f"{BASE_URL}{idx}.html"
        response = requests.get(url, headers=headers, timeout=5)
        soup = BeautifulSoup(response.text, 'html.parser')
        for article in soup.find_all('div', class_='r-ent'):
            title_tag = article.find('div', class_='title').find('a')
            if title_tag:
                title = title_tag.text.strip()
                href = "https://www.ptt.cc" + title_tag['href']
            else:
                title = article.find('div', class_='title').text.strip()
                href = "N/A"
            author = article.find('div', class_='author').text.strip()
            date = article.find('div', class_='date').text.strip()
            articles_data.append({
                'title': title,
                'author': author,
                'date': date,
                'href': href
            })
    return articles_data

# ============================================================
# 📊 Google Sheet 操作
# ============================================================
def write_articles_to_sheet(articles_data):
    try:
        worksheet = sh.worksheet(RAW_SHEET_NAME)
        worksheet.clear()
    except gspread.WorksheetNotFound:
        worksheet = sh.add_worksheet(title=RAW_SHEET_NAME, rows="100", cols="20")
    worksheet.append_row(['標題', '作者', '日期', '連結'])
    for a in articles_data:
        worksheet.append_row([a['title'], a['author'], a['date'], a['href']])

def read_articles_from_sheet():
    try:
        worksheet = sh.worksheet(RAW_SHEET_NAME)
        return worksheet.get_all_records()
    except gspread.WorksheetNotFound:
        return []

def write_keywords_to_sheet(keywords):
    try:
        worksheet = sh.worksheet(STAT_SHEET_NAME)
        worksheet.clear()
    except gspread.WorksheetNotFound:
        worksheet = sh.add_worksheet(title=STAT_SHEET_NAME, rows="100", cols="20")
    worksheet.append_row(['詞彙', '平均 TF-IDF'])
    for w, v in keywords:
        worksheet.append_row([w, v])

# ============================================================
# 🧠 TF-IDF 分析
# ============================================================
def get_top_keywords(records, top_n=10):
    document_list = []
    for r in records:
        text = r['標題']
        cleaned = re.sub(r'[^\w\s]', '', text)
        words = jieba.lcut(cleaned, cut_all=False)
        filtered = [w.strip() for w in words if w.strip() and len(w.strip())>1 and w.strip() not in stopwords]
        document_list.append(" ".join(filtered))

    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(document_list)
    feature_names = vectorizer.get_feature_names_out()
    tfidf_array = tfidf_matrix.toarray()

    avg_scores = defaultdict(float)
    for doc in tfidf_array:
        for i, weight in enumerate(doc):
            avg_scores[feature_names[i]] += weight
    num_docs = len(document_list)
    for w in avg_scores:
        avg_scores[w] /= num_docs
    sorted_avg = sorted(avg_scores.items(), key=lambda x: x[1], reverse=True)
    return sorted_avg[:top_n]

# ============================================================
# 🤖 Gemini API 洞察生成
# ============================================================
def generate_insights(keywords, user_api_key):
    if not user_api_key:
        return "⚠️ 請輸入有效的 Gemini API Key。"

    GEMINI_API_URL = "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent"
    prompt = (
        f"以下是 PTT 電影版熱門關鍵詞：{', '.join([w for w,_ in keywords])}\n"
        f"請用中文生成：\n"
        f"1. 五句洞察摘要（每句以「•」開頭）\n"
        f"2. 一段約 120 字的結論。"
    )
    headers = {"Content-Type": "application/json"}
    params = {"key": user_api_key}
    data = {"contents": [{"parts": [{"text": prompt}]}]}

    try:
        response = requests.post(GEMINI_API_URL, headers=headers, params=params, json=data, timeout=60)
        if response.status_code == 200:
            result = response.json()
            return result['candidates'][0]['content']['parts'][0]['text']
        else:
            return f"❌ API 錯誤: {response.status_code} - {response.text}"
    except Exception as e:
        return f"❌ API 呼叫失敗：{e}"

# ============================================================
# ⚙️ Gradio 多分頁介面（限制爬蟲頁數）
# ============================================================
with gr.Blocks() as demo:
    gr.Markdown("## 🎬 PTT 電影版 關鍵詞分析系統 (多分頁介面)")

    with gr.Tab("爬蟲結果"):
        start_url_input = gr.Textbox(label="PTT 起始頁網址", value="https://www.ptt.cc/bbs/movie/index.html")
        page_slider = gr.Slider(1, 5, value=1, step=1, label="爬取頁數（建議 1～3 頁）")
        run_button = gr.Button("🚀 執行爬蟲")
        articles_output = gr.Dataframe(headers=["標題", "作者", "日期", "連結"], label="爬蟲結果")

        def run_crawler(start_url, pages):
            gr.Info(f"開始爬取 {pages} 頁資料，請稍候...")
            articles = fetch_ptt_articles(start_url, pages=pages)
            if not articles:
                return []
            write_articles_to_sheet(articles)
            data = [[a['title'], a['author'], a['date'], a['href']] for a in articles]
            gr.Info(f"✅ 成功爬取 {len(data)} 筆文章！")
            return data

        run_button.click(run_crawler, inputs=[start_url_input, page_slider], outputs=articles_output)

    with gr.Tab("熱門詞分析"):
        top_n_input = gr.Slider(1, 20, value=10, step=1, label="熱門詞前 N 名")
        run_tfidf_button = gr.Button("📊 執行分析")
        keywords_output = gr.Dataframe(headers=["詞彙", "平均 TF-IDF"], label="TF-IDF 結果")

        def run_tfidf(top_n):
            gr.Info("📈 正在進行 TF-IDF 分析...")
            records = read_articles_from_sheet()
            if not records:
                gr.Warning("⚠️ 尚未有爬蟲資料，請先到第一頁執行爬蟲！")
                return []
            top_keywords = get_top_keywords(records, top_n=top_n)
            write_keywords_to_sheet(top_keywords)
            gr.Info("✅ 分析完成！")
            return top_keywords

        run_tfidf_button.click(run_tfidf, inputs=top_n_input, outputs=keywords_output)

    with gr.Tab("AI 洞察摘要"):
        api_key_input = gr.Textbox(label="🔑 請輸入你的 Gemini API Key", type="password", placeholder="AIza 或 g- 開頭的金鑰")
        run_ai_button = gr.Button("✨ 生成摘要")
        ai_output = gr.Textbox(label="AI 洞察摘要 + 結論", lines=12)

        def run_ai(api_key):
            gr.Info("🤖 正在請求 Gemini API...")
            records = read_articles_from_sheet()
            if not records:
                gr.Warning("⚠️ 尚未有爬蟲資料，請先到第一頁執行爬蟲！")
                return "⚠️ 尚無資料"
            top_keywords = get_top_keywords(records, top_n=10)
            insights = generate_insights(top_keywords, api_key)
            return insights

        run_ai_button.click(run_ai, inputs=api_key_input, outputs=ai_output)

demo.launch()

It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://a096c38c2fd1acbd3e.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




# 歷程記錄

In [None]:
import requests
from bs4 import BeautifulSoup

In [None]:
articles_data = []

In [None]:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}

In [None]:
def get_previous_page_url(soup):
    """獲取上一頁 (更新、更舊的文章) 的連結"""
    # PTT 的「上一頁」按鈕 HTML 結構：div class="btn-group btn-group-paging"
    paging_div = soup.find('div', class_='btn-group btn-group-paging')
    if paging_div:
        # PTT 頁面按鈕順序：最舊 (0), 上頁 (1), 下頁 (2), 最新 (3)
        # 我們要找的是「上一頁」 (即更舊的文章) 的連結，索引為 1
        prev_button = paging_div.find_all('a')[1]

        # 檢查連結是否有效 (如果已經是第一頁，連結會是 '#' 或沒有 href)
        if 'href' in prev_button.attrs:
            return "https://www.ptt.cc" + prev_button['href']
    return None

In [None]:
responseIndex = requests.get("https://www.ptt.cc/bbs/movie/index.html", headers=headers, timeout=5)
html_contentIndex = responseIndex.text
soupIndex = BeautifulSoup(html_contentIndex, 'html.parser')

In [None]:
current_url = get_previous_page_url(soupIndex)

In [None]:
current_url

'https://www.ptt.cc/bbs/movie/index10818.html'

In [None]:
import re

def extract_index_split(url):
    """
    使用字串分割方法解析頁碼。

    邏輯：
    1. 以 'index' 分割網址：['...', '10808.html']
    2. 取第二個元素 ('10808.html')
    3. 以 '.html' 分割：['10808', '']
    4. 取第一個元素 ('10808')
    5. 轉換為整數
    """
    try:
        # 確保網址中包含 'index' 和 '.html'
        index_str = url.split('index')[1].split('.html')[0]
        return int(index_str)
    except IndexError:
        print("錯誤: 網址結構不符合預期 (缺少 'index' 或 '.html')")
        return None
    except ValueError:
        print("錯誤: 提取到的內容無法轉換為數字")
        return None

In [None]:
# 指定的起始頁碼
START_INDEX = extract_index_split(current_url)
# 想要取得的頁數
PAGES_TO_FETCH = 10

# PTT 基礎網址
BASE_URL = "https://www.ptt.cc/bbs/movie/index"

# 計算迴圈的結束點 (例如: 10808 - 10 + 1 = 10799)
# range(start, stop, step) 的 stop 是不包含的，所以我們設為 START_INDEX - PAGES_TO_FETCH
stop_index = START_INDEX - PAGES_TO_FETCH

print(f"--- 正在生成從 {START_INDEX} 到 {stop_index + 1} 的 {PAGES_TO_FETCH} 個網址 ---")

# 使用 range 迴圈，從 START_INDEX 遞減到 stop_index
for index in range(START_INDEX, stop_index, -1):
    # 組合完整的 URL
    url = f"{BASE_URL}{index}.html"
    print(url)
    response = requests.get(url, headers=headers, timeout=5)
    html_content = response.text
    soup = BeautifulSoup(html_content, 'html.parser')
    article_list = soup.find_all('div', class_='r-ent')
    for article in article_list:
        # 標題 (Title) 和 連結 (Href) 資訊通常在 div class="title" 內
        title_tag = article.find('div', class_='title').find('a')

        # 排除被刪除或不可存取(標題為 - )的文章
        if title_tag:
            title = title_tag.text.strip()
            href = "https://www.ptt.cc" + title_tag['href']
        else:
            # 處理被刪除的文章 (通常標題會是 '-')
            title = article.find('div', class_='title').text.strip()
            href = "N/A (已刪除或不可存取)"

        # 作者 (Author) 資訊在 div class="author" 內
        author = article.find('div', class_='author').text.strip()

        # 日期 (Date) 資訊在 div class="date" 內
        date = article.find('div', class_='date').text.strip()

        # 儲存資料
        articles_data.append({
            'title': title,
            'date': date,
            'author': author,
            'href': href
        })

        # 額外要求：如果標題是指定標題，印出其對應的 href
        # 假設我們指定要特別關注標題包含 "新聞" 的文章
        # if '新聞' in title:
        #     print(f"[特別關注] 標題：{title} | 連結：{href}")
    # 註釋: 在實際的爬蟲程式中，您會將這個 url 傳遞給 requests.get() 函式來獲取內容
    # 例如: html_content = requests.get(url, headers=headers, cookies=cookies).text

--- 正在生成從 10818 到 10809 的 10 個網址 ---
https://www.ptt.cc/bbs/movie/index10818.html
https://www.ptt.cc/bbs/movie/index10817.html
https://www.ptt.cc/bbs/movie/index10816.html
https://www.ptt.cc/bbs/movie/index10815.html
https://www.ptt.cc/bbs/movie/index10814.html
https://www.ptt.cc/bbs/movie/index10813.html
https://www.ptt.cc/bbs/movie/index10812.html
https://www.ptt.cc/bbs/movie/index10811.html
https://www.ptt.cc/bbs/movie/index10810.html
https://www.ptt.cc/bbs/movie/index10809.html


In [None]:
articles_data

[{'title': '[問片] 一部電影開頭有紙鈔上的人頭講話',
  'date': '10/24',
  'author': 'wch1995',
  'href': 'https://www.ptt.cc/bbs/movie/M.1761261468.A.9C4.html'},
 {'title': '[情報] 鏈鋸人劇場版蕾潔篇 爛番茄100',
  'date': '10/24',
  'author': 'vestal',
  'href': 'https://www.ptt.cc/bbs/movie/M.1761270564.A.7B9.html'},
 {'title': '[討論] 周星馳為何這麼欣賞羅志祥？',
  'date': '10/24',
  'author': 'DiCaprio',
  'href': 'https://www.ptt.cc/bbs/movie/M.1761283533.A.798.html'},
 {'title': '[討論] 康斯坦汀:驅魔神探12/05重返大銀幕',
  'date': '10/24',
  'author': 'smilekrtc',
  'href': 'https://www.ptt.cc/bbs/movie/M.1761283695.A.0CA.html'},
 {'title': 'Re: [討論] 大家有哪些心中非常喜歡的電影主題曲？',
  'date': '10/24',
  'author': 'Allen0820',
  'href': 'https://www.ptt.cc/bbs/movie/M.1761286588.A.1E9.html'},
 {'title': '[  有雷] 大婢咒好看',
  'date': '10/24',
  'author': 'ej200078914',
  'href': 'https://www.ptt.cc/bbs/movie/M.1761289653.A.6FA.html'},
 {'title': '[好雷] 鏈鋸人蕾潔篇-大巨蛋秀泰ULTRA 4DX',
  'date': '10/24',
  'author': 'ljw155299',
  'href': 'https://www.ptt.cc/bbs/movie/M

In [11]:
sentence = articles_data[1]['title']

In [12]:
import jieba

# 待分詞的中文句子
print(f"原始句子: {sentence}\n")
print("-" * 40)

# --- 1. 精確模式 (Default Mode) ---
# 這是最常用的模式，嘗試將句子最精確地切開，適合文本分析。
print("模式一：精確模式 (jieba.cut)")
# jieba.cut 返回的是一個迭代器 (iterator)
seg_list_precise = jieba.cut(sentence, cut_all=False)
# 使用 '/ ' 將結果串接起來，方便輸出
print(f"分詞結果: {'/ '.join(seg_list_precise)}")
print("-" * 40)


# --- 2. 全模式 (Full Mode) ---
# 會掃描出句子中所有可能的詞語，速度最快，但結果可能有大量重疊。
print("模式二：全模式 (jieba.cut(..., cut_all=True))")
seg_list_all = jieba.cut(sentence, cut_all=True)
print(f"分詞結果: {'/ '.join(seg_list_all)}")
print("-" * 40)


# --- 3. 搜尋引擎模式 (Search Engine Mode) ---
# 在精確模式的基礎上，對長詞再次進行細分，適合用於搜尋引擎建立索引。
print("模式三：搜尋引擎模式 (jieba.cut_for_search)")
seg_list_search = jieba.cut_for_search(sentence)
print(f"分詞結果: {'/ '.join(seg_list_search)}")
print("-" * 40)


# --- 4. 詞性標註 (Optional: Add Part-of-Speech Tagging) ---
# jieba.posseg 可以在分詞的同時標註詞性 (例如 n: 名詞, v: 動詞)
import jieba.posseg as pseg
print("模式四：分詞與詞性標註 (jieba.posseg)")
words = pseg.cut(sentence)
result = []
for word, flag in words:
    result.append(f"{word}/{flag}")

print(f"分詞結果: {' '.join(result)}")
print("-" * 40)


  re_han_default = re.compile("([\u4E00-\u9FD5a-zA-Z0-9+#&\._%\-]+)", re.U)
  re_skip_default = re.compile("(\r\n|\s)", re.U)
  re_skip = re.compile("([a-zA-Z0-9]+(?:\.\d+)?%?)")
Building prefix dict from the default dictionary ...
DEBUG:jieba:Building prefix dict from the default dictionary ...


原始句子: [情報] 鏈鋸人劇場版蕾潔篇 爛番茄100

----------------------------------------
模式一：精確模式 (jieba.cut)


Dumping model to file cache /tmp/jieba.cache
DEBUG:jieba:Dumping model to file cache /tmp/jieba.cache
Loading model cost 0.541 seconds.
DEBUG:jieba:Loading model cost 0.541 seconds.
Prefix dict has been built successfully.
DEBUG:jieba:Prefix dict has been built successfully.
  re_skip_detail = re.compile("([\.0-9]+|[a-zA-Z0-9]+)")
  re_han_internal = re.compile("([\u4E00-\u9FD5a-zA-Z0-9+#&\._]+)")
  re_skip_internal = re.compile("(\r\n|\s)")
  re_num = re.compile("[\.0-9]+")


分詞結果: [/ 情報/ ]/  / 鏈鋸人/ 劇場/ 版蕾潔/ 篇/  / 爛/ 番茄/ 100
----------------------------------------
模式二：全模式 (jieba.cut(..., cut_all=True))
分詞結果: [/ 情/ 報/ ]/  / / 鏈/ 鋸/ 人/ 劇/ 場/ 版/ 蕾/ 潔/ 篇/ /  / / 爛/ 番茄/ 100
----------------------------------------
模式三：搜尋引擎模式 (jieba.cut_for_search)
分詞結果: [/ 情報/ ]/  / 鏈鋸人/ 劇場/ 版蕾潔/ 篇/  / 爛/ 番茄/ 100
----------------------------------------
模式四：分詞與詞性標註 (jieba.posseg)
分詞結果: [/x 情報/n ]/x  /x 鏈鋸人/n 劇場/n 版蕾潔篇/n  /x 爛/zg 番茄/n 100/m
----------------------------------------


In [13]:
from collections import Counter

# 1. 初始化一個 Counter 物件來記錄所有單詞的頻率
word_counts = Counter()

# 2. 設定一個範圍，從索引 1 到 10 (包含)
# 在 Python 的 range 中，range(start, stop) 是從 start 到 stop-1
for i in range(1, 11):
    # 確保索引 i 存在，避免錯誤
    if i < len(articles_data) and 'title' in articles_data[i]:
        # 取出標題內容
        title_text = articles_data[i]['title']

        # --- 文本清理步驟 (重要) ---
        # 清除標點符號、空格、換行符等非中文字符
        # 使用正則表達式，保留中文、英文字母、數字
        cleaned_text = re.sub(r'[^\w\s]', '', title_text) # 移除大部分標點符號

        # 進行結巴分詞 (使用精確模式 jieba.cut)
        # cut() 返回的是一個 generator，通常會轉成 list
        # 或直接在迴圈中使用，這裡我們直接用來更新 Counter
        words = jieba.cut(cleaned_text, cut_all=False)

        # 將分詞結果更新到 word_counts
        # Counter 的 update() 方法可以直接接收一個可疊代對象（如 words）
        word_counts.update(words)

# 3. 清理分詞結果，移除空格、單個字母等常見雜訊
# 建立一個新的 Counter，只包含長度大於 1 的詞，或您認為有意義的詞
final_word_counts = Counter()
for word, count in word_counts.items():
    # 移除空字符串、空格、換行符
    if word.strip() and len(word.strip()) > 1:
        final_word_counts[word] = count

# 4. 輸出詞頻結果 (例如，前 10 個高頻詞)
print("--- 詞頻統計結果 (前 10 名) ---")
for word, count in final_word_counts.most_common(10):
    print(f"'{word}': {count} 次")

--- 詞頻統計結果 (前 10 名) ---
'討論': 4 次
'周星馳': 2 次
'為何': 2 次
'這麼': 2 次
'欣賞羅志祥': 2 次
'Re': 2 次
'情報': 1 次
'鏈鋸人': 1 次
'劇場': 1 次
'版蕾潔': 1 次


In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import defaultdict

# 設置一個簡單的停用詞列表（Stop Words）
# 這些詞通常頻率很高，但對文章主題貢獻小
stopwords = set(['的', '了', '是', '在', '我', '你', '他', '她', '之', '一個', '和', '討論', '分享'])

In [15]:
document_list = []

# 從索引 1 到 10 (包含)
for i in range(1, 11):
    if i < len(articles_data) and 'title' in articles_data[i]:
        title_text = articles_data[i]['title']

        # 文本清理：移除標點符號和非詞語字符
        cleaned_text = re.sub(r'[^\w\s]', '', title_text)

        # 進行結巴分詞
        # lcut() 直接返回一個列表
        words = jieba.lcut(cleaned_text, cut_all=False)

        # 過濾停用詞和單個空白詞，並用空格重新連接成一個字符串，以便 TfidfVectorizer 處理
        filtered_words = [
            word.strip()
            for word in words
            if word.strip() and len(word.strip()) > 1 and word.strip() not in stopwords
        ]

        # TfidfVectorizer 需要的是字串形式的文檔
        document = " ".join(filtered_words)
        document_list.append(document)

# document_list 現在是一個列表，每個元素是經過處理的標題字串
# print(document_list)

In [16]:
# 1. 初始化 TfidfVectorizer
# TfidfVectorizer 會處理：
#    a. 將文檔轉換為詞頻矩陣 (CountVectorizer 的工作)
#    b. 計算 TF-IDF 權重 (TfidfTransformer 的工作)
vectorizer = TfidfVectorizer()

# 2. 進行擬合和轉換 (Fit and Transform)
# tfidf_matrix 是一個稀疏矩陣 (sparse matrix)，包含所有文檔的 TF-IDF 權重
tfidf_matrix = vectorizer.fit_transform(document_list)

# 3. 獲取所有詞彙 (特徵名稱)
feature_names = vectorizer.get_feature_names_out()

# 4. 將稀疏矩陣轉換為 NumPy 陣列，方便查看權重
tfidf_array = tfidf_matrix.toarray()

In [17]:
# 創建一個字典來存儲每個詞彙及其在所有文檔中的 TF-IDF 平均權重
avg_tfidf_scores = defaultdict(float)

# 遍歷所有文檔的權重
for doc_weights in tfidf_array:
    # 遍歷單篇文檔中的所有詞彙及其權重
    for i, weight in enumerate(doc_weights):
        word = feature_names[i]
        avg_tfidf_scores[word] += weight

# 計算平均值
num_documents = len(document_list)
for word in avg_tfidf_scores:
    avg_tfidf_scores[word] /= num_documents

# 按權重降序排列
sorted_avg_tfidf = sorted(avg_tfidf_scores.items(), key=lambda item: item[1], reverse=True)

print("--- 整個文檔集合中詞彙的 TF-IDF 平均權重 (前 10 名) ---")
for word, avg_weight in sorted_avg_tfidf[:10]:
    print(f"'{word}': {avg_weight:.4f}")

--- 整個文檔集合中詞彙的 TF-IDF 平均權重 (前 10 名) ---
'周星馳': 0.0947
'欣賞羅志祥': 0.0947
'為何': 0.0947
'這麼': 0.0947
're': 0.0753
'godofsex': 0.0577
'刪除': 0.0577
'大婢': 0.0577
'好看': 0.0577
'有雷': 0.0577


In [30]:
# ------------------------
# 套件
# ------------------------
!pip install --quiet gspread google-auth openai gradio jieba scikit-learn

import requests
from bs4 import BeautifulSoup
import re
import jieba
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
import gspread
from google.colab import auth
from google.auth import default
import gradio as gr
import openai
import os

# 假設你的 Gemini API Key 叫 gemini_xxx
os.environ['GEMINI_API_KEY'] = "gemini"

# ------------------------
# Google Sheet OAuth 授權（Colab）
# ------------------------
auth.authenticate_user()  # Colab 授權
creds, _ = default(scopes=['https://www.googleapis.com/auth/spreadsheets'])
gc = gspread.authorize(creds)

SHEET_URL = 'https://docs.google.com/spreadsheets/d/1ZrA1VIiUl2lJ1SXIdSPq1jRvIyqI1LsbiJ77BoVb8Tg/edit#gid=0'
sh = gc.open_by_url(SHEET_URL)
RAW_SHEET_NAME = '爬蟲資料'
STAT_SHEET_NAME = '關鍵詞統計'

# ------------------------
# 停用詞
# ------------------------
stopwords = set(['的', '了', '是', '在', '我', '你', '他', '她', '之', '一個', '和', '討論', '分享'])

# ------------------------
# PTT 爬蟲
# ------------------------
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
                  '(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

def extract_index_split(url):
    try:
        index_str = url.split('index')[1].split('.html')[0]
        return int(index_str)
    except Exception:
        return None

def get_last_index(url):
    response = requests.get(url, headers=headers, timeout=5)
    soup = BeautifulSoup(response.text, 'html.parser')
    # 找上一頁連結
    btns = soup.find('div', class_='btn-group btn-group-paging')
    if btns:
        prev_link = btns.find_all('a')[1]['href']  # 上一頁
        match = re.search(r'index(\d+).html', prev_link)
        if match:
            last_index = int(match.group(1)) + 1  # 最新頁 = 上一頁+1
            return last_index
    return None

def fetch_ptt_articles(start_url, pages=10):
    articles_data = []
    START_INDEX = extract_index_split(start_url)
    if START_INDEX is None:
        START_INDEX = get_last_index(start_url)
    if START_INDEX is None:
        print("無法取得起始頁頁碼")
        return []

    BASE_URL = "https://www.ptt.cc/bbs/movie/index"
    stop_index = START_INDEX - pages
    for idx in range(START_INDEX, stop_index, -1):
        url = f"{BASE_URL}{idx}.html"
        response = requests.get(url, headers=headers, timeout=5)
        soup = BeautifulSoup(response.text, 'html.parser')
        for article in soup.find_all('div', class_='r-ent'):
            title_tag = article.find('div', class_='title').find('a')
            if title_tag:
                title = title_tag.text.strip()
                href = "https://www.ptt.cc" + title_tag['href']
            else:
                title = article.find('div', class_='title').text.strip()
                href = "N/A (已刪除或不可存取)"
            author = article.find('div', class_='author').text.strip()
            date = article.find('div', class_='date').text.strip()
            articles_data.append({
                'title': title,
                'author': author,
                'date': date,
                'href': href
            })
    return articles_data

# ------------------------
# Sheet 存取
# ------------------------
def write_articles_to_sheet(articles_data):
    try:
        worksheet = sh.worksheet(RAW_SHEET_NAME)
        worksheet.clear()
    except gspread.WorksheetNotFound:
        worksheet = sh.add_worksheet(title=RAW_SHEET_NAME, rows="100", cols="20")
    worksheet.append_row(['標題', '作者', '日期', '連結'])
    for article in articles_data:
        worksheet.append_row([article['title'], article['author'], article['date'], article['href']])

def read_articles_from_sheet():
    try:
        worksheet = sh.worksheet(RAW_SHEET_NAME)
        records = worksheet.get_all_records()
        return records
    except gspread.WorksheetNotFound:
        return []

def write_keywords_to_sheet(keywords):
    try:
        worksheet = sh.worksheet(STAT_SHEET_NAME)
        worksheet.clear()
    except gspread.WorksheetNotFound:
        worksheet = sh.add_worksheet(title=STAT_SHEET_NAME, rows="100", cols="20")
    worksheet.append_row(['詞彙', '平均 TF-IDF'])
    for word, weight in keywords:
        worksheet.append_row([word, weight])

# ------------------------
# TF-IDF 關鍵詞分析
# ------------------------
def get_top_keywords(records, top_n=10):
    document_list = []
    for record in records:
        title_text = record['標題']
        cleaned_text = re.sub(r'[^\w\s]', '', title_text)
        words = jieba.lcut(cleaned_text, cut_all=False)
        filtered_words = [w.strip() for w in words if w.strip() and len(w.strip())>1 and w.strip() not in stopwords]
        document_list.append(" ".join(filtered_words))

    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(document_list)
    feature_names = vectorizer.get_feature_names_out()
    tfidf_array = tfidf_matrix.toarray()

    avg_tfidf_scores = defaultdict(float)
    for doc_weights in tfidf_array:
        for i, weight in enumerate(doc_weights):
            avg_tfidf_scores[feature_names[i]] += weight
    num_documents = len(document_list)
    for word in avg_tfidf_scores:
        avg_tfidf_scores[word] /= num_documents
    sorted_avg_tfidf = sorted(avg_tfidf_scores.items(), key=lambda item: item[1], reverse=True)
    return sorted_avg_tfidf[:top_n]

# ------------------------
# Gemini API 洞察生成
# ------------------------
def generate_insights(keywords):
    try:
        openai.api_key = os.environ.get('GEMINI_API_KEY')
        prompt = f"以下是文章熱門關鍵詞：{', '.join([w for w,_ in keywords])}\n" \
                 f"請生成 5 句洞察摘要 + 一段 120 字結論。"
        response = openai.ChatCompletion.create(
            model="gemini-1.5",
            messages=[{"role": "user", "content": prompt}],
            temperature=0.7
        )
        return response['choices'][0]['message']['content']
    except Exception as e:
        return f"API 呼叫失敗: {e}"

# ------------------------
# Gradio 主程式
# ------------------------
def run_all(start_url="https://www.ptt.cc/bbs/movie/index.html", top_n=10):
    articles = fetch_ptt_articles(start_url, pages=10)
    write_articles_to_sheet(articles)

    records = read_articles_from_sheet()
    top_keywords = get_top_keywords(records, top_n=top_n)
    write_keywords_to_sheet(top_keywords)

    insights = generate_insights(top_keywords)

    return top_keywords, insights

# ------------------------
# Gradio 介面
# ------------------------
with gr.Blocks() as demo:
    gr.Markdown("## PTT 電影版文章關鍵詞分析 + 洞察生成 (OAuth)")
    start_url_input = gr.Textbox(label="起始頁網址", value="https://www.ptt.cc/bbs/movie/index.html")
    top_n_input = gr.Slider(1, 20, value=10, step=1, label="熱門詞前 N 名")
    run_button = gr.Button("執行")
    keywords_output = gr.Dataframe(headers=["詞彙", "平均 TF-IDF"])
    insights_output = gr.Textbox(label="洞察摘要 + 結論")

    run_button.click(
        run_all,
        inputs=[start_url_input, top_n_input],
        outputs=[keywords_output, insights_output]
    )

demo.launch()

It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://aaeb1d4d94d8094beb.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [28]:
# 先寫入爬蟲抓到的文章
write_articles_to_sheet(articles)

# 再讀取剛寫入的資料
records = read_articles_from_sheet()
print(f"Sheet 讀取資料筆數: {len(records)}")
for r in records[:5]:  # 印前5筆確認
    print(r)

Sheet 讀取資料筆數: 10
{'標題': '[新聞] 「小辣椒」準備回歸漫威！葛妮絲派特洛', '作者': 'abiann', '日期': '10/24', '連結': 'https://www.ptt.cc/bbs/movie/M.1761321124.A.BBB.html'}
{'標題': '[討論] 鏈鋸人 北美提前場 340萬', '作者': 'razzL1225', '日期': '10/25', '連結': 'https://www.ptt.cc/bbs/movie/M.1761321611.A.DD3.html'}
{'標題': '[普好雷] 頭文字D 20週年 4K修復版', '作者': 'yulbin98', '日期': '10/25', '連結': 'https://www.ptt.cc/bbs/movie/M.1761323729.A.67A.html'}
{'標題': '[新聞] 電影賠錢他還是賺 李奧納多《一戰再戰》', '作者': 'godofsex', '日期': '10/25', '連結': 'https://www.ptt.cc/bbs/movie/M.1761325800.A.291.html'}
{'標題': '[好雷] 拿坡里的美麗傳說', '作者': 'steelgate', '日期': '10/25', '連結': 'https://www.ptt.cc/bbs/movie/M.1761326678.A.10C.html'}


In [29]:
# 從 Sheet 讀出的 records
top_keywords = get_top_keywords(records, top_n=5)
print("前 5 熱門關鍵詞:")
for word, score in top_keywords:
    print(word, score)

前 5 熱門關鍵詞:
公告 0.11376588207522505
340 0.09309790139542415
北美 0.09309790139542415
提前 0.09309790139542415
鏈鋸人 0.09309790139542415


In [34]:
# ------------------------
# 套件安裝
# ------------------------
!pip install --quiet gspread google-auth openai gradio jieba scikit-learn

import requests
from bs4 import BeautifulSoup
import re
import jieba
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
import gspread
from google.colab import auth
from google.auth import default
import gradio as gr
import os
import openai

# 直接從環境變數讀取
openai.api_key = os.environ.get('gemini')

# ------------------------
# Google Sheet OAuth 授權（Colab）
# ------------------------
auth.authenticate_user()
creds, _ = default(scopes=['https://www.googleapis.com/auth/spreadsheets'])
gc = gspread.authorize(creds)

SHEET_URL = 'https://docs.google.com/spreadsheets/d/1ZrA1VIiUl2lJ1SXIdSPq1jRvIyqI1LsbiJ77BoVb8Tg/edit#gid=0'
sh = gc.open_by_url(SHEET_URL)
RAW_SHEET_NAME = '爬蟲資料'
STAT_SHEET_NAME = '關鍵詞統計'

# ------------------------
# 停用詞
# ------------------------
stopwords = set(['的', '了', '是', '在', '我', '你', '他', '她', '之', '一個', '和', '討論', '分享'])

# ------------------------
# PTT 爬蟲
# ------------------------
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
                  '(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

def extract_index_split(url):
    try:
        index_str = url.split('index')[1].split('.html')[0]
        return int(index_str)
    except Exception:
        return None

def get_last_index(url):
    response = requests.get(url, headers=headers, timeout=5)
    soup = BeautifulSoup(response.text, 'html.parser')
    btns = soup.find('div', class_='btn-group btn-group-paging')
    if btns:
        prev_link = btns.find_all('a')[1]['href']  # 上一頁
        match = re.search(r'index(\d+).html', prev_link)
        if match:
            return int(match.group(1)) + 1  # 最新頁 = 上一頁+1
    return None

def fetch_ptt_articles(start_url, pages=10):
    articles_data = []
    START_INDEX = extract_index_split(start_url)
    if START_INDEX is None:
        START_INDEX = get_last_index(start_url)
    if START_INDEX is None:
        print("無法取得起始頁頁碼")
        return []

    BASE_URL = "https://www.ptt.cc/bbs/movie/index"
    stop_index = START_INDEX - pages
    for idx in range(START_INDEX, stop_index, -1):
        url = f"{BASE_URL}{idx}.html"
        response = requests.get(url, headers=headers, timeout=5)
        soup = BeautifulSoup(response.text, 'html.parser')
        for article in soup.find_all('div', class_='r-ent'):
            title_tag = article.find('div', class_='title').find('a')
            if title_tag:
                title = title_tag.text.strip()
                href = "https://www.ptt.cc" + title_tag['href']
            else:
                title = article.find('div', class_='title').text.strip()
                href = "N/A (已刪除或不可存取)"
            author = article.find('div', class_='author').text.strip()
            date = article.find('div', class_='date').text.strip()
            articles_data.append({
                'title': title,
                'author': author,
                'date': date,
                'href': href
            })
    return articles_data

# ------------------------
# Sheet 存取
# ------------------------
def write_articles_to_sheet(articles_data):
    try:
        worksheet = sh.worksheet(RAW_SHEET_NAME)
        worksheet.clear()
    except gspread.WorksheetNotFound:
        worksheet = sh.add_worksheet(title=RAW_SHEET_NAME, rows="100", cols="20")
    worksheet.append_row(['標題', '作者', '日期', '連結'])
    values = [[a['title'], a['author'], a['date'], a['href']] for a in articles_data]
    worksheet.append_rows(values)

def read_articles_from_sheet():
    try:
        worksheet = sh.worksheet(RAW_SHEET_NAME)
        return worksheet.get_all_records()
    except gspread.WorksheetNotFound:
        return []

def write_keywords_to_sheet(keywords):
    try:
        worksheet = sh.worksheet(STAT_SHEET_NAME)
        worksheet.clear()
    except gspread.WorksheetNotFound:
        worksheet = sh.add_worksheet(title=STAT_SHEET_NAME, rows="100", cols="20")
    worksheet.append_row(['詞彙', '平均 TF-IDF'])
    values = [[w, score] for w, score in keywords]
    worksheet.append_rows(values)

# ------------------------
# TF-IDF 關鍵詞分析
# ------------------------
def get_top_keywords(records, top_n=10):
    document_list = []
    for record in records:
        title_text = record['標題']
        cleaned_text = re.sub(r'[^\w\s]', '', title_text)
        words = jieba.lcut(cleaned_text, cut_all=False)
        filtered_words = [w.strip() for w in words if w.strip() and len(w.strip())>1 and w.strip() not in stopwords]
        document_list.append(" ".join(filtered_words))

    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(document_list)
    feature_names = vectorizer.get_feature_names_out()
    tfidf_array = tfidf_matrix.toarray()

    avg_tfidf_scores = defaultdict(float)
    for doc_weights in tfidf_array:
        for i, weight in enumerate(doc_weights):
            avg_tfidf_scores[feature_names[i]] += weight
    num_documents = len(document_list)
    for word in avg_tfidf_scores:
        avg_tfidf_scores[word] /= num_documents
    sorted_avg_tfidf = sorted(avg_tfidf_scores.items(), key=lambda item: item[1], reverse=True)
    return sorted_avg_tfidf[:top_n]

# ------------------------
# Gemini API 洞察生成 (OpenAI >=1.0)
# ------------------------
def generate_insights(keywords):
    try:
        openai.api_key = os.environ.get('gemini')  # 從 Colab Secrets 讀取
        prompt = f"以下是文章熱門關鍵詞：{', '.join([w for w,_ in keywords])}\n" \
                 f"請生成 5 句洞察摘要 + 一段 120 字結論。"

        response = openai.chat.completions.create(
            model="gemini-1.5",
            messages=[{"role": "user", "content": prompt}],
            temperature=0.7
        )
        return response.choices[0].message.content
    except Exception as e:
        return f"API 呼叫失敗: {e}"

# ------------------------
# 主程式
# ------------------------
def run_all(start_url="https://www.ptt.cc/bbs/movie/index.html", top_n=10):
    print("抓取文章...")
    articles = fetch_ptt_articles(start_url, pages=10)
    print(f"抓取完成: {len(articles)} 篇文章")

    print("寫入 Sheet...")
    write_articles_to_sheet(articles)

    print("讀取 Sheet...")
    records = read_articles_from_sheet()

    print("TF-IDF 分析熱門詞...")
    top_keywords = get_top_keywords(records, top_n=top_n)
    write_keywords_to_sheet(top_keywords)

    print("呼叫 Gemini API 生成洞察...")
    insights = generate_insights(top_keywords)
    print("完成 ✅")

    return top_keywords, insights

# ------------------------
# Gradio 介面
# ------------------------
with gr.Blocks() as demo:
    gr.Markdown("## PTT 電影版文章關鍵詞分析 + 洞察生成 (OAuth)")
    start_url_input = gr.Textbox(label="起始頁網址", value="https://www.ptt.cc/bbs/movie/index.html")
    top_n_input = gr.Slider(1, 20, value=10, step=1, label="熱門詞前 N 名")
    run_button = gr.Button("執行")
    keywords_output = gr.Dataframe(headers=["詞彙", "平均 TF-IDF"])
    insights_output = gr.Textbox(label="洞察摘要 + 結論", lines=10)  # 多行文字

    run_button.click(
        run_all,
        inputs=[start_url_input, top_n_input],
        outputs=[keywords_output, insights_output]
    )

demo.launch()


It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://638a112a44e786f76c.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [40]:
!pip install --quiet gspread google-auth google-generativeai gradio jieba scikit-learn beautifulsoup4 requests

In [41]:
import requests
from bs4 import BeautifulSoup
import re
import jieba
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
import gspread
from google.colab import auth
from google.auth import default
import gradio as gr
import google.generativeai as genai
import os

# ============================================================
# 🔑 Gemini API 設定
# ============================================================
# ⚠️ 你要先在 Colab 左側選單「⚙️設定 → Secrets」中建立 key 名稱：gemini
genai.configure(api_key=os.environ.get("gemini"))

# ============================================================
# 🧾 Google Sheet OAuth 授權（Colab）
# ============================================================
auth.authenticate_user()
creds, _ = default(scopes=['https://www.googleapis.com/auth/spreadsheets'])
gc = gspread.authorize(creds)

# 你的試算表網址
SHEET_URL = 'https://docs.google.com/spreadsheets/d/1ZrA1VIiUl2lJ1SXIdSPq1jRvIyqI1LsbiJ77BoVb8Tg/edit#gid=0'
sh = gc.open_by_url(SHEET_URL)

RAW_SHEET_NAME = '爬蟲資料'
STAT_SHEET_NAME = '關鍵詞統計'

# ============================================================
# 🧹 停用詞
# ============================================================
stopwords = set(['的', '了', '是', '在', '我', '你', '他', '她', '之', '一個', '和', '討論', '分享'])

# ============================================================
# 🕷️ PTT 爬蟲
# ============================================================
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
                  '(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

def extract_index_split(url):
    try:
        index_str = url.split('index')[1].split('.html')[0]
        return int(index_str)
    except Exception:
        return None

def get_last_index(url):
    response = requests.get(url, headers=headers, timeout=5)
    soup = BeautifulSoup(response.text, 'html.parser')
    btns = soup.find('div', class_='btn-group btn-group-paging')
    if btns:
        prev_link = btns.find_all('a')[1]['href']
        match = re.search(r'index(\d+).html', prev_link)
        if match:
            return int(match.group(1)) + 1
    return None

def fetch_ptt_articles(start_url, pages=10):
    articles_data = []
    START_INDEX = extract_index_split(start_url)
    if START_INDEX is None:
        START_INDEX = get_last_index(start_url)
    if START_INDEX is None:
        print("⚠️ 無法取得起始頁頁碼")
        return []

    BASE_URL = "https://www.ptt.cc/bbs/movie/index"
    stop_index = START_INDEX - pages
    for idx in range(START_INDEX, stop_index, -1):
        url = f"{BASE_URL}{idx}.html"
        response = requests.get(url, headers=headers, timeout=5)
        soup = BeautifulSoup(response.text, 'html.parser')
        for article in soup.find_all('div', class_='r-ent'):
            title_tag = article.find('div', class_='title').find('a')
            if title_tag:
                title = title_tag.text.strip()
                href = "https://www.ptt.cc" + title_tag['href']
            else:
                title = article.find('div', class_='title').text.strip()
                href = "N/A"
            author = article.find('div', class_='author').text.strip()
            date = article.find('div', class_='date').text.strip()
            articles_data.append({
                'title': title,
                'author': author,
                'date': date,
                'href': href
            })
    return articles_data

# ============================================================
# 📊 Google Sheet 操作
# ============================================================
def write_articles_to_sheet(articles_data):
    try:
        worksheet = sh.worksheet(RAW_SHEET_NAME)
        worksheet.clear()
    except gspread.WorksheetNotFound:
        worksheet = sh.add_worksheet(title=RAW_SHEET_NAME, rows="100", cols="20")
    worksheet.append_row(['標題', '作者', '日期', '連結'])
    for a in articles_data:
        worksheet.append_row([a['title'], a['author'], a['date'], a['href']])

def read_articles_from_sheet():
    try:
        worksheet = sh.worksheet(RAW_SHEET_NAME)
        return worksheet.get_all_records()
    except gspread.WorksheetNotFound:
        return []

def write_keywords_to_sheet(keywords):
    try:
        worksheet = sh.worksheet(STAT_SHEET_NAME)
        worksheet.clear()
    except gspread.WorksheetNotFound:
        worksheet = sh.add_worksheet(title=STAT_SHEET_NAME, rows="100", cols="20")
    worksheet.append_row(['詞彙', '平均 TF-IDF'])
    for w, v in keywords:
        worksheet.append_row([w, v])

# ============================================================
# 🧠 TF-IDF 關鍵詞分析
# ============================================================
def get_top_keywords(records, top_n=10):
    document_list = []
    for r in records:
        text = r['標題']
        cleaned = re.sub(r'[^\w\s]', '', text)
        words = jieba.lcut(cleaned, cut_all=False)
        filtered = [w.strip() for w in words if w.strip() and len(w.strip())>1 and w.strip() not in stopwords]
        document_list.append(" ".join(filtered))

    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(document_list)
    feature_names = vectorizer.get_feature_names_out()
    tfidf_array = tfidf_matrix.toarray()

    avg_scores = defaultdict(float)
    for doc in tfidf_array:
        for i, weight in enumerate(doc):
            avg_scores[feature_names[i]] += weight
    num_docs = len(document_list)
    for w in avg_scores:
        avg_scores[w] /= num_docs
    sorted_avg = sorted(avg_scores.items(), key=lambda x: x[1], reverse=True)
    return sorted_avg[:top_n]

# ============================================================
# ✨ Gemini 洞察生成
# ============================================================
def generate_insights(keywords):
    try:
        prompt = (
            f"以下是 PTT 電影版熱門關鍵詞：{', '.join([w for w,_ in keywords])}\n"
            f"請用中文生成：\n"
            f"1. 五句洞察摘要（每句以「•」開頭）\n"
            f"2. 一段約 120 字的結論。"
        )
        model = genai.GenerativeModel("gemini-1.5-flash")
        response = model.generate_content(prompt)
        return response.text
    except Exception as e:
        return f"❌ API 呼叫失敗：{e}"

# ============================================================
# ⚙️ 主流程
# ============================================================
def run_all(start_url="https://www.ptt.cc/bbs/movie/index.html", top_n=10):
    articles = fetch_ptt_articles(start_url, pages=10)
    write_articles_to_sheet(articles)

    records = read_articles_from_sheet()
    top_keywords = get_top_keywords(records, top_n=top_n)
    write_keywords_to_sheet(top_keywords)

    insights = generate_insights(top_keywords)
    return top_keywords, insights

# ============================================================
# 🎨 Gradio 介面
# ============================================================
with gr.Blocks() as demo:
    gr.Markdown("## 🎬 PTT 電影版 關鍵詞分析 + Gemini 洞察摘要")
    start_url_input = gr.Textbox(label="起始頁網址", value="https://www.ptt.cc/bbs/movie/index.html")
    top_n_input = gr.Slider(1, 20, value=10, step=1, label="熱門詞前 N 名")
    run_button = gr.Button("執行分析")
    keywords_output = gr.Dataframe(headers=["詞彙", "平均 TF-IDF"])
    insights_output = gr.Textbox(label="洞察摘要 + 結論", lines=10)  # ✅ 多行顯示
    run_button.click(run_all, inputs=[start_url_input, top_n_input], outputs=[keywords_output, insights_output])

demo.launch()

It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://286fcf5a2c4dcf5b23.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [43]:
import requests
from bs4 import BeautifulSoup
import re
import jieba
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
import gspread
from google.colab import auth
from google.auth import default
import gradio as gr
import os
import json
import time

# ============================================================
# 🧾 Google Sheet OAuth 授權（Colab）
# ============================================================
auth.authenticate_user()
creds, _ = default(scopes=['https://www.googleapis.com/auth/spreadsheets'])
gc = gspread.authorize(creds)

SHEET_URL = 'https://docs.google.com/spreadsheets/d/1ZrA1VIiUl2lJ1SXIdSPq1jRvIyqI1LsbiJ77BoVb8Tg/edit#gid=0'
sh = gc.open_by_url(SHEET_URL)
RAW_SHEET_NAME = '爬蟲資料'
STAT_SHEET_NAME = '關鍵詞統計'

# ============================================================
# 🧹 停用詞
# ============================================================
stopwords = set(['的', '了', '是', '在', '我', '你', '他', '她', '之', '一個', '和', '討論', '分享'])

# ============================================================
# 🕷️ PTT 爬蟲
# ============================================================
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
                  '(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

def extract_index_split(url):
    try:
        index_str = url.split('index')[1].split('.html')[0]
        return int(index_str)
    except Exception:
        return None

def get_last_index(url):
    response = requests.get(url, headers=headers, timeout=5)
    soup = BeautifulSoup(response.text, 'html.parser')
    btns = soup.find('div', class_='btn-group btn-group-paging')
    if btns:
        prev_link = btns.find_all('a')[1]['href']
        match = re.search(r'index(\d+).html', prev_link)
        if match:
            return int(match.group(1)) + 1
    return None

def fetch_ptt_articles(start_url, pages=10):
    articles_data = []
    START_INDEX = extract_index_split(start_url)
    if START_INDEX is None:
        START_INDEX = get_last_index(start_url)
    if START_INDEX is None:
        print("⚠️ 無法取得起始頁頁碼")
        return []

    BASE_URL = "https://www.ptt.cc/bbs/movie/index"
    stop_index = START_INDEX - pages
    for idx in range(START_INDEX, stop_index, -1):
        url = f"{BASE_URL}{idx}.html"
        response = requests.get(url, headers=headers, timeout=5)
        soup = BeautifulSoup(response.text, 'html.parser')
        for article in soup.find_all('div', class_='r-ent'):
            title_tag = article.find('div', class_='title').find('a')
            if title_tag:
                title = title_tag.text.strip()
                href = "https://www.ptt.cc" + title_tag['href']
            else:
                title = article.find('div', class_='title').text.strip()
                href = "N/A"
            author = article.find('div', class_='author').text.strip()
            date = article.find('div', class_='date').text.strip()
            articles_data.append({
                'title': title,
                'author': author,
                'date': date,
                'href': href
            })
    return articles_data

# ============================================================
# 📊 Google Sheet 操作
# ============================================================
def write_articles_to_sheet(articles_data):
    try:
        worksheet = sh.worksheet(RAW_SHEET_NAME)
        worksheet.clear()
    except gspread.WorksheetNotFound:
        worksheet = sh.add_worksheet(title=RAW_SHEET_NAME, rows="100", cols="20")
    worksheet.append_row(['標題', '作者', '日期', '連結'])
    for a in articles_data:
        worksheet.append_row([a['title'], a['author'], a['date'], a['href']])

def read_articles_from_sheet():
    try:
        worksheet = sh.worksheet(RAW_SHEET_NAME)
        return worksheet.get_all_records()
    except gspread.WorksheetNotFound:
        return []

def write_keywords_to_sheet(keywords):
    try:
        worksheet = sh.worksheet(STAT_SHEET_NAME)
        worksheet.clear()
    except gspread.WorksheetNotFound:
        worksheet = sh.add_worksheet(title=STAT_SHEET_NAME, rows="100", cols="20")
    worksheet.append_row(['詞彙', '平均 TF-IDF'])
    for w, v in keywords:
        worksheet.append_row([w, v])

# ============================================================
# 🧠 TF-IDF 關鍵詞分析
# ============================================================
def get_top_keywords(records, top_n=10):
    document_list = []
    for r in records:
        text = r['標題']
        cleaned = re.sub(r'[^\w\s]', '', text)
        words = jieba.lcut(cleaned, cut_all=False)
        filtered = [w.strip() for w in words if w.strip() and len(w.strip())>1 and w.strip() not in stopwords]
        document_list.append(" ".join(filtered))

    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(document_list)
    feature_names = vectorizer.get_feature_names_out()
    tfidf_array = tfidf_matrix.toarray()

    avg_scores = defaultdict(float)
    for doc in tfidf_array:
        for i, weight in enumerate(doc):
            avg_scores[feature_names[i]] += weight
    num_docs = len(document_list)
    for w in avg_scores:
        avg_scores[w] /= num_docs
    sorted_avg = sorted(avg_scores.items(), key=lambda x: x[1], reverse=True)
    return sorted_avg[:top_n]

# ============================================================
# ✨ Gemini 洞察生成（執行時輸入 API Key）
# ============================================================
def generate_insights(keywords, user_api_key):
    if not user_api_key:
        return "⚠️ 請輸入有效的 Gemini API Key。"

    GEMINI_API_URL = "https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash-latest:generateContent"
    prompt = (
        f"以下是 PTT 電影版熱門關鍵詞：{', '.join([w for w,_ in keywords])}\n"
        f"請用中文生成：\n"
        f"1. 五句洞察摘要（每句以「•」開頭）\n"
        f"2. 一段約 120 字的結論。"
    )
    headers = {"Content-Type": "application/json"}
    params = {"key": user_api_key}
    data = {"contents": [{"parts": [{"text": prompt}]}]}

    try:
        response = requests.post(GEMINI_API_URL, headers=headers, params=params, json=data, timeout=60)
        if response.status_code == 200:
            result = response.json()
            return result['candidates'][0]['content']['parts'][0]['text']
        else:
            return f"❌ API 錯誤: {response.status_code} - {response.text}"
    except Exception as e:
        return f"❌ API 呼叫失敗：{e}"

# ============================================================
# ⚙️ 主流程
# ============================================================
def run_all(start_url, top_n, api_key):
    articles = fetch_ptt_articles(start_url, pages=10)
    write_articles_to_sheet(articles)

    records = read_articles_from_sheet()
    top_keywords = get_top_keywords(records, top_n=top_n)
    write_keywords_to_sheet(top_keywords)

    insights = generate_insights(top_keywords, api_key)
    return top_keywords, insights

# ============================================================
# 🎨 Gradio 介面
# ============================================================
with gr.Blocks() as demo:
    gr.Markdown("## 🎬 PTT 電影版 關鍵詞分析 + Gemini 洞察摘要 (API Key 輸入版)")
    api_key_input = gr.Textbox(label="🔑 請輸入你的 Gemini API Key", type="password", placeholder="AIza 或 g- 開頭的金鑰")
    start_url_input = gr.Textbox(label="起始頁網址", value="https://www.ptt.cc/bbs/movie/index.html")
    top_n_input = gr.Slider(1, 20, value=10, step=1, label="熱門詞前 N 名")
    run_button = gr.Button("🚀 執行分析")
    keywords_output = gr.Dataframe(headers=["詞彙", "平均 TF-IDF"])
    insights_output = gr.Textbox(label="洞察摘要 + 結論", lines=10)

    run_button.click(run_all, inputs=[start_url_input, top_n_input, api_key_input], outputs=[keywords_output, insights_output])

demo.launch()


It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://162d7484de2755df24.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [55]:
import requests
from bs4 import BeautifulSoup
import re
import jieba
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
import gspread
from google.colab import auth
from google.auth import default
import gradio as gr
import os
import json
import time

# ============================================================
# 🧾 Google Sheet 授權
# ============================================================
auth.authenticate_user()
creds, _ = default(scopes=['https://www.googleapis.com/auth/spreadsheets'])
gc = gspread.authorize(creds)
SHEET_URL = 'https://docs.google.com/spreadsheets/d/1ZrA1VIiUl2lJ1SXIdSPq1jRvIyqI1LsbiJ77BoVb8Tg/edit#gid=0'
sh = gc.open_by_url(SHEET_URL)
RAW_SHEET_NAME = '爬蟲資料'
STAT_SHEET_NAME = '關鍵詞統計'

# ============================================================
# 🧹 停用詞
# ============================================================
stopwords = set(['的', '了', '是', '在', '我', '你', '他', '她', '之', '一個', '和', '討論', '分享'])

# ============================================================
# 🕷️ PTT 爬蟲
# ============================================================
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
                  '(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

def extract_index_split(url):
    try:
        index_str = url.split('index')[1].split('.html')[0]
        return int(index_str)
    except Exception:
        return None

def get_last_index(url):
    response = requests.get(url, headers=headers, timeout=5)
    soup = BeautifulSoup(response.text, 'html.parser')
    btns = soup.find('div', class_='btn-group btn-group-paging')
    if btns:
        prev_link = btns.find_all('a')[1]['href']
        match = re.search(r'index(\d+).html', prev_link)
        if match:
            return int(match.group(1)) + 1
    return None

def fetch_ptt_articles(start_url, pages=10):
    articles_data = []
    START_INDEX = extract_index_split(start_url)
    if START_INDEX is None:
        START_INDEX = get_last_index(start_url)
    if START_INDEX is None:
        print("⚠️ 無法取得起始頁頁碼")
        return []

    BASE_URL = "https://www.ptt.cc/bbs/movie/index"
    stop_index = START_INDEX - pages
    for idx in range(START_INDEX, stop_index, -1):
        url = f"{BASE_URL}{idx}.html"
        response = requests.get(url, headers=headers, timeout=5)
        soup = BeautifulSoup(response.text, 'html.parser')
        for article in soup.find_all('div', class_='r-ent'):
            title_tag = article.find('div', class_='title').find('a')
            if title_tag:
                title = title_tag.text.strip()
                href = "https://www.ptt.cc" + title_tag['href']
            else:
                title = article.find('div', class_='title').text.strip()
                href = "N/A"
            author = article.find('div', class_='author').text.strip()
            date = article.find('div', class_='date').text.strip()
            articles_data.append({
                'title': title,
                'author': author,
                'date': date,
                'href': href
            })
    return articles_data

# ============================================================
# 📊 Google Sheet 操作
# ============================================================
def write_articles_to_sheet(articles_data):
    try:
        worksheet = sh.worksheet(RAW_SHEET_NAME)
        worksheet.clear()
    except gspread.WorksheetNotFound:
        worksheet = sh.add_worksheet(title=RAW_SHEET_NAME, rows="100", cols="20")
    worksheet.append_row(['標題', '作者', '日期', '連結'])
    for a in articles_data:
        worksheet.append_row([a['title'], a['author'], a['date'], a['href']])

def read_articles_from_sheet():
    try:
        worksheet = sh.worksheet(RAW_SHEET_NAME)
        return worksheet.get_all_records()
    except gspread.WorksheetNotFound:
        return []

def write_keywords_to_sheet(keywords):
    try:
        worksheet = sh.worksheet(STAT_SHEET_NAME)
        worksheet.clear()
    except gspread.WorksheetNotFound:
        worksheet = sh.add_worksheet(title=STAT_SHEET_NAME, rows="100", cols="20")
    worksheet.append_row(['詞彙', '平均 TF-IDF'])
    for w, v in keywords:
        worksheet.append_row([w, v])

# ============================================================
# 🧠 TF-IDF 分析
# ============================================================
def get_top_keywords(records, top_n=10):
    document_list = []
    for r in records:
        text = r['標題']
        cleaned = re.sub(r'[^\w\s]', '', text)
        words = jieba.lcut(cleaned, cut_all=False)
        filtered = [w.strip() for w in words if w.strip() and len(w.strip())>1 and w.strip() not in stopwords]
        document_list.append(" ".join(filtered))

    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(document_list)
    feature_names = vectorizer.get_feature_names_out()
    tfidf_array = tfidf_matrix.toarray()

    avg_scores = defaultdict(float)
    for doc in tfidf_array:
        for i, weight in enumerate(doc):
            avg_scores[feature_names[i]] += weight
    num_docs = len(document_list)
    for w in avg_scores:
        avg_scores[w] /= num_docs
    sorted_avg = sorted(avg_scores.items(), key=lambda x: x[1], reverse=True)
    return sorted_avg[:top_n]

# ============================================================
# 🤖 Gemini API 洞察生成
# ============================================================
def generate_insights(keywords, user_api_key):
    if not user_api_key:
        return "⚠️ 請輸入有效的 Gemini API Key。"

    GEMINI_API_URL = "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent"
    prompt = (
        f"以下是 PTT 電影版熱門關鍵詞：{', '.join([w for w,_ in keywords])}\n"
        f"請用中文生成：\n"
        f"1. 五句洞察摘要（每句以「•」開頭）\n"
        f"2. 一段約 120 字的結論。"
    )
    headers = {"Content-Type": "application/json"}
    params = {"key": user_api_key}
    data = {"contents": [{"parts": [{"text": prompt}]}]}

    try:
        response = requests.post(GEMINI_API_URL, headers=headers, params=params, json=data, timeout=60)
        if response.status_code == 200:
            result = response.json()
            return result['candidates'][0]['content']['parts'][0]['text']
        else:
            return f"❌ API 錯誤: {response.status_code} - {response.text}"
    except Exception as e:
        return f"❌ API 呼叫失敗：{e}"

# ============================================================
# ⚙️ Gradio 多分頁介面（限制爬蟲頁數）
# ============================================================
with gr.Blocks() as demo:
    gr.Markdown("## 🎬 PTT 電影版 關鍵詞分析系統 (多分頁介面)")

    with gr.Tab("🕷️ 爬蟲結果"):
        start_url_input = gr.Textbox(label="PTT 起始頁網址", value="https://www.ptt.cc/bbs/movie/index.html")
        page_slider = gr.Slider(1, 5, value=1, step=1, label="爬取頁數（建議 1～3 頁）")
        run_button = gr.Button("🚀 執行爬蟲")
        articles_output = gr.Dataframe(headers=["標題", "作者", "日期", "連結"], label="爬蟲結果")

        def run_crawler(start_url, pages):
            gr.Info(f"開始爬取 {pages} 頁資料，請稍候...")
            articles = fetch_ptt_articles(start_url, pages=pages)
            if not articles:
                return []
            write_articles_to_sheet(articles)
            data = [[a['title'], a['author'], a['date'], a['href']] for a in articles]
            gr.Info(f"✅ 成功爬取 {len(data)} 筆文章！")
            return data

        run_button.click(run_crawler, inputs=[start_url_input, page_slider], outputs=articles_output)

    with gr.Tab("🔠 熱門詞分析"):
        top_n_input = gr.Slider(1, 20, value=10, step=1, label="熱門詞前 N 名")
        run_tfidf_button = gr.Button("📊 執行分析")
        keywords_output = gr.Dataframe(headers=["詞彙", "平均 TF-IDF"], label="TF-IDF 結果")

        def run_tfidf(top_n):
            gr.Info("📈 正在進行 TF-IDF 分析...")
            records = read_articles_from_sheet()
            if not records:
                gr.Warning("⚠️ 尚未有爬蟲資料，請先到第一頁執行爬蟲！")
                return []
            top_keywords = get_top_keywords(records, top_n=top_n)
            write_keywords_to_sheet(top_keywords)
            gr.Info("✅ 分析完成！")
            return top_keywords

        run_tfidf_button.click(run_tfidf, inputs=top_n_input, outputs=keywords_output)

    with gr.Tab("🤖 AI 洞察摘要"):
        api_key_input = gr.Textbox(label="🔑 請輸入你的 Gemini API Key", type="password", placeholder="AIza 或 g- 開頭的金鑰")
        run_ai_button = gr.Button("✨ 生成摘要")
        ai_output = gr.Textbox(label="AI 洞察摘要 + 結論", lines=12)

        def run_ai(api_key):
            gr.Info("🤖 正在請求 Gemini API...")
            records = read_articles_from_sheet()
            if not records:
                gr.Warning("⚠️ 尚未有爬蟲資料，請先到第一頁執行爬蟲！")
                return "⚠️ 尚無資料"
            top_keywords = get_top_keywords(records, top_n=10)
            insights = generate_insights(top_keywords, api_key)
            return insights

        run_ai_button.click(run_ai, inputs=api_key_input, outputs=ai_output)

demo.launch()

It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://a76869bbcf46b385c3.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


