<a href="https://colab.research.google.com/github/41371103hjnh/Programming_Project/blob/main/%E6%9C%88%E8%80%81%E7%B1%A4%E8%A9%A9_%E5%BB%BA%E6%AA%94%E7%89%88.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install requests beautifulsoup4 pandas

In [None]:
import requests
from bs4 import BeautifulSoup

index_url = "https://www.manekineko.com.tw/yuelao"

res = requests.get(index_url)
res.encoding = res.apparent_encoding  # 避免中文亂碼

print("HTTP 狀態碼：", res.status_code)
print(res.text[:1000])


In [None]:
from urllib.parse import urljoin

soup = BeautifulSoup(res.text, "html.parser")

links = []
for a in soup.find_all("a", href=True):
    href = a["href"]
    text = a.get_text(strip=True)
    # 只挑籤詩的文章，例如 /blog/yuelao74 這種
    if "/blog/yuelao" in href:
        full_url = urljoin(index_url, href)
        links.append({"label": text, "url": full_url})

print("一開始抓到的連結數量：", len(links))

# 去除重複網址
seen = set()
unique_links = []
for item in links:
    if item["url"] not in seen:
        seen.add(item["url"])
        unique_links.append(item)

links = unique_links
print("去重後連結數量：", len(links))

# 看前幾個檢查一下
links[:5]


In [None]:
test_url = links[0]["url"]   # 先拿第一個來試，之後你想指定哪一個都可以
print("測試網址：", test_url)

test_res = requests.get(test_url)
test_res.encoding = test_res.apparent_encoding

test_soup = BeautifulSoup(test_res.text, "html.parser")
page_text = test_soup.get_text("\n", strip=True)

# 印出前 80 行看看
lines = page_text.split("\n")
for i, line in enumerate(lines[:80]):
    print(i, line)


In [None]:
import re

def parse_lot_page(url: str) -> dict:
    res = requests.get(url)
    res.encoding = res.apparent_encoding
    soup = BeautifulSoup(res.text, "html.parser")
    text = soup.get_text("\n", strip=True)

    # 砍成一行一行
    lines = [line.strip() for line in text.split("\n") if line.strip()]

    # 1️⃣ 從網址抓籤號，例如 yuelao74 -> 74
    m = re.search(r"yuelao(\d+)", url)
    lot_id = m.group(1) if m else ""

    # 2️⃣ 找到「月老籤詩-第XX籤」那一行，下面幾行是籤文
    title_line = ""
    original = ""
    for i, line in enumerate(lines):
        if "月老籤詩-第" in line:
            title_line = line

            poem_lines = []
            for j in range(i + 1, len(lines)):
                lj = lines[j]

                # 這些是下一個區塊的開頭，遇到就停
                if lj.startswith("⬇"):
                    break
                if lj.startswith("釋意") or lj.startswith("白話文解"):
                    break
                if lj.startswith("單身") or lj.startswith("暗戀") or lj.startswith("婚 姻"):
                    break

                poem_lines.append(lj)

            original = "\n".join(poem_lines)
            break

    # 3️⃣ 找「白話文解：」那一段，下面幾行當作解釋（base_mean）
    base_mean = ""
    idx_white = None
    for i, line in enumerate(lines):
        if "白話文解" in line:
            idx_white = i
            break

    if idx_white is not None:
        collected = []
        stop_keywords = [
            "單身", "暗戀", "婚 姻", "交往", "復 合",
            "分手後", "感情合盤", "籤詩總結"
        ]
        for j in range(idx_white + 1, len(lines)):
            lj = lines[j]
            # 如果遇到下一個大標題或提示，就停
            if lj.startswith("⬇") or any(lj.startswith(k) for k in stop_keywords):
                break
            collected.append(lj)
        base_mean = "\n".join(collected)

    return {
        "lot_id": lot_id,
        "title": title_line,
        "original": original,
        "base_mean": base_mean,
        "source_url": url,
    }

# 先用一篇測試看看
sample = parse_lot_page(test_url)
sample


In [None]:
all_rows = []

for item in links:
    url = item["url"]
    try:
        data = parse_lot_page(url)
        all_rows.append(data)
        print(f"OK: {data['lot_id']}  {data['original']}")
    except Exception as e:
        print(f"❌ 這一篇有問題：{url} -> {e}")

print("總共成功解析幾支籤：", len(all_rows))


In [None]:
import pandas as pd

# 轉成 DataFrame，並整理成你想要的欄位
df = pd.DataFrame(all_rows)

# 加上 story, love_hint 欄位（先空）
df["story"] = ""
df["love_hint"] = ""

# 調整欄位順序
df = df[["lot_id", "original", "story", "base_mean", "love_hint", "source_url"]]

df.head()


In [None]:
df.to_csv("lots.csv", index=False, encoding="utf-8-sig")
print("已輸出 lots.csv")


In [None]:
!pip install gspread gspread_dataframe


In [None]:
from google.colab import auth
auth.authenticate_user()


In [None]:
import gspread
from google.auth import default

creds, _ = default()
gc = gspread.authorize(creds)

# 用試算表的「ID」開啟（就是網址中間那一串）
sheet_id = "106PoiJgG46Pc-wlPRBtCvibLpmrhF1kOCSND-QQRg6g"
sh = gc.open_by_key(sheet_id)

# 如果你的工作表名稱叫「月老籤詩」，用這行：
ws = sh.worksheet("月老籤詩")

# 如果你不確定名字，可以先印出所有工作表名稱看看：
print([w.title for w in sh.worksheets()])


In [None]:
# 如果你要全部欄位都回寫：
df_to_sheet = df

# 如果你只想寫這幾欄：
# df_to_sheet = df[["lot_id", "original", "base_mean", "source_url"]]

df_to_sheet.head()


In [None]:
ws.clear()

In [None]:
from gspread_dataframe import set_with_dataframe

# 把 df_to_sheet 寫入工作表，會從 A1 開始，第一列自動寫欄位名稱
set_with_dataframe(ws, df_to_sheet, include_index=False)

print("已經把資料寫到 Google Sheet！")
