<a href="https://colab.research.google.com/github/EasonHu0620/naion_palace_exhibiton/blob/main/palace_museum.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [33]:
import os
import re
import csv
import requests as req
from bs4 import BeautifulSoup as bs
from urllib.parse import urljoin
from requests.utils import requote_uri

base_url = "https://www.npm.gov.tw"
exhs_url = "https://www.npm.gov.tw/Exhibition-Current.aspx?sno=03000060&l=1"
museum_name = "國立故宮博物院"

resp = req.get(exhs_url)
html = bs(resp.text, "html.parser")
exhs = html.find_all("li", class_="mb-8")

rows = []  # 用來存所有展覽資料
for exh in exhs:
  # 展覽名稱
  title = ""
  try:
    # 第一優先：font-medium
    title = exh.find("h3", class_="font-medium").text.strip()
  except Exception:
    # 如果沒有 font-medium，就試著抓 card-title h5
    try:
      title = exh.find("h3", class_="card-title h5").text.strip()
    except Exception:
      pass

  # 展覽日期
  ex_date = ""
  try:
  # 第一優先：正常有日期的展覽
    ex_date = exh.find("div", class_="exhibition-list-date").text.strip()
  except Exception:
  # 第二優先：抓 card-content-top 內的第一個無 class div
    try:
      content_top = exh.find("div", class_="card-content-top")
      # 找到 card-content-top 下 "第一個無 class 的 div"（就是常設展）
      date_div = content_top.find("div", class_=False, recursive=False)
      ex_date = date_div.get_text(strip=True)
    except Exception:
      pass


  #展覽類型
  ex_tag = ""
  try:
    ex_tag = exh.find("div", class_="mt-2").text.strip()
  except Exception:
    pass
    try:
      ex_tag = exh.find("div", class_="card-tags").text.strip()
    except Exception:
      pass


  #展覽地點
  ex_place = ""
  try:
    ex_place = exh.find("div", class_="card-content-bottom").text.strip()
  except Exception:
    pass

  #展覽連結
  ex_link = ""
  try:
    ex_link = urljoin(base_url, exh.find("a")["href"])
  except Exception:
    pass

  #展覽圖片
  ex_img = ""
  try:
    img_tag = exh.find("img")
    if img_tag:
        # 先拿 data-src，沒有才用 src
        src = img_tag.get("data-src") or img_tag.get("src")

        # 避免抓到 loader.gif
        if src and "loader.gif" not in src:
            ex_img = urljoin(base_url, src).split("&")[0]
  except Exception:
      pass

  rows.append([museum_name, title, ex_tag, ex_link, ex_img, ex_date, ex_place])
  print(title, ex_date, ex_place, ex_tag, ex_link, ex_img)

# --- 將結果輸出為 CSV 檔 ---
output_file = "place_museum_exhibitions.csv"

with open(output_file, "w", newline="", encoding="utf-8-sig") as f:
    writer = csv.writer(f)
    # 寫入欄位名稱
    writer.writerow(["館別", "展覽名稱", '展覽類別', "展覽連結", "展覽圖片", "展覽日期", "展覽地點"])
    # 寫入內容
    for row in rows:
        writer.writerow(row)

print(f"✅ 已輸出 {len(rows)} 筆資料至 {output_file}")




千年神遇——北宋西園雅集傳奇 2025-10-10~2026-01-07 北部院區　第一展覽館
                                                                202,204,206,208,210,212 #書法　#繪畫 https://www.npm.gov.tw/Exhibition-Content.aspx?sno=04014159&l=1&type=&cat= https://www.npm.gov.tw/NewFileAtt.ashx?name=exbitBig/04014159/34057889.jpg
甲子萬年：國立故宮博物院百年院慶特展 2025-10-04~2026-01-04 北部院區　第一展覽館
                                                                105,107 #書法　#繪畫　#圖書文獻　#器物 https://www.npm.gov.tw/Exhibition-Content.aspx?sno=04014243&l=1&type=&cat= https://www.npm.gov.tw/NewFileAtt.ashx?name=exbitBig/04014243/34058137.jpg
皕宋——故宮宋版圖書觀止 (I) 2025-10-03~2026-01-04 北部院區　第一展覽館
                                                                103,104 #圖書文獻 https://www.npm.gov.tw/Exhibition-Content.aspx?sno=04014209&l=1&type=&cat= https://www.npm.gov.tw/NewFileAtt.ashx?name=exbitBig/04014209/34058027.jpg
看得見的紅樓夢 2024-05-17~2026-05-17 北部院區　第一展覽館
                                                                203 #陶瓷　#玉器　#珍玩　#圖書文獻　#器物 h