<a href="https://colab.research.google.com/github/EasonHu0620/tfam_exh/blob/main/tfam.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import re
import csv
import time
from urllib.parse import urljoin

import requests as req
from bs4 import BeautifulSoup as bs

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC


BASE = "https://www.tfam.museum/"
HOME = "https://www.tfam.museum/index.aspx?ddlLang=zh-tw"
EXH  = "https://www.tfam.museum/Exhibition/Exhibition.aspx?ddlLang=zh-tw"
CONTAINER_XPATH = '/html/body/form/div[3]/div[3]/div/div[2]'


# ---------- 1) 首頁：館別/地址 ----------
def fetch_basic_info():
    r = req.get(HOME, timeout=20)
    r.raise_for_status()
    html = bs(r.text, "html.parser")

    tfam = html.find("div", class_="footer-info-container")
    tfam_text = tfam.get_text(" ", strip=True)

    info = {}
    info["館別"] = re.search(r"臺北市立美術館", tfam_text).group()

    info["地址"] = re.search(r"104227.+?號", tfam_text).group()

    # 開放時間
    time_blocks = html.find("div", class_="table-cell")
    openday = html.find_all("span", style="box-sizing: border-box; margin: 0px 3rem 0px 0px; padding: 0px; border: 0px; outline: 0px; display: inline-block; vertical-align: text-bottom; width: 8rem;")
    openday_text1 = openday[0].get_text()
    openday_text2 = openday[1].get_text()
    opentime1 = openday[0].find_next_sibling().get_text()
    opentime2 = openday[1].find_next_sibling().get_text()
    open_text = openday_text1 + opentime1 +","+ openday_text2 + opentime2
    info["開館時間"] = open_text

    # 休館日
    offtime = html.find("span", style="font-size: 1.6rem; margin-right: 3rem; display: inline-block; vertical-align: text-bottom; width: 8rem;")
    offtime_text = offtime.get_text()
    info["休館日"] = offtime_text
    return info


# ---------- 2) Selenium：展覽清單 ----------
def get_driver(headless=True):
    from selenium.webdriver.chrome.options import Options
    opts = Options()
    if headless:
      opts.add_argument("--headless=new") #讓瀏覽器在背景執行
    opts.add_argument("--window-size=1400,1000") #設定瀏覽器啟動時的視窗大小
    opts.add_argument("--lang=zh-TW") #設定瀏覽器的語言首選為繁體中文
    opts.add_argument("--disable-gpu") #headless 模式穩定
    opts.add_argument("--no-sandbox") #避免在某些環境無法啟動的錯誤
    return webdriver.Chrome(options=opts)

def scrape_exhibitions():
    driver = get_driver(headless=True)
    rows = []
    try:
        driver.get(EXH)
        wait = WebDriverWait(driver, 20)

        # 等待我要的XPATH出現
        container = wait.until(EC.presence_of_element_located((By.XPATH, CONTAINER_XPATH)))
        items = container.find_elements(By.XPATH, "./div")


        for it in items:
            # 照片
            img_src = ""
            try:
                img = it.find_element(By.XPATH, "./div[1]/img")
                img_src = img.get_attribute("src") or ""
                img_src = urljoin(BASE, img_src)
            except Exception:
                pass

            # 展覽標題與連結
            title= ""
            try:
                a = it.find_element(By.XPATH, "./div[2]/h3/a")
                title = (a.text or "").strip()
            except Exception:
                pass

            # 展覽時間
            ex_time = ""
            try:
                ex_time = it.find_element(By.XPATH, "./div[2]/p[1]").text.strip()
            except Exception:
                pass

            # 展覽地點
            ex_place = ""
            try:
                ex_place = it.find_element(By.XPATH, "./div[2]/p[2]").text.strip()
            except Exception:
                pass

            # 展覽連結
            ex_link = ""
            try:
                link = it.find_element(By.XPATH, "./div[2]/div")
                link_num = link.get_attribute("id")[-3:] or ""
                ex_link = f"{BASE}Exhibition/Exhibition_Special.aspx?ddlLang=zh-tw&id={link_num}"
            except Exception:
                pass

            if any([title, ex_time, ex_place, img_src, ex_link]):
                rows.append({
                    "照片": img_src,
                    "展覽": title,
                    "展覽時間": ex_time,
                    "展覽地點": ex_place,
                    "展覽連結": ex_link,
                })
        return rows
    finally:
        driver.quit()


# ---------- 3) CSV ----------
def save_csv(path, rows):
    if not rows:
        print(f"沒有資料可寫入 CSV：{path}")
        return
    keys = list(rows[0].keys())
    with open(path, "w", newline="", encoding="utf-8-sig") as f:
        writer = csv.DictWriter(f, fieldnames=keys)
        writer.writeheader()
        writer.writerows(rows)
    print(f"CSV 已輸出：{path}（{len(rows)} 筆）")


if __name__ == "__main__":
    # A) 美術館基本資料 → 單列 CSV
    info = fetch_basic_info()
    print(info)
    save_csv("tfam_museum_info.csv", [info])   # 注意用 [info] 包成 list

    # B) 展覽資訊 + 館別欄位 → 多列 CSV
    data = scrape_exhibitions()
    museum_name = info.get("館別", "臺北市立美術館")
    for r in data:
        r["館別"] = museum_name
    for r in data:
        print(r)
    save_csv("tfam_exh.csv", data)

{'館別': '臺北市立美術館', '地址': '104227臺北市中山區中山北路三段181號', '開館時間': '週二至週日9:30-17:30,週六9:30-20:30', '休館日': '週一'}
CSV 已輸出：tfam_museum_info.csv（1 筆）
{'照片': 'https://www.tfam.museum/File/Exhibition/Main/794/2025072118142524950161.jpg', '展覽': '不發音字母—翻閱165頁厚度', '展覽時間': '2025/08/09 - 2025/11/16', '展覽地點': '三樓3A', '展覽連結': 'https://www.tfam.museum/Exhibition/Exhibition_Special.aspx?ddlLang=zh-tw&id=794', '館別': '臺北市立美術館'}
{'照片': 'https://www.tfam.museum/File/Exhibition/Main/798/2025071416094935860567.png', '展覽': '力求失真的嗓音', '展覽時間': '2025/08/09 - 2025/11/16', '展覽地點': '三樓3A', '展覽連結': 'https://www.tfam.museum/Exhibition/Exhibition_Special.aspx?ddlLang=zh-tw&id=798', '館別': '臺北市立美術館'}
{'照片': 'https://www.tfam.museum/File/Exhibition/Main/796/2025071010382802670547.jpg', '展覽': '未完成之作：彭弘智個展', '展覽時間': '2025/08/09 - 2025/11/16', '展覽地點': '三樓3B', '展覽連結': 'https://www.tfam.museum/Exhibition/Exhibition_Special.aspx?ddlLang=zh-tw&id=796', '館別': '臺北市立美術館'}
{'照片': 'https://www.tfam.museum/File/Exhibition/Main/795/202507220

In [None]:
import os
import re
import csv
import requests as req
from bs4 import BeautifulSoup as bs

url = "https://www.tfam.museum/index.aspx?ddlLang=zh-tw"

r = req.get(url)
r.raise_for_status()
html = bs(r.text, "html.parser")

tfam = html.find("div", class_="footer-info-container")
tfam_text = tfam.get_text("\n", strip=True)
info = {}
info["館別"]= re.search(r"臺北市立美術館", tfam_text).group()
info["地址"] = re.search(r"104227.+?號", tfam_text).group()

print(info)


{'館別': '臺北市立美術館', '地址': '104227臺北市中山區中山北路三段181號'}


In [None]:
import os
import re
import csv
import requests as req
from bs4 import BeautifulSoup as bs

url = "https://www.tfam.museum/index.aspx?ddlLang=zh-tw"

r = req.get(url)
r.raise_for_status()
html = bs(r.text, "html.parser")
time_blocks = html.find("div", class_="table-cell")
openday = html.find_all("span", style="box-sizing: border-box; margin: 0px 3rem 0px 0px; padding: 0px; border: 0px; outline: 0px; display: inline-block; vertical-align: text-bottom; width: 8rem;")
openday_text1 = openday[0].get_text()
openday_text2 = openday[1].get_text()
opentime1 = openday[0].find_next_sibling().get_text()
opentime2 = openday[1].find_next_sibling().get_text()
open_text = openday_text1 + opentime1 +","+ openday_text2 + opentime2


print(open_text)

offtime = html.find("span", style="font-size: 1.6rem; margin-right: 3rem; display: inline-block; vertical-align: text-bottom; width: 8rem;")
offtime_text = offtime.get_text()

print(offtime_text)

#print(time_blocks)

週二至週日9:30-17:30,週六9:30-20:30
週一


In [None]:
import os
import re
import csv
import requests as req
from bs4 import BeautifulSoup as bs
headers = {
    "User-Agent": "Mozilla/5.0",
    "Referer": "https://www.tfam.museum/Exhibition/Exhibition.aspx?ddlLang=zh-tw",
    "X-Requested-With": "XMLHttpRequest",
    "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
    "Accept": "text/html, application/json;q=0.9, */*;q=0.8",
}

# ↓ 這些鍵名請用 DevTools → Form Data 看到的為準
form = {
    "Method": "GetList",   # 例：GetList / GetExList / Search …
    "Status": "current",
    "Page": "1",
    "PageSize": "12"
}

exh_url = "https://www.tfam.museum//Exhibition/Exhibition.aspx"
Request_URL = "https://www.tfam.museum/ashx/Exhibition.ashx?ddlLang=zh-tw"
session = req.Session()
resp = session.post(Request_URL, headers=headers, data=form, timeout=20)

html = resp.text,
#current_exh = html.find_all("div" , class_="row Exhibition_list")

print(html)

('<!DOCTYPE html>\r\n<html lang="zh-TW" xmlns="http://www.w3.org/1999/xhtml">\r\n<head runat="server">\r\n    <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />\r\n    <title>æ\xad¤ç¶²é\xa0\x81ä¸\x8då\xad\x98å\x9c¨</title>\r\n</head>\r\n<body>\r\n    <form id="form1" runat="server">\r\n        <div>\r\n            <h1>æ\xad¤ç¶²é\xa0\x81ä¸\x8då\xad\x98å\x9c¨ï¼\x8cè«\x8bæ\x8c\x89æ\xad¤<a href="https://www.tfam.museum/">å\x9b\x9eé¦\x96é\xa0\x81</a></h1>\r\n        </div>\r\n    </form>\r\n</body>\r\n</html>',)
