In [None]:
! pip install requests beautifulsoup4




In [None]:
import requests
import urllib.request as req
from bs4 import BeautifulSoup as bs

base_url = "http://books.toscrape.com/"
page_url = "catalogue/page-1.html"

response = requests.get(base_url + page_url)
soup = bs(response.text, 'html.parser')


#### 擷取每一本書的基本資訊

In [None]:
books = soup.find_all('article', class_='product_pod')

for book in books:
    # 書名
    title = book.h3.a['title']
    # 價格
    price = book.find('p', class_='price_color').text
    # 評價（轉換成文字）
    rating_class = book.find('p')['class'][1]  # 例如 'Three'
    # 庫存狀態
    availability = book.find('p', class_='instock availability').text.strip()

    print(f"書名：{title}")
    print(f"價格：{price}")
    print(f"評價：{rating_class}")
    print(f"庫存：{availability}")
    print("-" * 40)


書名：A Light in the Attic
價格：Â£51.77
評價：Three
庫存：In stock
----------------------------------------
書名：Tipping the Velvet
價格：Â£53.74
評價：One
庫存：In stock
----------------------------------------
書名：Soumission
價格：Â£50.10
評價：One
庫存：In stock
----------------------------------------
書名：Sharp Objects
價格：Â£47.82
評價：Four
庫存：In stock
----------------------------------------
書名：Sapiens: A Brief History of Humankind
價格：Â£54.23
評價：Five
庫存：In stock
----------------------------------------
書名：The Requiem Red
價格：Â£22.65
評價：One
庫存：In stock
----------------------------------------
書名：The Dirty Little Secrets of Getting Your Dream Job
價格：Â£33.34
評價：Four
庫存：In stock
----------------------------------------
書名：The Coming Woman: A Novel Based on the Life of the Infamous Feminist, Victoria Woodhull
價格：Â£17.93
評價：Three
庫存：In stock
----------------------------------------
書名：The Boys in the Boat: Nine Americans and Their Epic Quest for Gold at the 1936 Berlin Olympics
價格：Â£22.60
評價：Four
庫存：In stock
--------------

#### 擷取50頁每一本書的基本資訊

In [None]:
for page_num in range(1, 51):
    url = f"http://books.toscrape.com/catalogue/page-{page_num}.html"
    response = requests.get(url)
    soup = bs(response.text, 'html.parser')

    books = soup.find_all('article', class_='product_pod')

    for book in books:
        title = book.h3.a['title']
        price = book.find('p', class_='price_color').text
        rating = book.find('p')['class'][1]
        availability = book.find('p', class_='instock availability').text.strip()

        print(f"書名：{title}")
        print(f"價格：{price}")
        print(f"評價：{rating}")
        print(f"庫存：{availability}")
        print("-" * 40)


書名：A Light in the Attic
價格：Â£51.77
評價：Three
庫存：In stock
----------------------------------------
書名：Tipping the Velvet
價格：Â£53.74
評價：One
庫存：In stock
----------------------------------------
書名：Soumission
價格：Â£50.10
評價：One
庫存：In stock
----------------------------------------
書名：Sharp Objects
價格：Â£47.82
評價：Four
庫存：In stock
----------------------------------------
書名：Sapiens: A Brief History of Humankind
價格：Â£54.23
評價：Five
庫存：In stock
----------------------------------------
書名：The Requiem Red
價格：Â£22.65
評價：One
庫存：In stock
----------------------------------------
書名：The Dirty Little Secrets of Getting Your Dream Job
價格：Â£33.34
評價：Four
庫存：In stock
----------------------------------------
書名：The Coming Woman: A Novel Based on the Life of the Infamous Feminist, Victoria Woodhull
價格：Â£17.93
評價：Three
庫存：In stock
----------------------------------------
書名：The Boys in the Boat: Nine Americans and Their Epic Quest for Gold at the 1936 Berlin Olympics
價格：Â£22.60
評價：Four
庫存：In stock
--------------

#### 擷取圖片

#### 抓取所有分類的網址

In [None]:
import json
import pandas as pd
category_tags = soup.select('.side_categories ul li ul li a')

category_urls = {}
for tag in category_tags:
    category_name = tag.text.strip()
    category_href = tag['href']
    full_url = base_url + category_href
    category_urls[category_name] = full_url

print(category_urls)  # 顯示所有分類名稱與對應網址

{'Travel': 'http://books.toscrape.com/category/books/travel_2/index.html', 'Mystery': 'http://books.toscrape.com/category/books/mystery_3/index.html', 'Historical Fiction': 'http://books.toscrape.com/category/books/historical-fiction_4/index.html', 'Sequential Art': 'http://books.toscrape.com/category/books/sequential-art_5/index.html', 'Classics': 'http://books.toscrape.com/category/books/classics_6/index.html', 'Philosophy': 'http://books.toscrape.com/category/books/philosophy_7/index.html', 'Romance': 'http://books.toscrape.com/category/books/romance_8/index.html', 'Womens Fiction': 'http://books.toscrape.com/category/books/womens-fiction_9/index.html', 'Fiction': 'http://books.toscrape.com/category/books/fiction_10/index.html', 'Childrens': 'http://books.toscrape.com/category/books/childrens_11/index.html', 'Religion': 'http://books.toscrape.com/category/books/religion_12/index.html', 'Nonfiction': 'http://books.toscrape.com/category/books/nonfiction_13/index.html', 'Music': 'http:

#### 抓取每一分類的所有書籍資訊與圖片連結

In [None]:
import os

for category, url in category_urls.items():
    print(f"正在抓取分類：{category}")
    page_num = 1

    while True:
        # 處理分頁
        page_url = url.replace('index.html', f'page-{page_num}.html') if page_num > 1 else url
        res = requests.get(page_url)
        if res.status_code != 200:
            break  # 無下一頁

        soup = bs(res.text, 'html.parser')
        books = soup.find_all('article', class_='product_pod')
        if not books:
            break

        # 創建分類資料夾
        os.makedirs(f'images/{category}', exist_ok=True)

        for book in books:
            title = book.h3.a['title']
            image_relative_url = book.find('img')['src']
            image_url = base_url + image_relative_url.replace('../', '')
            print(f"書名：{title}")
            print(f"圖片連結：{image_url}")

            # 圖片下載
            img_data = requests.get(image_url).content
            filename = f'images/{category}/{title[:30].replace("/", "-")}.jpg'
            with open(filename, 'wb') as img_file:
                img_file.write(img_data)

        page_num += 1


正在抓取分類：Travel
正在抓取分類：Mystery
正在抓取分類：Historical Fiction
正在抓取分類：Sequential Art
正在抓取分類：Classics
正在抓取分類：Philosophy
正在抓取分類：Romance
正在抓取分類：Womens Fiction
正在抓取分類：Fiction
正在抓取分類：Childrens
正在抓取分類：Religion
正在抓取分類：Nonfiction
正在抓取分類：Music
正在抓取分類：Default
正在抓取分類：Science Fiction
正在抓取分類：Sports and Games
正在抓取分類：Add a comment
正在抓取分類：Fantasy
正在抓取分類：New Adult
正在抓取分類：Young Adult
正在抓取分類：Science
正在抓取分類：Poetry
正在抓取分類：Paranormal
正在抓取分類：Art
正在抓取分類：Psychology
正在抓取分類：Autobiography
正在抓取分類：Parenting
正在抓取分類：Adult Fiction
正在抓取分類：Humor
正在抓取分類：Horror
正在抓取分類：History
正在抓取分類：Food and Drink
正在抓取分類：Christian Fiction
正在抓取分類：Business
正在抓取分類：Biography
正在抓取分類：Thriller
正在抓取分類：Contemporary
正在抓取分類：Spirituality
正在抓取分類：Academic
正在抓取分類：Self Help
正在抓取分類：Historical
正在抓取分類：Christian
正在抓取分類：Suspense
正在抓取分類：Short Stories
正在抓取分類：Novels
正在抓取分類：Health
正在抓取分類：Politics
正在抓取分類：Cultural
正在抓取分類：Erotica
正在抓取分類：Crime


#### 儲存為 CSV

In [None]:
import requests
from bs4 import BeautifulSoup
import os
import csv

base_url = "http://books.toscrape.com/"
main_page = requests.get(base_url)
main_soup = BeautifulSoup(main_page.text, 'html.parser')

# Step 1: 抓所有分類網址
category_tags = main_soup.select('.side_categories ul li ul li a')
category_urls = {
    tag.text.strip(): base_url + tag['href'] for tag in category_tags
}

# Step 2: 準備 CSV 檔案
os.makedirs('images', exist_ok=True)
csv_file = open('books.csv', 'w', newline='', encoding='utf-8')
writer = csv.writer(csv_file)
writer.writerow(['Category', 'Title', 'Price', 'Rating', 'Availability', 'Image URL', 'Image Filename'])

# Step 3: 開始爬分類資料
for category, url in category_urls.items():
    print(f"📚 抓取分類：{category}")
    page_num = 1
    while True:
        # 分頁處理
        page_url = url.replace('index.html', f'page-{page_num}.html') if page_num > 1 else url
        res = requests.get(page_url)
        if res.status_code != 200:
            break

        soup = BeautifulSoup(res.text, 'html.parser')
        books = soup.find_all('article', class_='product_pod')
        if not books:
            break

        # 確保圖片資料夾存在
        os.makedirs(f'images/{category}', exist_ok=True)

        for book in books:
            title = book.h3.a['title'].strip()
            price = book.find('p', class_='price_color').text.strip()
            rating = book.find('p')['class'][1]
            availability = book.find('p', class_='instock availability').text.strip()
            img_relative = book.find('img')['src']
            img_url = base_url + img_relative.replace('../', '')
            filename = f'images/{category}/{title[:30].replace("/", "-")}.jpg'

            # 下載圖片
            img_data = requests.get(img_url).content
            with open(filename, 'wb') as f:
                f.write(img_data)

            # 寫入 CSV
            writer.writerow([category, title, price, rating, availability, img_url, filename])

        page_num += 1

csv_file.close()
print("✅ 所有資料已完成並儲存到 books.csv")


📚 抓取分類：Travel
📚 抓取分類：Mystery
📚 抓取分類：Historical Fiction
📚 抓取分類：Sequential Art
📚 抓取分類：Classics
📚 抓取分類：Philosophy
📚 抓取分類：Romance
📚 抓取分類：Womens Fiction
📚 抓取分類：Fiction
📚 抓取分類：Childrens
📚 抓取分類：Religion
📚 抓取分類：Nonfiction
📚 抓取分類：Music
📚 抓取分類：Default
📚 抓取分類：Science Fiction
📚 抓取分類：Sports and Games
📚 抓取分類：Add a comment
📚 抓取分類：Fantasy
📚 抓取分類：New Adult
📚 抓取分類：Young Adult
📚 抓取分類：Science
📚 抓取分類：Poetry
📚 抓取分類：Paranormal
📚 抓取分類：Art
📚 抓取分類：Psychology
📚 抓取分類：Autobiography
📚 抓取分類：Parenting
📚 抓取分類：Adult Fiction
📚 抓取分類：Humor
📚 抓取分類：Horror
📚 抓取分類：History
📚 抓取分類：Food and Drink
📚 抓取分類：Christian Fiction
📚 抓取分類：Business
📚 抓取分類：Biography
📚 抓取分類：Thriller
📚 抓取分類：Contemporary
📚 抓取分類：Spirituality
📚 抓取分類：Academic
📚 抓取分類：Self Help
📚 抓取分類：Historical
📚 抓取分類：Christian
📚 抓取分類：Suspense
📚 抓取分類：Short Stories
📚 抓取分類：Novels
📚 抓取分類：Health
📚 抓取分類：Politics
📚 抓取分類：Cultural
📚 抓取分類：Erotica
📚 抓取分類：Crime
✅ 所有資料已完成並儲存到 books.csv
