In [1]:
import requests
from bs4 import BeautifulSoup
import csv
import os
import time
import re
from urllib.parse import urljoin


def extract_genre(breadcrumb):
    """
    直接提取面包屑导航中的第三级类目(extract categories from detail pages)
    
    参数(parameter):
        breadcrumb: 包含分类路径的面包屑元素(elements in category path)
    
    返回(return):
        str: 三级类目名称(third categories names)
    """
    # Extract all breadcrumb items (excluding the book title in the last item)
    # 提取所有面包屑项（排除最后一项的书名）
    items = [li.text.strip() for li in breadcrumb.select('li')][:-1]

    # Ensure there are sufficient levels
    # 确保有足够的层级
    if len(items) >= 3:
        return items[2]  # 第三项就是三级类目(third item is the aimed category)
    elif len(items) == 2:
        return items[1]  # 只有二级类目的情况(if only has secondary category)
    return "Uncategorized"


def clean_price(price_str):
    """清理并格式化价格字符串"""
    """Clean and format the price string"""

    # Confirm the string clean
    # 确保价格字符串是干净的
    price_number = ''.join(char for char in price_str if char.isdigit() or char == '.')
    return f"￡{price_number}"  # 使用全角的￡符号(Use the full-width "￡" symbol)


def scrape_books():
    # 设置明确的CSV保存路径
    # set up CSV save path
    
    save_path = os.path.join(os.getcwd(), 'books.csv')
    print(f"CSV file save to: {'C:/Users/dayining/Desktop/book_Project/data/book.csv'}")
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    base_url = "http://books.toscrape.com/"
    current_page = base_url
    all_book_links = []
    page_count = 0
    max_pages = 5  # 设置最大爬取页数（set maximum number of pages to scrape）
    
    
    try:
        # 循环爬取前5页
        # Loop to scrape the first 5 pages
        while current_page and page_count < max_pages:
            page_count += 1
            print(f"scrape No.{page_count} page: {current_page}")
            
            # 获取当前页内容
            # scrape for current page
            
            response = requests.get(current_page, headers=headers)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # 获取当前页的所有书籍链接
            # scrape for current page link
            
            page_book_links = []
            for article in soup.select('article.product_pod'):
                link = article.select_one('h3 a')['href']
                # 处理相对URL
                # deal with relative URLs
                full_url = urljoin(current_page, link)
                page_book_links.append(full_url)
            
            print(f"  find {len(page_book_links)} book")
            all_book_links.extend(page_book_links)
            
            # 查找下一页链接
            # search for next page link
            next_button = soup.select_one('li.next a')
            if next_button and page_count < max_pages:
                next_url = next_button['href']
                current_page = urljoin(current_page, next_url)
            else:
                current_page = None
                print(f"done {max_pages} page")
                
        total_books = len(all_book_links)  
        print(f"\nTotal find {len(all_book_links)} books, from {page_count} pages")
        
        
        books_data = []
        
        
        # 遍历每本书的详情页
        # Through each book's detail page
        for i, book_url in enumerate(all_book_links, 1):
            try:
                # 显示简洁进度信息
                # Display concise progress information
                if i % 10 == 0 or i == total_books:
                    print(f"dealing with book {i}/{total_books}")
                
                book_response = requests.get(book_url, headers=headers)
                book_response.raise_for_status()
                
                book_soup = BeautifulSoup(book_response.text, 'html.parser')
                
                # 提取书籍信息
                # Book information
                title = book_soup.select_one('h1').text.strip()
                
                # 提取价格并处理
                # Price information
                price_raw = book_soup.select_one('p.price_color').text.strip()
                price = clean_price(price_raw)
                
                # 提取评分
                # Rating information
                rating_class = book_soup.select_one('p.star-rating')['class']
                rating = rating_class[1] if len(rating_class) > 1 else "Not rated"
                
                # 提取作者
                # Author information
                author_element = book_soup.find('th', string='Author')
                author = author_element.find_next_sibling('td').text.strip() if author_element else "Unknown"
                
                # 提取分类信息（三级类目）
                # Third categories information
                breadcrumb = book_soup.select_one('.breadcrumb')
                genre = "Uncategorized"  

                if breadcrumb:
                    # 打印面包屑项用于调试
                    # Print breadcrumb for test
                    breadcrumb_items = [li.text.strip() for li in breadcrumb.select('li')]
                    #print(f"Breadcrumb path items: {breadcrumb_items}")
                    
                    # 直接提取第三项作为三级类目
                    # Directly extract the third item as the tertiary category
                    if len(breadcrumb_items) >= 3:
                        genre = breadcrumb_items[2]
                        #print(f"third category: {genre}")
                    else:
                        print(f"lack of breadcrumb levels: {len(breadcrumb_items)}")
                else:
                    print("not found breadcrumb")
                
                # 提取出版年份
                # Publication date
                date_element = book_soup.find('th', string='Publication date')
                pub_date = date_element.find_next_sibling('td').text.strip() if date_element else "Unknown"
                
                # 提取年份
                # Year
                year_match = re.search(r'\d{4}', pub_date)
                year = year_match.group(0) if year_match else "Unknown"
                
                # 提取库存状态作为流行度指标
                # Stock for popularity
                stock_element = book_soup.find('th', string='Availability')
                availability = stock_element.find_next_sibling('td').text.strip() if stock_element else "Unknown"
                # 将库存状态转换为数值
                # stock for number
                stock_match = re.search(r'(\d+)', availability)
                stock_count = int(stock_match.group(1)) if stock_match else 0
                
                # 提取产品描述
                # Description
                description_element = book_soup.select_one('#product_description + p')
                description = description_element.text.strip() if description_element else "No description available"
                
                # 添加到书籍数据列表
                # Add to the book data list
                books_data.append([
                    title, author, genre, year, 
                    stock_count, rating, price, 
                    description, book_url
                ])
                
                
                time.sleep(0.5)
                
                
            except Exception as e:
                print(f"deal with {book_url} error: {e}")
                continue
        
        # 保存到CSV
        # Save to CSV
        with open('C:/Users/dayining/Desktop/book_Project/data/book.csv', 'w', newline='', encoding='utf-8-sig') as f:
            writer = csv.writer(f)
            # 添加标题行
            # Add column headers
            writer.writerow([
                'Title', 'Author', 'Genre', 'Publication Year', 
                'Popularity (Stock)', 'Rating', 'Price', 
                'Description', 'URL'
            ])
            writer.writerows(books_data)
            
        print(f"succeed scrape {len(books_data)} book date!")
        print(f"file save to: {'C:/Users/dayining/Desktop/book_Project/data/book.csv'}")
        
        # 打印分类统计
        # Print categories sum
        genres = [row[2] for row in books_data]
        print("\nCategory summary:")
        genre_counts = {}
        for g in genres:
            genre_counts[g] = genre_counts.get(g, 0) + 1
        
        # 按书籍数量排序
        # Ranking by count
        sorted_genres = sorted(genre_counts.items(), key=lambda x: x[1], reverse=True)
        
        for genre, count in sorted_genres:
            print(f"{genre}: {count} book")
        
        return True, total_books  
        
    except requests.exceptions.RequestException as e:
        print(f"network error: {e}")
        return False
    except Exception as e:
        print(f"scrape error: {e}")
        return False

if __name__ == "__main__":
    print("start to scrape...")
    start_time = time.time()
    
    success, total_books = scrape_books()
    
    end_time = time.time()
    elapsed = end_time - start_time
    
    if success:
        print(f"Done! Timing: {elapsed:.2f}秒")
    else:
        print("Failed, please check error message")

start to scrape...
CSV file save to: C:/Users/dayining/Desktop/book_Project/data/book.csv
scrape No.1 page: http://books.toscrape.com/
  find 20 book
scrape No.2 page: http://books.toscrape.com/catalogue/page-2.html
  find 20 book
scrape No.3 page: http://books.toscrape.com/catalogue/page-3.html
  find 20 book
scrape No.4 page: http://books.toscrape.com/catalogue/page-4.html
  find 20 book
scrape No.5 page: http://books.toscrape.com/catalogue/page-5.html
  find 20 book
done 5 page

Total find 100 books, from 5 pages
dealing with book 10/100
dealing with book 20/100
dealing with book 30/100
dealing with book 40/100
dealing with book 50/100
dealing with book 60/100
dealing with book 70/100
dealing with book 80/100
dealing with book 90/100
dealing with book 100/100
succeed scrape 100 book date!
file save to: C:/Users/dayining/Desktop/book_Project/data/book.csv

Category summary:
Sequential Art: 14 book
Nonfiction: 12 book
Default: 9 book
Poetry: 7 book
Fiction: 5 book
Food and Drink: 5 bo