In [None]:
# 基礎數據處理
import pandas as pd
import re
from datetime import datetime
import time

# 網頁爬蟲相關
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.chrome.options import Options as ChromeOptions
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup

# 圖片處理
from PIL import Image
from io import BytesIO
import matplotlib.pyplot as plt

# 其他工具
import pyautogui
import requests
import html

# 設定 ChromeDriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.action_chains import ActionChains
import undetected_chromedriver as uc


In [None]:
# 創建 ChromeDriver 服務
service = Service(ChromeDriverManager().install())

# 設定 Chrome 選項
options = uc.ChromeOptions()

# 瀏覽器行為設定
options.add_argument('--headless')  
options.add_argument('--disable-blink-features=AutomationControlled')  
options.add_argument('--disable-infobars')  
options.add_argument('--disable-dev-shm-usage')  
options.add_argument('--no-sandbox')  
options.add_argument('--disable-gpu')  

# 設定使用者代理
options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) \
                     AppleWebKit/537.36 (KHTML, like Gecko) \
                     Chrome/91.0.4472.124 Safari/537.36')

# 初始化瀏覽器
driver = webdriver.Chrome(service=service, options=options)

# 訪問目標網址（Google 搜尋香連結果）
target_url = 'https://www.google.com/search?q=%E9%A6%99%E9%80%A3&rlz=1C1CHBF_zh-TWTW1068TW1068&oq=%E9%A6%99%E9%80%A3&gs_lcrp=EgZjaHJvbWUqBggAEEUYOzIGCAAQRRg7MgYIARBFGDwyBwgCEAAYgAQyBwgDEAAYgAQyBwgEEAAYgAQyBggFEEUYQTIGCAYQRRg8MgYIBxBFGD3SAQg1ODEwajBqN6gCALACAA&sourceid=chrome&ie=UTF-8#lrd=0x3442a96ae8df6059:0xff0eda2fd6a5e216,1,,,'
driver.get(target_url)

In [None]:
data = {'評等': [], '評論': [], '圖片url': []}

In [None]:
def wait_for_element(driver, by, value, timeout=10):
    """等待特定元素出現"""
    return WebDriverWait(driver, timeout).until(
        EC.presence_of_element_located((by, value))
    )

def extract_review_data(review):
    """從評論元素中提取數據"""
    stars = review.find_all('span', {'class': 'lTi8oc z3HNkc'})
    comments = review.find_all('span', {'data-expandable-section': True})
    images = review.find_all('div', {'class': 'JrO5Xe'})
    return stars, comments, images

def process_image_urls(images):
    """處理評論中的圖片"""
    image_urls = []
    for image_html in images:
        img_url_match = re.search(r'url\((.*?)\)', str(image_html))
        if img_url_match:
            img_url = img_url_match.group(1)
            try:
                response = requests.get(img_url)
                if response.status_code == 200:
                    img = Image.open(BytesIO(response.content))
                    plt.imshow(img)
                    plt.axis('off')
                    plt.show()
                    image_urls.append(img_url)
            except Exception as e:
                print(f"圖片處理錯誤: {e}")
    return image_urls

def scroll_reviews(driver):
    """滾動評論區塊"""
    try:
        dialog_body = wait_for_element(driver, By.ID, 'reviewSort')
        driver.execute_script("arguments[0].scrollIntoView(false);", dialog_body)
        time.sleep(5)
        
        bottom_element = driver.find_element(By.XPATH, '//div[@class="loris"]')
        actions = ActionChains(driver)
        actions.drag_and_drop_by_offset(bottom_element, 0, 50).perform()
        return True
    except Exception as e:
        print(f"滾動錯誤: {e}")
        return False

def main():
    # 初始化數據結構
    data = {
        '評論': [],
        '評等': [],
        '圖片url': []
    }
    
    # 等待頁面載入
    wait_for_element(driver, By.ID, 'reviewSort')
    
    # 主要爬蟲循環
    for i in range(100):
        try:
            # 獲取頁面內容
            page_source = driver.execute_script("return document.documentElement.outerHTML")
            soup = BeautifulSoup(page_source, 'html.parser')
            reviews = soup.find_all('div', class_='WMbnJf vY6njf gws-localreviews__google-review')
            
            # 處理每條評論
            for review in reviews:
                stars, comments, images = extract_review_data(review)
                
                for comment, star, image_html in zip(comments, stars, images):
                    # 提取評論文字
                    comment_text = comment.get_text(separator=", ").strip(',')
                    
                    # 檢查是否為新評論
                    if comment_text not in data['評論']:
                        # 提取星級
                        star_rating = star["aria-label"]
                        star_rating = re.search(r'評等：(\d+\.\d+)', star_rating).group(1)
                        
                        # 處理圖片
                        image_urls = process_image_urls(images)
                        
                        # 儲存數據
                        data['評論'].append(comment_text)
                        data['評等'].append(star_rating)
                        data['圖片url'].append(image_urls)
                        
                        # 輸出結果
                        print(f"評等: {star_rating}")
                        print(f"評論: {comment_text}")
                        print(f"圖片 URLs: {image_urls}\n")
            
            # 滾動到下一頁
            if not scroll_reviews(driver):
                break
                
        except Exception as e:
            print(f"主程序錯誤: {e}")
            break
    
    return data

if __name__ == "__main__":
    data = main()

In [None]:
df = pd.DataFrame(data)
df