# 1. 開啟瀏覽器，進到蝦皮搜尋首頁，並輸入關鍵字

## 1.1 引用套件，並準備一些瀏覽器基礎設定

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
import time
import re
from bs4 import BeautifulSoup
from lxml import etree
import pandas as pd


# 關閉通知
options = webdriver.ChromeOptions()
prefs = {
    'profile.default_content_setting_values':
        {
            'notifications': 2
        }
}
options.add_experimental_option('prefs', prefs)
options.add_argument("disable-infobars")

## 1.2 開啟瀏覽器，在蝦皮以關鍵字進行搜尋 

In [None]:
# 打啟動selenium 務必確認driver 檔案跟python 檔案要在同個資料夾中
# 在這邊下載: https://chromedriver.chromium.org/downloads
driver = webdriver.Chrome(options=options)
keyword = "三多葉黃素"
driver.get(f"https://shopee.tw/search?keyword={keyword}&order=asc&page=0&sortBy=price")
time.sleep(4)

## 1.3 往下滑三次，確保60筆搜尋結果都有顯現出來 (效果意外地不理想，待解決)

- 好像滑過頭，沒有觸發到讀取資料

In [None]:
# 往下滑
for i in range(0,3):
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
    time.sleep(3)
print("滑好了")

# 2. 擷取網頁資料

## 2.1 擷取並解析 HTML

In [None]:
html = etree.HTML(driver.page_source)
soup = BeautifulSoup(driver.page_source, "html.parser")

## 2.2 選擇搜尋結果

In [None]:
all_ = soup.find_all("div", class_="shopee-search-item-result")
print(len(all_))

In [None]:
all_total = all_[0].find_all("span", class_="shopee-mini-page-controller__total")
total = all_total[0].text
print(total)

all_current = all_[0].find_all("span", class_="shopee-mini-page-controller__current")
current = all_current[0].text
print(current)

## 2.3 獲取商品資訊

### 2.3.1 確認商品數量

In [None]:
# all_result = all_[0].find_all("div", class_="VTjd7p whIxGK")
all_result = all_[0].find_all("div", class_="col-xs-2-4 shopee-search-item-result__item")
print(len(all_result))

### 2.3.2 指定其中一個產品，測試抓取資料是否成功

In [None]:
# 指定要分析第幾個產品
product = all_result[10]

name = product.find_all("div", class_="ie3A+n bM+7UW Cve6sh")[0].text
price = product.find_all("div", {"class": ["vioxXd", "rVLWG6h"]})[0].text
sold = product.find_all("div", {"class": ["r6HknA", "uEPGHT"]})[0].text
img = product.find_all("img", {"class": ["_7DTxhh", "vc8g9F"]})[0]["src"]
href = product.find_all("a", {"data-sqe": "link"})[0]["href"]

if sold != "":
    sold = sold[4:]   # 移除「已售出」

print(name)
print(price)
print(sold)
print(img)
print(f"https://shopee.tw{href}")

### 2.3.3 抓取這一頁全部的產品資料，並存成 excel 和 html 檔

#### 優先處理：
    - 自動往下滑的部分要解決
    - 改用 Colab
        - 目前問題：可以用selenium得到資料，但沒有視窗開啟，且因為沒有往下滑，資料並不完整
        - 神奇的是，有60筆資料
        - 應該是中後半段有些標籤內容抓不到

#### TODO list
    - 價格低於某個數值，將表格填色
    - 如果是廣告，標註起來
    - 價格呈現優化

#### 無關緊要的TODO list
    - find_all 應該可以換成 find
    - 過長標題自動換行 (e.g. 用<br>)
    - 關鍵字 highlight (e.g. 用<mark></mark>)
    


In [None]:
def path_to_image_html(path):
    return '<img src="'+ path + '" width="100" >'

def href_to_full_path(href):
    return f'<a href="https://shopee.tw{href}">link</a>'

name_list = []
img_list = []
price_list = []
sold_list = []
href_list = []

for product in all_result:
#     name = product.find_all("div", class_="ie3A+n bM+7UW Cve6sh")[0].text
#     img = product.find_all("img", {"class": ["_7DTxhh", "vc8g9F"]})[0]["src"]
#     price = product.find_all("div", {"class": ["vioxXd", "rVLWG6h"]})[0].text
#     sold = product.find_all("div", {"class": ["r6HknA", "uEPGHT"]})[0].text
#     href = product.find_all("a", {"data-sqe": "link"})[0]["href"]
    
    try:
        name = product.find_all("div", class_="ie3A+n bM+7UW Cve6sh")[0].text
    except:
        name = ""
    
    try:
        img = product.find_all("img", {"class": ["_7DTxhh", "vc8g9F"]})[0]["src"]
    except:
        img = ""
        
    try:
        price = product.find_all("div", {"class": ["vioxXd", "rVLWG6h"]})[0].text
    except:
        price = ""
    
    try:
        sold = product.find_all("div", {"class": ["r6HknA", "uEPGHT"]})[0].text
    except:
        sold = ""
    
    try:
        href = product.find_all("a", {"data-sqe": "link"})[0]["href"]
    except:
        href = ""
    
    if sold != "":
        sold = sold[4:]   # 移除「已售出」
    
    name_list.append(name)
    img_list.append(path_to_image_html(img))
    price_list.append(price)
    sold_list.append(sold)
    href_list.append(href_to_full_path(href))

data = {
    "name": name_list,
    "img": img_list,
    "price": price_list,
    "sold": sold_list,
    "href": href_list
}

df = pd.DataFrame(data)

df.to_excel("data.xlsx")

# df.to_html("data.html")
df.to_html(f"{keyword}.html", escape=False, formatters=dict(Country=path_to_image_html))

df