In [None]:
import os
import shutil
import re
import subprocess
import urllib
import zipfile
import requests


"""
Scrapes and installs chromium from linux mint 21.3(virginia) packages site.
Link: http://packages.linuxmint.com/pool/upstream/c/chromium/
Scrapes and installs chromedriver from Chrome for Testing page.
Link: https://googlechromelabs.github.io/chrome-for-testing/
"""

class CantGetLatestChromiumVersionError(Exception):
    """Happens when regex failed"""

class ChromiumInstallationFailedException(Exception):
    """
    Happens when deb package not installed
    Check the downloaded chroumium deb file
    """

class CantGetChromeDriverError(Exception):
    """Happens when regex failed"""

main_url = "http://packages.linuxmint.com/pool/upstream/c/chromium/"
work_dir = "/content"

def get_chromium_latest_version() -> str:
    # A request to packages.linuxmint.com for getting latest version of chromium
    # e.g. "chromium_121.0.6167.160~linuxmint1+virginia_amd64.deb"
    r = requests.get(main_url)
    if r.status_code != 200:
        raise Exception("status_code code not 200!")
    text = r.text

    # Find latest version
    pattern = '<a\shref="(chromium_[^"]+linuxmint1%2Bvirginia_amd64.deb)'
    latest_version_search = re.search(pattern, text)
    if latest_version_search:
        latest_version = latest_version_search.group(1)
    else:
        raise CantGetLatestChromiumVersionError("Failed to get latest chromium version!")
    return latest_version

def install_chromium(latest_version: str, deb_file: str, quiet: bool):
    # Full url of deb file
    url = f"{main_url}{latest_version}"

    # Download deb file
    if quiet:
        command = f"wget -q -O {work_dir}/{deb_file} {url}"
    else:
        command = f"wget -O {work_dir}/{deb_file} {url}"
    print(f"Downloading: {deb_file}")
    # os.system(command)
    !$command

    # Install deb file
    if quiet:
        command = f"apt-get install {work_dir}/{deb_file} >> apt.log"
    else:
        command = f"apt-get install {work_dir}/{deb_file}"
    print(f"Installing: {deb_file}")
    # os.system(command)
    !$command

def check_chromium_installation(deb_file: str):
    try:
        subprocess.call(["chromium"])
        print("Chromium installation successfull.\n")
        # If installation successfull we can remove deb file
        # Delete deb file from disk
        os.remove(f"{work_dir}/{deb_file}")
    except FileNotFoundError:
        raise ChromiumInstallationFailedException("Chromium Installation Failed!")

def get_chromedriver_url(deb_file: str) -> str:
    # Get content of crhomedriver page
    url = "https://googlechromelabs.github.io/chrome-for-testing/"
    r = requests.get(url)
    if r.status_code != 200:
        raise Exception("status_code code not 200!")
    text = r.text

    # Get chromium version from deb file's name
    version_number = deb_file.split("chromium_")[-1].split(".")[0]

    # Example: https://edgedl.me.gvt1.com/edgedl/chrome/chrome-for-testing/121.0.6167.85/linux64/chromedriver-linux64.zip
    pattern = f'https://[^<]+/{version_number}[^<]+/linux64/chromedriver-linux64.zip'
    # Find latest version
    chromedriver_url_search = re.search(pattern, text)
    if chromedriver_url_search:
        chromedriver_url = chromedriver_url_search.group()
        return chromedriver_url
    else:
        raise CantGetChromeDriverError("Failed to get chromedriver!")

def install_chromedriver(deb_file: str, quiet: bool):
    url = get_chromedriver_url(deb_file)
    file_name = url.split("/")[-1]
    # Download chromedriver
    chromedriver_zip = f"{work_dir}/{file_name}"
    if quiet:
        command = f"wget -q -O {chromedriver_zip} {url}"
    else:
        command = f"wget -O {chromedriver_zip} {url}"
    print(f"Downloading: {file_name}")
    # os.system(command)
    !$command

    # Extract chromedriver from zip
    with zipfile.ZipFile(chromedriver_zip) as zpf:
        zpf.extract(member="chromedriver-linux64/chromedriver", path=work_dir)

    # Remove chromedriver-linux64.zip file
    os.remove(chromedriver_zip)

    # Move extracted chromedriver binary file to /usr/bin directory
    source = f"{work_dir}/chromedriver-linux64/chromedriver"
    destination = "/usr/bin/chromedriver"
    os.rename(source, destination)

    # Make chromedriver binary executable
    os.system(f"chmod +x {destination}")

    # Remove empty chromedriver-linux64 folder
    shutil.rmtree(f"{work_dir}/chromedriver-linux64")

    print("Chromedriver installed")

def install_selenium_package(quiet: bool):
    if quiet:
        !pip install selenium -qq >> pip.log
    else:
        !pip install selenium

def main(quiet: bool):
    # Get the latest version of chromium from linux mint packages site
    latest_version = get_chromium_latest_version()
    # Name of the deb file
    deb_file = urllib.parse.unquote(latest_version, "utf-8")
    # Download and install chromium for ubuntu 22.04
    install_chromium(latest_version, deb_file, quiet)
    # Check if installation succesfull
    check_chromium_installation(deb_file)
    # Install chromedriver
    install_chromedriver(deb_file, quiet)
    # Finally install selenium package
    install_selenium_package(quiet)

if __name__ == '__main__':
    quiet = True # verboseness of wget and apt
    main(quiet)

Downloading: chromium_130.0.6723.116~linuxmint1+virginia_amd64.deb


In [None]:
#爬取中央氣象局天氣資料
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from selenium.webdriver.chrome.service import Service
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import NoAlertPresentException
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_driver_path = '/usr/bin/chromedriver'
driver = webdriver.Chrome(service=Service(chrome_driver_path), options=chrome_options)
driver.get('https://www.cwa.gov.tw/V8/C/W/OBS_Map.html')
driver.implicitly_wait(10)

span = driver.find_elements(By.CSS_SELECTOR, "span.tem-C.is-active")
for i in span:
  print(i.text)
driver.quit()





















26.7
27.1
26.3
26.9
26.5
24.8
24.6
24.6
20.9
26.4
25.8
25.0
24.9
25.0
26.1
25.5
25.1
26.0
27.9
23.1
26.2
28.8


In [None]:
#爬取momo網站商品資料
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from selenium.webdriver.chrome.service import Service
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import NoAlertPresentException
import unittest
import time
import re
import sys
from getpass import getpass
from bs4 import BeautifulSoup
import time, json

user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0.3 Safari/605.1.15"
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument(f"user-agent={user_agent}")
chrome_driver_path = '/usr/bin/chromedriver'
driver = webdriver.Chrome(service=Service(chrome_driver_path), options=chrome_options)
driver.get('https://www.momoshop.com.tw/search/searchShop.jsp?keyword=nike NBA&searchType=1&curPage=1&_isFuzzy=0&showType=chessboardType')
driver.implicitly_wait(10)


items = []
count = 1
page = 1
while True:
    print("抓取: 第" + str(page) + "頁 網路資料中...")
    page = page + 1
    soup = BeautifulSoup(driver.page_source, "lxml")
    print(driver.page_source)
    tag_ul = soup.select_one("div.listArea > ul")
    tag_lis = tag_ul.find_all("li")
    for tag_li in tag_lis:
        title = tag_li.find("h3", class_="prdName")
        price = tag_li.find("span", class_="price").find("b")
        items.append({"id": count,
               "title": title.text,
               "price": price.text})
        print("已經擷取:", count, "筆")
        count = count + 1

    btn_css = "div > div.page-btn.page-next"

    #BodyBase > div.bt_2_layout.searchbox.searchListArea.selectedtop > div:nth-child(6) > dl > dd > a
    button = driver.find_elements(By.CSS_SELECTOR, btn_css)
    for i in button:
      print(i.text)
    if len(button) > 0:
      if button[len(button)-1].text == "下一頁":
          button[len(button)-1].click()
      else:
          break
    time.sleep(10)
driver.quit()

with open("momo_items.json", "w", encoding="utf-8") as fp:
    json.dump(items,fp,indent=2,sort_keys=True,ensure_ascii=False)

[1;30;43m串流輸出內容已截斷至最後 5000 行。[0m
          "image": "https://img1.momoshop.com.tw/goodsimg/0005/167/741/5167741_OL.jpg?t=1692629759",
          "description": "滿1件享84折",
          "url": "https://www.momoshop.com.tw/goods/GoodsDetail.jsp?i_code=5167741&Area=search&mdiv=403&oid=12_11&cid=index&kw=nike+NBA",
          "offers": {
            "@type": "Offer",
            "price": "487",
            "priceCurrency": "TWD",
            "availability": "https://schema.org/InStock"
          }
          
        }
              ]
            }
        }</script><div class="web header-fixed"><div class="headcontent"><div class="headcontentinner01"><div id="bt_0_150_01"><ul class="leftMenu"><li><a href="https://www.momoshop.com.tw/main/Main.jsp?cid=memb&amp;oid=back2hp&amp;mdiv=1099800000-bt_0_150_01-bt_0_150_01_e1&amp;ctype=B" title="回首頁"><p></p>回首頁</a></li><li><a href="https://www.momoshop.com.tw/fuli/redirect.jsp?redirect_url=https://www.momo5188.com" title="momo富立保險"><p></p>momo富立保險</a><

KeyboardInterrupt: 

In [None]:
#爬取covid-19網站資料
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from selenium.webdriver.chrome.service import Service
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import NoAlertPresentException

chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_driver_path = '/usr/bin/chromedriver'
driver = webdriver.Chrome(service=Service(chrome_driver_path), options=chrome_options)
driver.get('https://covid-19.nchc.org.tw/2023_city_confirmed.php?mycity=%E6%96%B0%E5%8C%97%E5%B8%82')
driver.implicitly_wait(10)

#print(driver.page_source)
driver.switch_to.frame(0)

xpath='//*[@id="8"]/tbody'
table = driver.find_element(By.XPATH, xpath)
print(table.text)
driver.quit()

2023-09-07 新北市 全區 2 2,107,845 0.29
2023-07-25 新北市 全區 5 2,107,843 1.14
2023-07-21 新北市 全區 2 2,107,838 0.43
2023-07-20 新北市 全區 1 2,107,836 0.14
2023-07-07 新北市 全區 1 2,107,835 0.29
2023-07-03 新北市 全區 1 2,107,834 0.29
2023-06-27 新北市 全區 1 2,107,833 0.14
2023-06-14 新北市 全區 426 2,107,832 60.86
2023-06-05 新北市 全區 1 2,107,406 0.43
2023-06-01 新北市 全區 2 2,107,405 0.29
2023-05-25 新北市 全區 1 2,107,403 0.14
2023-04-28 新北市 全區 1 2,107,402 0.14
2023-04-21 新北市 全區 2 2,107,401 0.29
2023-03-24 新北市 全區 8 2,107,399 489.86
2023-03-23 新北市 全區 4 2,107,391 715.57
2023-03-22 新北市 全區 18 2,107,387 955.71
2023-03-21 新北市 全區 53 2,107,369 1,200.43
2023-03-20 新北市 全區 514 2,107,316 1,483.86
2023-03-19 新北市 全區 1,108 2,106,802 1,688.14
2023-03-18 新北市 全區 1,724 2,105,694 1,704.57
2023-03-17 新北市 全區 1,588 2,103,970 1,708.29
2023-03-16 新北市 全區 1,685 2,102,382 1,710.43
2023-03-15 新北市 全區 1,731 2,100,697 1,713.14
2023-03-14 新北市 全區 2,037 2,098,966 1,715.00
2023-03-13 新北市 全區 1,944 2,096,929 1,717.29
2023-03-12 新北市 全區 1,223 2,094,985 1,713.71
2023-

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from selenium.webdriver.chrome.service import Service
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import NoAlertPresentException

user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0.3 Safari/605.1.15"
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument(f"user-agent={user_agent}")
chrome_driver_path = '/usr/bin/chromedriver'
driver = webdriver.Chrome(service=Service(chrome_driver_path), options=chrome_options)
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
  "source": """
    Object.defineProperty(navigator, 'webdriver', {
      get: () => undefined
    })
  """
})
driver.get('https://lvr.land.moi.gov.tw/')
driver.implicitly_wait(5)

driver.switch_to.frame(0)
print(driver.page_source)
city = driver.find_element(By.NAME,'city').send_keys('台中市')
region = driver.find_element(By.NAME,'town').send_keys('北區')
btn='#main_form > div:nth-child(2) > div.form-group.mt-0.form-check-inline.qry_general > div > font:nth-child(1) > a'
button = driver.find_elements(By.CSS_SELECTOR, btn)
if (len(button)>0):
  print(button[0].text)
button[0].submit()
driver.implicitly_wait(6)
print(driver.page_source)
driver.quit()


<html lang="zh-TW"><head><script type="text/javascript" async="" src="https://www.googletagmanager.com/gtag/js?id=G-6LQ53RXLY4&amp;l=dataLayer&amp;cx=c" nonce=""></script><script src="https://embed.tawk.to/_s/v4/app/6723acfb8c3/js/twk-main.js" charset="UTF-8" crossorigin="*"></script><script src="https://embed.tawk.to/_s/v4/app/6723acfb8c3/js/twk-vendor.js" charset="UTF-8" crossorigin="*"></script><script src="https://embed.tawk.to/_s/v4/app/6723acfb8c3/js/twk-chunk-vendors.js" charset="UTF-8" crossorigin="*"></script><script src="https://embed.tawk.to/_s/v4/app/6723acfb8c3/js/twk-chunk-common.js" charset="UTF-8" crossorigin="*"></script><script src="https://embed.tawk.to/_s/v4/app/6723acfb8c3/js/twk-runtime.js" charset="UTF-8" crossorigin="*"></script><script src="https://embed.tawk.to/_s/v4/app/6723acfb8c3/js/twk-app.js" charset="UTF-8" crossorigin="*"></script><script type="text/javascript" async="" src="https://www.googletagmanager.com/gtag/js?id=G-5XSPE56B4R&amp;cx=c&amp;_slc=1" n