## 參展店家

In [None]:
import requests
from bs4 import BeautifulSoup

In [None]:
url = "https://cybersec.ithome.com.tw/2024/exhibitionDirectory"

In [None]:
# 1. 先去取得資訊: 回傳一份完整的html + js 字串
response = requests.get(url)
response

<Response [200]>

In [None]:
len(response.text)

717633

In [None]:
# 2. 把剛剛得到的字串，丟給beautifulsoup處理
# 3. 找出所有的<div class='exd-card'>
soup = BeautifulSoup(response.text, "html.parser")
exd_cards = soup.find_all("div", attrs={"class": "exd-card"})
len(exd_cards)

374

In [None]:
url_prefix = "https://cybersec.ithome.com.tw"
exd_cards_info = list()

for exd_card in exd_cards:
    # 找連結
    href = url_prefix + exd_card.a["href"]

    # 展攤名稱
    exd_name = exd_card.h5.text

    # 展攤位置編號
    if exd_card.h6: # 判斷是否為None
      exd_id = exd_card.h6.text.split("：")[1]
    else:
      exd_id = ""

    # print(href, exd_name, exd_id)
    exd_cards_info.append({
        'exd_link': href,
        'exd_name': exd_name,
        'exd_id': exd_id
    })

In [None]:
len(exd_cards_info)

374

In [None]:
exd_cards_info[0:5]

[{'exd_link': 'https://cybersec.ithome.com.tw/2024/exhibition-page/2054',
  'exd_name': 'A10 Networks',
  'exd_id': 'C307'},
 {'exd_link': 'https://cybersec.ithome.com.tw/2024/exhibition-page/2226',
  'exd_name': '三甲科技',
  'exd_id': 'T37'},
 {'exd_link': 'https://cybersec.ithome.com.tw/2024/exhibition-page/2079',
  'exd_name': 'ABPSecurite',
  'exd_id': 'P302'},
 {'exd_link': 'https://cybersec.ithome.com.tw/2024/exhibition-page/2249',
  'exd_name': 'ACAD 安碁學苑股份有限公司',
  'exd_id': 'CT08'},
 {'exd_link': 'https://cybersec.ithome.com.tw/2024/exhibition-page/2000',
  'exd_name': '鼎峰亞太',
  'exd_id': 'P216'}]

In [None]:
import pandas as pd
data = pd.DataFrame(exd_cards_info) # 轉換成DataFrame
data.to_csv('cybersec_exd.csv')

In [None]:
data.head()

Unnamed: 0,exd_link,exd_name,exd_id
0,https://cybersec.ithome.com.tw/2024/exhibition...,A10 Networks,C307
1,https://cybersec.ithome.com.tw/2024/exhibition...,三甲科技,T37
2,https://cybersec.ithome.com.tw/2024/exhibition...,ABPSecurite,P302
3,https://cybersec.ithome.com.tw/2024/exhibition...,ACAD 安碁學苑股份有限公司,CT08
4,https://cybersec.ithome.com.tw/2024/exhibition...,鼎峰亞太,P216


## 參展店家: 取得更進一步的資訊

In [None]:
# 老方法
url = exd_cards_info[0]['exd_link']
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
tel = soup.find('span', attrs={'class': 'info-tel'})

In [None]:
len(tel)

TypeError: object of type 'NoneType' has no len()

In [None]:
mail = soup.find('span', attrs={'class': 'info-mail'})

In [None]:
len(mail)

TypeError: object of type 'NoneType' has no len()

### (參考)新方法一: regular expression 正則表達式

In [None]:
response = requests.get(url)
response

<Response [200]>

In [None]:
len(response.text)

28922

In [None]:
# contact_email: "apac@a10networks.com",
# contact_phone: "886-2-2785-2729",
# link_website: "http:\u002F\u002Fwww.a10networks.com",
# link_facebook: "https:\u002F\u002Fwww.facebook.com\u002Fa10networksTW",
# link_instagram: b,
# link_youtube: b,
# link_linkedin: "https:\u002F\u002Fwww.linkedin.com\u002Fcompany\u002Fa10networks\u002F",
# link_twitter: "https:\u002F\u002Ftwitter.com\u002Fa10networks"

In [None]:
# prompt: 請幫我生成一段regular expression 取得 contact_email 的程式碼
# 原始格式為：contact_email:"apac@a10networks.com"

import re
pattern = r'contact_email:"(.*?)"'
email = re.findall(pattern, response.text)
print(email)


['apac@a10networks.com']


In [None]:
pattern = r'contact_phone:"(.*?)"'
phone = re.findall(pattern, response.text)
print(phone)

['886-2-2785-2729']


In [None]:
pattern = r'link_website:"(.*?)"'
website = re.findall(pattern, response.text)
print(website)

['http:\\u002F\\u002Fwww.a10networks.com']


In [None]:
pattern = r'link_facebook:"(.*?)"'
website = re.findall(pattern, response.text)
print(website)

['https:\\u002F\\u002Fwww.facebook.com\\u002Fa10networksTW']


In [None]:
pattern = r'link_linkedin:"(.*?)"'
website = re.findall(pattern, response.text)
print(website)

AttributeError: module 're' has no attribute 'find'

In [None]:
exd_cards_info[:3]

[{'exd_link': 'https://cybersec.ithome.com.tw/2024/exhibition-page/2054',
  'exd_name': 'A10 Networks',
  'exd_id': 'C307'},
 {'exd_link': 'https://cybersec.ithome.com.tw/2024/exhibition-page/2226',
  'exd_name': '三甲科技',
  'exd_id': 'T37'},
 {'exd_link': 'https://cybersec.ithome.com.tw/2024/exhibition-page/2079',
  'exd_name': 'ABPSecurite',
  'exd_id': 'P302'}]

In [None]:
import time
# 整理程式碼
for exd_info in exd_cards_info: # 我針對每一組card的資料，去取得連結

  time.sleep(3) # delay 3 sec，避免被網站阻擋請求。

  url = exd_info['exd_link'] # 取得URL
  response = requests.get(url) # 利用request去取得網站的HTML+JS

  email = re.findall(r'contact_email:"(.*?)"', response.text)
  email = email[0] if email else ""  # Comprehension

  phone = re.findall(r'contact_phone:"(.*?)"', response.text)
  if phone:
    phone = phone[0]
  else:
    phone = ""

  website = re.findall(r'link_website:"(.*?)"', response.text)
  if website:
    website = website[0].replace("\\u002F", "\\")
  else:
    website = ""

  # 更新新收集資料到dictionary內
  exd_info['email'] = email
  exd_info['phone'] = phone
  exd_info['website'] = website


In [None]:
exd_cards_info[:3]

[{'exd_link': 'https://cybersec.ithome.com.tw/2024/exhibition-page/2054',
  'exd_name': 'A10 Networks',
  'exd_id': 'C307',
  'email': 'apac@a10networks.com',
  'phone': '886-2-2785-2729',
  'website': 'http:\\\\www.a10networks.com'},
 {'exd_link': 'https://cybersec.ithome.com.tw/2024/exhibition-page/2226',
  'exd_name': '三甲科技',
  'exd_id': 'T37',
  'email': 'service@aaasec.com.tw',
  'phone': '04-2452-4234',
  'website': ''},
 {'exd_link': 'https://cybersec.ithome.com.tw/2024/exhibition-page/2079',
  'exd_name': 'ABPSecurite',
  'exd_id': 'P302',
  'email': 'sales@abpsecurite.com',
  'phone': '',
  'website': 'https:\\\\www.abpsecurite.com\\'}]

In [None]:
import pandas as pd
data = pd.DataFrame(exd_cards_info) # 轉換成DataFrame
data.to_csv('/content/drive/MyDrive/上課教材/Python/網路爬蟲 Scraper/cybersec_exd.csv')

### 方法二 - 模擬瀏覽器 Selenium

In [None]:
# https://selenium-python-zh.readthedocs.io/en/latest/
# 轉移到 VSCode 進行

Collecting Selenium
  Downloading selenium-4.23.1-py3-none-any.whl.metadata (7.1 kB)
Collecting trio~=0.17 (from Selenium)
  Downloading trio-0.26.1-py3-none-any.whl.metadata (8.6 kB)
Collecting trio-websocket~=0.9 (from Selenium)
  Downloading trio_websocket-0.11.1-py3-none-any.whl.metadata (4.7 kB)
Collecting outcome (from trio~=0.17->Selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.9->Selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Collecting h11<1,>=0.9.0 (from wsproto>=0.14->trio-websocket~=0.9->Selenium)
  Downloading h11-0.14.0-py3-none-any.whl.metadata (8.2 kB)
Downloading selenium-4.23.1-py3-none-any.whl (9.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.4/9.4 MB[0m [31m41.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trio-0.26.1-py3-none-any.whl (475 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m475.7/475.7 kB[0m [31m25.

## 與會講者

In [None]:
speaker_url = "https://cybersec.ithome.com.tw/2024/speaker"
response = requests.get(speaker_url)
response

<Response [200]>

In [None]:
len(response.text)

214961

In [None]:
soup = BeautifulSoup(response.text, "html.parser")
speaker_cards = soup.find_all("div", attrs={"class": "col-md-3 sp-card-area"})
len(speaker_cards)

0

In [None]:
!apt-get update
!apt install chromium-chromedriver
!cp /usr/lib/chromium-browser/chromedriver /usr/bin
!pip install selenium

In [None]:
# 匯入與設定 Selenium
from selenium import webdriver
from selenium.webdriver.common.by import By
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless') # 不開啟瀏覽器視窗
chrome_options.add_argument('--no-sandbox') # 無痕
chrome_options.headless = True
driver = webdriver.Chrome(options=chrome_options)