-
Notifications
You must be signed in to change notification settings - Fork 0
/
網路爬蟲專案 II
48 lines (33 loc) · 1.81 KB
/
網路爬蟲專案 II
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
import requests
import csv
import time
time.sleep(3)
from bs4 import BeautifulSoup
url = 'https://movies.yahoo.com.tw/chart.html?cate=year'
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'}
resp = requests.get(url, headers=headers)
# 設定編碼為 utf-8 避免中文亂碼問題
resp.encoding = 'utf-8'
# 根據 HTTP header 的編碼解碼後的內容資料(ex. UTF-8),若該網站沒設定可能會有中文亂碼問題。所以通常會使用 resp.encoding 設定
raw_html = resp.text
soup = BeautifulSoup(raw_html, 'html.parser')
# 開始寫入檔案,把資料存放到 list 裡面
performance_list = []
#使用 CSS Selector 選到對應的元素位置,取出裡面的值 (1-9)
for i in range(1, 10):
print('i', i)
performance_dict= {}
performance_dict['rank'] = soup.select('#content_l > div > div.rank_list.table.rankstyle1 > div:nth-child(2)> div:nth-child(1)')[0].text
performance_dict['name'] = soup.select('#content_l > div > div.rank_list.table.rankstyle1 > div:nth-child(2)> div:nth-child(4) > a > dl > dd > h2')[0].text
performance_dict['date'] = soup.select('#content_l > div > div.rank_list.table.rankstyle1 > div:nth-child(2)> div:nth-child(5)')[0].text
# 每月資料寫入 list
performance_list.append(performance_dict)
# CSV 檔案第一列標題會是 date, final_price, year_revenue,記得要和 dict 的 key 相同,不然會出現錯誤
headers = [ 'rank','name', 'date']
# 使用檔案 with ... open 開啟寫入檔案模式,透過 csv 模組將資料寫入
with open('performance.csv', 'w') as output_file:
dict_writer = csv.DictWriter(output_file, headers)
# 寫入標題
dict_writer.writeheader()
# 寫入值
dict_writer.writerows(performance_list)