/
toutiao_spider.py
80 lines (64 loc) · 1.79 KB
/
toutiao_spider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
# -*- coding:utf-8 -*-
import datetime
import json
import requests
from multiprocessing import Pool
from urllib.parse import urlencode
from pymongo import MongoClient
from requests.exceptions import RequestException
from config import *
client = MongoClient('localhost', 27017)
db = client.toutiao
news = db.news
def get_data(offset):
data = {
'offset': offset,
'format': 'json',
'keyword': '江歌',
'autoload': 'true',
'count': '20',
'cur_tab': 1,
'from':'search_tab'
}
url = 'https://www.toutiao.com/search_content/?' + urlencode(data)
response = requests.get(url)
try:
if response.status_code == 200:
return response.text
return None
except RequestException:
print('data is error!')
return None
def parse_data(html):
data = json.loads(html) # JSON 字符串解码为 Python 对象
if data and 'data' in data.keys():
news_list =[]
for item in data.get('data'):
news = {
'title': item.get('title'), # 使用get()方法避免了部分keys不存在时报错
'url': item.get('url'),
'datetime': datetime.datetime.now()
}
news_list.append(news)
return news_list
def save_to_mongo(data):
try:
if news.insert(data):
print('Save succesfully!')
except:
print('Save Error!')
finally:
print(data)
def main(offset):
html = get_data(offset)
data = parse_data(html)
datebase = save_to_mongo(data)
print(datebase)
if __name__ == '__main__':
# main()
groups = [x * 2 for x in range(GROUP_START, GROUP_END + 1)]
pool = Pool(8)
pool.map(main, groups)
pool.close()
pool.join()
print('ALl is done!')