-
Notifications
You must be signed in to change notification settings - Fork 0
/
scraper0507.py
320 lines (292 loc) · 12.6 KB
/
scraper0507.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
# -*- coding: utf-8 -*-
import requests, re, time, datetime, json, zlib, csv, http.client
from bs4 import BeautifulSoup
from urllib.parse import urlparse
from random import choice, randrange
from datetime import timedelta
from redis import StrictRedis
seedurl = 'http://china.findlaw.cn/ask/'
#年月Page url地址格式:http://china.findlaw.cn/ask/d201803_page1/
#问题Page url地址格式:http://china.findlaw.cn/ask/question_43837794.html
urltest = 'http://chnaa.findlaw.cn/ask/browse/'
YEARS = [str(years) for years in range(2004, 2018)]
MONTHS = ['{:02}'.format(months) for months in range(1, 13)]
USER_AGENT = 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:59.0) Gecko/20100101 Firefox/59.0'
PROXIES = {'http' : 'http://myproy.net:8888'}
# print (YEARS, MONTHS)
class Throttle:
'''
Add a delay between downloads to the same domain
'''
def __init__(self, delay):
# amount of delay between downloads for each domain
self.delay = delay
# timestamp of when a domain was last accessed
self.domains = {}
def wait(self, url):
domain = urlparse(url).netloc
last_accessed = self.domains.get(domain)
if self.delay > 0 and last_accessed is not None:
sleep_secs = self.delay - (time.time() - last_accessed)
if sleep_secs > 0:
# domain has been accessed recently (< delay), so need to sleep
time.sleep(sleep_secs)
# update the last accessed time
self.domains[domain] = time.time()
class Downloader:
# seedurl = 'http://china.findlaw.cn/ask/'
""" Downloader class to use cache and requests for downloading pages.
For contructor, pass:
delay (int): # of secs delay between requests (default: 5)
user_agent (str): user agent string (default: 'wswp')
proxies (list[dict]): list of possible proxies, each
must be a dict with http / https keys and proxy values
cache (dict or dict-like obj): keys: urls, values: dicts with keys (html, code)
bsparser (str): must be one of ['gettotalpages', 'getLinks', 'getContent']
"""
def __init__(self, delay = 1, user_agent = USER_AGENT, cache = {}, proxies = None):
# instance variables
self.throttle = Throttle(delay)
self.user_agent = user_agent
self.cache = cache
self.proxies = proxies
def __call__(self, url, bsparser = None):
""" Call the downloader class, which will return HTML from cache
or download it
args:
url (str): url to download
kwargs:
num_retries (int): # times to retry if 5xx code (default: 2)
"""
try:
result = self.cache[url]
if result:
result.update({'url': url})
print('Loaded from cache:', url, result['code'])
except (KeyError, UnicodeDecodeError):
result = None
if result and result['code'] != 200 and result['code'] != 404:
# server error so ignore result from cache
# and re-download
result = None
if result and result['code'] == 200 and result['html'] and '法律咨询' in result['html']['classify']:
#数据不完全,重新下载
result = None
if result is None:
# result was not loaded from cache, need to download
self.throttle.wait(url)
headers = {'User-Agent': self.user_agent}
result = self.download(url, headers, bsparser)
self.cache[url] = result
return result['code']
def download(self, url, headers, bsparser):
""" Download a and return the page content
args:
url (str): URL
headers (dict): dict of headers (like user_agent)
proxies (dict): proxy dict w/ keys 'http'/'https', values
are strs (i.e. 'http(s)://IP') (default: None)
bsparser (str): method to parse html, must be one of ['gettotalpages', 'getLinks', 'getContent']
"""
# print('Downloading:', url, 'with proxy IP {}'.format(self.proxies['http']))
num_retries = 2
print('Downloading:', url)
try:
resp = requests.get(url, headers = headers, proxies = self.proxies, timeout = 5)
print ('Downloading status:', resp.status_code)
html = resp.text
if resp.status_code == 200:
return {'html': self.applybs(html, bsparser), 'code': resp.status_code, 'url': url}
elif resp.status_code == 404:
return {'html' : None, 'code' : resp.status_code, 'url' : url}
elif (resp.status_code == 408 or 500 <= resp.status_code < 600) and num_retries:
num_retries -= 1
print ('Retry download after 3 sec.')
time.sleep(3)
return self.download(url, headers, bsparser)
elif resp.status_code == 403:
print ('Retry download after 5 mins.')
time.sleep(303)
return self.download(url, headers, bsparser)
except (requests.exceptions.RequestException,
requests.exceptions.ChunkedEncodingError,
requests.ConnectionError, http.client.IncompleteRead, http.client.HTTPException) as e:
print('Download {} error:'.format(url), e)
return {'html': None, 'code': 403, 'url': url}
def getTotalpages(self, html):
'''function that returns the total page of a specified year and month
Args:
html(html object):
Returns:
int or false message
'''
print('getTotalpages called @ {}'.format(
datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')))
if html is None:
return None
else:
bsObj = BeautifulSoup(html, 'html.parser')
try:
lastpagetag = bsObj.find('a', string=u'尾页')
if 'href' in lastpagetag.attrs:
urlprefix = re.search(re.compile(
'.*d[0-9]{6}_page'), lastpagetag.attrs['href'])
totalpages = re.search(re.compile(
'[0-9]+/$'), lastpagetag.attrs['href'])
if totalpages:
return [urlprefix.group() + str(link) for link in range(1, int(totalpages.group()[:-1]) + 1)]
except AttributeError as e:
print('Attribute Error in getTotalPages:', e)
return None
def getLinks(self, html):
''' This function return all the links to the specific question
Args:
html(object):
Returns:
set: The links in a set
'''
print('getLinks called @ {}'.format(
datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')))
pages = list()
if html is None:
pages = None
else:
bsObj = BeautifulSoup(html, 'html.parser')
try:
linklist = bsObj.find_all('a', class_='rli-item item-link')
for link in linklist:
if 'href' in link.attrs:
pages.append(link.attrs['href'])
except AttributeError as e:
print('Attribute Error in getLinks:', e)
pages = None
return pages
def abnormalchar(self, content):
return ''.join([c if len(c.encode('utf-8')) < 4 else '?' for c in content])
def getContent(self, html):
''' This function get the title/content/time and classify
Args:
html(str):
Returns:
dict: title, content, date, classify
'''
print('getContent called @ {}'.format(
datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')))
# contents = {}
# statuscode = re.compile('tip-item tip-item-.')
if html is None:
contents = None
else:
bsObj = BeautifulSoup(html, 'html.parser')
try:
title = bsObj.find('h1', class_='q-title').get_text()
content = bsObj.find('p', class_='q-detail').get_text()
date = bsObj.find('span', class_='about-item').get_text()
classify = bsObj.find('span', string = u'正文', class_ = 'loc-text loc-link').find_previous('a').get_text()
contents = {'title': self.abnormalchar(title), 'content': re.sub(
r'\s+', '', self.abnormalchar(content)), 'date': date, 'classify': classify}
except AttributeError as e:
print('Atrribute Error in getContent:', e)
contents = None
return contents
def applybs(self, html, bsparser):
if bsparser in ['gettotalpages', 'getLinks', 'getContent']:
return {'gettotalpages' : self.getTotalpages, 'getLinks' : self.getLinks, 'getContent' : self.getContent}[bsparser](html)
else:
raise KeyError('bsparser must be specified.')
class RedisCache:
def __init__(self, client = None, encoding = 'utf-8', db = 0, compress = True):
# if a client object is not passed then try
# connecting to redis at the default localhost port
self.client = StrictRedis(host = 'localhost', port = 6379, db = db) if client is None else client
# self.expires = expires
self.encoding = encoding
self.compress = compress
def __getitem__(self, url):
'''
Load value from Redis for the given URL
'''
record = self.client.get(url)
if record:
if self.compress:
record = zlib.decompress(record)
try:
rec = record.decode(self.encoding)
except UnicodeDecodeError:
rec = bytes(json.dumps({'html' : None, 'code' : 403}), self.encoding)
return json.loads(rec)
else:
raise KeyError(url + ' does not exist.')
def __setitem__(self, url, result):
'''
Save value in Redis for the given URL
'''
data = bytes(json.dumps(result), self.encoding, errors = 'ignore')
if self.compress:
data = zlib.compress(data)
self.client.set(url, data)
def __len__(self):
return self.client.dbsize()
def erase(self):
self.client.flushdb()
def page_links():
start = time.time()
D = Downloader(cache = RedisCache())
for year in YEARS:
for month in MONTHS:
D(seedurl + 'd' + year + month, bsparser = 'gettotalpages')
end = time.time()
if end - start > 15:
time.sleep(10)
start = time.time()
def question_links():
links_cache = RedisCache(db = 0)
question_cache = RedisCache(db = 1)
start = time.time()
D = Downloader(cache=question_cache)
for key in links_cache.client.scan_iter():
if links_cache[key]['html'] is None:
continue
else:
for link in links_cache[key]['html']:
D(link, bsparser = 'getLinks')
end = time.time()
if end - start > randrange(8, 10):
time.sleep(randrange(10, 15))
start = time.time()
def content_links():
question_cache = RedisCache(db = 1, compress = False)
content_cache = RedisCache(db = 2, compress = False)
start = time.time()
D = Downloader(cache=content_cache)
for key in question_cache.client.scan_iter():
if question_cache[key]['html'] is None:
continue
else:
for question in question_cache[key]['html']:
#print (question)
D(question, bsparser = 'getContent')
#print (code)
content = content_cache[question]['html']
if content is None:
continue
else:
# writer.writerow(content.update({'url' : question}))
end = time.time()
if end - start > randrange(8, 10):
time.sleep(randrange(10, 13))
start = time.time()
if __name__ == '__main__':
# pass
# page_links()
# question_links()
content_links()
# D = Downloader(cache = RedisCache())
# totalpages = D('http://china.findlaw.cn/ask/d201703', bsparser = 'gettotalpages')
# print (len(totalpages))
# D = Downloader()
# links = D('http://china.findlaw.cn/ask/d201703_page3829', bsparser='getLinks')
# print (len(links))
# D = Downloader()
# content = D('http://china.findlaw.cn/ask/question_38586569.html', bsparser = 'getContent')
# print (content)