Skip to content

Commit b6237de

Browse files
committed
add some picture crawler
1 parent fc1a104 commit b6237de

File tree

3 files changed

+63
-3
lines changed

3 files changed

+63
-3
lines changed

51_job_bs.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -77,8 +77,8 @@ def getData(self, work='Python'):
7777
#存在p节点
7878
for p in ps:
7979
p_data.append(p.string)
80-
else:
81-
p_data.append(div.get_text().strip())
80+
else:
81+
p_data.append(div.get_text().strip())
8282
except Exception as e:
8383
print(str(e))
8484
p_data.append("暂无")

README.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,4 @@
11
#### 2018-3-9
2-
##### *This is a warehouse with python crawlers and data processing, with examples inside.*
2+
##### *This is a warehouse with python crawlers and data processing, with examples inside.*
3+
4+
>>[code introduce by csdn](http://blog.csdn.net/tenderness4/article/details/79504086) or [blog](http://wirjx.top/Python3.6%E7%88%AC%E8%99%AB%E6%80%BB%E7%BB%93)

mm_req_html_pic.py

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
#coding=utf-8
2+
"""
3+
@author:JianxiongRao
4+
@data:2018/3/12
5+
@version:Python3.6
6+
"""
7+
from requests_html import HTMLSession
8+
import os
9+
import time
10+
11+
class MM(object):
12+
def __init__(self):
13+
self.__page = 1
14+
self.__url = "http://www.mm131.com/qingchun/list_1_{}.html"
15+
self.__session = HTMLSession()
16+
self.__headers = {
17+
'Referer':'http://www.mm131.com/qingchun/',
18+
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36'
19+
}
20+
self.__imagePath = r'D:/Photo/MM'
21+
self.__confirmPath()
22+
23+
def __confirmPath(self):
24+
if not os.path.exists(self.__imagePath):
25+
os.makedirs(self.__imagePath)
26+
27+
def download(self,link,fileName):
28+
try:
29+
with open(self.__imagePath+'/'+fileName+'.jpg','wb') as f:
30+
f.write(self.__session.request('get',link,headers = self.__headers,allow_redirects=False).content)
31+
except Exception as e:
32+
print(str(e))
33+
34+
def parseData(self):
35+
start = time.time()
36+
while self.__page < 12:
37+
if self.__page == 1:
38+
self.__url = "http://www.mm131.com/qingchun/"
39+
else:
40+
self.__url = 'http://www.mm131.com/qingchun/list_1_{}.html'.format(self.__page)
41+
r = self.__session.get(self.__url)
42+
main = r.html.find(".main",first=True)
43+
dl = main.find('dl')[0]
44+
dds = dl.find('dd')
45+
for dd in dds[:-1]:
46+
attr = dd.find('img')[0].attrs
47+
imageLink = attr['src']
48+
title = attr['alt']
49+
self.download(imageLink,title)
50+
self.__page += 1
51+
end = time.time() - start
52+
print("爬取时间:",end)
53+
54+
if __name__=="__main__":
55+
mm = MM()
56+
mm.parseData()
57+
58+

0 commit comments

Comments
 (0)