@@ -14,7 +14,9 @@ def __init__(self):
1414 self .__job = 'Python'
1515 self .__url = 'http://search.51job.com/list/000000,000000,0000,00,9,99,{},2,{}.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare='
1616 self .__count = 1
17- self .__page = 672
17+ #总共672页
18+ self .__page = 1
19+ #总共672页
1820 self .__createSheet ()
1921
2022 def __createSheet (self ):
@@ -46,13 +48,17 @@ def __getDetails(self, site):
4648 headers = {
4749 'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'
4850 }
49- res = requests .get (site , headers = headers )
50- res .encoding = 'gbk'
51- pattern2 = re .compile ('<div class.*?"bmsg job_msg inbox">(.*?)<div>' , re .S )
52- details = re .findall (pattern2 , res .text )
53- if details is not None and len (details ) != 0 :
54- details = re .sub (re .compile (r'<[^>]+>' , re .S ), '' , details )
55- else :
51+ try :
52+ res = requests .get (site , headers = headers ,timeout = 1 )
53+ res .encoding = 'gbk'
54+ pattern2 = re .compile ('<div class.*?bmsg.*?job.*?msg.*?inbox">(.*?)</div>' , re .S )
55+ details = re .findall (pattern2 , res .text )
56+ print (details )
57+ if details is not None and len (details ) != 0 :
58+ details = re .sub (re .compile (r'<[^>]+>' , re .S ), '' , details [0 ])
59+ else :
60+ details = "暂无数据"
61+ except :
5662 details = "暂无数据"
5763 return details
5864
@@ -64,11 +70,15 @@ def getData(self, work='Python'):
6470 }
6571 while self .__page <= 672 :
6672 url = self .__url .format (self .__job , self .__page )
67- response = requests .get (url , headers = headers ,timeout = 3 )
73+ response = requests .get (url , headers = headers ,timeout = 1 )
6874 response .encoding = 'gbk'
6975 content = response .text
70- pattern = re .compile ('<div class.*?el.*?<a.*?title="(.*?)".*?href.*?"(.*?)".*?<a.*?title.*?"(.*?)".*?class.*?"t3">(.*?)</span>.*?class.*?"t4">(.*?)</span>' ,re .S )
76+ print (content )
77+ print ("Start" )
78+ pattern = re .compile ('<div class="el">.*?<a.*?title="(.*?)".*?href="(.*?)".*?<a.*?title="(.*?)".*?class.*?"t3">(.*?)</span>.*?class.*?"t4">(.*?)</span>.*?</div>' ,re .S )
79+ print ("go on" )
7180 datas = re .findall (pattern ,content )
81+ print ("end" )
7282 for each in datas :
7383 title = ""
7484 address = ""
@@ -84,14 +94,12 @@ def getData(self, work='Python'):
8494 name = each [2 ]
8595 salary = each [4 ]
8696 site = each [1 ]
87- print (site )
8897 details = self .__getDetails (site )
8998
9099 self .__saveDataToExcel (title , address , name , salary , details , site )
91100 self .__page += 1
92101 except Exception as e :
93102 print (str (e ))
94- raise e
95103
96104
97105if __name__ == '__main__' :
0 commit comments