Skip to content

Commit 580c79a

Browse files
committed
update re code
1 parent cd66e2d commit 580c79a

File tree

1 file changed

+20
-12
lines changed

1 file changed

+20
-12
lines changed

51_job.re.py

Lines changed: 20 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,9 @@ def __init__(self):
1414
self.__job = 'Python'
1515
self.__url = 'http://search.51job.com/list/000000,000000,0000,00,9,99,{},2,{}.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare='
1616
self.__count = 1
17-
self.__page = 672
17+
#总共672页
18+
self.__page = 1
19+
#总共672页
1820
self.__createSheet()
1921

2022
def __createSheet(self):
@@ -46,13 +48,17 @@ def __getDetails(self, site):
4648
headers = {
4749
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'
4850
}
49-
res = requests.get(site, headers=headers)
50-
res.encoding = 'gbk'
51-
pattern2 = re.compile('<div class.*?"bmsg job_msg inbox">(.*?)<div>', re.S)
52-
details = re.findall(pattern2, res.text)
53-
if details is not None and len(details) != 0:
54-
details = re.sub(re.compile(r'<[^>]+>', re.S), '', details)
55-
else:
51+
try:
52+
res = requests.get(site, headers=headers,timeout=1)
53+
res.encoding = 'gbk'
54+
pattern2 = re.compile('<div class.*?bmsg.*?job.*?msg.*?inbox">(.*?)</div>', re.S)
55+
details = re.findall(pattern2, res.text)
56+
print(details)
57+
if details is not None and len(details) != 0:
58+
details = re.sub(re.compile(r'<[^>]+>', re.S), '', details[0])
59+
else:
60+
details = "暂无数据"
61+
except:
5662
details = "暂无数据"
5763
return details
5864

@@ -64,11 +70,15 @@ def getData(self, work='Python'):
6470
}
6571
while self.__page <= 672:
6672
url = self.__url.format(self.__job, self.__page)
67-
response = requests.get(url, headers=headers,timeout=3)
73+
response = requests.get(url, headers=headers,timeout=1)
6874
response.encoding = 'gbk'
6975
content = response.text
70-
pattern = re.compile('<div class.*?el.*?<a.*?title="(.*?)".*?href.*?"(.*?)".*?<a.*?title.*?"(.*?)".*?class.*?"t3">(.*?)</span>.*?class.*?"t4">(.*?)</span>',re.S)
76+
print(content)
77+
print("Start")
78+
pattern = re.compile('<div class="el">.*?<a.*?title="(.*?)".*?href="(.*?)".*?<a.*?title="(.*?)".*?class.*?"t3">(.*?)</span>.*?class.*?"t4">(.*?)</span>.*?</div>',re.S)
79+
print("go on")
7180
datas = re.findall(pattern,content)
81+
print("end")
7282
for each in datas:
7383
title = ""
7484
address = ""
@@ -84,14 +94,12 @@ def getData(self, work='Python'):
8494
name = each[2]
8595
salary = each[4]
8696
site = each[1]
87-
print(site)
8897
details = self.__getDetails(site)
8998

9099
self.__saveDataToExcel(title, address, name, salary, details, site)
91100
self.__page += 1
92101
except Exception as e:
93102
print(str(e))
94-
raise e
95103

96104

97105
if __name__ == '__main__':

0 commit comments

Comments
 (0)