forked from IcingTomato/BOSS-ZHIPIN_Spider
-
Notifications
You must be signed in to change notification settings - Fork 0
/
jobs.py
123 lines (103 loc) · 5.15 KB
/
jobs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import asyncio, random
from pyppeteer import launch
from lxml import etree
import pandas as pd
import requests
import openpyxl
class ss_xz(object):
def __init__(self):
self.data_list = list()
def screen_size(self):
"""使用tkinter获取屏幕大小"""
import tkinter
tk = tkinter.Tk()
width = tk.winfo_screenwidth()
height = tk.winfo_screenheight()
tk.quit()
return width, height
# width, height = 1366, 768
async def main(self):
try:
browser = await launch(headless=False,userDataDir="./config",
args=['--disable-infobars', '--window-size=1366,768', '--no-sandbox'])
page = await browser.newPage()
width, height = self.screen_size()
await page.setViewport({'width': width, 'height': height})
# await page.goto(
# 'https://www.zhipin.com/changzhou/?ka=city-sites-101191100') # 常州
await page.goto(
'https://www.zhipin.com/?city=100010000&ka=city-sites-100010000') # 全国
await page.evaluateOnNewDocument(
'''() =>{ Object.defineProperties(navigator, { webdriver: { get: () => false } }) }''')
await asyncio.sleep(5)
# 查询数据分析岗位
await page.type(
'#wrap > div.column-search-panel > div > div > div.search-form > form > div.search-form-con > p > input',
'商务英语', {'delay': self.input_time_random() - 50})
await asyncio.sleep(2)
# 点击搜索
await page.click('#wrap > div.column-search-panel > div > div > div.search-form > form > button')
await asyncio.sleep(5)
# print(await page.content())
# 获取页面内容
i = 0
while True:
await asyncio.sleep(2)
content = await page.content()
html = etree.HTML(content)
# 解析内容
self.parse_html(html)
# 翻页
await page.click('#wrap > div.page-job-wrapper > div.page-job-inner > div > div.job-list-wrapper > div.search-job-result > div > div > div > a:nth-child(10)')
await asyncio.sleep(3)
i += 1
print(i)
# boss直聘限制翻页为10页,分省分批次抓取
if i >= 10:
break
df = pd.DataFrame(self.data_list)
# df['职位'] = df.职位.str.extract(r'[(.*?)]', expand=True)
df.to_excel('./data/jobs_全国.xlsx', index=False)
# df.to_csv('./data/jobs_test.csv', index=False)
print(df)
except Exception as a:
print(a)
def input_time_random(self):
return random.randint(100, 151)
def parse_html(self, html):
li_list = html.xpath('//div[@class="search-job-result"]//ul[@class="job-list-box"]/li')
data_df = []
for li in li_list:
# 获取文本
items = {}
items['职位'] = li.xpath('.//span[@class="job-name"]/text()')[0]
items['薪酬'] = li.xpath('.//div[@class="job-info clearfix"]/span/text()')[0]
items['公司名称'] = li.xpath('.//div[@class="company-info"]//h3/a/text()')[0]
items['工作经验'] = li.xpath('.//div[@class="job-info clearfix"]/ul/li/text()')[0]
items['学历要求'] = li.xpath('.//div[@class="job-info clearfix"]/ul/li/text()')[1]
items['地区'] = li.xpath('.//span[@class="job-area"]/text()')[0]
items['福利'] = li.xpath('.//div[@class="info-desc"]/text()')
span_list = li.xpath('.//div[@class="job-card-footer clearfix"]/ul[@class="tag-list"]')
for span in span_list:
items['技能要求'] = span.xpath('./li/text()')
ul_list = li.xpath('.//ul[@class="company-tag-list"]')
for ul in ul_list:
items['公司类型及规模'] = ul.xpath('./li/text()')
xl_list = li.xpath('.//div[@class="job-info clearfix"]/ul[@class="company-tag-list"]')
for xl in xl_list:
items['工作经验及学历要求'] = xl.xpath('./li/text()')
self.data_list.append(items)
def run(self):
asyncio.get_event_loop().run_until_complete(self.main())
# Convert the list of dictionaries to a pandas DataFrame
df = pd.DataFrame(self.data_list)
# Remove the brackets from '福利', '技能要求', '公司类型及规模'
df['福利'] = df['福利'].str.join(",") # Convert list to string
df['技能要求'] = df['技能要求'].str.join(",") # Convert list to string
df['公司类型及规模'] = df['公司类型及规模'].str.join(",") # Convert list to string
# Save the cleaned data to 'clean.csv'
# df.to_csv('./data/clean.csv', index=False)
df.to_excel('./data/clean_全国.xlsx', index=False)
if __name__ == '__main__':
comment = ss_xz()
comment.run()