-
Notifications
You must be signed in to change notification settings - Fork 0
/
crowler104.py
175 lines (154 loc) · 6.59 KB
/
crowler104.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
# coding=gbk
import re, time, requests
import json
import csv
import pprint
from lxml import etree
from multiprocessing import Pool
from datetime import date
import os
class cachefilename():
def __init__(self,):
self.name = './{}.json'.format(date.today().strftime("%Y-%m-%d"))
def make_params(sceng,area,min,max):
list = []
my_params = {'scneg': sceng, # 搜尋參數
'area': area, # 指定區域
'scmin': min,
'scmax': max
}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36',
'Referer': 'https://www.104.com.tw/job/'}
searchpageurl = 'https://www.104.com.tw/jobs/search/?'
ss = requests.session()
res = ss.get(url=searchpageurl, headers=headers, params=my_params)
html = etree.HTML(res.text)
try:
totalpage = int(html.xpath('//script[contains(text(),"totalPage")]/text()')[0].split('totalPage":')[-1].split(
',"totalCount"')[0]) # 取得js中的total page
print("totalPage:", totalpage)
except Exception as e:
print(e)
totalpage = 1
for page in range(1,totalpage+1):
my_params = {'scneg': sceng, # 搜尋參數
'area': area, # 指定區域
'scmin': min,
'scmax': max,
'page': page
}
list.append(my_params)
return list
#帶入myparams參數取得後五碼的list
def index(my_params):
joburl = []
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36',
'Referer': 'https://www.104.com.tw/job/'}
searchpageurl = 'https://www.104.com.tw/jobs/search/?'
ss = requests.session()
try:
res = ss.get(url=searchpageurl, headers=headers, params=my_params)#allow_redirects=False
except Exception as e:
print(e,"continuing...")
try:
if res.status_code == 200:
html = etree.HTML(res.text)
for ajax_content in html.xpath('//a[contains(@class,"js-job-link")]/@href'):
joburl.append(ajax_content.split('?')[0].split('/')[-1])
return joburl #list
else :
print("錯誤代碼:",res.status_code)
except Exception as exc:
print(exc)
#把抓下來的後五碼存進json檔
def dump_json_file(query_dict):
dumped_json_cache = json.dumps(query_dict)
filename = date.today().strftime("%Y-%m-%d")
fw = open('./{}.json'.format(filename), "w")
fw.write(dumped_json_cache)
fw.close()
print('dump the data successfully')
def crowl(url):
tmpdict = {}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36',
'Referer': 'https://www.104.com.tw/job/'}
ss=requests.session()
res = ss.get(url= url , headers = headers)
if res.status_code == 200:
try:
return res.json()
except json.decoder.JSONDecodeError:
print(res.json)
else:
print('請求網頁json錯誤, 錯誤狀態碼:', res.status_code)
#資料清洗、重新排版
def extract(joburl,data_dict):
job_dict = {joburl: {}}
print('PID:{} , task:{}'.format(os.getpid(),joburl))
try:
data_dict = data_dict['data']
# job title
job_dict[joburl]['jobName'] = data_dict['header']['jobName']
job_dict[joburl]['appearDate'] = data_dict['header']['appearDate']
# company detail
job_dict[joburl]['companyName'] = data_dict['header']['custName']
job_dict[joburl]['companyUrl'] = data_dict['header']['custUrl']
job_dict[joburl]['industry'] = data_dict['industry']
job_dict[joburl]['addressRegion'] = data_dict['jobDetail']['addressRegion']
job_dict[joburl]['longitude'] = data_dict['jobDetail']['longitude']
job_dict[joburl]['latitude'] = data_dict['jobDetail']['latitude']
# condition
job_dict[joburl]['acceptRole'] = data_dict['condition']['acceptRole']
job_dict[joburl]['workExp'] = data_dict['condition']['workExp']
job_dict[joburl]['edu'] = data_dict['condition']['edu']
job_dict[joburl]['major'] = data_dict['condition']['major']
job_dict[joburl]['language'] = data_dict['condition']['language']
job_dict[joburl]['skill'] = data_dict['condition']['skill']
job_dict[joburl]['certificate'] = data_dict['condition']['certificate']
job_dict[joburl]['other'] = data_dict['condition']['other']
# job Detail
job_dict[joburl]['jobDescription'] = data_dict['jobDetail']['jobDescription']
job_dict[joburl]['jobCategory'] = data_dict['jobDetail']['jobCategory']
job_dict[joburl]['jobType'] = data_dict['jobDetail']['jobType']
job_dict[joburl]['manageResp'] = data_dict['jobDetail']['manageResp']
job_dict[joburl]['businessTrip'] = data_dict['jobDetail']['businessTrip']
job_dict[joburl]['workPeriod'] = data_dict['jobDetail']['workPeriod']
job_dict[joburl]['vacationPolicy'] = data_dict['jobDetail']['vacationPolicy']
job_dict[joburl]['startWorkingDay'] = data_dict['jobDetail']['startWorkingDay']
job_dict[joburl]['needEmp'] = data_dict['jobDetail']['needEmp']
# salary
job_dict[joburl]['salary'] = data_dict['jobDetail']['salary']
job_dict[joburl]['salaryMin'] = data_dict['jobDetail']['salaryMin']
job_dict[joburl]['salaryMax'] = data_dict['jobDetail']['salaryMax']
job_dict[joburl]['salaryType'] = data_dict['jobDetail']['salaryType']
job_dict[joburl]['welfare'] = data_dict['welfare']['welfare']
return job_dict
except Exception as e:
print(e)
print(joburl)
return job_dict
def write_json(filename,tmpdict):
#pprint.pprint(tmpdict)
with open('./'+filename,'w') as f:
f.write(json.dumps(tmpdict))
print("write{} successfully".format(filename))
def open_json_file(CACHE_FNAME):
try:
cache_file = open(CACHE_FNAME, 'r')
cache_contents = cache_file.read()
CACHE_DICTION = json.loads(cache_contents)
cache_file.close()
return CACHE_DICTION
except:
print("no any cache")
CACHE_DICTION = {}
return CACHE_DICTION
if __name__ == '__main__':
arealist = ['6001001001'] #測試用
# import os
#
# cpus = os.cpu_count()
# print(cpus)