-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraper.py
185 lines (157 loc) · 8.94 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
import aiohttp
import asyncio
import json
import time
import urllib.parse
from fake_useragent import UserAgent
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.chrome.options import Options as chromeoptions
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
class JobScraper:
def __init__(self, skills, place, job_type):
self.jobs = {}
self.skills = skills
self.place = place
self.job_type = job_type
self.user_agent = UserAgent(platforms='pc')
# Generate a random User Agent
user_agent_str = self.user_agent.random
print(f"User-Agent: {user_agent_str}")
# Setting Selenium options
options = chromeoptions()
options.add_argument('--incognito')
options.add_argument('--headless')
options.add_argument('--disable-gpu')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
options.add_argument(f'user-agent={user_agent_str}')
self.driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options)
async def get_info(self, session, url, skill):
user_agent_str = self.user_agent.random
retries = 1
for attempt in range(retries):
try:
async with session.get(url, headers={'User-Agent': user_agent_str}) as response:
response.raise_for_status()
self.driver.get(url)
WebDriverWait(self.driver, 10).until(
EC.presence_of_element_located((By.XPATH, '//*[@id="rso"]/div/div/div/div/div[2]/div/div/div/div/infinity-scrolling/div[1]/div[1]/div/div[1]'))
)
break
except (aiohttp.ClientError, aiohttp.ClientConnectorError) as e:
print(f"Attempt {attempt + 1} - Network error: {e}")
if attempt == retries - 1:
return
await asyncio.sleep(5) # Wait before retrying
except Exception as e:
print(f"Attempt {attempt + 1} - Timeout waiting to load the first job of the page: {e}")
if attempt == retries - 1:
return
await asyncio.sleep(5) # Wait before retrying
for i in range(3): # Limit to 3 jobs
job_elements = self.driver.find_elements(By.XPATH, '//*[@class="L5NwLd"]')
if i < len(job_elements):
job_elements[i].click()
time.sleep(2)
try:
title = self.driver.find_element(By.XPATH, '//*[@id="Sva75c"]/div[2]/div[2]/div/div[2]/c-wiz/div/c-wiz[1]/c-wiz/c-wiz/div[2]/h1').text.strip()
except:
continue
try:
company = self.driver.find_element(By.XPATH, '//*[@id="Sva75c"]/div[2]/div[2]/div/div[2]/c-wiz/div/c-wiz[1]/c-wiz/c-wiz/div[1]/div/div[1]/div/div[2]/span/div').text.strip()
except:
company = 'Error'
try:
details = self.driver.find_element(By.XPATH, '//*[@id="Sva75c"]/div[2]/div[2]/div/div[2]/c-wiz/div/c-wiz[1]/c-wiz/c-wiz/div[2]/div[1]').text.strip()
split_text = [part.strip() for part in details.split('•')]
place = split_text[1]
except:
place = 'Not mentioned'
try:
salary = self.driver.find_element(By.XPATH, '//*[@id="Sva75c"]/div[@class="A8mJGd NDuZHe"]/div[@class="LrPjRb"]/div/div[@class="BIB1wf EIehLd fHE6De"]/c-wiz/div/c-wiz[1]/c-wiz/c-wiz/div[@class="JmvMcb"]/div[@class="mLdNec"]/div[(contains(., "MXN"))]/span[@class="RcZtZb"]').text.strip()
except:
salary = 'Not mentioned'
try:
job_type = self.driver.find_element(By.XPATH, '//*[@id="Sva75c"]/div[@class="A8mJGd NDuZHe"]/div[@class="LrPjRb"]/div/div[@class="BIB1wf EIehLd fHE6De"]/c-wiz/div/c-wiz[1]/c-wiz/c-wiz/div[@class="JmvMcb"]/div[@class="mLdNec"]/div[not(contains(., "MXN")) and not(contains(., "hace")) and not(contains(., "título"))]/span[@class="RcZtZb"]').text.strip()
if job_type == '':
job_type = 'Not mentioned'
except:
job_type = 'Not mentioned'
try:
published = self.driver.find_element(By.XPATH, '//*[@id="Sva75c"]/div[@class="A8mJGd NDuZHe"]/div[@class="LrPjRb"]/div/div[@class="BIB1wf EIehLd fHE6De"]/c-wiz/div/c-wiz[1]/c-wiz/c-wiz/div[@class="JmvMcb"]/div[@class="mLdNec"]/div[(contains(., "hace"))]/span[@class="RcZtZb"]').text.strip()
except:
published = 'Weeks ago.'
try:
short_description = self.driver.find_element(By.XPATH, '//*[@id="Sva75c"]/div[2]/div[2]/div/div[2]/c-wiz/div/c-wiz[1]/c-wiz/c-wiz/div[6]/div/span[1]').text.strip()
try:
long_description = self.driver.find_element(By.XPATH, '//*[@id="Sva75c"]/div[2]/div[2]/div/div[2]/c-wiz/div/c-wiz[1]/c-wiz/c-wiz/div[6]/div/span[3]').text.strip()
description = short_description + long_description
except:
description = short_description
except Exception as e:
description = f'No description - Error: {e}'
try:
links_container = self.driver.find_element(By.XPATH, '//*[@id="Sva75c"]/div[2]/div[2]/div/div[2]/c-wiz/div/c-wiz[1]/c-wiz/c-wiz/div[4]')
job_urls = []
links = links_container.find_elements(By.TAG_NAME, 'a')
for link in links:
href = link.get_attribute('href')
job_urls.append(href)
except Exception as e:
job_urls = f'Error: {e}'
self.jobs[title] = {
'Company': company,
'Place': place,
'Salary': salary,
'Type': job_type,
'Published': published,
'URLs': job_urls,
'Description': description
}
async def get_all_jobs(self):
async with aiohttp.ClientSession(connector=aiohttp.TCPConnector()) as session:
for skill in self.skills:
queries = skill
if self.place:
queries += f" {self.place}"
if self.job_type:
queries += f" {self.job_type}"
encoded_queries = urllib.parse.quote(queries)
url = f'https://www.google.com/search?&udm=8&q={encoded_queries}&jbr=sep:0'
print(url)
await self.get_info(session, url, skill)
await asyncio.sleep(2) # Adding delay to reduce request frequency
def save_to_json(self, output_file='jobs.json'):
with open(output_file, 'w', encoding='utf-8') as json_file:
json.dump(self.jobs, json_file, indent=4, ensure_ascii=False)
def return_dict(self):
return json.dumps(self.jobs, indent=4, ensure_ascii=False)
async def scraper_main(skills, place, job_type):
scraper = JobScraper(skills, place, job_type)
await scraper.get_all_jobs()
scraper.driver.quit()
return scraper.jobs
# ------------------ T E S T I N G -----------
'''
if __name__ == "__main__":
inicio = time.time()
print('Loading...')
# User parameters [skills - type - place]:
qskills = ['carpintero', 'sql', 'trabajo en equipo', 'c++', 'ensamblador', 'html', 'vendedor', 'react', 'comisiones', 'docker']
qplace = 'CDMX' #City of the job [example: CDMX] , if you don't insert an option the scraper would show the trending jobs.
qtype = 'Tiempo completo' #Type of job ['Medio Tiempo' or 'Tiempo completo'], if you don't insert an option the scraper would show the trending jobs.
asyncio.run(scraper_main(skills=qskills, place=qplace, job_type=qtype))
fin = time.time()
print(f"Complete: {fin - inicio} seconds")
RESUME:
-Asynchronous Scraping: The script asynchronously handles multiple queries using asyncio.gather.
-Job Limit: The loop in get_info is adjusted to scrape a maximum of 3 jobs per skill.
-Skill Processing: The get_all_jobs method now processes one skill at a time, creating a URL and adding the task for each skill individually.
-Slight Refactoring: Added a skill parameter to the get_info method to keep track of which skill the jobs are associated with.
-This ensures that the scraper processes each skill individually and limits the number of job postings scraped to a maximum of 3 per skill.
-The code return a dictionary with all the results.
'''