From d000a81eb3e2ff2c01989ec8553b767a58c76d40 Mon Sep 17 00:00:00 2001 From: Cullen Watson Date: Sun, 9 Jun 2024 17:45:38 -0500 Subject: [PATCH] Salary parse (#163) --- pyproject.toml | 2 +- src/jobspy/__init__.py | 36 ++++++++++++--- src/jobspy/scrapers/glassdoor/__init__.py | 10 ++--- src/jobspy/scrapers/indeed/__init__.py | 4 +- src/jobspy/scrapers/linkedin/__init__.py | 55 ++++++++++------------- src/jobspy/scrapers/utils.py | 52 +++++++++++++++++++++ 6 files changed, 114 insertions(+), 45 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 52b0772..353d39a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "python-jobspy" -version = "1.1.56" +version = "1.1.57" description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter" authors = ["Zachary Hampton ", "Cullen Watson "] homepage = "https://github.com/Bunsly/JobSpy" diff --git a/src/jobspy/__init__.py b/src/jobspy/__init__.py index f0623a3..e2956f5 100644 --- a/src/jobspy/__init__.py +++ b/src/jobspy/__init__.py @@ -5,7 +5,7 @@ from concurrent.futures import ThreadPoolExecutor, as_completed from .jobs import JobType, Location -from .scrapers.utils import logger, set_logger_level +from .scrapers.utils import logger, set_logger_level, extract_salary from .scrapers.indeed import IndeedScraper from .scrapers.ziprecruiter import ZipRecruiterScraper from .scrapers.glassdoor import GlassdoorScraper @@ -118,6 +118,21 @@ def worker(site): site_value, scraped_data = future.result() site_to_jobs_dict[site_value] = scraped_data + def convert_to_annual(job_data: dict): + if job_data["interval"] == "hourly": + job_data["min_amount"] *= 2080 + job_data["max_amount"] *= 2080 + if job_data["interval"] == "monthly": + job_data["min_amount"] *= 12 + job_data["max_amount"] *= 12 + if job_data["interval"] == "weekly": + job_data["min_amount"] *= 52 + job_data["max_amount"] *= 52 + if job_data["interval"] == "daily": + job_data["min_amount"] *= 260 + job_data["max_amount"] *= 260 + job_data["interval"] = "yearly" + jobs_dfs: list[pd.DataFrame] = [] for site, job_response in site_to_jobs_dict.items(): @@ -150,11 +165,22 @@ def worker(site): job_data["min_amount"] = compensation_obj.get("min_amount") job_data["max_amount"] = compensation_obj.get("max_amount") job_data["currency"] = compensation_obj.get("currency", "USD") + if ( + job_data["interval"] + and job_data["interval"] != "yearly" + and job_data["min_amount"] + and job_data["max_amount"] + ): + convert_to_annual(job_data) + else: - job_data["interval"] = None - job_data["min_amount"] = None - job_data["max_amount"] = None - job_data["currency"] = None + if country_enum == Country.USA: + ( + job_data["interval"], + job_data["min_amount"], + job_data["max_amount"], + job_data["currency"], + ) = extract_salary(job_data["description"]) job_df = pd.DataFrame([job_data]) jobs_dfs.append(job_df) diff --git a/src/jobspy/scrapers/glassdoor/__init__.py b/src/jobspy/scrapers/glassdoor/__init__.py index b0dd733..459087b 100644 --- a/src/jobspy/scrapers/glassdoor/__init__.py +++ b/src/jobspy/scrapers/glassdoor/__init__.py @@ -69,7 +69,7 @@ def scrape(self, scraper_input: ScraperInput) -> JobResponse: if location_type is None: logger.error("Glassdoor: location not parsed") return JobResponse(jobs=[]) - all_jobs: list[JobPost] = [] + job_list: list[JobPost] = [] cursor = None range_start = 1 + (scraper_input.offset // self.jobs_per_page) @@ -81,14 +81,14 @@ def scrape(self, scraper_input: ScraperInput) -> JobResponse: jobs, cursor = self._fetch_jobs_page( scraper_input, location_id, location_type, page, cursor ) - all_jobs.extend(jobs) - if not jobs or len(all_jobs) >= scraper_input.results_wanted: - all_jobs = all_jobs[: scraper_input.results_wanted] + job_list.extend(jobs) + if not jobs or len(job_list) >= scraper_input.results_wanted: + job_list = job_list[: scraper_input.results_wanted] break except Exception as e: logger.error(f"Glassdoor: {str(e)}") break - return JobResponse(jobs=all_jobs) + return JobResponse(jobs=job_list) def _fetch_jobs_page( self, diff --git a/src/jobspy/scrapers/indeed/__init__.py b/src/jobspy/scrapers/indeed/__init__.py index b5d6cd6..17fb565 100644 --- a/src/jobspy/scrapers/indeed/__init__.py +++ b/src/jobspy/scrapers/indeed/__init__.py @@ -297,8 +297,8 @@ def _get_compensation(job: dict) -> Compensation | None: max_range = comp["range"].get("max") return Compensation( interval=interval, - min_amount=round(min_range, 2) if min_range is not None else None, - max_amount=round(max_range, 2) if max_range is not None else None, + min_amount=int(min_range) if min_range is not None else None, + max_amount=int(max_range) if max_range is not None else None, currency=job["compensation"]["currencyCode"], ) diff --git a/src/jobspy/scrapers/linkedin/__init__.py b/src/jobspy/scrapers/linkedin/__init__.py index a4f2fd1..3db5557 100644 --- a/src/jobspy/scrapers/linkedin/__init__.py +++ b/src/jobspy/scrapers/linkedin/__init__.py @@ -69,7 +69,7 @@ def scrape(self, scraper_input: ScraperInput) -> JobResponse: """ self.scraper_input = scraper_input job_list: list[JobPost] = [] - seen_urls = set() + seen_ids = set() page = scraper_input.offset // 10 * 10 if scraper_input.offset else 0 request_count = 0 seconds_old = ( @@ -133,25 +133,24 @@ def scrape(self, scraper_input: ScraperInput) -> JobResponse: return JobResponse(jobs=job_list) for job_card in job_cards: - job_url = None href_tag = job_card.find("a", class_="base-card__full-link") if href_tag and "href" in href_tag.attrs: href = href_tag.attrs["href"].split("?")[0] job_id = href.split("-")[-1] - job_url = f"{self.base_url}/jobs/view/{job_id}" - - if job_url in seen_urls: - continue - seen_urls.add(job_url) - try: - fetch_desc = scraper_input.linkedin_fetch_description - job_post = self._process_job(job_card, job_url, fetch_desc) - if job_post: - job_list.append(job_post) - if not continue_search(): - break - except Exception as e: - raise LinkedInException(str(e)) + + if job_id in seen_ids: + continue + seen_ids.add(job_id) + + try: + fetch_desc = scraper_input.linkedin_fetch_description + job_post = self._process_job(job_card, job_id, fetch_desc) + if job_post: + job_list.append(job_post) + if not continue_search(): + break + except Exception as e: + raise LinkedInException(str(e)) if continue_search(): time.sleep(random.uniform(self.delay, self.delay + self.band_delay)) @@ -161,7 +160,7 @@ def scrape(self, scraper_input: ScraperInput) -> JobResponse: return JobResponse(jobs=job_list) def _process_job( - self, job_card: Tag, job_url: str, full_descr: bool + self, job_card: Tag, job_id: str, full_descr: bool ) -> Optional[JobPost]: salary_tag = job_card.find("span", class_="job-search-card__salary-info") @@ -208,16 +207,16 @@ def _process_job( date_posted = None job_details = {} if full_descr: - job_details = self._get_job_details(job_url) + job_details = self._get_job_details(job_id) return JobPost( - id=self._get_id(job_url), + id=job_id, title=title, company_name=company, company_url=company_url, location=location, date_posted=date_posted, - job_url=job_url, + job_url=f"{self.base_url}/jobs/view/{job_id}", compensation=compensation, job_type=job_details.get("job_type"), description=job_details.get("description"), @@ -227,24 +226,16 @@ def _process_job( job_function=job_details.get("job_function"), ) - def _get_id(self, url: str): - """ - Extracts the job id from the job url - :param url: - :return: str - """ - if not url: - return None - return url.split("/")[-1] - - def _get_job_details(self, job_page_url: str) -> dict: + def _get_job_details(self, job_id: str) -> dict: """ Retrieves job description and other job details by going to the job page url :param job_page_url: :return: dict """ try: - response = self.session.get(job_page_url, timeout=5) + response = self.session.get( + f"{self.base_url}/jobs-guest/jobs/api/jobPosting/{job_id}", timeout=5 + ) response.raise_for_status() except: return {} diff --git a/src/jobspy/scrapers/utils.py b/src/jobspy/scrapers/utils.py index 8d448be..16607b1 100644 --- a/src/jobspy/scrapers/utils.py +++ b/src/jobspy/scrapers/utils.py @@ -185,3 +185,55 @@ def remove_attributes(tag): for attr in list(tag.attrs): del tag[attr] return tag + + +def extract_salary( + salary_str, + lower_limit=1000, + upper_limit=700000, + hourly_threshold=350, + monthly_threshold=30000, +): + if not salary_str: + return None, None, None, None + + min_max_pattern = r"\$(\d+(?:,\d+)?(?:\.\d+)?)([kK]?)\s*[-—–]\s*(?:\$)?(\d+(?:,\d+)?(?:\.\d+)?)([kK]?)" + + def to_int(s): + return int(float(s.replace(",", ""))) + + def convert_hourly_to_annual(hourly_wage): + return hourly_wage * 2080 + + def convert_monthly_to_annual(monthly_wage): + return monthly_wage * 12 + + match = re.search(min_max_pattern, salary_str) + + if match: + min_salary = to_int(match.group(1)) + max_salary = to_int(match.group(3)) + # Handle 'k' suffix for min and max salaries independently + if "k" in match.group(2).lower() or "k" in match.group(4).lower(): + min_salary *= 1000 + max_salary *= 1000 + + # Convert to annual if less than the hourly threshold + if min_salary < hourly_threshold: + min_salary = convert_hourly_to_annual(min_salary) + if max_salary < hourly_threshold: + max_salary = convert_hourly_to_annual(max_salary) + + elif min_salary < monthly_threshold: + min_salary = convert_monthly_to_annual(min_salary) + if max_salary < monthly_threshold: + max_salary = convert_monthly_to_annual(max_salary) + + # Ensure salary range is within specified limits + if ( + lower_limit <= min_salary <= upper_limit + and lower_limit <= max_salary <= upper_limit + and min_salary < max_salary + ): + return "yearly", min_salary, max_salary, "USD" + return None, None, None, None