From ccb0c1766030aceeb8ed63520bb41320e3fc7054 Mon Sep 17 00:00:00 2001 From: Cullen Watson Date: Sun, 9 Jun 2024 16:21:01 -0500 Subject: [PATCH] enh: ziprecruiter full description (#162) --- pyproject.toml | 2 +- src/jobspy/scrapers/linkedin/__init__.py | 8 +--- src/jobspy/scrapers/utils.py | 7 ++++ src/jobspy/scrapers/ziprecruiter/__init__.py | 41 +++++++++++++++++++- 4 files changed, 49 insertions(+), 9 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index ed98ca6..52b0772 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "python-jobspy" -version = "1.1.55" +version = "1.1.56" description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter" authors = ["Zachary Hampton ", "Cullen Watson "] homepage = "https://github.com/Bunsly/JobSpy" diff --git a/src/jobspy/scrapers/linkedin/__init__.py b/src/jobspy/scrapers/linkedin/__init__.py index 8154291..a4f2fd1 100644 --- a/src/jobspy/scrapers/linkedin/__init__.py +++ b/src/jobspy/scrapers/linkedin/__init__.py @@ -19,7 +19,7 @@ from .. import Scraper, ScraperInput, Site from ..exceptions import LinkedInException -from ..utils import create_session +from ..utils import create_session, remove_attributes from ...jobs import ( JobPost, Location, @@ -257,12 +257,6 @@ def _get_job_details(self, job_page_url: str) -> dict: ) description = None if div_content is not None: - - def remove_attributes(tag): - for attr in list(tag.attrs): - del tag[attr] - return tag - div_content = remove_attributes(div_content) description = div_content.prettify(formatter="html") if self.scraper_input.description_format == DescriptionFormat.MARKDOWN: diff --git a/src/jobspy/scrapers/utils.py b/src/jobspy/scrapers/utils.py index 294d20c..8d448be 100644 --- a/src/jobspy/scrapers/utils.py +++ b/src/jobspy/scrapers/utils.py @@ -93,6 +93,7 @@ def execute_request(self, *args, **kwargs): else: self.proxies = {} response = tls_client.Session.execute_request(self, *args, **kwargs) + response.ok = response.status_code in range(200, 400) return response @@ -178,3 +179,9 @@ def currency_parser(cur_str): num = float(cur_str) return np.round(num, 2) + + +def remove_attributes(tag): + for attr in list(tag.attrs): + del tag[attr] + return tag diff --git a/src/jobspy/scrapers/ziprecruiter/__init__.py b/src/jobspy/scrapers/ziprecruiter/__init__.py index 7bf51bf..19fa9dd 100644 --- a/src/jobspy/scrapers/ziprecruiter/__init__.py +++ b/src/jobspy/scrapers/ziprecruiter/__init__.py @@ -7,19 +7,24 @@ from __future__ import annotations +import json import math +import re import time from datetime import datetime from typing import Optional, Tuple, Any from concurrent.futures import ThreadPoolExecutor +from bs4 import BeautifulSoup + from .. import Scraper, ScraperInput, Site from ..utils import ( logger, extract_emails_from_text, create_session, markdown_converter, + remove_attributes, ) from ...jobs import ( JobPost, @@ -151,6 +156,8 @@ def _process_job(self, job: dict) -> JobPost | None: comp_min = int(job["compensation_min"]) if "compensation_min" in job else None comp_max = int(job["compensation_max"]) if "compensation_max" in job else None comp_currency = job.get("compensation_currency") + description_full, job_url_direct = self._get_descr(job_url) + return JobPost( id=str(job["listing_key"]), title=title, @@ -165,10 +172,42 @@ def _process_job(self, job: dict) -> JobPost | None: ), date_posted=date_posted, job_url=job_url, - description=description, + description=description_full if description_full else description, emails=extract_emails_from_text(description) if description else None, + job_url_direct=job_url_direct, ) + def _get_descr(self, job_url): + res = self.session.get(job_url, headers=self.headers, allow_redirects=True) + description_full = job_url_direct = None + if res.ok: + soup = BeautifulSoup(res.text, "html.parser") + job_descr_div = soup.find("div", class_="job_description") + company_descr_section = soup.find("section", class_="company_description") + job_description_clean = ( + remove_attributes(job_descr_div).prettify(formatter="html") + if job_descr_div + else "" + ) + company_description_clean = ( + remove_attributes(company_descr_section).prettify(formatter="html") + if company_descr_section + else "" + ) + description_full = job_description_clean + company_description_clean + script_tag = soup.find("script", type="application/json") + if script_tag: + job_json = json.loads(script_tag.string) + job_url_val = job_json["model"]["saveJobURL"] + m = re.search(r"job_url=(.+)", job_url_val) + if m: + job_url_direct = m.group(1) + + if self.scraper_input.description_format == DescriptionFormat.MARKDOWN: + description_full = markdown_converter(description_full) + + return description_full, job_url_direct + def _get_cookies(self): data = "event_type=session&logged_in=false&number_of_retry=1&property=model%3AiPhone&property=os%3AiOS&property=locale%3Aen_us&property=app_build_number%3A4734&property=app_version%3A91.0&property=manufacturer%3AApple&property=timestamp%3A2024-01-12T12%3A04%3A42-06%3A00&property=screen_height%3A852&property=os_version%3A16.6.1&property=source%3Ainstall&property=screen_width%3A393&property=device_model%3AiPhone%2014%20Pro&property=brand%3AApple" url = f"{self.api_url}/jobs-app/event"