diff --git a/README.md b/README.md index 6543aea..3277b12 100644 --- a/README.md +++ b/README.md @@ -13,9 +13,6 @@ work with us.* - Aggregates the job postings in a Pandas DataFrame - Proxies support -[Video Guide for JobSpy](https://www.youtube.com/watch?v=RuP1HrAZnxs&pp=ygUgam9icyBzY3JhcGVyIGJvdCBsaW5rZWRpbiBpbmRlZWQ%3D) - -Updated for release v1.1.3 - ![jobspy](https://github.com/cullenwatson/JobSpy/assets/78247585/ec7ef355-05f6-4fd3-8161-a817e31c5c57) ### Installation @@ -46,7 +43,7 @@ jobs = scrape_jobs( ) print(f"Found {len(jobs)} jobs") print(jobs.head()) -jobs.to_csv("jobs.csv", quoting=csv.QUOTE_NONNUMERIC, escapechar="\\", index=False) # to_xlsx +jobs.to_csv("jobs.csv", quoting=csv.QUOTE_NONNUMERIC, escapechar="\\", index=False) # to_excel ``` ### Output diff --git a/src/jobspy/__init__.py b/src/jobspy/__init__.py index 4ad1f74..f0623a3 100644 --- a/src/jobspy/__init__.py +++ b/src/jobspy/__init__.py @@ -182,6 +182,7 @@ def worker(site): "max_amount", "currency", "is_remote", + "job_function", "emails", "description", "company_url", diff --git a/src/jobspy/jobs/__init__.py b/src/jobspy/jobs/__init__.py index 61816c5..08c5ad2 100644 --- a/src/jobspy/jobs/__init__.py +++ b/src/jobspy/jobs/__init__.py @@ -254,6 +254,9 @@ class JobPost(BaseModel): logo_photo_url: str | None = None banner_photo_url: str | None = None + # linkedin only atm + job_function: str | None = None + class JobResponse(BaseModel): jobs: list[JobPost] = [] diff --git a/src/jobspy/scrapers/linkedin/__init__.py b/src/jobspy/scrapers/linkedin/__init__.py index 03ec37f..8154291 100644 --- a/src/jobspy/scrapers/linkedin/__init__.py +++ b/src/jobspy/scrapers/linkedin/__init__.py @@ -224,6 +224,7 @@ def _process_job( job_url_direct=job_details.get("job_url_direct"), emails=extract_emails_from_text(job_details.get("description")), logo_photo_url=job_details.get("logo_photo_url"), + job_function=job_details.get("job_function"), ) def _get_id(self, url: str): @@ -247,7 +248,7 @@ def _get_job_details(self, job_page_url: str) -> dict: response.raise_for_status() except: return {} - if response.url == "https://www.linkedin.com/signup": + if "linkedin.com/signup" in response.url: return {} soup = BeautifulSoup(response.text, "html.parser") @@ -266,6 +267,18 @@ def remove_attributes(tag): description = div_content.prettify(formatter="html") if self.scraper_input.description_format == DescriptionFormat.MARKDOWN: description = markdown_converter(description) + + h3_tag = soup.find( + "h3", text=lambda text: text and "Job function" in text.strip() + ) + + job_function = None + if h3_tag: + job_function_span = h3_tag.find_next( + "span", class_="description__job-criteria-text" + ) + if job_function_span: + job_function = job_function_span.text.strip() return { "description": description, "job_type": self._parse_job_type(soup), @@ -273,6 +286,7 @@ def remove_attributes(tag): "logo_photo_url": soup.find("img", {"class": "artdeco-entity-image"}).get( "data-delayed-url" ), + "job_function": job_function, } def _get_location(self, metadata_card: Optional[Tag]) -> Location: