enh(li): job function (#160)

Bunsly · May 28, 2024 · 89a3ee2 · 89a3ee2
1 parent 6439f71
commit 89a3ee2
Show file tree

Hide file tree

Showing 4 changed files with 20 additions and 5 deletions.
diff --git a/README.md b/README.md
@@ -13,9 +13,6 @@ work with us.*
 - Aggregates the job postings in a Pandas DataFrame
 - Proxies support
 
-[Video Guide for JobSpy](https://www.youtube.com/watch?v=RuP1HrAZnxs&pp=ygUgam9icyBzY3JhcGVyIGJvdCBsaW5rZWRpbiBpbmRlZWQ%3D) -
-Updated for release v1.1.3
-
 ![jobspy](https://github.com/cullenwatson/JobSpy/assets/78247585/ec7ef355-05f6-4fd3-8161-a817e31c5c57)
 
 ### Installation
@@ -46,7 +43,7 @@ jobs = scrape_jobs(
 )
 print(f"Found {len(jobs)} jobs")
 print(jobs.head())
-jobs.to_csv("jobs.csv", quoting=csv.QUOTE_NONNUMERIC, escapechar="\\", index=False) # to_xlsx
+jobs.to_csv("jobs.csv", quoting=csv.QUOTE_NONNUMERIC, escapechar="\\", index=False) # to_excel
 ```
 
 ### Output

diff --git a/src/jobspy/__init__.py b/src/jobspy/__init__.py
@@ -182,6 +182,7 @@ def worker(site):
             "max_amount",
             "currency",
             "is_remote",
+            "job_function",
             "emails",
             "description",
             "company_url",

diff --git a/src/jobspy/jobs/__init__.py b/src/jobspy/jobs/__init__.py
@@ -254,6 +254,9 @@ class JobPost(BaseModel):
     logo_photo_url: str | None = None
     banner_photo_url: str | None = None
 
+    # linkedin only atm
+    job_function: str | None = None
+
 
 class JobResponse(BaseModel):
     jobs: list[JobPost] = []
diff --git a/src/jobspy/scrapers/linkedin/__init__.py b/src/jobspy/scrapers/linkedin/__init__.py
@@ -224,6 +224,7 @@ def _process_job(
             job_url_direct=job_details.get("job_url_direct"),
             emails=extract_emails_from_text(job_details.get("description")),
             logo_photo_url=job_details.get("logo_photo_url"),
+            job_function=job_details.get("job_function"),
         )
 
     def _get_id(self, url: str):
@@ -247,7 +248,7 @@ def _get_job_details(self, job_page_url: str) -> dict:
             response.raise_for_status()
         except:
             return {}
-        if response.url == "https://www.linkedin.com/signup":
+        if "linkedin.com/signup" in response.url:
             return {}
 
         soup = BeautifulSoup(response.text, "html.parser")
@@ -266,13 +267,26 @@ def remove_attributes(tag):
             description = div_content.prettify(formatter="html")
             if self.scraper_input.description_format == DescriptionFormat.MARKDOWN:
                 description = markdown_converter(description)
+
+        h3_tag = soup.find(
+            "h3", text=lambda text: text and "Job function" in text.strip()
+        )
+
+        job_function = None
+        if h3_tag:
+            job_function_span = h3_tag.find_next(
+                "span", class_="description__job-criteria-text"
+            )
+            if job_function_span:
+                job_function = job_function_span.text.strip()
         return {
             "description": description,
             "job_type": self._parse_job_type(soup),
             "job_url_direct": self._parse_job_url_direct(soup),
             "logo_photo_url": soup.find("img", {"class": "artdeco-entity-image"}).get(
                 "data-delayed-url"
             ),
+            "job_function": job_function,
         }
 
     def _get_location(self, metadata_card: Optional[Tag]) -> Location: