Skip to content

Commit

Permalink
fix linkedin bug & add linkedin company url (#67)
Browse files Browse the repository at this point in the history
  • Loading branch information
cullenwatson committed Nov 8, 2023
1 parent a2c8fe0 commit cc9e786
Show file tree
Hide file tree
Showing 5 changed files with 49 additions and 37 deletions.
10 changes: 6 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ zip_recruiter Software Developer TEKsystems Phoenix

```plaintext
Required
├── site_type (List[enum]): linkedin, zip_recruiter, indeed
├── site_type (List[enum]): linkedin, zip_recruiter, indeed, glassdoor
└── search_term (str)
Optional
├── location (int)
Expand Down Expand Up @@ -107,21 +107,22 @@ The following exceptions may be raised when using JobSpy:
* `LinkedInException`
* `IndeedException`
* `ZipRecruiterException`
* `GlassdoorException`

## Supported Countries for Job Searching

### **LinkedIn**

LinkedIn searches globally & uses only the `location` parameter.
LinkedIn searches globally & uses only the `location` parameter. You can only fetch 1000 jobs max from the LinkedIn endpoint we're using

### **ZipRecruiter**

ZipRecruiter searches for jobs in **US/Canada** & uses only the `location` parameter.

### **Indeed**
### **Indeed / Glassdoor**

Indeed & Glassdoor supports most countries, but the `country_indeed` parameter is required. Additionally, use the `location`
parameter to narrow down the location, e.g. city & state if necessary.
parameter to narrow down the location, e.g. city & state if necessary.

You can specify the following countries when searching on Indeed (use the exact name, * indicates support for Glassdoor):

Expand All @@ -145,6 +146,7 @@ You can specify the following countries when searching on Indeed (use the exact
| Venezuela | Vietnam | | |


Glassdoor can only fetch 900 jobs from the endpoint we're using on a given search.
## Frequently Asked Questions

---
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "python-jobspy"
version = "1.1.25"
version = "1.1.26"
description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter"
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
homepage = "https://github.com/Bunsly/JobSpy"
Expand Down
1 change: 1 addition & 0 deletions src/jobspy/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,7 @@ def worker(site):
"site",
"title",
"company",
"company_url",
"location",
"job_type",
"date_posted",
Expand Down
2 changes: 2 additions & 0 deletions src/jobspy/jobs/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,8 @@ class JobPost(BaseModel):
location: Optional[Location]

description: str | None = None
company_url: str | None = None

job_type: list[JobType] | None = None
compensation: Compensation | None = None
date_posted: date | None = None
Expand Down
71 changes: 39 additions & 32 deletions src/jobspy/scrapers/linkedin/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,10 @@
import requests
import time
from requests.exceptions import ProxyError
from concurrent.futures import ThreadPoolExecutor, as_completed
from bs4 import BeautifulSoup
from bs4.element import Tag
from threading import Lock
from urllib.parse import urlparse, urlunparse

from .. import Scraper, ScraperInput, Site
from ..utils import count_urgent_words, extract_emails_from_text, get_enum_from_job_type
Expand Down Expand Up @@ -66,12 +66,10 @@ def job_type_code(job_type_enum):
if scraper_input.job_type
else None,
"pageNum": 0,
page: page + scraper_input.offset,
"start": page + scraper_input.offset,
"f_AL": "true" if scraper_input.easy_apply else None,
}

params = {k: v for k, v in params.items() if v is not None}

params = {k: v for k, v in params.items() if v is not None}
retries = 0
while retries < self.MAX_RETRIES:
Expand All @@ -88,7 +86,7 @@ def job_type_code(job_type_enum):
break
except requests.HTTPError as e:
if hasattr(e, "response") and e.response is not None:
if e.response.status_code == 429:
if e.response.status_code in (429, 502):
time.sleep(self.DELAY)
retries += 1
continue
Expand All @@ -110,32 +108,27 @@ def job_type_code(job_type_enum):

soup = BeautifulSoup(response.text, "html.parser")

with ThreadPoolExecutor(max_workers=5) as executor:
futures = []
for job_card in soup.find_all("div", class_="base-search-card"):
job_url = None
href_tag = job_card.find("a", class_="base-card__full-link")
if href_tag and "href" in href_tag.attrs:
href = href_tag.attrs["href"].split("?")[0]
job_id = href.split("-")[-1]
job_url = f"{self.url}/jobs/view/{job_id}"

with url_lock:
if job_url in seen_urls:
continue
seen_urls.add(job_url)

futures.append(executor.submit(self.process_job, job_card, job_url))

for future in as_completed(futures):
try:
job_post = future.result()
if job_post:
job_list.append(job_post)
except Exception as e:
raise LinkedInException(
"Exception occurred while processing jobs"
)
for job_card in soup.find_all("div", class_="base-search-card"):
job_url = None
href_tag = job_card.find("a", class_="base-card__full-link")
if href_tag and "href" in href_tag.attrs:
href = href_tag.attrs["href"].split("?")[0]
job_id = href.split("-")[-1]
job_url = f"{self.url}/jobs/view/{job_id}"

with url_lock:
if job_url in seen_urls:
continue
seen_urls.add(job_url)

# Call process_job directly without threading
try:
job_post = self.process_job(job_card, job_url)
if job_post:
job_list.append(job_post)
except Exception as e:
raise LinkedInException("Exception occurred while processing jobs")

page += 25

job_list = job_list[: scraper_input.results_wanted]
Expand All @@ -147,6 +140,11 @@ def process_job(self, job_card: Tag, job_url: str) -> Optional[JobPost]:

company_tag = job_card.find("h4", class_="base-search-card__subtitle")
company_a_tag = company_tag.find("a") if company_tag else None
company_url = (
urlunparse(urlparse(company_a_tag.get("href"))._replace(query=""))
if company_a_tag and company_a_tag.has_attr("href")
else ""
)
company = company_a_tag.get_text(strip=True) if company_a_tag else "N/A"

metadata_card = job_card.find("div", class_="base-search-card__metadata")
Expand All @@ -168,11 +166,13 @@ def process_job(self, job_card: Tag, job_url: str) -> Optional[JobPost]:
benefits = " ".join(benefits_tag.get_text().split()) if benefits_tag else None

description, job_type = self.get_job_description(job_url)
# description, job_type = None, []

return JobPost(
title=title,
description=description,
company_name=company,
company_url=company_url,
location=location,
date_posted=date_posted,
job_url=job_url,
Expand All @@ -193,8 +193,15 @@ def get_job_description(
try:
response = requests.get(job_page_url, timeout=5, proxies=self.proxy)
response.raise_for_status()
except requests.HTTPError as e:
if hasattr(e, "response") and e.response is not None:
if e.response.status_code in (429, 502):
time.sleep(self.DELAY)
return None, None
except Exception as e:
return None, None
if response.url == "https://www.linkedin.com/signup":
return None, None

soup = BeautifulSoup(response.text, "html.parser")
div_content = soup.find(
Expand Down Expand Up @@ -230,7 +237,7 @@ def get_job_type(
employment_type = employment_type.lower()
employment_type = employment_type.replace("-", "")

return [get_enum_from_job_type(employment_type)]
return [get_enum_from_job_type(employment_type)] if employment_type else []

return description, get_job_type(soup)

Expand Down

0 comments on commit cc9e786

Please sign in to comment.