Skip to content

Commit

Permalink
enh: full description param (#85)
Browse files Browse the repository at this point in the history
  • Loading branch information
cullenwatson committed Jan 23, 2024
1 parent 2ec3b04 commit 5b3627b
Show file tree
Hide file tree
Showing 8 changed files with 115 additions and 50 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ Optional
├── job_type (enum): fulltime, parttime, internship, contract
├── proxy (str): in format 'http://user:pass@host:port' or [https, socks]
├── is_remote (bool)
├── full_description (bool): fetches full description for Indeed / LinkedIn (much slower)
├── results_wanted (int): number of job results to retrieve for each site specified in 'site_type'
├── easy_apply (bool): filters for jobs that are hosted on LinkedIn
├── country_indeed (enum): filters the country on Indeed (see below for correct spelling)
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "python-jobspy"
version = "1.1.34"
version = "1.1.35"
description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter"
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
homepage = "https://github.com/Bunsly/JobSpy"
Expand Down
2 changes: 2 additions & 0 deletions src/jobspy/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ def scrape_jobs(
country_indeed: str = "usa",
hyperlinks: bool = False,
proxy: Optional[str] = None,
full_description: Optional[bool] = False,
offset: Optional[int] = 0,
) -> pd.DataFrame:
"""
Expand Down Expand Up @@ -74,6 +75,7 @@ def get_enum_from_value(value_str):
is_remote=is_remote,
job_type=job_type,
easy_apply=easy_apply,
full_description=full_description,
results_wanted=results_wanted,
offset=offset,
)
Expand Down
1 change: 1 addition & 0 deletions src/jobspy/scrapers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ class ScraperInput(BaseModel):
is_remote: bool = False
job_type: Optional[JobType] = None
easy_apply: bool = None # linkedin
full_description: bool = False
offset: int = 0

results_wanted: int = 15
Expand Down
129 changes: 95 additions & 34 deletions src/jobspy/scrapers/glassdoor/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,12 @@
This module contains routines to scrape Glassdoor.
"""
import json
from typing import Optional, Any
import requests
from bs4 import BeautifulSoup
from typing import Optional
from datetime import datetime, timedelta
from concurrent.futures import ThreadPoolExecutor, as_completed
from ..utils import count_urgent_words, extract_emails_from_text

from .. import Scraper, ScraperInput, Site
from ..exceptions import GlassdoorException
Expand Down Expand Up @@ -66,50 +70,70 @@ def fetch_jobs_page(
jobs_data = res_json["data"]["jobListings"]["jobListings"]

jobs = []
for i, job in enumerate(jobs_data):
job_url = res_json["data"]["jobListings"]["jobListingSeoLinks"][
"linkItems"
][i]["url"]
if job_url in self.seen_urls:
continue
self.seen_urls.add(job_url)
job = job["jobview"]
title = job["job"]["jobTitleText"]
company_name = job["header"]["employerNameFromSearch"]
location_name = job["header"].get("locationName", "")
location_type = job["header"].get("locationType", "")
age_in_days = job["header"].get("ageInDays")
is_remote, location = False, None
date_posted = (datetime.now() - timedelta(days=age_in_days)).date() if age_in_days else None

if location_type == "S":
is_remote = True
else:
location = self.parse_location(location_name)

compensation = self.parse_compensation(job["header"])

job = JobPost(
title=title,
company_name=company_name,
date_posted=date_posted,
job_url=job_url,
location=location,
compensation=compensation,
is_remote=is_remote
)
jobs.append(job)
with ThreadPoolExecutor(max_workers=self.jobs_per_page) as executor:
future_to_job_data = {executor.submit(self.process_job, job): job for job in jobs_data}
for future in as_completed(future_to_job_data):
job_data = future_to_job_data[future]
try:
job_post = future.result()
if job_post:
jobs.append(job_post)
except Exception as exc:
raise GlassdoorException(f'Glassdoor generated an exception: {exc}')

return jobs, self.get_cursor_for_page(
res_json["data"]["jobListings"]["paginationCursors"], page_num + 1
)

def process_job(self, job_data):
"""Processes a single job and fetches its description."""
job_id = job_data["jobview"]["job"]["listingId"]
job_url = f'{self.url}/job-listing/?jl={job_id}'
if job_url in self.seen_urls:
return None
self.seen_urls.add(job_url)
job = job_data["jobview"]
title = job["job"]["jobTitleText"]
company_name = job["header"]["employerNameFromSearch"]
location_name = job["header"].get("locationName", "")
location_type = job["header"].get("locationType", "")
age_in_days = job["header"].get("ageInDays")
is_remote, location = False, None
date_posted = (datetime.now() - timedelta(days=age_in_days)).date() if age_in_days else None

if location_type == "S":
is_remote = True
else:
location = self.parse_location(location_name)

compensation = self.parse_compensation(job["header"])

try:
description = self.fetch_job_description(job_id)
except Exception as e :
description = None

job_post = JobPost(
title=title,
company_name=company_name,
date_posted=date_posted,
job_url=job_url,
location=location,
compensation=compensation,
is_remote=is_remote,
description=description,
emails=extract_emails_from_text(description) if description else None,
num_urgent_words=count_urgent_words(description) if description else None,
)
return job_post

def scrape(self, scraper_input: ScraperInput) -> JobResponse:
"""
Scrapes Glassdoor for jobs with scraper_input criteria.
:param scraper_input: Information about job search criteria.
:return: JobResponse containing a list of jobs.
"""
scraper_input.results_wanted = min(900, scraper_input.results_wanted)
self.country = scraper_input.country
self.url = self.country.get_url()

Expand Down Expand Up @@ -143,6 +167,43 @@ def scrape(self, scraper_input: ScraperInput) -> JobResponse:

return JobResponse(jobs=all_jobs)

def fetch_job_description(self, job_id):
"""Fetches the job description for a single job ID."""
url = f"{self.url}/graph"
body = [
{
"operationName": "JobDetailQuery",
"variables": {
"jl": job_id,
"queryString": "q",
"pageTypeEnum": "SERP"
},
"query": """
query JobDetailQuery($jl: Long!, $queryString: String, $pageTypeEnum: PageTypeEnum) {
jobview: jobView(
listingId: $jl
contextHolder: {queryString: $queryString, pageTypeEnum: $pageTypeEnum}
) {
job {
description
__typename
}
__typename
}
}
"""
}
]
response = requests.post(url, json=body, headers=GlassdoorScraper.headers())
if response.status_code != 200:
return None
data = response.json()[0]
desc = data['data']['jobview']['job']['description']
soup = BeautifulSoup(desc, 'html.parser')
description = soup.get_text(separator='\n')

return description

@staticmethod
def parse_compensation(data: dict) -> Optional[Compensation]:
pay_period = data.get("payPeriod")
Expand Down
7 changes: 4 additions & 3 deletions src/jobspy/scrapers/indeed/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ def scrape_page(
if sc_values:
params["sc"] = "0kf:" + "".join(sc_values) + ";"
try:
session = create_session(self.proxy, is_tls=True)
session = create_session(self.proxy)
response = session.get(
f"{self.url}/jobs",
headers=self.get_headers(),
Expand Down Expand Up @@ -140,7 +140,8 @@ def process_job(job) -> JobPost | None:
date_posted = datetime.fromtimestamp(timestamp_seconds)
date_posted = date_posted.strftime("%Y-%m-%d")

description = self.get_description(job_url)
description = self.get_description(job_url) if scraper_input.full_description else None

with io.StringIO(job["snippet"]) as f:
soup_io = BeautifulSoup(f, "html.parser")
li_elements = soup_io.find_all("li")
Expand Down Expand Up @@ -246,7 +247,7 @@ def get_description(self, job_page_url: str) -> str | None:
return None

soup = BeautifulSoup(job_description, "html.parser")
text_content = " ".join(soup.get_text(separator=" ").split()).strip()
text_content = "\n".join(soup.stripped_strings)

return text_content

Expand Down
21 changes: 10 additions & 11 deletions src/jobspy/scrapers/linkedin/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ def job_type_code(job_type_enum):

# Call process_job directly without threading
try:
job_post = self.process_job(job_card, job_url)
job_post = self.process_job(job_card, job_url, scraper_input.full_description)
if job_post:
job_list.append(job_post)
except Exception as e:
Expand All @@ -123,7 +123,7 @@ def job_type_code(job_type_enum):
job_list = job_list[: scraper_input.results_wanted]
return JobResponse(jobs=job_list)

def process_job(self, job_card: Tag, job_url: str) -> Optional[JobPost]:
def process_job(self, job_card: Tag, job_url: str, full_descr: bool) -> Optional[JobPost]:
salary_tag = job_card.find('span', class_='job-search-card__salary-info')

compensation = None
Expand Down Expand Up @@ -160,7 +160,7 @@ def process_job(self, job_card: Tag, job_url: str) -> Optional[JobPost]:
if metadata_card
else None
)
date_posted = None
date_posted = description = job_type = None
if datetime_tag and "datetime" in datetime_tag.attrs:
datetime_str = datetime_tag["datetime"]
try:
Expand All @@ -169,9 +169,8 @@ def process_job(self, job_card: Tag, job_url: str) -> Optional[JobPost]:
date_posted = None
benefits_tag = job_card.find("span", class_="result-benefits__text")
benefits = " ".join(benefits_tag.get_text().split()) if benefits_tag else None

# removed to speed up scraping
# description, job_type = self.get_job_description(job_url)
if full_descr:
description, job_type = self.get_job_description(job_url)

return JobPost(
title=title,
Expand All @@ -182,10 +181,10 @@ def process_job(self, job_card: Tag, job_url: str) -> Optional[JobPost]:
job_url=job_url,
compensation=compensation,
benefits=benefits,
# job_type=job_type,
# description=description,
# emails=extract_emails_from_text(description) if description else None,
# num_urgent_words=count_urgent_words(description) if description else None,
job_type=job_type,
description=description,
emails=extract_emails_from_text(description) if description else None,
num_urgent_words=count_urgent_words(description) if description else None,
)

def get_job_description(
Expand Down Expand Up @@ -214,7 +213,7 @@ def get_job_description(

description = None
if div_content:
description = " ".join(div_content.get_text().split()).strip()
description = "\n".join(line.strip() for line in div_content.get_text(separator="\n").splitlines() if line.strip())

def get_job_type(
soup_job_type: BeautifulSoup,
Expand Down
2 changes: 1 addition & 1 deletion src/jobspy/scrapers/ziprecruiter/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ def process_job(job: dict) -> JobPost:

description = BeautifulSoup(
job.get("job_description", "").strip(), "html.parser"
).get_text()
).get_text(separator="\n")

company = job["hiring_company"].get("name") if "hiring_company" in job else None
country_value = "usa" if job.get("job_country") == "US" else "canada"
Expand Down

0 comments on commit 5b3627b

Please sign in to comment.