Skip to content

Commit

Permalink
Fix Indeed exceptions on parsing description
Browse files Browse the repository at this point in the history
  • Loading branch information
cullenwatson authored Oct 18, 2023
1 parent 5e71866 commit f2cc74b
Show file tree
Hide file tree
Showing 8 changed files with 79 additions and 48 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -92,9 +92,9 @@ JobPost
│ ├── city (str)
│ ├── state (str)
├── description (str)
├── job_type (enum): fulltime, parttime, internship, contract
├── job_type (str): fulltime, parttime, internship, contract
├── compensation (object)
│ ├── interval (enum): yearly, monthly, weekly, daily, hourly
│ ├── interval (str): yearly, monthly, weekly, daily, hourly
│ ├── min_amount (int)
│ ├── max_amount (int)
│ └── currency (enum)
Expand Down
7 changes: 4 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
[tool.poetry]
name = "python-jobspy"
version = "1.1.13"
version = "1.1.14"
description = "Job scraper for LinkedIn, Indeed & ZipRecruiter"
authors = ["Zachary Hampton <zachary@zacharysproducts.com>", "Cullen Watson <cullen@cullen.ai>"]
homepage = "https://github.com/cullenwatson/JobSpy"
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
homepage = "https://github.com/Bunsly/JobSpy"
readme = "README.md"

packages = [
Expand All @@ -16,6 +16,7 @@ requests = "^2.31.0"
tls-client = "^0.2.1"
beautifulsoup4 = "^4.12.2"
pandas = "^2.1.0"
NUMPY = "1.24.2"
pydantic = "^2.3.0"


Expand Down
7 changes: 3 additions & 4 deletions src/jobspy/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,13 +84,12 @@ def scrape_site(site: Site) -> Tuple[str, JobResponse]:
except (LinkedInException, IndeedException, ZipRecruiterException) as lie:
raise lie
except Exception as e:
# unhandled exceptions
if site == Site.LINKEDIN:
raise LinkedInException()
raise LinkedInException(str(e))
if site == Site.INDEED:
raise IndeedException()
raise IndeedException(str(e))
if site == Site.ZIP_RECRUITER:
raise ZipRecruiterException()
raise ZipRecruiterException(str(e))
else:
raise e
return site.value, scraped_data
Expand Down
10 changes: 8 additions & 2 deletions src/jobspy/jobs/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,10 +37,16 @@ class JobType(Enum):
"повназайнятість",
"toànthờigian",
)
PART_TIME = ("parttime", "teilzeit")
PART_TIME = ("parttime", "teilzeit", "částečnýúvazek", "deltid")
CONTRACT = ("contract", "contractor")
TEMPORARY = ("temporary",)
INTERNSHIP = ("internship", "prácticas", "ojt(onthejobtraining)", "praktikum")
INTERNSHIP = (
"internship",
"prácticas",
"ojt(onthejobtraining)",
"praktikum",
"praktik",
)

PER_DIEM = ("perdiem",)
NIGHTS = ("nights",)
Expand Down
9 changes: 6 additions & 3 deletions src/jobspy/scrapers/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,15 @@


class LinkedInException(Exception):
"""Failed to scrape LinkedIn"""
def __init__(self, message=None):
super().__init__(message or "An error occurred with LinkedIn")


class IndeedException(Exception):
"""Failed to scrape Indeed"""
def __init__(self, message=None):
super().__init__(message or "An error occurred with Indeed")


class ZipRecruiterException(Exception):
"""Failed to scrape ZipRecruiter"""
def __init__(self, message=None):
super().__init__(message or "An error occurred with ZipRecruiter")
66 changes: 42 additions & 24 deletions src/jobspy/scrapers/indeed/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,12 @@
from concurrent.futures import ThreadPoolExecutor, Future

from ..exceptions import IndeedException
from ..utils import count_urgent_words, extract_emails_from_text, create_session
from ..utils import (
count_urgent_words,
extract_emails_from_text,
create_session,
get_enum_from_job_type,
)
from ...jobs import (
JobPost,
Compensation,
Expand Down Expand Up @@ -162,10 +167,10 @@ def process_job(job) -> JobPost | None:
)
return job_post

jobs = jobs["metaData"]["mosaicProviderJobCardsModel"]["results"]
with ThreadPoolExecutor(max_workers=1) as executor:
job_results: list[Future] = [
executor.submit(process_job, job)
for job in jobs["metaData"]["mosaicProviderJobCardsModel"]["results"]
executor.submit(process_job, job) for job in jobs
]

job_list = [result.result() for result in job_results if result.result()]
Expand Down Expand Up @@ -230,13 +235,37 @@ def get_description(self, job_page_url: str) -> str | None:
if response.status_code not in range(200, 400):
return None

raw_description = response.json()["body"]["jobInfoWrapperModel"][
"jobInfoModel"
]["sanitizedJobDescription"]
with io.StringIO(raw_description) as f:
soup = BeautifulSoup(f, "html.parser")
text_content = " ".join(soup.get_text().split()).strip()
return text_content
soup = BeautifulSoup(response.text, "html.parser")
script_tag = soup.find(
"script", text=lambda x: x and "window._initialData" in x
)

if not script_tag:
return None

script_code = script_tag.string
match = re.search(r"window\._initialData\s*=\s*({.*?})\s*;", script_code, re.S)

if not match:
return None

json_string = match.group(1)
data = json.loads(json_string)
try:
job_description = data["jobInfoWrapperModel"]["jobInfoModel"][
"sanitizedJobDescription"
]
except (KeyError, TypeError, IndexError):
return None

soup = BeautifulSoup(
job_description, "html.parser"
)
text_content = " ".join(
soup.get_text(separator=" ").split()
).strip()

return text_content

@staticmethod
def get_job_type(job: dict) -> list[JobType] | None:
Expand All @@ -252,22 +281,11 @@ def get_job_type(job: dict) -> list[JobType] | None:
label = taxonomy["attributes"][i].get("label")
if label:
job_type_str = label.replace("-", "").replace(" ", "").lower()
job_types.append(
IndeedScraper.get_enum_from_job_type(job_type_str)
)
job_type = get_enum_from_job_type(job_type_str)
if job_type:
job_types.append(job_type)
return job_types

@staticmethod
def get_enum_from_job_type(job_type_str):
"""
Given a string, returns the corresponding JobType enum member if a match is found.
for job_type in JobType:
"""
for job_type in JobType:
if job_type_str in job_type.value:
return job_type
return None

@staticmethod
def parse_jobs(soup: BeautifulSoup) -> dict:
"""
Expand Down
12 changes: 2 additions & 10 deletions src/jobspy/scrapers/linkedin/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,14 @@

import requests
import time
import re
from requests.exceptions import ProxyError
from concurrent.futures import ThreadPoolExecutor, as_completed
from bs4 import BeautifulSoup
from bs4.element import Tag
from threading import Lock

from .. import Scraper, ScraperInput, Site
from ..utils import count_urgent_words, extract_emails_from_text
from ..utils import count_urgent_words, extract_emails_from_text, get_enum_from_job_type
from ..exceptions import LinkedInException
from ...jobs import (
JobPost,
Expand Down Expand Up @@ -237,17 +236,10 @@ def get_job_type(
employment_type = employment_type.lower()
employment_type = employment_type.replace("-", "")

return LinkedInScraper.get_enum_from_value(employment_type)
return [get_enum_from_job_type(employment_type)]

return description, get_job_type(soup)

@staticmethod
def get_enum_from_value(value_str):
for job_type in JobType:
if value_str in job_type.value:
return [job_type]
return None

def get_location(self, metadata_card: Optional[Tag]) -> Location:
"""
Extracts the location data from the job metadata card.
Expand Down
12 changes: 12 additions & 0 deletions src/jobspy/scrapers/utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import re
import tls_client
from ..jobs import JobType


def count_urgent_words(description: str) -> int:
Expand Down Expand Up @@ -42,3 +43,14 @@ def create_session(proxy: str | None = None):
# }

return session


def get_enum_from_job_type(job_type_str: str) -> JobType | None:
"""
Given a string, returns the corresponding JobType enum member if a match is found.
"""
res = None
for job_type in JobType:
if job_type_str in job_type.value:
res = job_type
return res

0 comments on commit f2cc74b

Please sign in to comment.