Skip to content

Commit

Permalink
enh: indeed more fields (#126)
Browse files Browse the repository at this point in the history
  • Loading branch information
cullenwatson committed Mar 9, 2024
1 parent a4f6851 commit 0a669e9
Show file tree
Hide file tree
Showing 9 changed files with 267 additions and 288 deletions.
29 changes: 22 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ Updated for release v1.1.3
### Installation

```
pip install python-jobspy
pip install -U python-jobspy
```

_Python version >= [3.10](https://www.python.org/downloads/release/python-3100/) required_
Expand Down Expand Up @@ -64,8 +64,8 @@ Required
├── site_type (List[enum]): linkedin, zip_recruiter, indeed, glassdoor
└── search_term (str)
Optional
├── location (int)
├── distance (int): in miles
├── location (str)
├── distance (int): in miles, default 50
├── job_type (enum): fulltime, parttime, internship, contract
├── proxy (str): in format 'http://user:pass@host:port'
├── is_remote (bool)
Expand All @@ -76,7 +76,7 @@ Optional
├── description_format (enum): markdown, html (format type of the job descriptions)
├── country_indeed (enum): filters the country on Indeed (see below for correct spelling)
├── offset (num): starts the search from an offset (e.g. 25 will start the search from the 25th result)
├── hours_old (int): filters jobs by the number of hours since the job was posted (all but LinkedIn rounds up to next day)
├── hours_old (int): filters jobs by the number of hours since the job was posted (ZipRecruiter and Glassdoor round up to next day. If you use this on Indeed, it will not filter by job_type or is_remote)
```

### JobPost Schema
Expand All @@ -100,15 +100,26 @@ JobPost
│ └── currency (enum)
└── date_posted (date)
└── emails (str)
└── num_urgent_words (int)
└── is_remote (bool)
Indeed specific
├── company_country (str)
└── company_addresses (str)
└── company_industry (str)
└── company_employees_label (str)
└── company_revenue_label (str)
└── company_description (str)
└── ceo_name (str)
└── ceo_photo_url (str)
└── logo_photo_url (str)
└── banner_photo_url (str)
```

## Supported Countries for Job Searching

### **LinkedIn**

LinkedIn searches globally & uses only the `location` parameter. You can only fetch 1000 jobs max from the LinkedIn endpoint we're using
LinkedIn searches globally & uses only the `location` parameter. You can only fetch 1000 jobs max from the LinkedIn endpoint we are using

### **ZipRecruiter**

Expand Down Expand Up @@ -141,7 +152,11 @@ You can specify the following countries when searching on Indeed (use the exact
| Venezuela | Vietnam* | | |


Glassdoor can only fetch 900 jobs from the endpoint we're using on a given search.
## Notes
* Indeed is the best scraper currently with no rate limiting.
* Glassdoor can only fetch 900 jobs from the endpoint we're using on a given search.
* LinkedIn is the most restrictive and usually rate limits on around the 10th page
* ZipRecruiter is okay but has a 5 second delay in between each page to avoid rate limiting.
## Frequently Asked Questions

---
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "python-jobspy"
version = "1.1.47"
version = "1.1.48"
description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter"
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
homepage = "https://github.com/Bunsly/JobSpy"
Expand Down
23 changes: 18 additions & 5 deletions src/jobspy/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from concurrent.futures import ThreadPoolExecutor, as_completed

from .jobs import JobType, Location
from .scrapers.utils import logger
from .scrapers.indeed import IndeedScraper
from .scrapers.ziprecruiter import ZipRecruiterScraper
from .scrapers.glassdoor import GlassdoorScraper
Expand All @@ -20,7 +21,7 @@ def scrape_jobs(
site_name: str | list[str] | Site | list[Site] | None = None,
search_term: str | None = None,
location: str | None = None,
distance: int | None = None,
distance: int | None = 50,
is_remote: bool = False,
job_type: str | None = None,
easy_apply: bool | None = None,
Expand Down Expand Up @@ -92,6 +93,8 @@ def scrape_site(site: Site) -> Tuple[str, JobResponse]:
scraper_class = SCRAPER_MAPPING[site]
scraper = scraper_class(proxy=proxy)
scraped_data: JobResponse = scraper.scrape(scraper_input)
site_name = 'ZipRecruiter' if site.value.capitalize() == 'Zip_recruiter' else site.value.capitalize()
logger.info(f"{site_name} finished scraping")
return site.value, scraped_data

site_to_jobs_dict = {}
Expand Down Expand Up @@ -160,11 +163,11 @@ def worker(site):

# Desired column order
desired_order = [
"job_url_hyper" if hyperlinks else "job_url",
"site",
"job_url_hyper" if hyperlinks else "job_url",
"job_url_direct",
"title",
"company",
"company_url",
"location",
"job_type",
"date_posted",
Expand All @@ -173,10 +176,20 @@ def worker(site):
"max_amount",
"currency",
"is_remote",
"num_urgent_words",
"benefits",
"emails",
"description",

"company_url",
"company_url_direct",
"company_addresses",
"company_industry",
"company_num_employees",
"company_revenue",
"company_description",
"logo_photo_url",
"banner_photo_url",
"ceo_name",
"ceo_photo_url",
]

# Step 3: Ensure all desired columns are present, adding missing ones as empty
Expand Down
35 changes: 25 additions & 10 deletions src/jobspy/jobs/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ class JobType(Enum):
class Country(Enum):
"""
Gets the subdomain for Indeed and Glassdoor.
The second item in the tuple is the subdomain for Indeed
The second item in the tuple is the subdomain (and API country code if there's a ':' separator) for Indeed
The third item in the tuple is the subdomain (and tld if there's a ':' separator) for Glassdoor
"""

Expand Down Expand Up @@ -118,8 +118,8 @@ class Country(Enum):
TURKEY = ("turkey", "tr")
UKRAINE = ("ukraine", "ua")
UNITEDARABEMIRATES = ("united arab emirates", "ae")
UK = ("uk,united kingdom", "uk", "co.uk")
USA = ("usa,us,united states", "www", "com")
UK = ("uk,united kingdom", "uk:gb", "co.uk")
USA = ("usa,us,united states", "www:us", "com")
URUGUAY = ("uruguay", "uy")
VENEZUELA = ("venezuela", "ve")
VIETNAM = ("vietnam", "vn", "com")
Expand All @@ -132,7 +132,10 @@ class Country(Enum):

@property
def indeed_domain_value(self):
return self.value[1]
subdomain, _, api_country_code = self.value[1].partition(":")
if subdomain and api_country_code:
return subdomain, api_country_code.upper()
return self.value[1], self.value[1].upper()

@property
def glassdoor_domain_value(self):
Expand Down Expand Up @@ -163,7 +166,7 @@ def from_string(cls, country_str: str):


class Location(BaseModel):
country: Country | None = None
country: Country | str | None = None
city: Optional[str] = None
state: Optional[str] = None

Expand All @@ -173,7 +176,9 @@ def display_location(self) -> str:
location_parts.append(self.city)
if self.state:
location_parts.append(self.state)
if self.country and self.country not in (Country.US_CANADA, Country.WORLDWIDE):
if isinstance(self.country, str):
location_parts.append(self.country)
elif self.country and self.country not in (Country.US_CANADA, Country.WORLDWIDE):
country_name = self.country.value[0]
if "," in country_name:
country_name = country_name.split(",")[0]
Expand Down Expand Up @@ -217,21 +222,31 @@ class DescriptionFormat(Enum):

class JobPost(BaseModel):
title: str
company_name: str
company_name: str | None
job_url: str
job_url_direct: str | None = None
location: Optional[Location]

description: str | None = None
company_url: str | None = None
company_url_direct: str | None = None

job_type: list[JobType] | None = None
compensation: Compensation | None = None
date_posted: date | None = None
benefits: str | None = None
emails: list[str] | None = None
num_urgent_words: int | None = None
is_remote: bool | None = None
# company_industry: str | None = None

# indeed specific
company_addresses: str | None = None
company_industry: str | None = None
company_num_employees: str | None = None
company_revenue: str | None = None
company_description: str | None = None
ceo_name: str | None = None
ceo_photo_url: str | None = None
logo_photo_url: str | None = None
banner_photo_url: str | None = None


class JobResponse(BaseModel):
Expand Down
3 changes: 1 addition & 2 deletions src/jobspy/scrapers/glassdoor/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from typing import Optional
from datetime import datetime, timedelta
from concurrent.futures import ThreadPoolExecutor, as_completed
from ..utils import count_urgent_words, extract_emails_from_text
from ..utils import extract_emails_from_text

from .. import Scraper, ScraperInput, Site
from ..exceptions import GlassdoorException
Expand Down Expand Up @@ -188,7 +188,6 @@ def _process_job(self, job_data):
is_remote=is_remote,
description=description,
emails=extract_emails_from_text(description) if description else None,
num_urgent_words=count_urgent_words(description) if description else None,
)

def _fetch_job_description(self, job_id):
Expand Down
Loading

0 comments on commit 0a669e9

Please sign in to comment.