enh: indeed more fields (#126)

Bunsly · Mar 9, 2024 · 0a669e9 · 0a669e9
1 parent a4f6851
commit 0a669e9
Show file tree

Hide file tree

Showing 9 changed files with 267 additions and 288 deletions.
diff --git a/README.md b/README.md
@@ -21,7 +21,7 @@ Updated for release v1.1.3
 ### Installation
 
 ```
-pip install python-jobspy
+pip install -U python-jobspy
 ```
 
 _Python version >= [3.10](https://www.python.org/downloads/release/python-3100/) required_
@@ -64,8 +64,8 @@ Required
 ├── site_type (List[enum]): linkedin, zip_recruiter, indeed, glassdoor
 └── search_term (str)
 Optional
-├── location (int)
-├── distance (int): in miles
+├── location (str)
+├── distance (int): in miles, default 50
 ├── job_type (enum): fulltime, parttime, internship, contract
 ├── proxy (str): in format 'http://user:pass@host:port'
 ├── is_remote (bool)
@@ -76,7 +76,7 @@ Optional
 ├── description_format (enum): markdown, html (format type of the job descriptions)
 ├── country_indeed (enum): filters the country on Indeed (see below for correct spelling)
 ├── offset (num): starts the search from an offset (e.g. 25 will start the search from the 25th result)
-├── hours_old (int): filters jobs by the number of hours since the job was posted (all but LinkedIn rounds up to next day)
+├── hours_old (int): filters jobs by the number of hours since the job was posted (ZipRecruiter and Glassdoor round up to next day. If you use this on Indeed, it will not filter by job_type or is_remote)
 ```
 
 ### JobPost Schema
@@ -100,15 +100,26 @@ JobPost
 │   └── currency (enum)
 └── date_posted (date)
 └── emails (str)
-└── num_urgent_words (int)
 └── is_remote (bool)
+
+Indeed specific
+├── company_country (str)
+└── company_addresses (str)
+└── company_industry (str)
+└── company_employees_label (str)
+└── company_revenue_label (str)
+└── company_description (str)
+└── ceo_name (str)
+└── ceo_photo_url (str)
+└── logo_photo_url (str)
+└── banner_photo_url (str)
 ```
 
 ## Supported Countries for Job Searching
 
 ### **LinkedIn**
 
-LinkedIn searches globally & uses only the `location` parameter. You can only fetch 1000 jobs max from the LinkedIn endpoint we're using
+LinkedIn searches globally & uses only the `location` parameter. You can only fetch 1000 jobs max from the LinkedIn endpoint we are using
 
 ### **ZipRecruiter**
 
@@ -141,7 +152,11 @@ You can specify the following countries when searching on Indeed (use the exact
 | Venezuela            | Vietnam*     |            |                |
 
 
-Glassdoor can only fetch 900 jobs from the endpoint we're using on a given search.
+## Notes
+* Indeed is the best scraper currently with no rate limiting.  
+* Glassdoor can only fetch 900 jobs from the endpoint we're using on a given search.  
+* LinkedIn is the most restrictive and usually rate limits on around the 10th page  
+* ZipRecruiter is okay but has a 5 second delay in between each page to avoid rate limiting.
 ## Frequently Asked Questions
 
 ---

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "python-jobspy"
-version = "1.1.47"
+version = "1.1.48"
 description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter"
 authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
 homepage = "https://github.com/Bunsly/JobSpy"

diff --git a/src/jobspy/__init__.py b/src/jobspy/__init__.py
@@ -3,6 +3,7 @@
 from concurrent.futures import ThreadPoolExecutor, as_completed
 
 from .jobs import JobType, Location
+from .scrapers.utils import logger
 from .scrapers.indeed import IndeedScraper
 from .scrapers.ziprecruiter import ZipRecruiterScraper
 from .scrapers.glassdoor import GlassdoorScraper
@@ -20,7 +21,7 @@ def scrape_jobs(
     site_name: str | list[str] | Site | list[Site] | None = None,
     search_term: str | None = None,
     location: str | None = None,
-    distance: int | None = None,
+    distance: int | None = 50,
     is_remote: bool = False,
     job_type: str | None = None,
     easy_apply: bool | None = None,
@@ -92,6 +93,8 @@ def scrape_site(site: Site) -> Tuple[str, JobResponse]:
         scraper_class = SCRAPER_MAPPING[site]
         scraper = scraper_class(proxy=proxy)
         scraped_data: JobResponse = scraper.scrape(scraper_input)
+        site_name = 'ZipRecruiter' if site.value.capitalize() == 'Zip_recruiter' else site.value.capitalize()
+        logger.info(f"{site_name} finished scraping")
         return site.value, scraped_data
 
     site_to_jobs_dict = {}
@@ -160,11 +163,11 @@ def worker(site):
 
         # Desired column order
         desired_order = [
-            "job_url_hyper" if hyperlinks else "job_url",
             "site",
+            "job_url_hyper" if hyperlinks else "job_url",
+            "job_url_direct",
             "title",
             "company",
-            "company_url",
             "location",
             "job_type",
             "date_posted",
@@ -173,10 +176,20 @@ def worker(site):
             "max_amount",
             "currency",
             "is_remote",
-            "num_urgent_words",
-            "benefits",
             "emails",
             "description",
+
+            "company_url",
+            "company_url_direct",
+            "company_addresses",
+            "company_industry",
+            "company_num_employees",
+            "company_revenue",
+            "company_description",
+            "logo_photo_url",
+            "banner_photo_url",
+            "ceo_name",
+            "ceo_photo_url",
         ]
 
         # Step 3: Ensure all desired columns are present, adding missing ones as empty

diff --git a/src/jobspy/jobs/__init__.py b/src/jobspy/jobs/__init__.py
@@ -57,7 +57,7 @@ class JobType(Enum):
 class Country(Enum):
     """
     Gets the subdomain for Indeed and Glassdoor.
-    The second item in the tuple is the subdomain for Indeed
+    The second item in the tuple is the subdomain (and API country code if there's a ':' separator) for Indeed
     The third item in the tuple is the subdomain (and tld if there's a ':' separator) for Glassdoor
     """
 
@@ -118,8 +118,8 @@ class Country(Enum):
     TURKEY = ("turkey", "tr")
     UKRAINE = ("ukraine", "ua")
     UNITEDARABEMIRATES = ("united arab emirates", "ae")
-    UK = ("uk,united kingdom", "uk", "co.uk")
-    USA = ("usa,us,united states", "www", "com")
+    UK = ("uk,united kingdom", "uk:gb", "co.uk")
+    USA = ("usa,us,united states", "www:us", "com")
     URUGUAY = ("uruguay", "uy")
     VENEZUELA = ("venezuela", "ve")
     VIETNAM = ("vietnam", "vn", "com")
@@ -132,7 +132,10 @@ class Country(Enum):
 
     @property
     def indeed_domain_value(self):
-        return self.value[1]
+        subdomain, _, api_country_code = self.value[1].partition(":")
+        if subdomain and api_country_code:
+            return subdomain, api_country_code.upper()
+        return self.value[1], self.value[1].upper()
 
     @property
     def glassdoor_domain_value(self):
@@ -163,7 +166,7 @@ def from_string(cls, country_str: str):
 
 
 class Location(BaseModel):
-    country: Country | None = None
+    country: Country | str | None = None
     city: Optional[str] = None
     state: Optional[str] = None
 
@@ -173,7 +176,9 @@ def display_location(self) -> str:
             location_parts.append(self.city)
         if self.state:
             location_parts.append(self.state)
-        if self.country and self.country not in (Country.US_CANADA, Country.WORLDWIDE):
+        if isinstance(self.country, str):
+            location_parts.append(self.country)
+        elif self.country and self.country not in (Country.US_CANADA, Country.WORLDWIDE):
             country_name = self.country.value[0]
             if "," in country_name:
                 country_name = country_name.split(",")[0]
@@ -217,21 +222,31 @@ class DescriptionFormat(Enum):
 
 class JobPost(BaseModel):
     title: str
-    company_name: str
+    company_name: str | None
     job_url: str
+    job_url_direct: str | None = None
     location: Optional[Location]
 
     description: str | None = None
     company_url: str | None = None
+    company_url_direct: str | None = None
 
     job_type: list[JobType] | None = None
     compensation: Compensation | None = None
     date_posted: date | None = None
-    benefits: str | None = None
     emails: list[str] | None = None
-    num_urgent_words: int | None = None
     is_remote: bool | None = None
-    # company_industry: str | None = None
+
+    # indeed specific
+    company_addresses: str | None = None
+    company_industry: str | None = None
+    company_num_employees: str | None = None
+    company_revenue: str | None = None
+    company_description: str | None = None
+    ceo_name: str | None = None
+    ceo_photo_url: str | None = None
+    logo_photo_url: str | None = None
+    banner_photo_url: str | None = None
 
 
 class JobResponse(BaseModel):

diff --git a/src/jobspy/scrapers/glassdoor/__init__.py b/src/jobspy/scrapers/glassdoor/__init__.py
@@ -11,7 +11,7 @@
 from typing import Optional
 from datetime import datetime, timedelta
 from concurrent.futures import ThreadPoolExecutor, as_completed
-from ..utils import count_urgent_words, extract_emails_from_text
+from ..utils import extract_emails_from_text
 
 from .. import Scraper, ScraperInput, Site
 from ..exceptions import GlassdoorException
@@ -188,7 +188,6 @@ def _process_job(self, job_data):
             is_remote=is_remote,
             description=description,
             emails=extract_emails_from_text(description) if description else None,
-            num_urgent_words=count_urgent_words(description) if description else None,
         )
 
     def _fetch_job_description(self, job_id):