enh: proxies (#157)

* enh: proxies * enh: proxies
Bunsly · May 25, 2024 · 5cb7ffe · 5cb7ffe
1 parent cd29f79
commit 5cb7ffe
Show file tree

Hide file tree

Showing 12 changed files with 149 additions and 354 deletions.
diff --git a/README.md b/README.md
@@ -11,7 +11,7 @@ work with us.*
 
 - Scrapes job postings from **LinkedIn**, **Indeed**, **Glassdoor**, & **ZipRecruiter** simultaneously
 - Aggregates the job postings in a Pandas DataFrame
-- Proxy support
+- Proxies support
 
 [Video Guide for JobSpy](https://www.youtube.com/watch?v=RuP1HrAZnxs&pp=ygUgam9icyBzY3JhcGVyIGJvdCBsaW5rZWRpbiBpbmRlZWQ%3D) -
 Updated for release v1.1.3
@@ -39,7 +39,10 @@ jobs = scrape_jobs(
     results_wanted=20,
     hours_old=72, # (only Linkedin/Indeed is hour specific, others round up to days old)
     country_indeed='USA',  # only needed for indeed / glassdoor
+
     # linkedin_fetch_description=True # get full description and direct job url for linkedin (slower)
+    # proxies=["Efb5EA8OIk0BQb:wifi;us;@proxy.soax.com:9000", "localhost"],
+
 )
 print(f"Found {len(jobs)} jobs")
 print(jobs.head())
@@ -76,8 +79,9 @@ Optional
 ├── job_type (str): 
 |    fulltime, parttime, internship, contract
 │
-├── proxy (str): 
-|    in format 'http://user:pass@host:port'
+├── proxies (): 
+|    in format ['user:pass@host:port', 'localhost']
+|    each job board will round robin through the proxies
 │
 ├── is_remote (bool)
 │
@@ -201,7 +205,7 @@ You can specify the following countries when searching on Indeed (use the exact
 ## Notes
 * Indeed is the best scraper currently with no rate limiting.  
 * All the job board endpoints are capped at around 1000 jobs on a given search.  
-* LinkedIn is the most restrictive and usually rate limits around the 10th page.
+* LinkedIn is the most restrictive and usually rate limits around the 10th page with one ip. Proxies are a must basically.
 
 ## Frequently Asked Questions
 
@@ -216,7 +220,7 @@ persist, [submit an issue](https://github.com/Bunsly/JobSpy/issues).
 **Q: Received a response code 429?**  
 **A:** This indicates that you have been blocked by the job board site for sending too many requests. All of the job board sites are aggressive with blocking. We recommend:
 
-- Waiting some time between scrapes (site-dependent).
-- Trying a VPN or proxy to change your IP address.
+- Wait some time between scrapes (site-dependent).
+- Try using the proxies param to change your IP address.
 
 ---
diff --git a/examples/JobSpy_AllSites.py b/examples/JobSpy_AllSites.py
diff --git a/examples/JobSpy_Demo.ipynb b/examples/JobSpy_Demo.ipynb
diff --git a/examples/JobSpy_LongScrape.py b/examples/JobSpy_LongScrape.py
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "python-jobspy"
-version = "1.1.53"
+version = "1.1.54"
 description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter"
 authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
 homepage = "https://github.com/Bunsly/JobSpy"

diff --git a/src/jobspy/__init__.py b/src/jobspy/__init__.py
@@ -30,7 +30,7 @@ def scrape_jobs(
     results_wanted: int = 15,
     country_indeed: str = "usa",
     hyperlinks: bool = False,
-    proxy: str | None = None,
+    proxies: list[str] | str | None = None,
     description_format: str = "markdown",
     linkedin_fetch_description: bool | None = False,
     linkedin_company_ids: list[int] | None = None,
@@ -96,7 +96,7 @@ def get_site_type():
 
     def scrape_site(site: Site) -> Tuple[str, JobResponse]:
         scraper_class = SCRAPER_MAPPING[site]
-        scraper = scraper_class(proxy=proxy)
+        scraper = scraper_class(proxies=proxies)
         scraped_data: JobResponse = scraper.scrape(scraper_input)
         cap_name = site.value.capitalize()
         site_name = "ZipRecruiter" if cap_name == "Zip_recruiter" else cap_name

diff --git a/src/jobspy/scrapers/__init__.py b/src/jobspy/scrapers/__init__.py
@@ -39,9 +39,9 @@ class ScraperInput(BaseModel):
 
 
 class Scraper(ABC):
-    def __init__(self, site: Site, proxy: list[str] | None = None):
+    def __init__(self, site: Site, proxies: list[str] | None = None):
+        self.proxies = proxies
         self.site = site
-        self.proxy = (lambda p: {"http": p, "https": p} if p else None)(proxy)
 
     @abstractmethod
     def scrape(self, scraper_input: ScraperInput) -> JobResponse: ...
diff --git a/src/jobspy/scrapers/glassdoor/__init__.py b/src/jobspy/scrapers/glassdoor/__init__.py
@@ -34,12 +34,12 @@
 
 
 class GlassdoorScraper(Scraper):
-    def __init__(self, proxy: Optional[str] = None):
+    def __init__(self, proxies: list[str] | str | None = None):
         """
         Initializes GlassdoorScraper with the Glassdoor job search url
         """
         site = Site(Site.GLASSDOOR)
-        super().__init__(site, proxy=proxy)
+        super().__init__(site, proxies=proxies)
 
         self.base_url = None
         self.country = None
@@ -59,7 +59,7 @@ def scrape(self, scraper_input: ScraperInput) -> JobResponse:
         self.scraper_input.results_wanted = min(900, scraper_input.results_wanted)
         self.base_url = self.scraper_input.country.get_glassdoor_url()
 
-        self.session = create_session(self.proxy, is_tls=True, has_retry=True)
+        self.session = create_session(proxies=self.proxies, is_tls=True, has_retry=True)
         token = self._get_csrf_token()
         self.headers["gd-csrf-token"] = token if token else self.fallback_token
 
@@ -245,7 +245,6 @@ def _get_location(self, location: str, is_remote: bool) -> (int, str):
         if not location or is_remote:
             return "11047", "STATE"  # remote options
         url = f"{self.base_url}/findPopularLocationAjax.htm?maxLocationsToReturn=10&term={location}"
-        session = create_session(self.proxy, has_retry=True)
         res = self.session.get(url, headers=self.headers)
         if res.status_code != 200:
             if res.status_code == 429: