Proxy support (#44)

* add proxy support * return as data frame
Bunsly · Sep 7, 2023 · 59f7390 · 59f7390
1 parent a37e7f2
commit 59f7390
Show file tree

Hide file tree

Showing 10 changed files with 369 additions and 322 deletions.
diff --git a/JobSpy_Demo.ipynb b/JobSpy_Demo.ipynb
@@ -33,15 +33,20 @@
    "outputs": [],
    "source": [
     "# example 1 (no hyperlinks, USA)\n",
-    "result = scrape_jobs(\n",
+    "jobs = scrape_jobs(\n",
     "    site_name=[\"linkedin\", \"zip_recruiter\"],\n",
     "    location='san francisco',\n",
     "    search_term=\"engineer\",\n",
-    "    results_wanted=5, \n",
+    "    results_wanted=5,\n",
+    "\n",
+    "    # use if you want to use a proxy\n",
+    "    # proxy=\"socks5://jobspy:5a4vpWtj8EeJ2hoYzk@ca.smartproxy.com:20001\",\n",
+    "    # proxy=\"http://jobspy:5a4vpWtj8EeJ2hoYzk@ca.smartproxy.com:20001\",\n",
+    "    # proxy=\"https://jobspy:5a4vpWtj8EeJ2hoYzk@ca.smartproxy.com:20001\",\n",
+    "\n",
     ")\n",
     "\n",
-    "display(result.jobs)\n",
-    "display(result.errors)"
+    "display(jobs)"
    ]
   },
   {
@@ -52,7 +57,7 @@
    "outputs": [],
    "source": [
     "# example 2 - remote USA & hyperlinks\n",
-    "result = scrape_jobs(\n",
+    "jobs = scrape_jobs(\n",
     "    site_name=[\"linkedin\", \"zip_recruiter\", \"indeed\"],\n",
     "    # location='san francisco',\n",
     "    search_term=\"software engineer\",\n",
@@ -71,11 +76,10 @@
    "outputs": [],
    "source": [
     "# use if hyperlinks=True\n",
-    "html = result.jobs.to_html(escape=False)\n",
+    "html = jobs.to_html(escape=False)\n",
     "# change max-width: 200px to show more or less of the content\n",
     "truncate_width = f'<style>.dataframe td {{ max-width: 200px; overflow: hidden; text-overflow: ellipsis; white-space: nowrap; }}</style>{html}'\n",
-    "display(HTML(truncate_width))\n",
-    "display(result.errors)"
+    "display(HTML(truncate_width))"
    ]
   },
   {
@@ -86,13 +90,16 @@
    "outputs": [],
    "source": [
     "# example 3 - with hyperlinks, international - linkedin (no zip_recruiter)\n",
-    "result = scrape_jobs(\n",
+    "jobs = scrape_jobs(\n",
     "    site_name=[\"linkedin\"],\n",
     "    location='berlin',\n",
     "    search_term=\"engineer\",\n",
     "    hyperlinks=True,\n",
     "    results_wanted=5,\n",
     "    easy_apply=True\n",
+    "\n",
+    "\n",
+    "\n",
     ")"
    ]
   },
@@ -104,11 +111,10 @@
    "outputs": [],
    "source": [
     "# use if hyperlinks=True\n",
-    "html = result.jobs.to_html(escape=False)\n",
+    "html = jobs.to_html(escape=False)\n",
     "# change max-width: 200px to show more or less of the content\n",
     "truncate_width = f'<style>.dataframe td {{ max-width: 200px; overflow: hidden; text-overflow: ellipsis; white-space: nowrap; }}</style>{html}'\n",
-    "display(HTML(truncate_width))\n",
-    "display(result.errors)"
+    "display(HTML(truncate_width))"
    ]
   },
   {
@@ -136,11 +142,10 @@
    "outputs": [],
    "source": [
     "# use if hyperlinks=True\n",
-    "html = result.jobs.to_html(escape=False)\n",
+    "html = jobs.to_html(escape=False)\n",
     "# change max-width: 200px to show more or less of the content\n",
     "truncate_width = f'<style>.dataframe td {{ max-width: 200px; overflow: hidden; text-overflow: ellipsis; white-space: nowrap; }}</style>{html}'\n",
-    "display(HTML(truncate_width))\n",
-    "display(result.errors)"
+    "display(HTML(truncate_width))"
    ]
   }
  ],

diff --git a/README.md b/README.md
@@ -26,13 +26,18 @@ pip install python-jobspy
 from jobspy import scrape_jobs
 import pandas as pd
 
-result: pd.DataFrame = scrape_jobs(
+jobs: pd.DataFrame = scrape_jobs(
     site_name=["indeed", "linkedin", "zip_recruiter"],
     search_term="software engineer",
     location="Dallas, TX",
     results_wanted=10,
 
     country_indeed='USA' # only needed for indeed
+
+    # use if you want to use a proxy
+    # proxy="socks5://jobspy:5a4vpWtj8EeJ2hoYzk@ca.smartproxy.com:20001",
+    # proxy="http://jobspy:5a4vpWtj8EeJ2hoYzk@ca.smartproxy.com:20001",
+    # proxy="https://jobspy:5a4vpWtj8EeJ2hoYzk@ca.smartproxy.com:20001",
 )
 
 pd.set_option('display.max_columns', None)
@@ -41,12 +46,12 @@ pd.set_option('display.width', None)
 pd.set_option('display.max_colwidth', 50)  # set to 0 to see full job url / desc
 
 #1 output
-print(result.jobs)
-print(result.errors)
+print(jobs)
+print(errors)
 
 #2 display in Jupyter Notebook
-#display(result.jobs)
-#display(result.errors)
+#display(jobs)
+#display(errors)
 
 #3 output to .csv
 #result.jobs.to_csv('result.jobs.csv', index=False)

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "python-jobspy"
-version = "1.1.2"
+version = "1.1.3"
 description = "Job scraper for LinkedIn, Indeed & ZipRecruiter"
 authors = ["Zachary Hampton <zachary@zacharysproducts.com>", "Cullen Watson <cullen@cullen.ai>"]
 readme = "README.md"

diff --git a/src/jobspy/__init__.py b/src/jobspy/__init__.py
@@ -1,13 +1,19 @@
 import pandas as pd
 import concurrent.futures
 from concurrent.futures import ThreadPoolExecutor
-from typing import List, Tuple, NamedTuple, Dict
+from typing import List, Tuple, NamedTuple, Dict, Optional
+import traceback
 
 from .jobs import JobType, Location
 from .scrapers.indeed import IndeedScraper
 from .scrapers.ziprecruiter import ZipRecruiterScraper
 from .scrapers.linkedin import LinkedInScraper
 from .scrapers import ScraperInput, Site, JobResponse, Country
+from .scrapers.exceptions import (
+    LinkedInException,
+    IndeedException,
+    ZipRecruiterException,
+)
 
 SCRAPER_MAPPING = {
     Site.LINKEDIN: LinkedInScraper,
@@ -16,11 +22,6 @@
 }
 
 
-class ScrapeResults(NamedTuple):
-    jobs: pd.DataFrame
-    errors: pd.DataFrame
-
-
 def _map_str_to_site(site_name: str) -> Site:
     return Site[site_name.upper()]
 
@@ -35,17 +36,21 @@ def scrape_jobs(
     easy_apply: bool = False,  # linkedin
     results_wanted: int = 15,
     country_indeed: str = "usa",
-    hyperlinks: bool = False
-) -> ScrapeResults:
+    hyperlinks: bool = False,
+    proxy: Optional[str] = None,
+) -> pd.DataFrame:
     """
-    Asynchronously scrapes job data from multiple job sites.
+    Simultaneously scrapes job data from multiple job sites.
     :return: results_wanted: pandas dataframe containing job data
     """
 
     if type(site_name) == str:
         site_type = [_map_str_to_site(site_name)]
     else:  #: if type(site_name) == list
-        site_type = [_map_str_to_site(site) if type(site) == str else site_name for site in site_name]
+        site_type = [
+            _map_str_to_site(site) if type(site) == str else site_name
+            for site in site_name
+        ]
 
     country_enum = Country.from_string(country_indeed)
 
@@ -62,99 +67,95 @@ def scrape_jobs(
     )
 
     def scrape_site(site: Site) -> Tuple[str, JobResponse]:
+        scraper_class = SCRAPER_MAPPING[site]
+        scraper = scraper_class(proxy=proxy)
+
         try:
-            scraper_class = SCRAPER_MAPPING[site]
-            scraper = scraper_class()
             scraped_data: JobResponse = scraper.scrape(scraper_input)
+        except (LinkedInException, IndeedException, ZipRecruiterException) as lie:
+            raise lie
         except Exception as e:
-            scraped_data = JobResponse(jobs=[], error=str(e), success=False)
+            # unhandled exceptions
+            if site == Site.LINKEDIN:
+                raise LinkedInException()
+            if site == Site.INDEED:
+                raise IndeedException()
+            if site == Site.ZIP_RECRUITER:
+                raise ZipRecruiterException()
+            else:
+                raise e
         return site.value, scraped_data
 
-    results, errors = {}, {}
+    site_to_jobs_dict = {}
 
     def worker(site):
         site_value, scraped_data = scrape_site(site)
         return site_value, scraped_data
 
     with ThreadPoolExecutor() as executor:
-        future_to_site = {executor.submit(worker, site): site for site in scraper_input.site_type}
+        future_to_site = {
+            executor.submit(worker, site): site for site in scraper_input.site_type
+        }
 
         for future in concurrent.futures.as_completed(future_to_site):
             site_value, scraped_data = future.result()
-            results[site_value] = scraped_data
-            if scraped_data.error:
-                errors[site_value] = scraped_data.error
+            site_to_jobs_dict[site_value] = scraped_data
 
-    dfs = []
+    jobs_dfs: List[pd.DataFrame] = []
 
-    for site, job_response in results.items():
+    for site, job_response in site_to_jobs_dict.items():
         for job in job_response.jobs:
-            data = job.dict()
-            data["job_url_hyper"] = f'<a href="{data["job_url"]}">{data["job_url"]}</a>'
-            data["site"] = site
-            data["company"] = data["company_name"]
-            if data["job_type"]:
+            job_data = job.dict()
+            job_data[
+                "job_url_hyper"
+            ] = f'<a href="{job_data["job_url"]}">{job_data["job_url"]}</a>'
+            job_data["site"] = site
+            job_data["company"] = job_data["company_name"]
+            if job_data["job_type"]:
                 # Take the first value from the job type tuple
-                data["job_type"] = data["job_type"].value[0]
+                job_data["job_type"] = job_data["job_type"].value[0]
             else:
-                data["job_type"] = None
+                job_data["job_type"] = None
 
-            data["location"] = Location(**data["location"]).display_location()
+            job_data["location"] = Location(**job_data["location"]).display_location()
 
-            compensation_obj = data.get("compensation")
+            compensation_obj = job_data.get("compensation")
             if compensation_obj and isinstance(compensation_obj, dict):
-                data["interval"] = (
+                job_data["interval"] = (
                     compensation_obj.get("interval").value
                     if compensation_obj.get("interval")
                     else None
                 )
-                data["min_amount"] = compensation_obj.get("min_amount")
-                data["max_amount"] = compensation_obj.get("max_amount")
-                data["currency"] = compensation_obj.get("currency", "USD")
+                job_data["min_amount"] = compensation_obj.get("min_amount")
+                job_data["max_amount"] = compensation_obj.get("max_amount")
+                job_data["currency"] = compensation_obj.get("currency", "USD")
             else:
-                data["interval"] = None
-                data["min_amount"] = None
-                data["max_amount"] = None
-                data["currency"] = None
-
-            job_df = pd.DataFrame([data])
-            dfs.append(job_df)
-
-    errors_list = [(key, value) for key, value in errors.items()]
-    errors_df = pd.DataFrame(errors_list, columns=["Site", "Error"])
-
-    if dfs:
-        df = pd.concat(dfs, ignore_index=True)
-        if hyperlinks:
-            desired_order = [
-                "site",
-                "title",
-                "company",
-                "location",
-                "job_type",
-                "interval",
-                "min_amount",
-                "max_amount",
-                "currency",
-                "job_url_hyper",
-                "description",
-            ]
-        else:
-            desired_order = [
-                "site",
-                "title",
-                "company",
-                "location",
-                "job_type",
-                "interval",
-                "min_amount",
-                "max_amount",
-                "currency",
-                "job_url",
-                "description",
-            ]
-        df = df[desired_order]
+                job_data["interval"] = None
+                job_data["min_amount"] = None
+                job_data["max_amount"] = None
+                job_data["currency"] = None
+
+            job_df = pd.DataFrame([job_data])
+            jobs_dfs.append(job_df)
+
+    if jobs_dfs:
+        jobs_df = pd.concat(jobs_dfs, ignore_index=True)
+        desired_order: List[str] = [
+            "site",
+            "title",
+            "company",
+            "location",
+            "date_posted",
+            "job_type",
+            "interval",
+            "min_amount",
+            "max_amount",
+            "currency",
+            "job_url_hyper" if hyperlinks else "job_url",
+            "description",
+        ]
+        jobs_formatted_df = jobs_df[desired_order]
     else:
-        df = pd.DataFrame()
+        jobs_formatted_df = pd.DataFrame()
 
-    return ScrapeResults(jobs=df, errors=errors_df)
+    return jobs_formatted_df
diff --git a/src/jobspy/jobs/__init__.py b/src/jobspy/jobs/__init__.py
@@ -189,22 +189,4 @@ class JobPost(BaseModel):
 
 
 class JobResponse(BaseModel):
-    success: bool
-    error: str = None
-
-    total_results: Optional[int] = None
-
     jobs: list[JobPost] = []
-
-    returned_results: int = None
-
-    @validator("returned_results", pre=True, always=True)
-    def set_returned_results(cls, v, values):
-        jobs_list = values.get("jobs")
-
-        if v is None:
-            if jobs_list is not None:
-                return len(jobs_list)
-            else:
-                return 0
-        return v