Skip to content

Commit

Permalink
Proxy support (#44)
Browse files Browse the repository at this point in the history
* add proxy support

* return as data frame
  • Loading branch information
cullenwatson committed Sep 7, 2023
1 parent a37e7f2 commit 59f7390
Show file tree
Hide file tree
Showing 10 changed files with 369 additions and 322 deletions.
35 changes: 20 additions & 15 deletions JobSpy_Demo.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -33,15 +33,20 @@
"outputs": [],
"source": [
"# example 1 (no hyperlinks, USA)\n",
"result = scrape_jobs(\n",
"jobs = scrape_jobs(\n",
" site_name=[\"linkedin\", \"zip_recruiter\"],\n",
" location='san francisco',\n",
" search_term=\"engineer\",\n",
" results_wanted=5, \n",
" results_wanted=5,\n",
"\n",
" # use if you want to use a proxy\n",
" # proxy=\"socks5://jobspy:5a4vpWtj8EeJ2hoYzk@ca.smartproxy.com:20001\",\n",
" # proxy=\"http://jobspy:5a4vpWtj8EeJ2hoYzk@ca.smartproxy.com:20001\",\n",
" # proxy=\"https://jobspy:5a4vpWtj8EeJ2hoYzk@ca.smartproxy.com:20001\",\n",
"\n",
")\n",
"\n",
"display(result.jobs)\n",
"display(result.errors)"
"display(jobs)"
]
},
{
Expand All @@ -52,7 +57,7 @@
"outputs": [],
"source": [
"# example 2 - remote USA & hyperlinks\n",
"result = scrape_jobs(\n",
"jobs = scrape_jobs(\n",
" site_name=[\"linkedin\", \"zip_recruiter\", \"indeed\"],\n",
" # location='san francisco',\n",
" search_term=\"software engineer\",\n",
Expand All @@ -71,11 +76,10 @@
"outputs": [],
"source": [
"# use if hyperlinks=True\n",
"html = result.jobs.to_html(escape=False)\n",
"html = jobs.to_html(escape=False)\n",
"# change max-width: 200px to show more or less of the content\n",
"truncate_width = f'<style>.dataframe td {{ max-width: 200px; overflow: hidden; text-overflow: ellipsis; white-space: nowrap; }}</style>{html}'\n",
"display(HTML(truncate_width))\n",
"display(result.errors)"
"display(HTML(truncate_width))"
]
},
{
Expand All @@ -86,13 +90,16 @@
"outputs": [],
"source": [
"# example 3 - with hyperlinks, international - linkedin (no zip_recruiter)\n",
"result = scrape_jobs(\n",
"jobs = scrape_jobs(\n",
" site_name=[\"linkedin\"],\n",
" location='berlin',\n",
" search_term=\"engineer\",\n",
" hyperlinks=True,\n",
" results_wanted=5,\n",
" easy_apply=True\n",
"\n",
"\n",
"\n",
")"
]
},
Expand All @@ -104,11 +111,10 @@
"outputs": [],
"source": [
"# use if hyperlinks=True\n",
"html = result.jobs.to_html(escape=False)\n",
"html = jobs.to_html(escape=False)\n",
"# change max-width: 200px to show more or less of the content\n",
"truncate_width = f'<style>.dataframe td {{ max-width: 200px; overflow: hidden; text-overflow: ellipsis; white-space: nowrap; }}</style>{html}'\n",
"display(HTML(truncate_width))\n",
"display(result.errors)"
"display(HTML(truncate_width))"
]
},
{
Expand Down Expand Up @@ -136,11 +142,10 @@
"outputs": [],
"source": [
"# use if hyperlinks=True\n",
"html = result.jobs.to_html(escape=False)\n",
"html = jobs.to_html(escape=False)\n",
"# change max-width: 200px to show more or less of the content\n",
"truncate_width = f'<style>.dataframe td {{ max-width: 200px; overflow: hidden; text-overflow: ellipsis; white-space: nowrap; }}</style>{html}'\n",
"display(HTML(truncate_width))\n",
"display(result.errors)"
"display(HTML(truncate_width))"
]
}
],
Expand Down
15 changes: 10 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,18 @@ pip install python-jobspy
from jobspy import scrape_jobs
import pandas as pd

result: pd.DataFrame = scrape_jobs(
jobs: pd.DataFrame = scrape_jobs(
site_name=["indeed", "linkedin", "zip_recruiter"],
search_term="software engineer",
location="Dallas, TX",
results_wanted=10,

country_indeed='USA' # only needed for indeed

# use if you want to use a proxy
# proxy="socks5://jobspy:5a4vpWtj8EeJ2hoYzk@ca.smartproxy.com:20001",
# proxy="http://jobspy:5a4vpWtj8EeJ2hoYzk@ca.smartproxy.com:20001",
# proxy="https://jobspy:5a4vpWtj8EeJ2hoYzk@ca.smartproxy.com:20001",
)

pd.set_option('display.max_columns', None)
Expand All @@ -41,12 +46,12 @@ pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', 50) # set to 0 to see full job url / desc

#1 output
print(result.jobs)
print(result.errors)
print(jobs)
print(errors)

#2 display in Jupyter Notebook
#display(result.jobs)
#display(result.errors)
#display(jobs)
#display(errors)

#3 output to .csv
#result.jobs.to_csv('result.jobs.csv', index=False)
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "python-jobspy"
version = "1.1.2"
version = "1.1.3"
description = "Job scraper for LinkedIn, Indeed & ZipRecruiter"
authors = ["Zachary Hampton <zachary@zacharysproducts.com>", "Cullen Watson <cullen@cullen.ai>"]
readme = "README.md"
Expand Down
155 changes: 78 additions & 77 deletions src/jobspy/__init__.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,19 @@
import pandas as pd
import concurrent.futures
from concurrent.futures import ThreadPoolExecutor
from typing import List, Tuple, NamedTuple, Dict
from typing import List, Tuple, NamedTuple, Dict, Optional
import traceback

from .jobs import JobType, Location
from .scrapers.indeed import IndeedScraper
from .scrapers.ziprecruiter import ZipRecruiterScraper
from .scrapers.linkedin import LinkedInScraper
from .scrapers import ScraperInput, Site, JobResponse, Country
from .scrapers.exceptions import (
LinkedInException,
IndeedException,
ZipRecruiterException,
)

SCRAPER_MAPPING = {
Site.LINKEDIN: LinkedInScraper,
Expand All @@ -16,11 +22,6 @@
}


class ScrapeResults(NamedTuple):
jobs: pd.DataFrame
errors: pd.DataFrame


def _map_str_to_site(site_name: str) -> Site:
return Site[site_name.upper()]

Expand All @@ -35,17 +36,21 @@ def scrape_jobs(
easy_apply: bool = False, # linkedin
results_wanted: int = 15,
country_indeed: str = "usa",
hyperlinks: bool = False
) -> ScrapeResults:
hyperlinks: bool = False,
proxy: Optional[str] = None,
) -> pd.DataFrame:
"""
Asynchronously scrapes job data from multiple job sites.
Simultaneously scrapes job data from multiple job sites.
:return: results_wanted: pandas dataframe containing job data
"""

if type(site_name) == str:
site_type = [_map_str_to_site(site_name)]
else: #: if type(site_name) == list
site_type = [_map_str_to_site(site) if type(site) == str else site_name for site in site_name]
site_type = [
_map_str_to_site(site) if type(site) == str else site_name
for site in site_name
]

country_enum = Country.from_string(country_indeed)

Expand All @@ -62,99 +67,95 @@ def scrape_jobs(
)

def scrape_site(site: Site) -> Tuple[str, JobResponse]:
scraper_class = SCRAPER_MAPPING[site]
scraper = scraper_class(proxy=proxy)

try:
scraper_class = SCRAPER_MAPPING[site]
scraper = scraper_class()
scraped_data: JobResponse = scraper.scrape(scraper_input)
except (LinkedInException, IndeedException, ZipRecruiterException) as lie:
raise lie
except Exception as e:
scraped_data = JobResponse(jobs=[], error=str(e), success=False)
# unhandled exceptions
if site == Site.LINKEDIN:
raise LinkedInException()
if site == Site.INDEED:
raise IndeedException()
if site == Site.ZIP_RECRUITER:
raise ZipRecruiterException()
else:
raise e
return site.value, scraped_data

results, errors = {}, {}
site_to_jobs_dict = {}

def worker(site):
site_value, scraped_data = scrape_site(site)
return site_value, scraped_data

with ThreadPoolExecutor() as executor:
future_to_site = {executor.submit(worker, site): site for site in scraper_input.site_type}
future_to_site = {
executor.submit(worker, site): site for site in scraper_input.site_type
}

for future in concurrent.futures.as_completed(future_to_site):
site_value, scraped_data = future.result()
results[site_value] = scraped_data
if scraped_data.error:
errors[site_value] = scraped_data.error
site_to_jobs_dict[site_value] = scraped_data

dfs = []
jobs_dfs: List[pd.DataFrame] = []

for site, job_response in results.items():
for site, job_response in site_to_jobs_dict.items():
for job in job_response.jobs:
data = job.dict()
data["job_url_hyper"] = f'<a href="{data["job_url"]}">{data["job_url"]}</a>'
data["site"] = site
data["company"] = data["company_name"]
if data["job_type"]:
job_data = job.dict()
job_data[
"job_url_hyper"
] = f'<a href="{job_data["job_url"]}">{job_data["job_url"]}</a>'
job_data["site"] = site
job_data["company"] = job_data["company_name"]
if job_data["job_type"]:
# Take the first value from the job type tuple
data["job_type"] = data["job_type"].value[0]
job_data["job_type"] = job_data["job_type"].value[0]
else:
data["job_type"] = None
job_data["job_type"] = None

data["location"] = Location(**data["location"]).display_location()
job_data["location"] = Location(**job_data["location"]).display_location()

compensation_obj = data.get("compensation")
compensation_obj = job_data.get("compensation")
if compensation_obj and isinstance(compensation_obj, dict):
data["interval"] = (
job_data["interval"] = (
compensation_obj.get("interval").value
if compensation_obj.get("interval")
else None
)
data["min_amount"] = compensation_obj.get("min_amount")
data["max_amount"] = compensation_obj.get("max_amount")
data["currency"] = compensation_obj.get("currency", "USD")
job_data["min_amount"] = compensation_obj.get("min_amount")
job_data["max_amount"] = compensation_obj.get("max_amount")
job_data["currency"] = compensation_obj.get("currency", "USD")
else:
data["interval"] = None
data["min_amount"] = None
data["max_amount"] = None
data["currency"] = None

job_df = pd.DataFrame([data])
dfs.append(job_df)

errors_list = [(key, value) for key, value in errors.items()]
errors_df = pd.DataFrame(errors_list, columns=["Site", "Error"])

if dfs:
df = pd.concat(dfs, ignore_index=True)
if hyperlinks:
desired_order = [
"site",
"title",
"company",
"location",
"job_type",
"interval",
"min_amount",
"max_amount",
"currency",
"job_url_hyper",
"description",
]
else:
desired_order = [
"site",
"title",
"company",
"location",
"job_type",
"interval",
"min_amount",
"max_amount",
"currency",
"job_url",
"description",
]
df = df[desired_order]
job_data["interval"] = None
job_data["min_amount"] = None
job_data["max_amount"] = None
job_data["currency"] = None

job_df = pd.DataFrame([job_data])
jobs_dfs.append(job_df)

if jobs_dfs:
jobs_df = pd.concat(jobs_dfs, ignore_index=True)
desired_order: List[str] = [
"site",
"title",
"company",
"location",
"date_posted",
"job_type",
"interval",
"min_amount",
"max_amount",
"currency",
"job_url_hyper" if hyperlinks else "job_url",
"description",
]
jobs_formatted_df = jobs_df[desired_order]
else:
df = pd.DataFrame()
jobs_formatted_df = pd.DataFrame()

return ScrapeResults(jobs=df, errors=errors_df)
return jobs_formatted_df
18 changes: 0 additions & 18 deletions src/jobspy/jobs/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,22 +189,4 @@ class JobPost(BaseModel):


class JobResponse(BaseModel):
success: bool
error: str = None

total_results: Optional[int] = None

jobs: list[JobPost] = []

returned_results: int = None

@validator("returned_results", pre=True, always=True)
def set_returned_results(cls, v, values):
jobs_list = values.get("jobs")

if v is None:
if jobs_list is not None:
return len(jobs_list)
else:
return 0
return v
Loading

0 comments on commit 59f7390

Please sign in to comment.