Skip to content

Commit

Permalink
Add czech to Indeed (#72)
Browse files Browse the repository at this point in the history
  • Loading branch information
augustogunsch committed Dec 2, 2023
1 parent 6587e46 commit 33d442b
Show file tree
Hide file tree
Showing 6 changed files with 53 additions and 60 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "python-jobspy"
version = "1.1.29"
version = "1.1.30"
description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter"
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
homepage = "https://github.com/Bunsly/JobSpy"
Expand Down
32 changes: 23 additions & 9 deletions src/jobspy/jobs/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,18 +55,24 @@ class JobType(Enum):


class Country(Enum):
ARGENTINA = ("argentina", "com.ar")
"""
Gets the subdomain for Indeed and Glassdoor.
The second item in the tuple is the subdomain for Indeed
The third item in the tuple is the subdomain (and tld if there's a ':' separator) for Glassdoor
"""

ARGENTINA = ("argentina", "ar", "com.ar")
AUSTRALIA = ("australia", "au", "com.au")
AUSTRIA = ("austria", "at", "at")
BAHRAIN = ("bahrain", "bh")
BELGIUM = ("belgium", "be", "nl:be")
BELGIUM = ("belgium", "be", "fr:be")
BRAZIL = ("brazil", "br", "com.br")
CANADA = ("canada", "ca", "ca")
CHILE = ("chile", "cl")
CHINA = ("china", "cn")
COLOMBIA = ("colombia", "co")
COSTARICA = ("costa rica", "cr")
CZECHREPUBLIC = ("czech republic", "cz")
CZECHREPUBLIC = ("czech republic,czechia", "cz")
DENMARK = ("denmark", "dk")
ECUADOR = ("ecuador", "ec")
EGYPT = ("egypt", "eg")
Expand Down Expand Up @@ -112,8 +118,8 @@ class Country(Enum):
TURKEY = ("turkey", "tr")
UKRAINE = ("ukraine", "ua")
UNITEDARABEMIRATES = ("united arab emirates", "ae")
UK = ("uk", "uk", "co.uk")
USA = ("usa", "www", "com")
UK = ("uk,united kingdom", "uk", "co.uk")
USA = ("usa,us,united states", "www", "com")
URUGUAY = ("uruguay", "uy")
VENEZUELA = ("venezuela", "ve")
VIETNAM = ("vietnam", "vn")
Expand Down Expand Up @@ -147,7 +153,8 @@ def from_string(cls, country_str: str):
"""Convert a string to the corresponding Country enum."""
country_str = country_str.strip().lower()
for country in cls:
if country.value[0] == country_str:
country_names = country.value[0].split(',')
if country_str in country_names:
return country
valid_countries = [country.value for country in cls]
raise ValueError(
Expand All @@ -167,10 +174,13 @@ def display_location(self) -> str:
if self.state:
location_parts.append(self.state)
if self.country and self.country not in (Country.US_CANADA, Country.WORLDWIDE):
if self.country.value[0] in ("usa", "uk"):
location_parts.append(self.country.value[0].upper())
country_name = self.country.value[0]
if "," in country_name:
country_name = country_name.split(",")[0]
if country_name in ("usa", "uk"):
location_parts.append(country_name.upper())
else:
location_parts.append(self.country.value[0].title())
location_parts.append(country_name.title())
return ", ".join(location_parts)


Expand All @@ -181,6 +191,10 @@ class CompensationInterval(Enum):
DAILY = "daily"
HOURLY = "hourly"

@classmethod
def get_interval(cls, pay_period):
return cls[pay_period].value if pay_period in cls.__members__ else None


class Compensation(BaseModel):
interval: Optional[CompensationInterval] = None
Expand Down
50 changes: 19 additions & 31 deletions src/jobspy/scrapers/glassdoor/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,25 +4,20 @@
This module contains routines to scrape Glassdoor.
"""
import math
import time
import re
import json
from datetime import datetime, date
from typing import Optional, Tuple, Any
from bs4 import BeautifulSoup
from typing import Optional, Any
from datetime import datetime, timedelta

from .. import Scraper, ScraperInput, Site
from ..exceptions import GlassdoorException
from ..utils import count_urgent_words, extract_emails_from_text, create_session
from ..utils import create_session
from ...jobs import (
JobPost,
Compensation,
CompensationInterval,
Location,
JobResponse,
JobType,
Country,
)


Expand All @@ -49,9 +44,6 @@ def fetch_jobs_page(
) -> (list[JobPost], str | None):
"""
Scrapes a page of Glassdoor for jobs with scraper_input criteria
:param scraper_input:
:return: jobs found on page
:return: cursor for next page
"""
try:
payload = self.add_payload(
Expand Down Expand Up @@ -86,8 +78,9 @@ def fetch_jobs_page(
company_name = job["header"]["employerNameFromSearch"]
location_name = job["header"].get("locationName", "")
location_type = job["header"].get("locationType", "")
is_remote = False
location = None
age_in_days = job["header"].get("ageInDays")
is_remote, location = False, None
date_posted = (datetime.now() - timedelta(days=age_in_days)).date() if age_in_days else None

if location_type == "S":
is_remote = True
Expand All @@ -99,10 +92,11 @@ def fetch_jobs_page(
job = JobPost(
title=title,
company_name=company_name,
date_posted=date_posted,
job_url=job_url,
location=location,
compensation=compensation,
is_remote=is_remote,
is_remote=is_remote
)
jobs.append(job)

Expand Down Expand Up @@ -161,15 +155,8 @@ def parse_compensation(data: dict) -> Optional[Compensation]:
interval = None
if pay_period == "ANNUAL":
interval = CompensationInterval.YEARLY
elif pay_period == "MONTHLY":
interval = CompensationInterval.MONTHLY
elif pay_period == "WEEKLY":
interval = CompensationInterval.WEEKLY
elif pay_period == "DAILY":
interval = CompensationInterval.DAILY
elif pay_period == "HOURLY":
interval = CompensationInterval.HOURLY

elif pay_period:
interval = CompensationInterval.get_interval(pay_period)
min_amount = int(adjusted_pay.get("p10") // 1)
max_amount = int(adjusted_pay.get("p90") // 1)

Expand All @@ -180,12 +167,6 @@ def parse_compensation(data: dict) -> Optional[Compensation]:
currency=currency,
)

def get_job_type_enum(self, job_type_str: str) -> list[JobType] | None:
for job_type in JobType:
if job_type_str in job_type.value:
return [job_type]
return None

def get_location(self, location: str, is_remote: bool) -> (int, str):
if not location or is_remote:
return "11047", "STATE" # remote options
Expand Down Expand Up @@ -243,10 +224,17 @@ def add_payload(
payload["variables"]["filterParams"].append(
{"filterKey": "jobType", "values": filter_value}
)

return json.dumps([payload])

def parse_location(self, location_name: str) -> Location:
@staticmethod
def get_job_type_enum(job_type_str: str) -> list[JobType] | None:
for job_type in JobType:
if job_type_str in job_type.value:
return [job_type]
return None

@staticmethod
def parse_location(location_name: str) -> Location:
if not location_name or location_name == "Remote":
return None
city, _, state = location_name.partition(", ")
Expand Down
4 changes: 3 additions & 1 deletion src/jobspy/scrapers/indeed/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ def scrape_page(
"l": scraper_input.location,
"filter": 0,
"start": scraper_input.offset + page * 10,
"sort": "date"
}
if scraper_input.distance:
params["radius"] = scraper_input.distance
Expand Down Expand Up @@ -150,6 +151,7 @@ def process_job(job) -> JobPost | None:
title=job["normTitle"],
description=description,
company_name=job["company"],
company_url=self.url + job["companyOverviewLink"] if "companyOverviewLink" in job else None,
location=Location(
city=job.get("jobLocationCity"),
state=job.get("jobLocationState"),
Expand Down Expand Up @@ -305,7 +307,7 @@ def find_mosaic_script() -> Tag | None:
raise IndeedException("Could not find mosaic provider job cards data")
else:
raise IndeedException(
"Could not find a script tag containing mosaic provider data"
"Could not find any results for the search"
)

@staticmethod
Expand Down
6 changes: 0 additions & 6 deletions src/jobspy/scrapers/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,12 +39,6 @@ def create_session(proxy: dict | None = None, is_tls: bool = True):
random_tls_extension_order=True,
)
session.proxies = proxy
# TODO multiple proxies
# if self.proxies:
# session.proxies = {
# "http": random.choice(self.proxies),
# "https": random.choice(self.proxies),
# }
else:
session = requests.Session()
session.allow_redirects = True
Expand Down
19 changes: 7 additions & 12 deletions src/jobspy/scrapers/ziprecruiter/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,12 +44,12 @@ def find_jobs_in_page(
if continue_token:
params["continue"] = continue_token
try:
session = create_session(self.proxy, is_tls=False)
session = create_session(self.proxy, is_tls=True)
response = session.get(
f"https://api.ziprecruiter.com/jobs-app/jobs",
headers=self.headers(),
params=self.add_params(scraper_input),
timeout=10,
timeout_seconds=10,
)
if response.status_code != 200:
raise ZipRecruiterException(
Expand Down Expand Up @@ -195,17 +195,12 @@ def add_params(scraper_input) -> dict[str, str | Any]:
@staticmethod
def headers() -> dict:
"""
Returns headers needed for requests
Returns headers needed for ZipRecruiter API requests
:return: dict - Dictionary containing headers
"""
return {
"Host": "api.ziprecruiter.com",
"Cookie": "ziprecruiter_browser=018188e0-045b-4ad7-aa50-627a6c3d43aa; ziprecruiter_session=5259b2219bf95b6d2299a1417424bc2edc9f4b38; SplitSV=2016-10-19%3AU2FsdGVkX19f9%2Bx70knxc%2FeR3xXR8lWoTcYfq5QjmLU%3D%0A; __cf_bm=qXim3DtLPbOL83GIp.ddQEOFVFTc1OBGPckiHYxcz3o-1698521532-0-AfUOCkgCZyVbiW1ziUwyefCfzNrJJTTKPYnif1FZGQkT60dMowmSU/Y/lP+WiygkFPW/KbYJmyc+MQSkkad5YygYaARflaRj51abnD+SyF9V; zglobalid=68d49bd5-0326-428e-aba8-8a04b64bc67c.af2d99ff7c03.653d61bb; ziprecruiter_browser=018188e0-045b-4ad7-aa50-627a6c3d43aa; ziprecruiter_session=5259b2219bf95b6d2299a1417424bc2edc9f4b38",
"accept": "*/*",
"x-zr-zva-override": "100000000;vid:ZT1huzm_EQlDTVEc",
"x-pushnotificationid": "0ff4983d38d7fc5b3370297f2bcffcf4b3321c418f5c22dd152a0264707602a0",
"x-deviceid": "D77B3A92-E589-46A4-8A39-6EF6F1D86006",
"user-agent": "Job Search/87.0 (iPhone; CPU iOS 16_6_1 like Mac OS X)",
"authorization": "Basic YTBlZjMyZDYtN2I0Yy00MWVkLWEyODMtYTI1NDAzMzI0YTcyOg==",
"accept-language": "en-US,en;q=0.9",
'Host': 'api.ziprecruiter.com',
'accept': '*/*',
'authorization': 'Basic YTBlZjMyZDYtN2I0Yy00MWVkLWEyODMtYTI1NDAzMzI0YTcyOg==',
'Cookie': '__cf_bm=DZ7eJOw6lka.Bwy5jLeDqWanaZ8BJlVAwaXrmcbYnxM-1701505132-0-AfGaVIfTA2kJlmleK14o722vbVwpZ+4UxFznsWv+guvzXSpD9KVEy/+pNzvEZUx88yaEShJwGt3/EVjhHirX/ASustKxg47V/aXRd2XIO2QN; zglobalid=61f94830-1990-4130-b222-d9d0e09c7825.57da9ea9581c.656ae86b; ziprecruiter_browser=018188e0-045b-4ad7-aa50-627a6c3d43aa; ziprecruiter_session=5259b2219bf95b6d2299a1417424bc2edc9f4b38; zva=100000000%3Bvid%3AZWroa0x_F1KEeGeU'
}

0 comments on commit 33d442b

Please sign in to comment.