Skip to content

Commit

Permalink
enh: ziprecruiter full description (#162)
Browse files Browse the repository at this point in the history
  • Loading branch information
cullenwatson committed Jun 9, 2024
1 parent df33961 commit ccb0c17
Show file tree
Hide file tree
Showing 4 changed files with 49 additions and 9 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "python-jobspy"
version = "1.1.55"
version = "1.1.56"
description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter"
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
homepage = "https://github.com/Bunsly/JobSpy"
Expand Down
8 changes: 1 addition & 7 deletions src/jobspy/scrapers/linkedin/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@

from .. import Scraper, ScraperInput, Site
from ..exceptions import LinkedInException
from ..utils import create_session
from ..utils import create_session, remove_attributes
from ...jobs import (
JobPost,
Location,
Expand Down Expand Up @@ -257,12 +257,6 @@ def _get_job_details(self, job_page_url: str) -> dict:
)
description = None
if div_content is not None:

def remove_attributes(tag):
for attr in list(tag.attrs):
del tag[attr]
return tag

div_content = remove_attributes(div_content)
description = div_content.prettify(formatter="html")
if self.scraper_input.description_format == DescriptionFormat.MARKDOWN:
Expand Down
7 changes: 7 additions & 0 deletions src/jobspy/scrapers/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ def execute_request(self, *args, **kwargs):
else:
self.proxies = {}
response = tls_client.Session.execute_request(self, *args, **kwargs)
response.ok = response.status_code in range(200, 400)
return response


Expand Down Expand Up @@ -178,3 +179,9 @@ def currency_parser(cur_str):
num = float(cur_str)

return np.round(num, 2)


def remove_attributes(tag):
for attr in list(tag.attrs):
del tag[attr]
return tag
41 changes: 40 additions & 1 deletion src/jobspy/scrapers/ziprecruiter/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,19 +7,24 @@

from __future__ import annotations

import json
import math
import re
import time
from datetime import datetime
from typing import Optional, Tuple, Any

from concurrent.futures import ThreadPoolExecutor

from bs4 import BeautifulSoup

from .. import Scraper, ScraperInput, Site
from ..utils import (
logger,
extract_emails_from_text,
create_session,
markdown_converter,
remove_attributes,
)
from ...jobs import (
JobPost,
Expand Down Expand Up @@ -151,6 +156,8 @@ def _process_job(self, job: dict) -> JobPost | None:
comp_min = int(job["compensation_min"]) if "compensation_min" in job else None
comp_max = int(job["compensation_max"]) if "compensation_max" in job else None
comp_currency = job.get("compensation_currency")
description_full, job_url_direct = self._get_descr(job_url)

return JobPost(
id=str(job["listing_key"]),
title=title,
Expand All @@ -165,10 +172,42 @@ def _process_job(self, job: dict) -> JobPost | None:
),
date_posted=date_posted,
job_url=job_url,
description=description,
description=description_full if description_full else description,
emails=extract_emails_from_text(description) if description else None,
job_url_direct=job_url_direct,
)

def _get_descr(self, job_url):
res = self.session.get(job_url, headers=self.headers, allow_redirects=True)
description_full = job_url_direct = None
if res.ok:
soup = BeautifulSoup(res.text, "html.parser")
job_descr_div = soup.find("div", class_="job_description")
company_descr_section = soup.find("section", class_="company_description")
job_description_clean = (
remove_attributes(job_descr_div).prettify(formatter="html")
if job_descr_div
else ""
)
company_description_clean = (
remove_attributes(company_descr_section).prettify(formatter="html")
if company_descr_section
else ""
)
description_full = job_description_clean + company_description_clean
script_tag = soup.find("script", type="application/json")
if script_tag:
job_json = json.loads(script_tag.string)
job_url_val = job_json["model"]["saveJobURL"]
m = re.search(r"job_url=(.+)", job_url_val)
if m:
job_url_direct = m.group(1)

if self.scraper_input.description_format == DescriptionFormat.MARKDOWN:
description_full = markdown_converter(description_full)

return description_full, job_url_direct

def _get_cookies(self):
data = "event_type=session&logged_in=false&number_of_retry=1&property=model%3AiPhone&property=os%3AiOS&property=locale%3Aen_us&property=app_build_number%3A4734&property=app_version%3A91.0&property=manufacturer%3AApple&property=timestamp%3A2024-01-12T12%3A04%3A42-06%3A00&property=screen_height%3A852&property=os_version%3A16.6.1&property=source%3Ainstall&property=screen_width%3A393&property=device_model%3AiPhone%2014%20Pro&property=brand%3AApple"
url = f"{self.api_url}/jobs-app/event"
Expand Down

0 comments on commit ccb0c17

Please sign in to comment.