From a4f6851c32f22a82cfe7a09bf0cedbd36b084119 Mon Sep 17 00:00:00 2001 From: gigaSec <105108954+giga-sec@users.noreply.github.com> Date: Tue, 5 Mar 2024 07:35:57 +0800 Subject: [PATCH] Fix GlassDoor Country Vietnam(#122) --- README.md | 11 +--------- poetry.lock | 40 +++++++++++++++--------------------- pyproject.toml | 4 ++-- src/jobspy/jobs/__init__.py | 2 +- src/jobspy/scrapers/utils.py | 19 +++++++---------- 5 files changed, 28 insertions(+), 48 deletions(-) diff --git a/README.md b/README.md index dce67ea..a3ca73f 100644 --- a/README.md +++ b/README.md @@ -104,15 +104,6 @@ JobPost └── is_remote (bool) ``` -### Exceptions - -The following exceptions may be raised when using JobSpy: - -* `LinkedInException` -* `IndeedException` -* `ZipRecruiterException` -* `GlassdoorException` - ## Supported Countries for Job Searching ### **LinkedIn** @@ -147,7 +138,7 @@ You can specify the following countries when searching on Indeed (use the exact | South Korea | Spain* | Sweden | Switzerland* | | Taiwan | Thailand | Turkey | Ukraine | | United Arab Emirates | UK* | USA* | Uruguay | -| Venezuela | Vietnam | | | +| Venezuela | Vietnam* | | | Glassdoor can only fetch 900 jobs from the endpoint we're using on a given search. diff --git a/poetry.lock b/poetry.lock index 20eb44d..c129a67 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand. [[package]] name = "annotated-types" @@ -524,17 +524,6 @@ files = [ {file = "fqdn-1.5.1.tar.gz", hash = "sha256:105ed3677e767fb5ca086a0c1f4bb66ebc3c100be518f0e0d755d9eae164d89f"}, ] -[[package]] -name = "html2text" -version = "2020.1.16" -description = "Turn HTML into equivalent Markdown-structured text." -optional = false -python-versions = ">=3.5" -files = [ - {file = "html2text-2020.1.16-py3-none-any.whl", hash = "sha256:c7c629882da0cf377d66f073329ccf34a12ed2adf0169b9285ae4e63ef54c82b"}, - {file = "html2text-2020.1.16.tar.gz", hash = "sha256:e296318e16b059ddb97f7a8a1d6a5c1d7af4544049a01e261731d2d5cc277bbb"}, -] - [[package]] name = "idna" version = "3.4" @@ -1037,6 +1026,21 @@ files = [ {file = "jupyterlab_widgets-3.0.8.tar.gz", hash = "sha256:d428ab97b8d87cc7c54cbf37644d6e0f0e662f23876e05fa460a73ec3257252a"}, ] +[[package]] +name = "markdownify" +version = "0.11.6" +description = "Convert HTML to markdown." +optional = false +python-versions = "*" +files = [ + {file = "markdownify-0.11.6-py3-none-any.whl", hash = "sha256:ba35fe289d5e9073bcd7d2cad629278fe25f1a93741fcdc0bfb4f009076d8324"}, + {file = "markdownify-0.11.6.tar.gz", hash = "sha256:009b240e0c9f4c8eaf1d085625dcd4011e12f0f8cec55dedf9ea6f7655e49bfe"}, +] + +[package.dependencies] +beautifulsoup4 = ">=4.9,<5" +six = ">=1.15,<2" + [[package]] name = "markupsafe" version = "2.1.3" @@ -1064,16 +1068,6 @@ files = [ {file = "MarkupSafe-2.1.3-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:5bbe06f8eeafd38e5d0a4894ffec89378b6c6a625ff57e3028921f8ff59318ac"}, {file = "MarkupSafe-2.1.3-cp311-cp311-win32.whl", hash = "sha256:dd15ff04ffd7e05ffcb7fe79f1b98041b8ea30ae9234aed2a9168b5797c3effb"}, {file = "MarkupSafe-2.1.3-cp311-cp311-win_amd64.whl", hash = "sha256:134da1eca9ec0ae528110ccc9e48041e0828d79f24121a1a146161103c76e686"}, - {file = "MarkupSafe-2.1.3-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:f698de3fd0c4e6972b92290a45bd9b1536bffe8c6759c62471efaa8acb4c37bc"}, - {file = "MarkupSafe-2.1.3-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:aa57bd9cf8ae831a362185ee444e15a93ecb2e344c8e52e4d721ea3ab6ef1823"}, - {file = "MarkupSafe-2.1.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ffcc3f7c66b5f5b7931a5aa68fc9cecc51e685ef90282f4a82f0f5e9b704ad11"}, - {file = "MarkupSafe-2.1.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:47d4f1c5f80fc62fdd7777d0d40a2e9dda0a05883ab11374334f6c4de38adffd"}, - {file = "MarkupSafe-2.1.3-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1f67c7038d560d92149c060157d623c542173016c4babc0c1913cca0564b9939"}, - {file = "MarkupSafe-2.1.3-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:9aad3c1755095ce347e26488214ef77e0485a3c34a50c5a5e2471dff60b9dd9c"}, - {file = "MarkupSafe-2.1.3-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:14ff806850827afd6b07a5f32bd917fb7f45b046ba40c57abdb636674a8b559c"}, - {file = "MarkupSafe-2.1.3-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8f9293864fe09b8149f0cc42ce56e3f0e54de883a9de90cd427f191c346eb2e1"}, - {file = "MarkupSafe-2.1.3-cp312-cp312-win32.whl", hash = "sha256:715d3562f79d540f251b99ebd6d8baa547118974341db04f5ad06d5ea3eb8007"}, - {file = "MarkupSafe-2.1.3-cp312-cp312-win_amd64.whl", hash = "sha256:1b8dd8c3fd14349433c79fa8abeb573a55fc0fdd769133baac1f5e07abf54aeb"}, {file = "MarkupSafe-2.1.3-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:8e254ae696c88d98da6555f5ace2279cf7cd5b3f52be2b5cf97feafe883b58d2"}, {file = "MarkupSafe-2.1.3-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cb0932dc158471523c9637e807d9bfb93e06a95cbf010f1a38b98623b929ef2b"}, {file = "MarkupSafe-2.1.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9402b03f1a1b4dc4c19845e5c749e3ab82d5078d16a2a4c2cd2df62d57bb0707"}, @@ -2456,4 +2450,4 @@ files = [ [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "eea3694820df164179cdd8312d382eb5b29d6317c4d34c586e8866c69aaee9e9" +content-hash = "ba7f7cc9b6833a4a6271981f90610395639dd8b9b3db1370cbd1149d70cc9632" diff --git a/pyproject.toml b/pyproject.toml index 42dcf96..e5b40a0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "python-jobspy" -version = "1.1.46" +version = "1.1.47" description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter" authors = ["Zachary Hampton ", "Cullen Watson "] homepage = "https://github.com/Bunsly/JobSpy" @@ -17,8 +17,8 @@ beautifulsoup4 = "^4.12.2" pandas = "^2.1.0" NUMPY = "1.24.2" pydantic = "^2.3.0" -html2text = "^2020.1.16" tls-client = "^1.0.1" +markdownify = "^0.11.6" [tool.poetry.group.dev.dependencies] diff --git a/src/jobspy/jobs/__init__.py b/src/jobspy/jobs/__init__.py index 1d86a47..3fdb272 100644 --- a/src/jobspy/jobs/__init__.py +++ b/src/jobspy/jobs/__init__.py @@ -122,7 +122,7 @@ class Country(Enum): USA = ("usa,us,united states", "www", "com") URUGUAY = ("uruguay", "uy") VENEZUELA = ("venezuela", "ve") - VIETNAM = ("vietnam", "vn") + VIETNAM = ("vietnam", "vn", "com") # internal for ziprecruiter US_CANADA = ("usa/ca", "www") diff --git a/src/jobspy/scrapers/utils.py b/src/jobspy/scrapers/utils.py index 7cff0fd..943c1c0 100644 --- a/src/jobspy/scrapers/utils.py +++ b/src/jobspy/scrapers/utils.py @@ -1,15 +1,14 @@ -import re import logging -import numpy as np +import re -import html2text -import tls_client +import numpy as np import requests +import tls_client +from markdownify import markdownify as md from requests.adapters import HTTPAdapter, Retry from ..jobs import JobType -text_maker = html2text.HTML2Text() logger = logging.getLogger("JobSpy") logger.propagate = False if not logger.handlers: @@ -36,13 +35,9 @@ def count_urgent_words(description: str) -> int: def markdown_converter(description_html: str): if description_html is None: - return "" - text_maker.ignore_links = False - try: - markdown = text_maker.handle(description_html) - return markdown.strip() - except AssertionError as e: - return "" + return None + markdown = md(description_html) + return markdown.strip() def extract_emails_from_text(text: str) -> list[str] | None: