From 7dba21bc5c40f47d9a58638360de25c52063e389 Mon Sep 17 00:00:00 2001 From: ben Date: Sat, 4 Nov 2023 18:18:55 +0300 Subject: [PATCH] Add JS Rendering of content. --- README.md | 7 +++++++ src/dtos.py | 2 ++ src/flaresolverr_service.py | 8 ++++++++ 3 files changed, 17 insertions(+) diff --git a/README.md b/README.md index 845ab77cb6..ccb3c2f884 100644 --- a/README.md +++ b/README.md @@ -153,6 +153,8 @@ session. When you no longer need to use a session you should make sure to close | cookies | Optional. Will be used by the headless browser. Eg: `"cookies": [{"name": "cookie1", "value": "value1"}, {"name": "cookie2", "value": "value2"}]`. | | returnOnlyCookies | Optional, default false. Only returns the cookies. Response data, headers and other parts of the response are removed. | | proxy | Optional, default disabled. Eg: `"proxy": {"url": "http://127.0.0.1:8888"}`. You must include the proxy schema in the URL: `http://`, `socks4://` or `socks5://`. Authorization (username/password) is not supported. (When the `session` parameter is set, the proxy is ignored; a session specific proxy can be set in `sessions.create`.) | +| xpath | Optional, default disabled. XPath selector to JS rendered content. | +| xpathWaitTimeout | Optional, default disabled. Max timeout to wait for XPath selector in milliseconds. | > **Warning** > If you want to use Cloudflare clearance cookie in your scripts, make sure you use the FlareSolverr User-Agent too. If they don't match you will see the challenge. @@ -224,6 +226,11 @@ This is the same as `request.get` but it takes one more param: |-----------|--------------------------------------------------------------------------| | postData | Must be a string with `application/x-www-form-urlencoded`. Eg: `a=b&c=d` | +## JS Rendering +If you want to get HTML after JS Rendering, set "xpath" parameter to the request. You should set xpath selector that represents the content you want to get. +Also you can set xpathWaitTimeout parameter to control how much browser will wait for content. + + ## Environment variables | Name | Default | Notes | diff --git a/src/dtos.py b/src/dtos.py index 1e9aace71b..64c32c8b1f 100644 --- a/src/dtos.py +++ b/src/dtos.py @@ -36,6 +36,8 @@ class V1RequestBase(object): session_ttl_minutes: int = None headers: list = None # deprecated v2.0.0, not used userAgent: str = None # deprecated v2.0.0, not used + xpath: str = None + xpathWaitTimeout: int = None # V1Request url: str = None diff --git a/src/flaresolverr_service.py b/src/flaresolverr_service.py index acc920b11d..e57fbda99d 100644 --- a/src/flaresolverr_service.py +++ b/src/flaresolverr_service.py @@ -121,6 +121,8 @@ def _controller_v1_handler(req: V1RequestBase) -> V1ResponseBase: # set default values if req.maxTimeout is None or req.maxTimeout < 1: req.maxTimeout = 60000 + if req.xpathWaitTimeout is None or req.xpathWaitTimeout < 1: + req.xpathWaitTimeout = 60000 # execute the command res: V1ResponseBase @@ -396,6 +398,12 @@ def _evil_logic(req: V1RequestBase, driver: WebDriver, method: str) -> Challenge if not req.returnOnlyCookies: challenge_res.headers = {} # todo: fix, selenium not provides this info + if req.xpath: + try: + WebDriverWait(driver, req.xpathWaitTimeout / 1000)\ + .until(presence_of_element_located((By.XPATH, req.xpath))) + except TimeoutException: + raise Exception(f'JS render timeout. Specified selector was not found in time.') challenge_res.response = driver.page_source res.result = challenge_res