Skip to content

Commit

Permalink
enh: improve timeout and availability check for fmt_http
Browse files Browse the repository at this point in the history
  • Loading branch information
paulmueller committed Nov 4, 2023
1 parent 6f92ad2 commit 74c1eb7
Show file tree
Hide file tree
Showing 3 changed files with 33 additions and 8 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
0.55.4
- enh: improve timeout and availability check for fmt_http
0.55.3
- enh: migrate to purely requests-based HTTP file for fmt_http
0.55.2
Expand Down
38 changes: 30 additions & 8 deletions dclab/rtdc_dataset/fmt_http.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import re
import socket
from urllib.parse import urlparse
import warnings

import numpy as np

Expand Down Expand Up @@ -41,7 +42,8 @@ def get(self, *args, **kwargs):
requests.exceptions.ReadTimeout,
requests.exceptions.ConnectTimeout,
requests.urllib3.exceptions.ConnectionError,
requests.urllib3.exceptions.ReadTimeoutError):
requests.urllib3.exceptions.ReadTimeoutError) as e:
warnings.warn(f"Encountered {e} for {args} {kwargs}")
continue
else:
break
Expand Down Expand Up @@ -253,7 +255,7 @@ class HTTPBasin(Basin):
basin_type = "remote"

def __init__(self, *args, **kwargs):
self._available_verified = False
self._available_verified = None
super(HTTPBasin, self).__init__(*args, **kwargs)

def load_dataset(self, location, **kwargs):
Expand All @@ -271,9 +273,16 @@ def is_available(self):
Caching policy: Once this method returns True, it will always
return True.
"""
if not self._available_verified:
self._available_verified = (
REQUESTS_AVAILABLE and is_url_available(self.location))
if not REQUESTS_AVAILABLE:
# don't even bother
self._available_verified = False
if self._available_verified is None:
avail, reason = is_url_available(self.location)
if reason in ["forbidden", "not found"]:
# we cannot access the URL in the near future
self._available_verified = False
elif avail:
self._available_verified = True
return self._available_verified


Expand All @@ -284,26 +293,39 @@ def is_url_available(url: str):
----------
url: str
full URL to the object
Returns
-------
available: bool
whether the URL is available
reason: str
reason for the URL not being available is `available` is False
"""
avail = False
reason = "none"
if is_http_url(url):
urlp = urlparse(url)
# default to https if no scheme or port is specified
port = urlp.port or (80 if urlp.scheme == "http" else 443)
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
s.settimeout(1)
# Try to connect to the host
try:
s.connect((urlp.netloc, port))
except (socket.gaierror, OSError):
pass
reason = "no connection"
else:
# Try to access the url
try:
req = requests.get(url, stream=True)
ses = ResoluteRequestsSession()
req = ses.get(url, stream=True, timeout=1)
avail = req.ok
if not avail:
reason = req.reason.lower()
except OSError:
reason = "oserror"
pass
return avail
return avail, reason


@functools.lru_cache()
Expand Down
1 change: 1 addition & 0 deletions dclab/rtdc_dataset/fmt_s3.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,7 @@ def is_s3_object_available(url: str,
# default to https if no scheme or port is specified
port = urlp.port or (80 if urlp.scheme == "http" else 443)
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
s.settimeout(1)
# Try to connect to the host
try:
s.connect((urlp.netloc, port))
Expand Down

0 comments on commit 74c1eb7

Please sign in to comment.