From 195d731aed46f7f5fe18e38fd9b655e14fb1df7f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaros=C5=82aw=20Cellary?= Date: Thu, 30 Oct 2025 09:32:17 +0100 Subject: [PATCH 1/4] Rate limit fixes --- setup.py | 2 +- tap_github/client.py | 22 ++++++++++++++++++++-- 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index 0c350267..c4e19738 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ from setuptools import setup, find_packages setup(name='tap-github', - version='2.0.11', + version='2.0.12', description='Singer.io tap for extracting data from the GitHub API', author='Stitch', url='http://singer.io', diff --git a/tap_github/client.py b/tap_github/client.py index 932d1b86..b0e88e2d 100644 --- a/tap_github/client.py +++ b/tap_github/client.py @@ -16,7 +16,11 @@ # Set default timeout of 300 seconds REQUEST_TIMEOUT = 300 +# How many total seconds to retry when getting rate limit error from API +RATE_LIMIT_RETRY_MAX_TIME = 600 + PAGINATION_EXCEED_MSG = 'In order to keep the API fast for everyone, pagination is limited for this resource.' +RATE_LIMIT_EXCEED_MSG = 'API rate limit exceeded' class GithubException(Exception): pass @@ -54,6 +58,9 @@ class ConflictError(GithubException): class RateLimitExceeded(GithubException): pass +class RateLimitSleepExceeded(GithubException): + pass + class TooManyRequests(GithubException): pass @@ -111,6 +118,11 @@ def raise_for_error(resp, source, stream, client, should_skip_404): except JSONDecodeError: response_json = {} + if error_code == 403 and RATE_LIMIT_EXCEED_MSG in response_json.get('message', ''): + message = f"HTTP-error-code: 403, Error: {response_json.get('message', '')}. " + LOGGER.warning(message) + raise RateLimitExceeded() from None + if error_code == 404 and should_skip_404: # Add not accessible stream into list. client.not_accessible_repos.add(stream) @@ -150,13 +162,18 @@ def rate_throttling(response, max_sleep_seconds, min_remain_rate_limit): """ For rate limit errors, get the remaining time before retrying and calculate the time to sleep before making a new request. """ + if "Retry-After" in response.headers: + # handles the secondary rate limit + seconds_to_sleep = int(response.headers['Retry-After']) + LOGGER.info("Retry-After header found in response. Tap will retry the data collection after %s seconds.", seconds_to_sleep) + time.sleep(seconds_to_sleep) if 'X-RateLimit-Remaining' in response.headers: if int(response.headers['X-RateLimit-Remaining']) <= min_remain_rate_limit: - seconds_to_sleep = calculate_seconds(int(response.headers['X-RateLimit-Reset'])) + seconds_to_sleep = calculate_seconds(int(response.headers['X-RateLimit-Reset']) + 15) if seconds_to_sleep > max_sleep_seconds: message = "API rate limit exceeded, please try after {} seconds.".format(seconds_to_sleep) - raise RateLimitExceeded(message) from None + raise RateLimitSleepExceeded(message) from None LOGGER.info("API rate limit exceeded. Tap will retry the data collection after %s seconds.", seconds_to_sleep) time.sleep(seconds_to_sleep) @@ -206,6 +223,7 @@ def set_auth_in_session(self): @backoff.on_exception(backoff.expo, (requests.Timeout, requests.ConnectionError, Server5xxError, TooManyRequests), max_tries=5, factor=2) @backoff.on_exception(backoff.expo, (BadCredentialsException, ), max_tries=3, factor=2) + @backoff.on_exception(backoff.constant, (RateLimitExceeded, ), jitter=None, interval=60, max_time=RATE_LIMIT_RETRY_MAX_TIME) def authed_get_single_page(self, source, url, headers={}, stream="", should_skip_404 = True): """ Call rest API and return the response in case of status code 200. From c557810606a23e2fa7aacccb1f233255a46428d0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaros=C5=82aw=20Cellary?= Date: Thu, 30 Oct 2025 09:41:55 +0100 Subject: [PATCH 2/4] style fix --- tap_github/client.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/tap_github/client.py b/tap_github/client.py index b0e88e2d..c3c6639e 100644 --- a/tap_github/client.py +++ b/tap_github/client.py @@ -118,8 +118,10 @@ def raise_for_error(resp, source, stream, client, should_skip_404): except JSONDecodeError: response_json = {} - if error_code == 403 and RATE_LIMIT_EXCEED_MSG in response_json.get('message', ''): - message = f"HTTP-error-code: 403, Error: {response_json.get('message', '')}. " + response_message = response_json.get('message', '') + + if error_code == 403 and RATE_LIMIT_EXCEED_MSG in response_message: + message = f"HTTP-error-code: 403, Error: {response_message}" LOGGER.warning(message) raise RateLimitExceeded() from None @@ -134,8 +136,8 @@ def raise_for_error(resp, source, stream, client, should_skip_404): # Don't raise a NotFoundException return None - if error_code == 422 and PAGINATION_EXCEED_MSG in response_json.get('message', ''): - message = f"HTTP-error-code: 422, Error: {response_json.get('message', '')}. " \ + if error_code == 422 and PAGINATION_EXCEED_MSG in response_message: + message = f"HTTP-error-code: 422, Error: {response_message}. " \ f"Please refer '{response_json.get('documentation_url')}' for more details." \ "This is a known issue when the results exceed 40k and the last page is not full" \ " (it will trim the results to get only the available by the API)." @@ -223,7 +225,7 @@ def set_auth_in_session(self): @backoff.on_exception(backoff.expo, (requests.Timeout, requests.ConnectionError, Server5xxError, TooManyRequests), max_tries=5, factor=2) @backoff.on_exception(backoff.expo, (BadCredentialsException, ), max_tries=3, factor=2) - @backoff.on_exception(backoff.constant, (RateLimitExceeded, ), jitter=None, interval=60, max_time=RATE_LIMIT_RETRY_MAX_TIME) + @backoff.on_exception(backoff.constant, (RateLimitExceeded, ), interval=60, jitter=None, max_time=RATE_LIMIT_RETRY_MAX_TIME) def authed_get_single_page(self, source, url, headers={}, stream="", should_skip_404 = True): """ Call rest API and return the response in case of status code 200. From 5fd917727a1f2def5cb02ec26a792a36356af88c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaros=C5=82aw=20Cellary?= Date: Thu, 30 Oct 2025 09:48:30 +0100 Subject: [PATCH 3/4] fixed limit --- tap_github/client.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tap_github/client.py b/tap_github/client.py index c3c6639e..94c9cff7 100644 --- a/tap_github/client.py +++ b/tap_github/client.py @@ -16,8 +16,8 @@ # Set default timeout of 300 seconds REQUEST_TIMEOUT = 300 -# How many total seconds to retry when getting rate limit error from API -RATE_LIMIT_RETRY_MAX_TIME = 600 +# How many total seconds to retry when getting rate limit error from API. The limit resets every hour. +RATE_LIMIT_RETRY_MAX_TIME = 3600 PAGINATION_EXCEED_MSG = 'In order to keep the API fast for everyone, pagination is limited for this resource.' RATE_LIMIT_EXCEED_MSG = 'API rate limit exceeded' From 46fa9aa711299881ca6527bf7dd84416d0dd5a2d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaros=C5=82aw=20Cellary?= Date: Thu, 30 Oct 2025 11:25:46 +0100 Subject: [PATCH 4/4] added comments to exceptions --- tap_github/client.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tap_github/client.py b/tap_github/client.py index 94c9cff7..e182462f 100644 --- a/tap_github/client.py +++ b/tap_github/client.py @@ -55,12 +55,15 @@ class MovedPermanentlyError(GithubException): class ConflictError(GithubException): pass +# Thrown when we receive 403 Rate Limit Exceeded from Github API class RateLimitExceeded(GithubException): pass +# Thrown when we're expected to sleep for longer than the max_sleep_seconds limit class RateLimitSleepExceeded(GithubException): pass +# Thrown when 429 is received from Github API class TooManyRequests(GithubException): pass