diff --git a/config.sample.json b/config.sample.json index 61df370..4a9d46e 100644 --- a/config.sample.json +++ b/config.sample.json @@ -3,5 +3,6 @@ "repository": "singer-io/target-stitch", "start_date": "2021-01-01T00:00:00Z", "request_timeout": 300, - "base_url": "https://api.github.com" + "base_url": "https://api.github.com", + "extract_archived": "false" } diff --git a/setup.py b/setup.py index 2cbde14..37115bd 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ from setuptools import setup, find_packages setup(name='tap-github', - version='2.0.13', + version='2.0.14', description='Singer.io tap for extracting data from the GitHub API', author='Stitch', url='http://singer.io', diff --git a/tap_github/client.py b/tap_github/client.py index e182462..18bf5e8 100644 --- a/tap_github/client.py +++ b/tap_github/client.py @@ -67,6 +67,10 @@ class RateLimitSleepExceeded(GithubException): class TooManyRequests(GithubException): pass +# Thrown when repository is archived and extract_archived is not enabled +class ArchivedRepositoryError(GithubException): + pass + ERROR_CODE_EXCEPTION_MAPPING = { 301: { @@ -200,6 +204,9 @@ def __init__(self, config): self.set_auth_in_session() self.not_accessible_repos = set() self.max_per_page = self.config.get('max_per_page', DEFAULT_MAX_PER_PAGE) + # Convert string 'true'/'false' to boolean, default to False + extract_archived_value = str(self.config.get('extract_archived', 'false')).lower() + self.extract_archived = extract_archived_value == 'true' def get_request_timeout(self): """ @@ -282,9 +289,31 @@ def verify_repo_access(self, url_for_repo, repo): message = "HTTP-error-code: 404, Error: Please check the repository name \'{}\' or you do not have sufficient permissions to access this repository.".format(repo) raise NotFoundException(message) from None + def check_repo_archived(self, repo): + """ + Check if a repository is archived and raise an error if extract_archived is not enabled. + + Args: + repo: Repository in 'org/repo' format + + Raises: + ArchivedRepositoryError: If repo is archived and extract_archived config is not true + """ + url = "{}/repos/{}".format(self.base_url, repo) + response = self.authed_get_single_page("checking repository archived status", url, should_skip_404=False) + repo_info = response.json() + + if repo_info.get('archived', False): + if not self.extract_archived: + message = "Repository '{}' is archived. To extract data from archived repositories, " \ + "set 'extract_archived' to 'true' in the config.".format(repo) + raise ArchivedRepositoryError(message) + LOGGER.warning("Repository '%s' is archived. Proceeding with extraction as 'extract_archived' is enabled.", repo) + def verify_access_for_repo(self): """ For all the repositories mentioned in the config, check the access for each repos. + Also checks if repositories are archived and fails if extract_archived is not enabled. """ repositories, org = self.extract_repos_from_config() # pylint: disable=unused-variable @@ -296,6 +325,9 @@ def verify_access_for_repo(self): # Verifying for Repo access self.verify_repo_access(url_for_repo, repo) + # Check if repository is archived + self.check_repo_archived(repo) + def extract_orgs_from_config(self): """ Extracts all organizations from the config @@ -383,6 +415,14 @@ def get_all_repos(self, organizations: list): repo ) + # Check if repository is archived (info already available in response) + if repo.get('archived', False): + if not self.extract_archived: + message = "Repository '{}' is archived. To extract data from archived repositories, " \ + "set 'extract_archived' to 'true' in the config.".format(repo_full_name) + raise ArchivedRepositoryError(message) + LOGGER.warning("Repository '%s' is archived. Proceeding with extraction as 'extract_archived' is enabled.", repo_full_name) + repos.append(repo_full_name) except NotFoundException: # Throwing user-friendly error message as it checks token access