From 440a41f67ce17a4c630f8769efb8421adefc00ce Mon Sep 17 00:00:00 2001 From: Isaac Milarsky Date: Mon, 30 Jun 2025 16:00:22 -0500 Subject: [PATCH 1/6] hit api method Signed-off-by: Isaac Milarsky --- codejson_index_generator/parsers.py | 64 +++++++++++++++++++++++++---- main.py | 2 +- pyproject.toml | 2 +- 3 files changed, 59 insertions(+), 9 deletions(-) diff --git a/codejson_index_generator/parsers.py b/codejson_index_generator/parsers.py index 7fc9a58..1d60ed4 100644 --- a/codejson_index_generator/parsers.py +++ b/codejson_index_generator/parsers.py @@ -1,27 +1,77 @@ import json +from json.decoder import JSONDecodeError import base64 import argparse import os +import requests +from time import sleep, mktime, gmtime, time, localtime from typing import Dict, Optional -from github import Github, Repository, GithubException, Organization + +RETRIES = 5 + + +def hit_endpoint(url,token,method='GET'): + headers = {"Authorization": f"bearer {token}"} + + attempts = 0 + while attempts < RETRIES: + + response = requests.request(method, url, headers=headers,timeout=10) + + try: + if response.status_code == 200: + response_json = json.loads(response.text) + break + elif response.status_code in (403,429): + #rate limit was triggered. + wait_until = int(response.headers.get("x-ratelimit-reset")) + wait_in_seconds = int( + mktime(gmtime(wait_until)) - + mktime(gmtime(time())) + ) + wait_until_time = localtime(wait_until) + + print(f"Ran into rate limit sleeping for {self.name}!") + print( + f"sleeping until {wait_until_time.tm_hour}:{wait_until_time.tm_min} ({wait_in_seconds} seconds)" + ) + sleep(wait_in_seconds) + + response_json = {} + attempts += 1 + + if attempts >= REQUEST_RETRIES: + raise ConnectionError( + f"Rate limit was reached and couldn't be rectified after {attempts} tries" + ) + else: + raise ConnectionError("Rate limit error!") + except JSONDecodeError: + response_json = {} + attempts += 1 + + return response_json + + + + class IndexGenerator: - def __init__(self, agency: str, verison: str, token: Optional[str] = None,): - self.github = Github(token) if token else Github() + def __init__(self, agency: str, version: str, token: Optional[str] = None,): - # user can change agency and version depending on paramters + # user can change agency and version depending on parameters self.index = { "agency": agency, - "version": verison, + "version": version, "measurementType": { "method": "projects" }, "releases": [] } - def get_code_json(self, repo: Repository) -> Optional[Dict]: + def get_code_json(self, repo: str) -> Optional[Dict]: try: content = repo.get_contents("code.json", ref = repo.default_branch) except GithubException as e: @@ -35,7 +85,7 @@ def get_code_json(self, repo: Repository) -> Optional[Dict]: print(f"JSON Error: {str(e)}") return None - def save_code_json(self, repo: Repository, output_path: str) -> Optional[str]: + def save_code_json(self, repo: str, output_path: str) -> Optional[str]: res = self.get_code_json(repo) diff --git a/main.py b/main.py index d4209a4..402181c 100644 --- a/main.py +++ b/main.py @@ -40,7 +40,7 @@ def main(): try: indexGen = IndexGenerator( agency = args.agency, - verison = args.version, + version = args.version, token = github_key ) diff --git a/pyproject.toml b/pyproject.toml index 96c1ade..d9eaee8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,7 +9,7 @@ repository = "https://github.com/DSACMS/codejson-index-generator" [tool.poetry.dependencies] python = "^3.13" -pygithub = ">=1.59,<2.0" +requests = "^2.32.4" [build-system] From b7d8320c8089c77e788f81ce882f81f291144bd1 Mon Sep 17 00:00:00 2001 From: Isaac Milarsky Date: Wed, 2 Jul 2025 13:27:32 -0500 Subject: [PATCH 2/6] handle github repos Signed-off-by: Isaac Milarsky --- codejson_index_generator/parsers.py | 57 ++++++++++++++++++++++++----- 1 file changed, 48 insertions(+), 9 deletions(-) diff --git a/codejson_index_generator/parsers.py b/codejson_index_generator/parsers.py index 1d60ed4..16a62c2 100644 --- a/codejson_index_generator/parsers.py +++ b/codejson_index_generator/parsers.py @@ -4,6 +4,7 @@ import argparse import os import requests +import re from time import sleep, mktime, gmtime, time, localtime from typing import Dict, Optional @@ -54,7 +55,32 @@ def hit_endpoint(url,token,method='GET'): return response_json +def get_repo_owner_and_name(repo_http_url): + """ Gets the owner and repo from a url. + Args: + url: Github url + + Returns: + Tuple of owner and repo. Or a tuple of None and None if the url is invalid. + """ + + # Regular expression to parse a GitHub URL into two groups + # The first group contains the owner of the github repo extracted from the url + # The second group contains the name of the github repo extracted from the url + # 'But what is a regular expression?' ----> https://docs.python.org/3/howto/regex.html + regex = r"https?:\/\/github\.com\/([A-Za-z0-9 \- _]+)\/([A-Za-z0-9 \- _ \.]+)(.git)?\/?$" + result = re.search(regex, repo_http_url) + + if not result: + return None, None + + capturing_groups = result.groups() + + owner = capturing_groups[0] + repo = capturing_groups[1] + + return owner, repo @@ -71,19 +97,29 @@ def __init__(self, agency: str, version: str, token: Optional[str] = None,): "releases": [] } - def get_code_json(self, repo: str) -> Optional[Dict]: + self.token = token + + def get_code_json_github(self,repo : str) -> Optional[Dict]: try: - content = repo.get_contents("code.json", ref = repo.default_branch) - except GithubException as e: + owner,name = get_repo_owner_and_name(repo) + code_json_endpoint = f"https://api.github.com/repos/{owner}/{name}/contents/code.json" + content_dict = hit_endpoint(code_json_endpoint,self.token)#repo.get_contents("code.json", ref = repo.default_branch) + except Exception as e: print(f"GitHub Error: {e.data.get('message', 'No message available')}") return None try: - decoded_content = base64.b64decode(content.content) + decoded_content = base64.b64decode(content_dict['content']) return json.loads(decoded_content) except (json.JSONDecodeError, ValueError) as e: print(f"JSON Error: {str(e)}") return None + + def get_code_json(self, repo: str) -> Optional[Dict]: + if 'github' in repo: + return self.get_code_json_github(repo) + else: + return None def save_code_json(self, repo: str, output_path: str) -> Optional[str]: @@ -107,16 +143,19 @@ def update_index(self, index: Dict, code_json: Dict, org_name: str, repo_name: s index['releases'].append(baseline) - def get_org_repos(self, org_name: str) -> list[Organization]: + def get_org_repos(self, org_name: str) -> list[Dict]: try: - org = self.github.get_organization(org_name) + org_endpoint = f"https://api.github.com/orgs/{org_name}/repos" print(f"\nProcessing organization: {org_name}") - total_repos = org.public_repos + repo_list = hit_endpoint(org_endpoint,self.token) + + + total_repos = len(repo_list) print(f"Found {total_repos} public repositories") - return total_repos - except GithubException as e: + return repo_list + except Exception as e: raise e def save_organization_files(self, org_name: str, codeJSONPath) -> None: From aa34e08216f35efee1432c3b5b1a8461e08afd38 Mon Sep 17 00:00:00 2001 From: Isaac Milarsky Date: Wed, 2 Jul 2025 19:53:43 -0500 Subject: [PATCH 3/6] progress Signed-off-by: Isaac Milarsky --- codejson_index_generator/parsers.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/codejson_index_generator/parsers.py b/codejson_index_generator/parsers.py index 16a62c2..2142cf0 100644 --- a/codejson_index_generator/parsers.py +++ b/codejson_index_generator/parsers.py @@ -5,6 +5,7 @@ import os import requests import re +import subprocess from time import sleep, mktime, gmtime, time, localtime from typing import Dict, Optional @@ -114,12 +115,15 @@ def get_code_json_github(self,repo : str) -> Optional[Dict]: except (json.JSONDecodeError, ValueError) as e: print(f"JSON Error: {str(e)}") return None + + def get_code_json_other(self,repo: str) -> Optional[Dict]: + return None def get_code_json(self, repo: str) -> Optional[Dict]: if 'github' in repo: return self.get_code_json_github(repo) else: - return None + return self.get_code_json_other(repo) def save_code_json(self, repo: str, output_path: str) -> Optional[str]: From 74bb3da604aabc59a2aa037621101bd8717d76b8 Mon Sep 17 00:00:00 2001 From: Isaac Milarsky Date: Thu, 10 Jul 2025 12:05:02 -0500 Subject: [PATCH 4/6] add Bitbucket, github and gitlab support. Add support for github and gitlab orgs Signed-off-by: Isaac Milarsky --- codejson_index_generator/parsers.py | 124 ++++++++++++++++++++++------ main.py | 2 +- pyproject.toml | 1 + 3 files changed, 99 insertions(+), 28 deletions(-) diff --git a/codejson_index_generator/parsers.py b/codejson_index_generator/parsers.py index 2142cf0..c0ee96b 100644 --- a/codejson_index_generator/parsers.py +++ b/codejson_index_generator/parsers.py @@ -70,7 +70,13 @@ def get_repo_owner_and_name(repo_http_url): # The first group contains the owner of the github repo extracted from the url # The second group contains the name of the github repo extracted from the url # 'But what is a regular expression?' ----> https://docs.python.org/3/howto/regex.html - regex = r"https?:\/\/github\.com\/([A-Za-z0-9 \- _]+)\/([A-Za-z0-9 \- _ \.]+)(.git)?\/?$" + if 'github' in repo_http_url: + regex = r"https?:\/\/github\.com\/([A-Za-z0-9 \- _]+)\/([A-Za-z0-9 \- _ \.]+)(.git)?\/?$" + elif 'gitlab' in repo_http_url: + regex = r"https?:\/\/gitlab\.com\/([A-Za-z0-9 \- _]+)\/([A-Za-z0-9 \- _ \.]+)(.git)?\/?$" + elif 'bitbucket' in repo_http_url: + regex = r"https?:\/\/bitbucket\.org\/([A-Za-z0-9 \- _]+)\/([A-Za-z0-9 \- _ \.]+)(.git)?\/?$" + result = re.search(regex, repo_http_url) if not result: @@ -86,7 +92,7 @@ def get_repo_owner_and_name(repo_http_url): class IndexGenerator: - def __init__(self, agency: str, version: str, token: Optional[str] = None,): + def __init__(self, agency: str, version: str, token: Optional[str] = None, bitbucket_user: Optional[str] = None, bitbucket_password: Optional[str] = None, gitlab_token: Optional[str] = None): # user can change agency and version depending on parameters self.index = { @@ -99,6 +105,9 @@ def __init__(self, agency: str, version: str, token: Optional[str] = None,): } self.token = token + self.gitlab_token = gitlab_token + self.bitbucket_user = bitbucket_user + self.bitbucket_password = bitbucket_password def get_code_json_github(self,repo : str) -> Optional[Dict]: try: @@ -116,14 +125,45 @@ def get_code_json_github(self,repo : str) -> Optional[Dict]: print(f"JSON Error: {str(e)}") return None - def get_code_json_other(self,repo: str) -> Optional[Dict]: - return None + def get_code_json_gitlab(self,repo: str) -> Optional[Dict]: + try: + owner,name = get_repo_owner_and_name(repo) + code_json_endpoint = f"https://gitlab.com/api/v4/projects/{owner}%2F{name}/repository/files/code.json?ref=HEAD" + content_dict = hit_endpoint(code_json_endpoint,self.gitlab_token) + except Exception as e: + print("Problem querying the Gitlab API") + return None + + try: + decoded_content = base64.b64decode(content_dict['content']) + return json.loads(decoded_content) + except (json.JSONDecodeError, ValueError) as e: + print(f"JSON Error {e}") + return None + + def get_code_json_bitbucket(self,repo: str) -> Optional[Dict]: + try: + owner, name = get_repo_owner_and_name(repo) + code_json_endpoint = f"https://bitbucket.org/{owner}/{name}/raw/HEAD/code.json" + session = requests.Session() + session.auth = (self.bitbucket_user,self.bitbucket_password) + + auth = session.post('http://bitbucket.org') + response_dict = session.get(code_json_endpoint) + except Exception as e: + print(f"Exception when querying bitbucket.org: {e}") + + return json.loads(response_dict.text) def get_code_json(self, repo: str) -> Optional[Dict]: if 'github' in repo: return self.get_code_json_github(repo) + elif 'gitlab' in repo: + return self.get_code_json_gitlab(repo) + elif 'bitbucket' in repo: + return self.get_code_json_bitbucket(repo) else: - return self.get_code_json_other(repo) + return None def save_code_json(self, repo: str, output_path: str) -> Optional[str]: @@ -147,7 +187,7 @@ def update_index(self, index: Dict, code_json: Dict, org_name: str, repo_name: s index['releases'].append(baseline) - def get_org_repos(self, org_name: str) -> list[Dict]: + def get_github_org_repos(self, org_name: str) -> list[Dict]: try: org_endpoint = f"https://api.github.com/orgs/{org_name}/repos" print(f"\nProcessing organization: {org_name}") @@ -162,34 +202,64 @@ def get_org_repos(self, org_name: str) -> list[Dict]: except Exception as e: raise e - def save_organization_files(self, org_name: str, codeJSONPath) -> None: - raise NotImplementedError + def _enumerate_repo_orgs(self,org_name,repo_name, url, total_repos, codeJSONPath=None): + print(f"\nChecking {repo_name} [{id}/{total_repos}]") + + if not codeJSONPath: + code_json = self.get_code_json(url) + else: + repoPath = os.path.join(codeJSONPath, (repo_name + '.json')) + code_json = self.save_code_json(url,repoPath) + + if code_json and add_to_index: + print(f"✅ Found code.json in {repo_name}") + self.update_index(self.index, code_json, org_name, repo_name) + elif not code_json: + print(f"❌ No code.json found in {repo_name}") - def process_organization(self, org_name: str, add_to_index=True, codeJSONPath=None) -> None: + def process_github_org_files(self, org_name: str, add_to_index=True, codeJSONPath=None) -> None: try: - org = self.github.get_organization(org_name) - total_repos = self.get_org_repos(org_name) + orgs = self.get_github_org_repos(org_name) + total_repos = len(orgs) - for id, repo in enumerate(org.get_repos(type='public'), 1): - print(f"\nChecking {repo.name} [{id}/{total_repos}]") - - if not codeJSONPath: - code_json = self.get_code_json(repo) - else: - repoPath = os.path.join(codeJSONPath, (repo.name + '.json')) - code_json = self.save_code_json(repo,repoPath) - - if code_json and add_to_index: - print(f"✅ Found code.json in {repo.name}") - self.update_index(self.index, code_json, org_name, repo.name) - elif not code_json: - print(f"❌ No code.json found in {repo.name}") + for id, repo in enumerate(orgs, 1): + self._enumerate_repo_orgs( + org_name,repo['name'],repo['svn_url'],total_repos,codeJSONPath=codeJSONPath + ) - except GithubException as e: + except Exception as e: + print(f"Error processing organization {org_name}: {str(e)}") + + def get_gitlab_org_repos(self, org_name: str) -> list[Dict]: + try: + url_encoded_org_name = org_name.replace("/","%2F") + org_endpoint = f"https://gitlab.com/api/v4/groups/{url_encoded_org_name}/projects" + + repo_list = hit_endpoint(org_endpoint,self.gitlab_token) + + total_repos = len(repo_list) + print(f"Found {total_repos} public repositories") + + return total_repos + except Exception as e: + print(f"Ran into Exception when querying Gitlab Repos in group {org_name}: {e}") + return None + + def process_gitlab_org_files(self, org_name: str, add_to_index=True, codeJSONPath=None) -> None: + try: + orgs = self.get_gitlab_org_repos(org_name) + total_repos = len(orgs) + + for id, repo in enumerate(orgs, 1): + self._enumerate_repo_orgs( + org_name,repo['name'],repo['web_url'],total_repos,codeJSONPath=codeJSONPath + ) + + except Exception as e: print(f"Error processing organization {org_name}: {str(e)}") def save_index(self, output_path: str) -> None: - # sorts index by organizaiton then by name + # sorts index by organization then by name self.index['releases'].sort(key=lambda x: (x.get('organization', ''), x.get('name', ''))) with open(output_path, 'w') as f: diff --git a/main.py b/main.py index 402181c..2c91c58 100644 --- a/main.py +++ b/main.py @@ -46,7 +46,7 @@ def main(): for org in args.orgs.split(","): org = org.strip() - indexGen.process_organization(org) + indexGen.process_github_org_files(org) indexGen.save_index(args.output) print(f"\nIndexing complete. Results saved to {args.output}") diff --git a/pyproject.toml b/pyproject.toml index d9eaee8..c27bd14 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,6 +10,7 @@ repository = "https://github.com/DSACMS/codejson-index-generator" [tool.poetry.dependencies] python = "^3.13" requests = "^2.32.4" +llnl-scraper = "^0.15.0" [build-system] From 6198b11aa49fe3da698d3c4438771ac5bc987b9e Mon Sep 17 00:00:00 2001 From: Isaac Milarsky Date: Thu, 10 Jul 2025 13:43:10 -0500 Subject: [PATCH 5/6] fix bugs Signed-off-by: Isaac Milarsky --- codejson_index_generator/parsers.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/codejson_index_generator/parsers.py b/codejson_index_generator/parsers.py index c0ee96b..5e045ef 100644 --- a/codejson_index_generator/parsers.py +++ b/codejson_index_generator/parsers.py @@ -48,6 +48,7 @@ def hit_endpoint(url,token,method='GET'): f"Rate limit was reached and couldn't be rectified after {attempts} tries" ) else: + print(response.status_code) raise ConnectionError("Rate limit error!") except JSONDecodeError: response_json = {} @@ -115,7 +116,7 @@ def get_code_json_github(self,repo : str) -> Optional[Dict]: code_json_endpoint = f"https://api.github.com/repos/{owner}/{name}/contents/code.json" content_dict = hit_endpoint(code_json_endpoint,self.token)#repo.get_contents("code.json", ref = repo.default_branch) except Exception as e: - print(f"GitHub Error: {e.data.get('message', 'No message available')}") + print(f"GitHub Error: {e}") return None try: @@ -202,7 +203,7 @@ def get_github_org_repos(self, org_name: str) -> list[Dict]: except Exception as e: raise e - def _enumerate_repo_orgs(self,org_name,repo_name, url, total_repos, codeJSONPath=None): + def _enumerate_repo_orgs(self,id,org_name,repo_name, url, total_repos, codeJSONPath=None,add_to_index=True): print(f"\nChecking {repo_name} [{id}/{total_repos}]") if not codeJSONPath: @@ -218,17 +219,16 @@ def _enumerate_repo_orgs(self,org_name,repo_name, url, total_repos, codeJSONPath print(f"❌ No code.json found in {repo_name}") def process_github_org_files(self, org_name: str, add_to_index=True, codeJSONPath=None) -> None: - try: - orgs = self.get_github_org_repos(org_name) - total_repos = len(orgs) - - for id, repo in enumerate(orgs, 1): + orgs = self.get_github_org_repos(org_name) + total_repos = len(orgs) + + for id, repo in enumerate(orgs, 1): + try: self._enumerate_repo_orgs( - org_name,repo['name'],repo['svn_url'],total_repos,codeJSONPath=codeJSONPath + id,org_name,repo['name'],repo['svn_url'],total_repos,codeJSONPath=codeJSONPath,add_to_index=add_to_index ) - - except Exception as e: - print(f"Error processing organization {org_name}: {str(e)}") + except Exception as e: + print(e) def get_gitlab_org_repos(self, org_name: str) -> list[Dict]: try: @@ -252,7 +252,7 @@ def process_gitlab_org_files(self, org_name: str, add_to_index=True, codeJSONPat for id, repo in enumerate(orgs, 1): self._enumerate_repo_orgs( - org_name,repo['name'],repo['web_url'],total_repos,codeJSONPath=codeJSONPath + id,org_name,repo['name'],repo['web_url'],total_repos,codeJSONPath=codeJSONPath,add_to_index=add_to_index ) except Exception as e: From 805111dd46711570bf423154d3301b91f0865900 Mon Sep 17 00:00:00 2001 From: Isaac Milarsky Date: Thu, 24 Jul 2025 09:21:22 -0500 Subject: [PATCH 6/6] PR Edits Signed-off-by: Isaac Milarsky --- codejson_index_generator/parsers.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/codejson_index_generator/parsers.py b/codejson_index_generator/parsers.py index 5e045ef..3d69876 100644 --- a/codejson_index_generator/parsers.py +++ b/codejson_index_generator/parsers.py @@ -1,11 +1,9 @@ import json from json.decoder import JSONDecodeError import base64 -import argparse import os import requests import re -import subprocess from time import sleep, mktime, gmtime, time, localtime from typing import Dict, Optional @@ -43,7 +41,7 @@ def hit_endpoint(url,token,method='GET'): response_json = {} attempts += 1 - if attempts >= REQUEST_RETRIES: + if attempts >= RETRIES: raise ConnectionError( f"Rate limit was reached and couldn't be rectified after {attempts} tries" )