From a52ba7a56878dc85ae0bed8138a2037442e8126f Mon Sep 17 00:00:00 2001 From: Gang Li Date: Wed, 27 Mar 2024 17:35:27 +0800 Subject: [PATCH] Use wildcard for paths in maven CF invalidating * And ignore the indexing(index.html) CF invalidating per cost consideration * And change the root work dir for npm uploading --- charon/cache.py | 61 ++++++++++++++++++++++++--------------- charon/pkgs/indexing.py | 13 +++++---- charon/pkgs/maven.py | 47 +++++++++++++++++++++--------- charon/pkgs/npm.py | 20 +++++++------ charon/pkgs/pkg_utils.py | 22 +++++++++----- tests/test_cf_reindex.py | 9 +++--- tests/test_cfclient.py | 10 +++---- tests/test_maven_index.py | 3 +- tests/test_npm_index.py | 9 ++---- 9 files changed, 118 insertions(+), 76 deletions(-) diff --git a/charon/cache.py b/charon/cache.py index a5daf5fe..0112d5fe 100644 --- a/charon/cache.py +++ b/charon/cache.py @@ -59,7 +59,10 @@ def __get_endpoint(self, extra_conf) -> str: logger.debug("[CloudFront] No user-specified endpoint url is used.") return endpoint_url - def invalidate_paths(self, distr_id: str, paths: List[str]) -> Dict[str, str]: + def invalidate_paths( + self, distr_id: str, paths: List[str], + batch_size: int = 15 + ) -> List[Dict[str, str]]: """Send a invalidating requests for the paths in distribution to CloudFront. This will invalidate the paths in the distribution to enforce the refreshment from backend S3 bucket for these paths. For details see: @@ -67,30 +70,42 @@ def invalidate_paths(self, distr_id: str, paths: List[str]) -> Dict[str, str]: * The distr_id is the id for the distribution. This id can be get through get_dist_id_by_domain(domain) function * Can specify the invalidating paths through paths param. + * Batch size is the number of paths to be invalidated in one request. + Because paths contains wildcard(*), so the default value is 15 which + is the maximum number in official doc: + https://docs.aws.amazon.com/AmazonCloudFront/latest/DeveloperGuide/Invalidation.html#InvalidationLimits """ - caller_ref = str(uuid.uuid4()) logger.debug("[CloudFront] Creating invalidation for paths: %s", paths) - try: - response = self.__client.create_invalidation( - DistributionId=distr_id, - InvalidationBatch={ - 'CallerReference': caller_ref, - 'Paths': { - 'Quantity': len(paths), - 'Items': paths + real_paths = [paths] + # Split paths into batches by batch_size + if batch_size: + real_paths = [paths[i:i + batch_size] for i in range(0, len(paths), batch_size)] + results = [] + for batch_paths in real_paths: + caller_ref = str(uuid.uuid4()) + try: + response = self.__client.create_invalidation( + DistributionId=distr_id, + InvalidationBatch={ + 'CallerReference': caller_ref, + 'Paths': { + 'Quantity': len(batch_paths), + 'Items': batch_paths + } } - } - ) - if response: - invalidation = response.get('Invalidation', {}) - return { - 'Id': invalidation.get('Id', None), - 'Status': invalidation.get('Status', None) - } - except Exception as err: - logger.error( - "[CloudFront] Error occurred while creating invalidation, error: %s", err - ) + ) + if response: + invalidation = response.get('Invalidation', {}) + results.append({ + 'Id': invalidation.get('Id', None), + 'Status': invalidation.get('Status', None) + }) + except Exception as err: + logger.error( + "[CloudFront] Error occurred while creating invalidation" + " for paths %s, error: %s", batch_paths, err + ) + return results def check_invalidation(self, distr_id: str, invalidation_id: str) -> dict: try: @@ -115,7 +130,7 @@ def get_dist_id_by_domain(self, domain: str) -> str: """Get distribution id by a domain name. The id can be used to send invalidating request through #invalidate_paths function * Domain are Ronda domains, like "maven.repository.redhat.com" - or "npm.repository.redhat.com" + or "npm.registry.redhat.com" """ try: response = self.__client.list_distributions() diff --git a/charon/pkgs/indexing.py b/charon/pkgs/indexing.py index bd9192ab..ee42a83c 100644 --- a/charon/pkgs/indexing.py +++ b/charon/pkgs/indexing.py @@ -15,8 +15,8 @@ """ from charon.config import get_template from charon.storage import S3Client -from charon.cache import CFClient -from charon.pkgs.pkg_utils import invalidate_cf_paths +# from charon.cache import CFClient +# from charon.pkgs.pkg_utils import invalidate_cf_paths from charon.constants import (INDEX_HTML_TEMPLATE, NPM_INDEX_HTML_TEMPLATE, PACKAGE_TYPE_MAVEN, PACKAGE_TYPE_NPM, PROD_INFO_SUFFIX) from charon.utils.files import digest_content @@ -265,7 +265,7 @@ def re_index( path: str, package_type: str, aws_profile: str = None, - cf_enable: bool = False, + # cf_enable: bool = False, dry_run: bool = False ): """Refresh the index.html for the specified folder in the bucket. @@ -312,9 +312,10 @@ def re_index( index_path, index_content, (bucket_name, real_prefix), "text/html", digest_content(index_content) ) - if cf_enable: - cf_client = CFClient(aws_profile=aws_profile) - invalidate_cf_paths(cf_client, bucket, [index_path]) + # We will not invalidate index.html per cost consideration + # if cf_enable: + # cf_client = CFClient(aws_profile=aws_profile) + # invalidate_cf_paths(cf_client, bucket, [index_path]) else: logger.warning( "The path %s does not contain any contents in bucket %s. " diff --git a/charon/pkgs/maven.py b/charon/pkgs/maven.py index 257bb2b9..c7413c80 100644 --- a/charon/pkgs/maven.py +++ b/charon/pkgs/maven.py @@ -58,7 +58,9 @@ def __get_mvn_template(kind: str, default: str) -> str: META_TEMPLATE = __get_mvn_template("maven-metadata.xml.j2", MAVEN_METADATA_TEMPLATE) ARCH_TEMPLATE = __get_mvn_template("archetype-catalog.xml.j2", ARCHETYPE_CATALOG_TEMPLATE) -STANDARD_GENERATED_IGNORES = ["maven-metadata.xml", "archetype-catalog.xml"] +MAVEN_METADATA_FILE = "maven-metadata.xml" +MAVEN_ARCH_FILE = "archetype-catalog.xml" +STANDARD_GENERATED_IGNORES = [MAVEN_METADATA_FILE, MAVEN_ARCH_FILE] class MavenMetadata(object): @@ -219,7 +221,7 @@ def gen_meta_file(group_id, artifact_id: str, versions: list, root="/", digest=T ).generate_meta_file_content() g_path = "/".join(group_id.split(".")) meta_files = [] - final_meta_path = os.path.join(root, g_path, artifact_id, "maven-metadata.xml") + final_meta_path = os.path.join(root, g_path, artifact_id, MAVEN_METADATA_FILE) try: overwrite_file(final_meta_path, content) meta_files.append(final_meta_path) @@ -374,7 +376,7 @@ def handle_maven_uploading( cf_invalidate_paths.extend(meta_files.get(META_FILE_GEN_KEY, [])) # 8. Determine refreshment of archetype-catalog.xml - if os.path.exists(os.path.join(top_level, "archetype-catalog.xml")): + if os.path.exists(os.path.join(top_level, MAVEN_ARCH_FILE)): logger.info("Start generating archetype-catalog.xml for bucket %s", bucket_name) upload_archetype_file = _generate_upload_archetype_catalog( s3=s3_client, bucket=bucket_name, @@ -451,15 +453,16 @@ def handle_maven_uploading( ) failed_metas.extend(_failed_metas) logger.info("Index files updating done\n") - # Add index files to Cf invalidate paths - if cf_enable: - cf_invalidate_paths.extend(created_indexes) + # We will not invalidate the index files per cost consideration + # if cf_enable: + # cf_invalidate_paths.extend(created_indexes) else: logger.info("Bypass indexing") # Finally do the CF invalidating for metadata files if cf_enable and len(cf_invalidate_paths) > 0: cf_client = CFClient(aws_profile=aws_profile) + cf_invalidate_paths = __wildcard_metadata_paths(cf_invalidate_paths) invalidate_cf_paths(cf_client, bucket, cf_invalidate_paths, top_level) upload_post_process(failed_files, failed_metas, prod_key, bucket_name) @@ -578,7 +581,7 @@ def handle_maven_del( cf_invalidate_paths.extend(all_meta_files) # 7. Determine refreshment of archetype-catalog.xml - if os.path.exists(os.path.join(top_level, "archetype-catalog.xml")): + if os.path.exists(os.path.join(top_level, MAVEN_ARCH_FILE)): logger.info("Start generating archetype-catalog.xml") archetype_action = _generate_rollback_archetype_catalog( s3=s3_client, bucket=bucket_name, @@ -630,13 +633,15 @@ def handle_maven_del( if len(_failed_index_files) > 0: failed_metas.extend(_failed_index_files) logger.info("Index files updating done.\n") - if cf_enable: - cf_invalidate_paths.extend(created_indexes) + # We will not invalidate the index files per cost consideration + # if cf_enable: + # cf_invalidate_paths.extend(created_indexes) else: logger.info("Bypassing indexing") if cf_enable and len(cf_invalidate_paths): cf_client = CFClient(aws_profile=aws_profile) + cf_invalidate_paths = __wildcard_metadata_paths(cf_invalidate_paths) invalidate_cf_paths(cf_client, bucket, cf_invalidate_paths, top_level) rollback_post_process(failed_files, failed_metas, prod_key, bucket_name) @@ -1017,15 +1022,15 @@ def _generate_metadatas( "No poms found in s3 bucket %s for GA path %s", bucket, path ) meta_files_deletion = meta_files.get(META_FILE_DEL_KEY, []) - meta_files_deletion.append(os.path.join(path, "maven-metadata.xml")) - meta_files_deletion.extend(__hash_decorate_metadata(path, "maven-metadata.xml")) + meta_files_deletion.append(os.path.join(path, MAVEN_METADATA_FILE)) + meta_files_deletion.extend(__hash_decorate_metadata(path, MAVEN_METADATA_FILE)) meta_files[META_FILE_DEL_KEY] = meta_files_deletion else: logger.warning("An error happened when scanning remote " "artifacts under GA path %s", path) meta_failed_path = meta_files.get(META_FILE_FAILED, []) - meta_failed_path.append(os.path.join(path, "maven-metadata.xml")) - meta_failed_path.extend(__hash_decorate_metadata(path, "maven-metadata.xml")) + meta_failed_path.append(os.path.join(path, MAVEN_METADATA_FILE)) + meta_failed_path.extend(__hash_decorate_metadata(path, MAVEN_METADATA_FILE)) meta_files[META_FILE_FAILED] = meta_failed_path else: logger.debug( @@ -1090,6 +1095,22 @@ def __get_suffix(package_type: str, conf: CharonConfig) -> List[str]: return [] +def __wildcard_metadata_paths(paths: List[str]) -> List[str]: + new_paths = [] + for path in paths: + if path.endswith(MAVEN_METADATA_FILE)\ + or path.endswith(MAVEN_ARCH_FILE): + new_paths.append(path[:-len(".xml")] + ".*") + elif path.endswith(".md5")\ + or path.endswith(".sha1")\ + or path.endswith(".sha128")\ + or path.endswith(".sha256"): + continue + else: + new_paths.append(path) + return new_paths + + class VersionCompareKey: 'Used as key function for version sorting' def __init__(self, obj): diff --git a/charon/pkgs/npm.py b/charon/pkgs/npm.py index ad418ff1..894998a6 100644 --- a/charon/pkgs/npm.py +++ b/charon/pkgs/npm.py @@ -105,6 +105,8 @@ def handle_npm_uploading( client = S3Client(aws_profile=aws_profile, dry_run=dry_run) generated_signs = [] + succeeded = True + root_dir = mkdtemp(prefix=f"npm-charon-{product}-", dir=dir_) for bucket in buckets: # prepare cf invalidate files cf_invalidate_paths = [] @@ -113,7 +115,7 @@ def handle_npm_uploading( prefix = remove_prefix(bucket[2], "/") registry = bucket[3] target_dir, valid_paths, package_metadata = _scan_metadata_paths_from_archive( - tarball_path, registry, prod=product, dir__=dir_ + tarball_path, registry, prod=product, dir__=root_dir ) if not os.path.isdir(target_dir): logger.error("Error: the extracted target_dir path %s does not exist.", target_dir) @@ -129,8 +131,6 @@ def handle_npm_uploading( ) logger.info("Files uploading done\n") - succeeded = True - if not manifest_bucket_name: logger.warning( 'Warning: No manifest bucket is provided, will ignore the process of manifest ' @@ -235,8 +235,9 @@ def handle_npm_uploading( ) failed_metas.extend(_failed_metas) logger.info("Index files updating done\n") - if cf_enable: - cf_invalidate_paths.extend(created_indexes) + # We will not invalidate the index files per cost consideration + # if cf_enable: + # cf_invalidate_paths.extend(created_indexes) else: logger.info("Bypass indexing\n") @@ -248,7 +249,7 @@ def handle_npm_uploading( upload_post_process(failed_files, failed_metas, product, bucket_name) succeeded = succeeded and len(failed_files) == 0 and len(failed_metas) == 0 - return (target_dir, succeeded) + return (root_dir, succeeded) def handle_npm_del( @@ -360,9 +361,10 @@ def handle_npm_del( ) failed_metas.extend(_failed_index_files) logger.info("Index files updating done.\n") - if cf_enable and len(created_indexes): - logger.debug("Add index files to cf invalidate list: %s", created_indexes) - cf_invalidate_paths.extend(created_indexes) + # We will not invalidate the index files per cost consideration + # if cf_enable and len(created_indexes): + # logger.debug("Add index files to cf invalidate list: %s", created_indexes) + # cf_invalidate_paths.extend(created_indexes) else: logger.info("Bypassing indexing\n") diff --git a/charon/pkgs/pkg_utils.py b/charon/pkgs/pkg_utils.py index a206f697..00fa293a 100644 --- a/charon/pkgs/pkg_utils.py +++ b/charon/pkgs/pkg_utils.py @@ -87,11 +87,17 @@ def invalidate_cf_paths( logger.debug("Invalidating paths: %s", final_paths) if not domain: domain = cf_client.get_domain_by_bucket(bucket_name) - distr_id = cf_client.get_dist_id_by_domain(domain) - if distr_id: - result = cf_client.invalidate_paths(distr_id, final_paths) - if result: - logger.info( - "The CF invalidating request for metadata/indexing is sent, " - "request id %s, status is %s", result['Id'], result['Status'] - ) + if domain: + distr_id = cf_client.get_dist_id_by_domain(domain) + if distr_id: + result = cf_client.invalidate_paths(distr_id, final_paths) + if result: + logger.info( + "The CF invalidating request for metadata/indexing is sent, " + "request status as below:\n %s", result + ) + else: + logger.error( + "CF invalidating will not be performed because domain not found for" + " bucket %s. ", bucket_name + ) diff --git a/tests/test_cf_reindex.py b/tests/test_cf_reindex.py index c8fc400a..42a6dbab 100644 --- a/tests/test_cf_reindex.py +++ b/tests/test_cf_reindex.py @@ -22,10 +22,12 @@ from tests.constants import INPUTS from moto import mock_aws import os +import pytest @mock_aws class CFReIndexTest(CFBasedTest): + @pytest.mark.skip(reason="Indexing CF invalidation is abandoned") def test_cf_maven_after_reindex(self): response = self.mock_cf.list_invalidations(DistributionId=self.test_dist_id) self.assertIsNotNone(response) @@ -41,8 +43,7 @@ def test_cf_maven_after_reindex(self): re_index( (TEST_BUCKET, TEST_BUCKET, "ga", "", "maven.repository.redhat.com"), - "org/apache/httpcomponents/httpclient/", "maven", - cf_enable=True + "org/apache/httpcomponents/httpclient/", "maven" ) response = self.mock_cf.list_invalidations(DistributionId=self.test_dist_id) @@ -51,6 +52,7 @@ def test_cf_maven_after_reindex(self): self.assertEqual(1, len(items)) self.assertEqual('completed', str.lower(items[0].get('Status'))) + @pytest.mark.skip(reason="Indexing CF invalidation is abandoned") def test_cf_npm_after_reindex(self): response = self.mock_cf.list_invalidations(DistributionId=self.test_dist_id) self.assertIsNotNone(response) @@ -66,8 +68,7 @@ def test_cf_npm_after_reindex(self): re_index( (TEST_BUCKET, TEST_BUCKET, "", "", "npm.registry.redhat.com"), - "@babel/", "npm", - cf_enable=True + "@babel/", "npm" ) response = self.mock_cf.list_invalidations(DistributionId=self.test_dist_id) diff --git a/tests/test_cfclient.py b/tests/test_cfclient.py index 610c454b..455af65c 100644 --- a/tests/test_cfclient.py +++ b/tests/test_cfclient.py @@ -48,10 +48,10 @@ def test_get_distribution_id(self): def test_invalidate_paths(self): dist_id = self.cf_client.get_dist_id_by_domain("maven.repository.redhat.com") result = self.cf_client.invalidate_paths(dist_id, ["/*"]) - self.assertIsNotNone(result['Id']) - self.assertEqual('completed', str.lower(result['Status'])) + self.assertTrue(result[0]['Id']) + self.assertEqual('completed', str.lower(result[0]['Status'])) status = self.cf_client.invalidate_paths("noexists_id", ["/*"]) - self.assertIsNone(status) + self.assertFalse(status) @pytest.mark.skip(reason=""" Because current moto 5.0.3 has not implemented the get_invalidation(), @@ -60,6 +60,6 @@ def test_invalidate_paths(self): def test_check_invalidation(self): dist_id = self.cf_client.get_dist_id_by_domain("maven.repository.redhat.com") result = self.cf_client.invalidate_paths(dist_id, ["/*"]) - invalidation = self.cf_client.check_invalidation(dist_id, result['Id']) + invalidation = self.cf_client.check_invalidation(dist_id, result[0]['Id']) self.assertIsNotNone(invalidation['Id']) - self.assertEqual('completed', str.lower(result['Status'])) + self.assertEqual('completed', str.lower(result[0]['Status'])) diff --git a/tests/test_maven_index.py b/tests/test_maven_index.py index 445b8e92..a137fb11 100644 --- a/tests/test_maven_index.py +++ b/tests/test_maven_index.py @@ -179,8 +179,7 @@ def test_re_index(self): ) re_index( (TEST_BUCKET, TEST_BUCKET, "", "", None), - commons_client_root, "maven", - cf_enable=True + commons_client_root, "maven" ) indedx_obj = test_bucket.Object(COMMONS_CLIENT_INDEX) index_content = str(indedx_obj.get()["Body"].read(), "utf-8") diff --git a/tests/test_npm_index.py b/tests/test_npm_index.py index f6745c3c..129f5278 100644 --- a/tests/test_npm_index.py +++ b/tests/test_npm_index.py @@ -224,8 +224,7 @@ def test_re_index(self): ) re_index( (TEST_BUCKET, TEST_BUCKET, prefix, "", None), - "@babel/", "npm", - cf_enable=True + "@babel/", "npm" ) index_obj = test_bucket.Object(prefixed_namespace_babel_index) index_content = str(index_obj.get()["Body"].read(), "utf-8") @@ -255,8 +254,7 @@ def test_re_index(self): ) re_index( (TEST_BUCKET, TEST_BUCKET, prefix, "", None), - "/", "npm", - cf_enable=True + "/", "npm" ) index_obj = test_bucket.Object(prefixed_root_index) index_content = str(index_obj.get()["Body"].read(), "utf-8") @@ -287,8 +285,7 @@ def test_re_index(self): ) re_index( (TEST_BUCKET, TEST_BUCKET, prefix, "", None), - metadata_path, "npm", - cf_enable=True + metadata_path, "npm" ) objs = list(test_bucket.objects.all()) actual_files = [obj.key for obj in objs]