From e7cf9ef2319220434f56a5d7a2571335cc72acbf Mon Sep 17 00:00:00 2001 From: Gang Li Date: Fri, 9 Feb 2024 18:02:40 +0800 Subject: [PATCH] Add command for re-index of folder --- .gitignore | 3 + README.md | 10 ++++ charon/cmd/__init__.py | 2 + charon/cmd/cmd_index.py | 120 ++++++++++++++++++++++++++++++++++++++ charon/pkgs/indexing.py | 77 +++++++++++++++++++++--- charon/storage.py | 93 +++++++++++++++++++++++++++-- tests/test_maven_index.py | 82 +++++++++++++++++++++++++- tests/test_npm_index.py | 106 +++++++++++++++++++++++++++++++++ tests/test_s3client.py | 73 +++++++++++++++++++++++ 9 files changed, 553 insertions(+), 13 deletions(-) create mode 100644 charon/cmd/cmd_index.py diff --git a/.gitignore b/.gitignore index 8ca90496..b32671f8 100644 --- a/.gitignore +++ b/.gitignore @@ -16,3 +16,6 @@ package/ # Unit test __pytest_reports htmlcov + +# Generated when local run +*.log diff --git a/README.md b/README.md index cdd1608d..887166c0 100644 --- a/README.md +++ b/README.md @@ -96,3 +96,13 @@ This command will delete some paths from repo in S3. but not delete the artifacts themselves. * During or after the paths' deletion, regenerate the metadata files and index files for both types. + +### charon-index: refresh the index.html for the specified path + +```bash +usage: charon index $PATH [-t, --target] [-D, --debug] [-q, --quiet] +``` + +This command will refresh the index.html for the specified path. + +* Note that if the path is a NPM metadata path which contains package.json, this refreshment will not work because this type of folder will display the package.json instead of the index.html in http request. diff --git a/charon/cmd/__init__.py b/charon/cmd/__init__.py index a9834e1a..9a3084d0 100644 --- a/charon/cmd/__init__.py +++ b/charon/cmd/__init__.py @@ -16,6 +16,7 @@ from click import group from charon.cmd.cmd_upload import upload from charon.cmd.cmd_delete import delete +from charon.cmd.cmd_index import index @group() @@ -29,3 +30,4 @@ def cli(): # init group command cli.add_command(upload) cli.add_command(delete) +cli.add_command(index) diff --git a/charon/cmd/cmd_index.py b/charon/cmd/cmd_index.py new file mode 100644 index 00000000..281ed876 --- /dev/null +++ b/charon/cmd/cmd_index.py @@ -0,0 +1,120 @@ +""" +Copyright (C) 2022 Red Hat, Inc. (https://github.com/Commonjava/charon) + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from charon.config import get_config +from charon.cmd.internal import _decide_mode +from charon.pkgs.indexing import re_index +from charon.constants import PACKAGE_TYPE_MAVEN, PACKAGE_TYPE_NPM +from click import command, option, argument + +import traceback +import logging +import os +import sys + +logger = logging.getLogger(__name__) + + +@argument( + "path", + type=str, +) +@option( + "--target", + "-t", + help=""" + The target to do the index refreshing, which will decide + which s3 bucket and what root path where all files will + be deleted from. + """, + required=True +) +@option( + "--debug", + "-D", + help="Debug mode, will print all debug logs for problem tracking.", + is_flag=True, + default=False +) +@option( + "--quiet", + "-q", + help="Quiet mode, will shrink most of the logs except warning and errors.", + is_flag=True, + default=False +) +@option("--dryrun", "-n", is_flag=True, default=False) +@command() +def index( + path: str, + target: str, + debug: bool = False, + quiet: bool = False, + dryrun: bool = False +): + """This command will re-generate the index.html files for the + specified path. + """ + _decide_mode( + "index-{}".format(target), path.replace("/", "_"), + is_quiet=quiet, is_debug=debug + ) + try: + conf = get_config() + if not conf: + sys.exit(1) + + aws_profile = os.getenv("AWS_PROFILE") or conf.get_aws_profile() + if not aws_profile: + logger.error("No AWS profile specified!") + sys.exit(1) + + tgt = conf.get_target(target) + if not tgt: + # log is recorded get_target + sys.exit(1) + + aws_bucket = None + prefix = None + for b in conf.get_target(target): + aws_bucket = b.get('bucket') + prefix = b.get('prefix', '') + + package_type = None + if "maven" in aws_bucket: + logger.info( + "The target is a maven repository. Will refresh the index as maven package type" + ) + package_type = PACKAGE_TYPE_MAVEN + elif "npm" in aws_bucket: + package_type = PACKAGE_TYPE_NPM + logger.info( + "The target is a npm repository. Will refresh the index as npm package type" + ) + else: + logger.error( + "The target is not supported. Only maven or npm target is supported." + ) + sys.exit(1) + + if not aws_bucket: + logger.error("No bucket specified!") + sys.exit(1) + + re_index(aws_bucket, prefix, path, package_type, aws_profile, dryrun) + except Exception: + print(traceback.format_exc()) + sys.exit(2) # distinguish between exception and bad config or bad state diff --git a/charon/pkgs/indexing.py b/charon/pkgs/indexing.py index f478e0a5..b342c071 100644 --- a/charon/pkgs/indexing.py +++ b/charon/pkgs/indexing.py @@ -17,6 +17,7 @@ from charon.storage import S3Client from charon.constants import (INDEX_HTML_TEMPLATE, NPM_INDEX_HTML_TEMPLATE, PACKAGE_TYPE_MAVEN, PACKAGE_TYPE_NPM, PROD_INFO_SUFFIX) +from charon.utils.files import digest_content from jinja2 import Template import os import logging @@ -149,6 +150,17 @@ def __generate_index_html( def __to_html(package_type: str, contents: List[str], folder: str, top_level: str) -> str: + html_content = __to_html_content(package_type, contents, folder) + html_path = os.path.join(top_level, folder, "index.html") + if folder == "/": + html_path = os.path.join(top_level, "index.html") + os.makedirs(os.path.dirname(html_path), exist_ok=True) + with open(html_path, 'w', encoding='utf-8') as html: + html.write(html_content) + return html_path + + +def __to_html_content(package_type: str, contents: List[str], folder: str) -> str: items = [] if folder != "/": items.append("../") @@ -160,13 +172,7 @@ def __to_html(package_type: str, contents: List[str], folder: str, top_level: st items.extend(contents) items = __sort_index_items(items) index = IndexedHTML(title=folder, header=folder, items=items) - html_path = os.path.join(top_level, folder, "index.html") - if folder == "/": - html_path = os.path.join(top_level, "index.html") - os.makedirs(os.path.dirname(html_path), exist_ok=True) - with open(html_path, 'w', encoding='utf-8') as html: - html.write(index.generate_index_file_content(package_type)) - return html_path + return index.generate_index_file_content(package_type) def __sort_index_items(items): @@ -250,3 +256,60 @@ def __compare(self, other) -> int: return -1 else: return 0 + + +def re_index( + bucket: str, + prefix: str, + path: str, + package_type: str, + aws_profile: str = None, + dry_run: bool = False +): + """Refresh the index.html for the specified folder in the bucket. + """ + s3_client = S3Client(aws_profile=aws_profile, dry_run=dry_run) + s3_folder = os.path.join(prefix, path) + if path.strip() == "" or path.strip() == "/": + s3_folder = prefix + items: List[str] = s3_client.list_folder_content(bucket, s3_folder) + contents = [i for i in items if not i.endswith(PROD_INFO_SUFFIX)] + if PACKAGE_TYPE_NPM == package_type: + if any([True if "package.json" in c else False for c in contents]): + logger.warn( + "The path %s contains NPM package.json which will work as " + "package metadata for indexing. This indexing is ignored.", + path + ) + return + + if len(contents) >= 1: + real_contents = [] + if prefix and prefix.strip() != "": + for c in contents: + if c.strip() != "": + if c.startswith(prefix): + real_c = remove_prefix(c, prefix) + real_c = remove_prefix(real_c, "/") + real_contents.append(real_c) + else: + real_contents.append(c) + else: + real_contents = contents + logger.debug(real_contents) + index_content = __to_html_content(package_type, real_contents, path) + if not dry_run: + index_path = os.path.join(path, "index.html") + if path == "/": + index_path = "index.html" + s3_client.simple_delete_file(index_path, (bucket, prefix)) + s3_client.simple_upload_file( + index_path, index_content, (bucket, prefix), + "text/html", digest_content(index_content) + ) + else: + logger.warning( + "The path %s does not contain any contents in bucket %s. " + "Will not do any re-indexing", + path, bucket + ) diff --git a/charon/storage.py b/charon/storage.py index 45963c42..f07b1c50 100644 --- a/charon/storage.py +++ b/charon/storage.py @@ -530,10 +530,11 @@ def delete_files( self, file_paths: List[str], target: Tuple[str, str], product: Optional[str], root="/" ) -> List[str]: - """ Deletes a list of files to s3 bucket. * Use the cut down file path as s3 key. The cut - down way is move root from the file path if it starts with root. Example: if file_path is - /tmp/maven-repo/org/apache/.... and root is /tmp/maven-repo Then the key will be - org/apache/..... + """ Deletes a list of files to s3 bucket. + * Use the cut down file path as s3 key. The cut + down way is move root from the file path if it starts with root. + Example: if file_path is /tmp/maven-repo/org/apache/.... and + root is /tmp/maven-repo Then the key will be org/apache/..... * The removing will happen with conditions of product checking. First the deletion will remove The product from the file metadata "rh-products". After the metadata removing, if there still are extra products left in that metadata, the file will not @@ -637,6 +638,90 @@ async def path_delete_handler( return failed_files + def simple_delete_file( + self, file_path: str, target: Tuple[str, str] + ): + """ Deletes file in s3 bucket, regardless of any extra + information like product and version info. + * Warning: this will directly delete the files even if + it has lots of product info, so please be careful to use. + If you want to delete product artifact files, please use + delete_files + """ + bucket = target[0] + prefix = target[1] + bucket_obj = self.__get_bucket(bucket) + path_key = os.path.join(prefix, file_path) + file_object = bucket_obj.Object(path_key) + existed = False + try: + existed = self.__file_exists(file_object) + if existed: + bucket_obj.delete_objects(Delete={"Objects": [{"Key": path_key}]}) + else: + logger.warning( + 'Warning: File %s does not exist in S3 bucket %s, will ignore its deleting', + file_path, bucket + ) + except (ClientError, HTTPClientError) as e: + logger.error( + "Error: file existence check failed due to error: %s", e + ) + + def simple_upload_file( + self, file_path: str, file_content: str, + target: Tuple[str, str], + mime_type: str = None, + check_sum_sha1: str = None + ): + """ Uploads file to s3 bucket, regardless of any extra + information like product and version info. + * Warning: this will directly delete the files even if + it has lots of product info, so please be careful to use. + If you want to upload product artifact files, please use + upload_files + """ + bucket = target[0] + prefix = target[1] + bucket_obj = self.__get_bucket(bucket) + path_key = os.path.join(prefix, file_path) + file_object = bucket_obj.Object(path_key) + existed = False + logger.debug( + 'Uploading %s to bucket %s', path_key, bucket + ) + existed = False + try: + existed = self.__file_exists(file_object) + except (ClientError, HTTPClientError) as e: + logger.error( + "Error: file existence check failed due to error: %s", e + ) + return + + content_type = mime_type + if not content_type: + content_type = DEFAULT_MIME_TYPE + if not existed: + f_meta = {} + if check_sum_sha1 and check_sum_sha1.strip() != "": + f_meta[CHECKSUM_META_KEY] = check_sum_sha1 + try: + if not self.__dry_run: + file_object.put( + Body=file_content, + Metadata=f_meta, + ContentType=content_type + ) + logger.debug('Uploaded %s to bucket %s', file_path, bucket) + except (ClientError, HTTPClientError) as e: + logger.error( + "ERROR: file %s not uploaded to bucket %s due to error: %s ", + file_path, bucket, e + ) + else: + raise FileExistsError("Error: file %s already exists, upload is forbiden.") + def delete_manifest(self, product_key: str, target: str, manifest_bucket_name: str): if not manifest_bucket_name: logger.warning( diff --git a/tests/test_maven_index.py b/tests/test_maven_index.py index d5647ecd..7468310d 100644 --- a/tests/test_maven_index.py +++ b/tests/test_maven_index.py @@ -15,6 +15,7 @@ """ from charon.constants import PROD_INFO_SUFFIX from charon.pkgs.maven import handle_maven_uploading, handle_maven_del +from charon.pkgs.indexing import re_index from charon.storage import CHECKSUM_META_KEY from charon.utils.strings import remove_prefix from tests.base import LONG_TEST_PREFIX, SHORT_TEST_PREFIX, PackageBaseTest @@ -45,8 +46,6 @@ def test_uploading_index(self): objs = list(test_bucket.objects.all()) actual_files = [obj.key for obj in objs] - self.assertEqual(41, len(actual_files)) - for f in COMMONS_LOGGING_INDEXES: self.assertIn(f, actual_files) @@ -127,6 +126,85 @@ def test_overlap_upload_index(self): self.assertNotIn("../", index_content) self.assertNotIn(PROD_INFO_SUFFIX, index_content) + def test_re_index(self): + test_zip = os.path.join(INPUTS, "commons-client-4.5.6.zip") + product = "commons-client-4.5.6" + handle_maven_uploading( + test_zip, product, + buckets=[('', TEST_BUCKET, '', '')], + dir_=self.tempdir + ) + + test_bucket = self.mock_s3.Bucket(TEST_BUCKET) + objs = list(test_bucket.objects.all()) + actual_files = [obj.key for obj in objs] + + for f in COMMONS_CLIENT_456_INDEXES: + self.assertIn(f, actual_files) + + self.check_content(objs, [product]) + + indedx_obj = test_bucket.Object(COMMONS_CLIENT_INDEX) + index_content = str(indedx_obj.get()["Body"].read(), "utf-8") + self.assertIn('../', index_content) + self.assertIn('4.5.6/', index_content) + self.assertIn( + '' + 'maven-metadata.xml', + index_content + ) + self.assertIn( + '' + 'maven-metadata.xml.md5', + index_content + ) + self.assertIn( + '' + 'maven-metadata.xml.sha1', + index_content + ) + self.assertIn( + '' + 'maven-metadata.xml.sha256', + index_content + ) + self.assertNotIn("4.5.7/", index_content) + + # insert new in commons-client + commons_client_root = "org/apache/httpcomponents/httpclient/" + commons_client_457_test = commons_client_root + "4.5.7/httpclient-4.5.7.txt" + self.mock_s3.Bucket(TEST_BUCKET).put_object( + Key=commons_client_457_test, + Body="Just a test content" + ) + re_index(TEST_BUCKET, "", commons_client_root, "maven") + indedx_obj = test_bucket.Object(COMMONS_CLIENT_INDEX) + index_content = str(indedx_obj.get()["Body"].read(), "utf-8") + self.assertIn('../', index_content) + self.assertIn('4.5.6/', index_content) + self.assertIn( + '' + 'maven-metadata.xml', + index_content + ) + self.assertIn( + '' + 'maven-metadata.xml.md5', + index_content + ) + self.assertIn( + '' + 'maven-metadata.xml.sha1', + index_content + ) + self.assertIn( + '' + 'maven-metadata.xml.sha256', + index_content + ) + self.assertIn("4.5.7/", index_content) + self.assertNotIn(PROD_INFO_SUFFIX, index_content) + def test_upload_index_with_short_prefix(self): self.__test_upload_index_with_prefix(SHORT_TEST_PREFIX) diff --git a/tests/test_npm_index.py b/tests/test_npm_index.py index fa0ebc3a..02dc64e0 100644 --- a/tests/test_npm_index.py +++ b/tests/test_npm_index.py @@ -15,6 +15,7 @@ """ from charon.constants import PROD_INFO_SUFFIX, DEFAULT_REGISTRY from charon.pkgs.npm import handle_npm_uploading, handle_npm_del +from charon.pkgs.indexing import re_index from charon.storage import CHECKSUM_META_KEY from tests.base import LONG_TEST_PREFIX, SHORT_TEST_PREFIX, PackageBaseTest from tests.commons import ( @@ -182,3 +183,108 @@ def __prepare_content(self, prefix: str = None): buckets=[('', TEST_BUCKET, prefix, DEFAULT_REGISTRY)], dir_=self.tempdir ) + + def test_re_index(self): + test_tgz = os.path.join(INPUTS, "code-frame-7.14.5.tgz") + product_7_14_5 = "code-frame-7.14.5" + prefix = SHORT_TEST_PREFIX + + handle_npm_uploading( + test_tgz, product_7_14_5, + buckets=[('', TEST_BUCKET, SHORT_TEST_PREFIX, DEFAULT_REGISTRY)], + dir_=self.tempdir, + ) + + test_bucket = self.mock_s3.Bucket(TEST_BUCKET) + objs = list(test_bucket.objects.all()) + actual_files = [obj.key for obj in objs] + + prefixed_7158_indexes = [ + os.path.join(prefix, f) for f in CODE_FRAME_7_15_8_INDEXES + ] + prefixed_namespace_babel_index = os.path.join(prefix, NAMESPACE_BABEL_INDEX) + prefixed_root_index = os.path.join(prefix, COMMONS_ROOT_INDEX) + + for assert_file in prefixed_7158_indexes: + self.assertNotIn(assert_file, actual_files) + + # test package path + index_obj = test_bucket.Object(prefixed_namespace_babel_index) + index_content = str(index_obj.get()["Body"].read(), "utf-8") + self.assertIn('code-frame/', + index_content) + test_file_path = os.path.join(prefix, "@babel/test/test-file.txt") + self.assertNotIn( + '' + 'test/test-file.txt', index_content + ) + # Add entry and re-index package path + test_bucket.put_object( + Key=test_file_path, Body="test content" + ) + re_index(TEST_BUCKET, prefix, "@babel/", "npm") + index_obj = test_bucket.Object(prefixed_namespace_babel_index) + index_content = str(index_obj.get()["Body"].read(), "utf-8") + self.assertIn( + 'code-frame/', index_content + ) + self.assertIn( + 'test/', index_content + ) + self.assertIn( + '../', index_content + ) + self.assertNotIn(PROD_INFO_SUFFIX, index_content) + + # test root path + index_obj = test_bucket.Object(prefixed_root_index) + index_content = str(index_obj.get()["Body"].read(), "utf-8") + self.assertIn('@babel/', index_content) + test_file_path = os.path.join(prefix, "test/test-file.txt") + self.assertNotIn( + '' + 'test/test-file.txt', index_content + ) + # Add entry and re-index root + test_bucket.put_object( + Key=test_file_path, Body="test content" + ) + re_index(TEST_BUCKET, prefix, "/", "npm") + index_obj = test_bucket.Object(prefixed_root_index) + index_content = str(index_obj.get()["Body"].read(), "utf-8") + self.assertIn('@babel/', index_content) + self.assertIn( + '' + 'test/', index_content + ) + self.assertNotIn('../', index_content) + self.assertNotIn(PROD_INFO_SUFFIX, index_content) + + # Test metadata path + metadata_path = "@babel/code-frame/" + objs = list(test_bucket.objects.all()) + actual_files = [obj.key for obj in objs] + self.assertIn( + os.path.join(prefix, metadata_path, "package.json"), + actual_files + ) + self.assertNotIn( + os.path.join(prefix, metadata_path, "index.html"), + actual_files + ) + # Add entry and re-index metadata path + test_file_path = os.path.join(prefix, metadata_path, "test/test-file.txt") + test_bucket.put_object( + Key=test_file_path, Body="test content" + ) + re_index(TEST_BUCKET, prefix, metadata_path, "npm") + objs = list(test_bucket.objects.all()) + actual_files = [obj.key for obj in objs] + self.assertIn( + os.path.join(prefix, metadata_path, "package.json"), + actual_files + ) + self.assertNotIn( + os.path.join(prefix, metadata_path, "index.html"), + actual_files + ) diff --git a/tests/test_s3client.py b/tests/test_s3client.py index 1c78db2b..b33e68d7 100644 --- a/tests/test_s3client.py +++ b/tests/test_s3client.py @@ -390,6 +390,79 @@ def test_exists_override_failing(self): file_obj = bucket.Object(path) self.assertEqual(sha1, file_obj.metadata[CHECKSUM_META_KEY]) + def test_simple_upload_file(self): + (temp_root, _, all_files) = self.__prepare_files() + for file_path in all_files: + file_key = file_path[len(temp_root) + 1:] + file_content = open(file_path, "rb").read() + sha1 = read_sha1(file_path) + self.s3_client.simple_upload_file( + file_path=file_key, + file_content=file_content, + check_sum_sha1=sha1, + target=(MY_BUCKET, '') + ) + bucket = self.mock_s3.Bucket(MY_BUCKET) + + objects = list(bucket.objects.all()) + self.assertEqual(len(all_files), len(objects)) + file_path = all_files[0] + file_key = file_path[len(temp_root) + 1:] + file_content = open(file_path, "rb").read() + sha1 = read_sha1(file_path) + obj = bucket.Object(file_key) + self.assertEqual(sha1, obj.metadata[CHECKSUM_META_KEY]) + self.assertEqual(file_key, obj.key) + self.assertEqual( + str(file_content, sys.getdefaultencoding()), + str(obj.get()["Body"].read(), sys.getdefaultencoding()) + ) + + # test upload exists + self.assertRaises( + FileExistsError, + self.s3_client.simple_upload_file, + file_path=file_key, + file_content="file_content", + check_sum_sha1=sha1, + target=(MY_BUCKET, '') + ) + + shutil.rmtree(temp_root) + + def test_simple_delete_file(self): + # prepare files + (temp_root, _, all_files) = self.__prepare_files() + for file_path in all_files: + file_key = file_path[len(temp_root) + 1:] + file_content = open(file_path, "rb").read() + sha1 = read_sha1(file_path) + self.s3_client.simple_upload_file( + file_path=file_key, + file_content=file_content, + check_sum_sha1=sha1, + target=(MY_BUCKET, '') + ) + bucket = self.mock_s3.Bucket(MY_BUCKET) + + objects = list(bucket.objects.all()) + self.assertEqual(len(all_files), len(objects)) + + # test delete file start + file_key = all_files[0][len(temp_root) + 1:] + objects = list(bucket.objects.all()) + self.assertIn(file_key, [o.key for o in objects]) + self.s3_client.simple_delete_file( + file_path=file_key, + target=(MY_BUCKET, "") + ) + + objects = list(bucket.objects.all()) + self.assertEqual(len(all_files) - 1, len(objects)) + self.assertNotIn(file_key, [o.key for o in objects]) + + shutil.rmtree(temp_root) + def __prepare_files(self): test_zip = zipfile.ZipFile( os.path.join(INPUTS, "commons-lang3.zip")