diff --git a/charon.spec b/charon.spec index 9c7210fa..3fb320db 100644 --- a/charon.spec +++ b/charon.spec @@ -80,6 +80,11 @@ export LANG=en_US.UTF-8 LANGUAGE=en_US.en LC_ALL=en_US.UTF-8 %changelog +* Tue May 7 2024 Gang Li +- 1.3.1 release +- Add checksum refresh command: refresh checksum files for maven artifacts +- Refactor the CF invalidating commands into cf sub command + * Fri Apr 12 2024 Gang Li - 1.3.0 release - Add validate command: validate the checksum for maven artifacts diff --git a/charon/cmd/__init__.py b/charon/cmd/__init__.py index b2cafd05..16a0129d 100644 --- a/charon/cmd/__init__.py +++ b/charon/cmd/__init__.py @@ -17,8 +17,8 @@ from charon.cmd.cmd_upload import upload from charon.cmd.cmd_delete import delete from charon.cmd.cmd_index import index -from charon.cmd.cmd_checksum import checksum_validate -from charon.cmd.cmd_cache import cf_invalidate, cf_check +from charon.cmd.cmd_checksum import init_checksum, checksum +from charon.cmd.cmd_cache import init_cf, cf @group() @@ -33,6 +33,11 @@ def cli(): cli.add_command(upload) cli.add_command(delete) cli.add_command(index) -cli.add_command(checksum_validate) -cli.add_command(cf_invalidate) -cli.add_command(cf_check) + +# init cf cmmand +init_cf() +cli.add_command(cf) + +# init checksum command +init_checksum() +cli.add_command(checksum) diff --git a/charon/cmd/cmd_cache.py b/charon/cmd/cmd_cache.py index 95aae658..65cf87fb 100644 --- a/charon/cmd/cmd_cache.py +++ b/charon/cmd/cmd_cache.py @@ -18,7 +18,7 @@ from charon.cmd.internal import _decide_mode, _get_buckets from charon.cache import CFClient from charon.pkgs.pkg_utils import invalidate_cf_paths -from click import command, option, argument +from click import command, option, argument, group from typing import List, Tuple import traceback @@ -54,7 +54,7 @@ "-f", "path_file", help=""" - The file which contain the paths to be invalidated in CF. Pahts in this file follow the + The file which contain the paths to be invalidated in CF. Paths in this file follow the format of CF defining too, and each path should be in a single line. """ ) @@ -75,7 +75,7 @@ default=False ) @command() -def cf_invalidate( +def invalidate( target: str, paths: List[str], path_file: str, @@ -161,7 +161,7 @@ def cf_invalidate( default=False ) @command() -def cf_check( +def check( invalidation_id: str, target: str, quiet: bool = False, @@ -214,3 +214,15 @@ def _init_cmd(target: str) -> Tuple[List[Tuple[str, str, str, str, str]], str]: sys.exit(1) return (_get_buckets([target], conf), aws_profile) + + +@group() +def cf(): + """cf commands are responsible for the CloudFront cache operations in + products operated by Charon + """ + + +def init_cf(): + cf.add_command(invalidate) + cf.add_command(check) diff --git a/charon/cmd/cmd_checksum.py b/charon/cmd/cmd_checksum.py index 1591df77..e628d825 100644 --- a/charon/cmd/cmd_checksum.py +++ b/charon/cmd/cmd_checksum.py @@ -13,12 +13,14 @@ See the License for the specific language governing permissions and limitations under the License. """ -from typing import List +from typing import List, Tuple from charon.config import get_config -from charon.pkgs.checksum_http import handle_checksum_validation_http +from charon.pkgs.checksum_http import ( + handle_checksum_validation_http, refresh_checksum +) from charon.cmd.internal import _decide_mode -from click import command, option, argument +from click import command, option, argument, group import traceback import logging @@ -99,7 +101,7 @@ required=True ) @command() -def checksum_validate( +def validate( path: str, target: str, includes: List[str], @@ -118,22 +120,11 @@ def checksum_validate( be recorded. """ _decide_mode( - "checksum-{}".format(target), path.replace("/", "_"), + "checksum-validate-{}".format(target), path.replace("/", "_"), is_quiet=quiet, is_debug=debug ) try: - conf = get_config() - if not conf: - sys.exit(1) - - aws_bucket = "" - root_path = "" - t = conf.get_target(target) - if not t: - sys.exit(1) - for b in t: - aws_bucket = b.get('bucket') - prefix = b.get('prefix', '') + (aws_bucket, prefix) = _init_cmd(target) # NOTE: This is a liitle hacky, which constrain the configuration of # of target should define the bucket to contain "prod-maven" @@ -153,3 +144,129 @@ def checksum_validate( except Exception: print(traceback.format_exc()) sys.exit(2) + + +@option( + "--debug", + "-D", + "debug", + help="Debug mode, will print all debug logs for problem tracking.", + is_flag=True, + default=False +) +@option( + "--quiet", + "-q", + "quiet", + help="Quiet mode, will shrink most of the logs except warning and errors.", + is_flag=True, + default=False +) +@option( + "--path", + "-p", + "paths", + help=""" + The paths of artifact files to do checksum refreshing. + """, + multiple=True +) +@option( + "--path-file", + "-f", + "path_file", + help=""" + The file which contain the paths of artifact files to do checksum refreshing. + Each path in this file should be in a single line. + """ +) +@option( + "--target", + "-t", + "target", + help=""" + The target to do the uploading, which will decide which s3 bucket + and what root path where all files will be uploaded to. + Can accept more than one target. + """, + required=True +) +@command() +def refresh( + target: str, + paths: List[str], + path_file: str, + quiet: bool = False, + debug: bool = False +): + """ + Refresh the checksum of the specified path for the target maven repository. + It will calculate the checksum files of the specified artifact and see if + unmatched, then regenerate the checksum files based on the artifact. + Default checksum files include .md5, .sha1. + """ + _decide_mode( + "checksum-refresh-{}".format(target), "", + is_quiet=quiet, is_debug=debug, use_log_file=False + ) + if not paths and not path_file: + logger.error( + "No path specified, please specify at least one path " + "through --path or --path-file.") + sys.exit(1) + + work_paths = [] + if paths: + work_paths.extend(paths) + + conf = get_config() + aws_profile = os.getenv("AWS_PROFILE") or conf.get_aws_profile() + if not aws_profile: + logger.error("No AWS profile specified!") + sys.exit(1) + + if path_file: + with open(path_file, "r", encoding="utf-8") as f: + for line in f.readlines(): + work_paths.append(str(line).strip()) + try: + (aws_bucket, prefix) = _init_cmd(target) + + # NOTE: This is a liitle hacky, which constrain the configuration of + # of target should define the bucket to contain "prod-maven" + # or "stage-maven" to decide that the bucket is for maven repo + # in our defined aws env for production or stage + if "prod-maven" not in aws_bucket and "stage-maven" not in aws_bucket: + logger.error("The target %s is not a maven repository.", target) + sys.exit(1) + + refresh_checksum((aws_bucket, prefix), work_paths, aws_profile) + except Exception: + print(traceback.format_exc()) + sys.exit(2) + + +def _init_cmd(target: str) -> Tuple[str, str]: + conf = get_config() + if not conf: + sys.exit(1) + aws_bucket = "" + t = conf.get_target(target) + if not t: + sys.exit(1) + for b in t: + aws_bucket = b.get('bucket') + prefix = b.get('prefix', '') + return (aws_bucket, prefix) + + +@group() +def checksum(): + """checksum commands are responsible to operate checksum files + of maven products operated by Charon + """ + + +def init_checksum(): + checksum.add_command(validate) + checksum.add_command(refresh) diff --git a/charon/pkgs/checksum_http.py b/charon/pkgs/checksum_http.py index 515bf5a3..4654559b 100644 --- a/charon/pkgs/checksum_http.py +++ b/charon/pkgs/checksum_http.py @@ -13,7 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. """ -from charon.utils.files import digest +from charon.utils.files import digest, HashType +from charon.storage import S3Client from typing import Tuple, List, Dict from html.parser import HTMLParser import tempfile @@ -36,9 +37,8 @@ def handle_checksum_validation_http( skips: List[str] = None ): """ Handle the checksum check for maven artifacts. - * target contains bucket name and prefix for the bucket, which will - be used to store artifacts with the prefix. See target definition - in Charon configuration for details. + * bucket contains store artifacts with the prefix. See target + definition in Charon configuration for details. * path is the root path where to start the validation in the bucket. * includes are the file suffixes which will decide the types of files to do the validation. @@ -266,3 +266,84 @@ def _decide_root_url(bucket: str) -> str: if bucket.strip().startswith("stage-maven"): return "https://maven.stage.repository.redhat.com" return None + + +def refresh_checksum( + target: Tuple[str, str], + paths: List[str], + aws_profile: str = None +): + """Refresh checksum for files in a given bucket. + * bucket contains store artifacts with the prefix. See target + definition in Charon configuration for details. + * paths are the exact files whose checksum files will be + refreshed with. + """ + bucket_name = target[0] + prefix = target[1] + s3_client = S3Client(aws_profile=aws_profile) + real_prefix = prefix if prefix.strip() != "/" else "" + filetype_filter = [".prodinfo", ".sha1", ".sha256", ".md5"] + for path in paths: + is_artifact = True + for filetype in filetype_filter: + if path.strip().endswith(filetype): + is_artifact = False + continue + if not is_artifact: + logger.info( + "%s is not an artifact file for maven products. Skipped.", + path + ) + continue + s3_path = os.path.join(real_prefix, path) + checksums = { + ".md5": HashType.MD5, + ".sha1": HashType.SHA1, + ".sha256": HashType.SHA256, + ".sha512": HashType.SHA512 + } + if s3_client.file_exists_in_bucket(bucket_name, s3_path): + temp_f = os.path.join(tempfile.gettempdir(), path) + folder = os.path.dirname(temp_f) + try: + if not os.path.exists(folder): + os.makedirs(folder) + s3_client.download_file(bucket_name, s3_path, temp_f) + existed_checksum_types = [] + for file_type in checksums: + s3_checksum_path = s3_path + file_type + if s3_client.file_exists_in_bucket(bucket_name, s3_checksum_path): + existed_checksum_types.append(file_type) + if existed_checksum_types: + for file_type in existed_checksum_types: + checksum_path = path + file_type + s3_checksum_path = s3_path + file_type + hash_type = checksums[file_type] + correct_checksum_c = digest(temp_f, hash_type) + original_checksum_c = s3_client.read_file_content( + bucket_name, s3_checksum_path + ) + if correct_checksum_c == original_checksum_c: + logger.info("Checksum %s matches, no need to refresh.", checksum_path) + else: + logger.info("Checksum %s does not match, refreshing...", checksum_path) + s3_client.simple_upload_file( + file_path=checksum_path, + file_content=correct_checksum_c, + target=(bucket_name, prefix), + mime_type="text/plain", + force=True + ) + else: + logger.warning( + "No valid checksum files exist for %s, Skipped." + " Are you sure it is a valid maven artifact?", + path + ) + finally: + if folder and folder != tempfile.gettempdir() and os.path.exists(folder): + shutil.rmtree(folder) + logger.info("Checksums are refreshed for artifact %s", path) + else: + logger.warning("File %s does not exist in bucket %s", s3_path, bucket_name) diff --git a/charon/pkgs/pkg_utils.py b/charon/pkgs/pkg_utils.py index 9325f14b..9d57def1 100644 --- a/charon/pkgs/pkg_utils.py +++ b/charon/pkgs/pkg_utils.py @@ -117,7 +117,7 @@ def invalidate_cf_paths( non_completed[status] = ids logger.info( "The CF invalidating requests done, following requests " - "are not completed yet:\n %s\nPlease use cf-check command to " + "are not completed yet:\n %s\nPlease use 'cf check' command to " "check its details.", non_completed ) logger.debug( diff --git a/charon/storage.py b/charon/storage.py index 34ae1274..a34a079d 100644 --- a/charon/storage.py +++ b/charon/storage.py @@ -675,14 +675,15 @@ def simple_upload_file( self, file_path: str, file_content: str, target: Tuple[str, str], mime_type: str = None, - check_sum_sha1: str = None + check_sum_sha1: str = None, + force: bool = False ): """ Uploads file to s3 bucket, regardless of any extra information like product and version info. - * Warning: this will directly overwrite the files even if - it has lots of product info, so please be careful to use. - If you want to upload product artifact files, please use - upload_files + * Warning: If force is set True, it will directly overwrite + the files even if it has lots of product info, so please be + careful to use. If you want to upload product artifact files, + please use upload_files() """ bucket = target[0] prefix = target[1] @@ -705,7 +706,7 @@ def simple_upload_file( content_type = mime_type if not content_type: content_type = DEFAULT_MIME_TYPE - if not existed: + if not existed or force: f_meta = {} if check_sum_sha1 and check_sum_sha1.strip() != "": f_meta[CHECKSUM_META_KEY] = check_sum_sha1 @@ -723,7 +724,9 @@ def simple_upload_file( file_path, bucket, e ) else: - raise FileExistsError("Error: file %s already exists, upload is forbiden.") + raise FileExistsError( + f"Error: file {path_key} already exists, upload is forbiden." + ) def delete_manifest(self, product_key: str, target: str, manifest_bucket_name: str): if not manifest_bucket_name: @@ -780,6 +783,10 @@ def read_file_content(self, bucket_name: str, key: str) -> str: file_object = bucket.Object(key) return str(file_object.get()['Body'].read(), 'utf-8') + def download_file(self, bucket_name: str, key: str, file_path: str): + bucket = self.__get_bucket(bucket_name) + bucket.download_file(key, file_path) + def list_folder_content(self, bucket_name: str, folder: str) -> List[str]: """List the content in folder in an s3 bucket. Note it's not recursive, which means the content only contains the items in that folder, but diff --git a/charon/utils/files.py b/charon/utils/files.py index f15f77c4..d811200b 100644 --- a/charon/utils/files.py +++ b/charon/utils/files.py @@ -27,6 +27,21 @@ class HashType(Enum): MD5 = 0 SHA1 = 1 SHA256 = 2 + SHA512 = 3 + + +def get_hash_type(type_str: str) -> HashType: + """Get hash type from string""" + if type_str.lower() == "md5": + return HashType.MD5 + elif type_str.lower() == "sha1": + return HashType.SHA1 + elif type_str.lower() == "sha256": + return HashType.SHA256 + elif type_str.lower() == "sha512": + return HashType.SHA512 + else: + raise ValueError("Unsupported hash type: {}".format(type_str)) def overwrite_file(file_path: str, content: str): @@ -45,7 +60,7 @@ def read_sha1(file: str) -> str: they are used for hashing, so we will directly calculate its sha1 hash through digesting. """ if os.path.isfile(file): - non_search_suffix = [".md5", ".sha1", ".sha256"] + non_search_suffix = [".md5", ".sha1", ".sha256", ".sha512"] _, suffix = os.path.splitext(file) if suffix not in non_search_suffix: sha1_file = file + ".sha1" @@ -89,6 +104,8 @@ def _hash_object(hash_type: HashType): hash_obj = hashlib.sha256() elif hash_type == HashType.MD5: hash_obj = hashlib.md5() + elif hash_type == HashType.SHA512: + hash_obj = hashlib.sha512() else: raise Exception("Error: Unknown hash type for digesting.") return hash_obj diff --git a/requirements.txt b/requirements.txt index cc669871..4f1b20ad 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ setuptools-rust==1.7.0 -Jinja2==3.1.3 +Jinja2==3.1.4 boto3==1.28.46 botocore==1.31.46 click==8.1.7 diff --git a/setup.py b/setup.py index 98d6be4d..0c11c688 100755 --- a/setup.py +++ b/setup.py @@ -15,7 +15,7 @@ """ from setuptools import setup, find_packages -version = "1.3.0" +version = "1.3.1" # f = open('README.md') # long_description = f.read().strip() diff --git a/tests/test_checksum_refresh.py b/tests/test_checksum_refresh.py new file mode 100644 index 00000000..c5e8dff3 --- /dev/null +++ b/tests/test_checksum_refresh.py @@ -0,0 +1,160 @@ +""" +Copyright (C) 2022 Red Hat, Inc. (https://github.com/Commonjava/charon) + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" +from charon.pkgs.checksum_http import refresh_checksum +from charon.utils.files import ( + digest, HashType, get_hash_type +) +from tests.constants import INPUTS +from tests.base import PackageBaseTest +from tests.commons import TEST_BUCKET_2 +from moto import mock_aws +from botocore.errorfactory import ClientError +from botocore.exceptions import HTTPClientError +import os +import tempfile +import shutil + +CHECKSUMS = { + ".md5": HashType.MD5, + ".sha1": HashType.SHA1, + ".sha256": HashType.SHA256, + ".sha512": HashType.SHA512 +} + + +@mock_aws +class ChecksumTest(PackageBaseTest): + def setUp(self): + super().setUp() + self.target_ = (TEST_BUCKET_2, '') + self.tmp_dir = tempfile.mkdtemp(prefix="charon-checksum-test-") + self.mock_s3.create_bucket(Bucket=TEST_BUCKET_2) + self.test_bucket_2 = self.mock_s3.Bucket(TEST_BUCKET_2) + + def tearDown(self): + buckets = [TEST_BUCKET_2] + self.cleanBuckets(buckets) + shutil.rmtree(self.tmp_dir) + super().tearDown() + + def test_checksum_refresh_not_matched(self): + fs = ["commons-client-4.5.6.zip", "commons-client-4.5.9.zip"] + checksum_types = [".md5", ".sha1", ".sha256"] + for f in fs: + self.__upload_file(f) + for hash_file_type in checksum_types: + self.__upload_content( + f"wrong {hash_file_type}", + self.__get_test_key(f"{f}{hash_file_type}") + ) + new_fs = [ + self.__get_test_key("commons-client-4.5.6.zip"), + self.__get_test_key("commons-client-4.5.9.zip") + ] + refresh_checksum(self.target_, new_fs) + for f_ in fs: + for hash_file_type in checksum_types: + f = self.__download_file(self.__get_test_key(f"{f_}{hash_file_type}")) + self.assertTrue(os.path.exists(f)) + original_digest = self.__get_digest(f_, CHECKSUMS[hash_file_type]) + with open(f, "rb") as f: + updated_checksum = str(f.read(), encoding="utf-8") + self.assertEqual(original_digest, updated_checksum) + for non_existed_hash_type in [".sha512"]: + self.assertFalse( + self.__check_file_exists( + self.__get_test_key(f"{f_}{non_existed_hash_type}") + ) + ) + + def test_checksum_refresh_already_matched(self): + f_ = "commons-client-4.5.6.zip" + self.__upload_file(f_) + checksum_types = ["md5", "sha1"] + for hash_file_type in checksum_types: + hash_f = f"{f_}.{hash_file_type}" + self.__upload_content( + self.__get_digest( + f_, + get_hash_type(hash_file_type) + ), + self.__get_test_key(hash_f) + ) + refresh_checksum(self.target_, [self.__get_test_key(f_)]) + f = self.__download_file(self.__get_test_key(f_)) + self.assertTrue(os.path.exists(f)) + for hash_file_type in checksum_types: + type_key = f".{hash_file_type}" + f = self.__download_file(self.__get_test_key(f"{f_}{type_key}")) + self.assertTrue(os.path.exists(f)) + original_digest = self.__get_digest(f_, CHECKSUMS[type_key]) + with open(f, "rb") as f: + updated_checksum = str(f.read(), encoding="utf-8") + self.assertEqual(original_digest, updated_checksum) + for non_existed_hash_type in [".sha256", ".sha512"]: + self.assertFalse( + self.__check_file_exists(self.__get_test_key(f"{f_}{non_existed_hash_type}")) + ) + + def test_checksum_refresh_missing(self): + f_ = "commons-client-4.5.6.zip" + self.__upload_file(f_) + key = self.__get_test_key(f_) + refresh_checksum(self.target_, [key]) + f = self.__download_file(key) + self.assertTrue(os.path.exists(f)) + for non_existed_hash_type in [".md5", ".sha1", ".sha256", ".sha512"]: + self.assertFalse( + self.__check_file_exists(self.__get_test_key(f"{f_}{non_existed_hash_type}")) + ) + + def __get_digest(self, file, hash_type): + real_file = os.path.join(INPUTS, file) + return digest(real_file, hash_type) + + def __get_test_key(self, file): + return os.path.join("test", file) + + def __upload_file(self, file): + real_file = os.path.join(INPUTS, file) + self.test_bucket_2.put_object( + Body=open(real_file, "rb"), Key=self.__get_test_key(file), + ContentEncoding="utf-8" + ) + + def __upload_content(self, content, key): + self.test_bucket_2.put_object( + Body=content, Key=key, + ContentEncoding="utf-8" + ) + + def __download_file(self, key) -> str: + file_path = os.path.join(self.tmp_dir, key) + folder_ = os.path.dirname(file_path) + if not os.path.exists(folder_): + os.makedirs(folder_) + self.test_bucket_2.download_file(key, file_path) + return file_path + + def __check_file_exists(self, key) -> bool: + try: + self.test_bucket_2.Object(key).load() + return True + except (ClientError, HTTPClientError) as e: + if isinstance(e, ClientError) and e.response["Error"]["Code"] == "404": + return False + else: + raise e diff --git a/tests/test_s3client.py b/tests/test_s3client.py index 48063daa..706c59c1 100644 --- a/tests/test_s3client.py +++ b/tests/test_s3client.py @@ -463,6 +463,34 @@ def test_simple_delete_file(self): shutil.rmtree(temp_root) + def test_download_file(self): + (temp_root, _, all_files) = self.__prepare_files() + for file_path in all_files: + file_key = file_path[len(temp_root) + 1:] + file_content = open(file_path, "rb").read() + sha1 = read_sha1(file_path) + self.s3_client.simple_upload_file( + file_path=file_key, + file_content=file_content, + check_sum_sha1=sha1, + target=(MY_BUCKET, '') + ) + + # test download file start + file_key = all_files[0][len(temp_root) + 1:] + file_path = os.path.join(temp_root, file_key) + original_sha1 = read_sha1(file_path) + os.remove(file_path) + self.assertFalse(os.path.exists(file_path)) + self.s3_client.download_file( + MY_BUCKET, file_key, file_path + ) + self.assertTrue(os.path.exists(file_path)) + download_sha1 = read_sha1(file_path) + self.assertEqual(original_sha1, download_sha1) + + shutil.rmtree(temp_root) + def __prepare_files(self): test_zip = zipfile.ZipFile( os.path.join(INPUTS, "commons-lang3.zip")