From d5f4a2f3f072953134ce9a5cb93daacf65132c29 Mon Sep 17 00:00:00 2001 From: Gang Li Date: Thu, 16 Dec 2021 20:27:43 +0800 Subject: [PATCH 1/2] Fix: add ignored dirs configuration Current ignore_patterns only address the file names, but not the whole direcotries. It's a little hard to use this regex patterns to decide if the whole path should be ignored by the pattern. This pr added a new configuration "ignore_dirs", which means all files in these ignored_dirs will be ignored for uploading and removing, wherever these dirs located (root or the sub folder) --- charon/config.py | 4 +++ charon/pkgs/maven.py | 39 ++++++++++++++++++++------- tests/base.py | 5 ++++ tests/input/commons-client-4.5.6.zip | Bin 15062 -> 18619 bytes 4 files changed, 39 insertions(+), 9 deletions(-) diff --git a/charon/config.py b/charon/config.py index c4324250..c14c0d6b 100644 --- a/charon/config.py +++ b/charon/config.py @@ -34,6 +34,7 @@ class CharonConfig(object): """ def __init__(self, data: Dict): self.__ignore_patterns: List[str] = data.get("ignore_patterns", None) + self.__ignore_dirs: List[str] = data.get("ignore_dirs", None) self.__aws_profile: str = data.get("aws_profile", None) self.__targets: Dict = data.get("targets", None) if not self.__targets or not isinstance(self.__targets, Dict): @@ -43,6 +44,9 @@ def __init__(self, data: Dict): def get_ignore_patterns(self) -> List[str]: return self.__ignore_patterns + def get_ignore_dirs(self) -> List[str]: + return self.__ignore_dirs + def get_aws_profile(self) -> str: return self.__aws_profile diff --git a/charon/pkgs/maven.py b/charon/pkgs/maven.py index 2ac68a4b..47d20144 100644 --- a/charon/pkgs/maven.py +++ b/charon/pkgs/maven.py @@ -20,7 +20,7 @@ from charon.utils.strings import remove_prefix from charon.storage import S3Client from charon.pkgs.pkg_utils import upload_post_process, rollback_post_process -from charon.config import get_template +from charon.config import get_template, get_config from charon.constants import (META_FILE_GEN_KEY, META_FILE_DEL_KEY, META_FILE_FAILED, MAVEN_METADATA_TEMPLATE, ARCHETYPE_CATALOG_TEMPLATE, ARCHETYPE_CATALOG_FILENAME, @@ -254,7 +254,7 @@ def __gen_digest_file(hash_file_path, meta_file_path: str, hashtype: HashType) - def handle_maven_uploading( repo: str, prod_key: str, - ignore_patterns=None, + ignore_patterns: List[str] = None, root="maven-repository", targets: List[Tuple[str, str, str]] = None, aws_profile=None, @@ -284,10 +284,11 @@ def handle_maven_uploading( # 2. scan for paths and filter out the ignored paths, # and also collect poms for later metadata generation + ignore_dirs = get_config().get_ignore_dirs() (top_level, valid_mvn_paths, valid_poms, - valid_dirs) = _scan_paths(tmp_root, ignore_patterns, root) + valid_dirs) = _scan_paths(tmp_root, ignore_patterns, ignore_dirs, root) # This prefix is a subdir under top-level directory in tarball # or root before real GAV dir structure @@ -447,10 +448,11 @@ def handle_maven_del( # 2. scan for paths and filter out the ignored paths, # and also collect poms for later metadata generation + ignore_dirs = get_config().get_ignore_dirs() (top_level, valid_mvn_paths, valid_poms, - valid_dirs) = _scan_paths(tmp_root, ignore_patterns, root) + valid_dirs) = _scan_paths(tmp_root, ignore_patterns, ignore_dirs, root) # 3. Delete all valid_paths from s3 logger.debug("Valid poms: %s", valid_poms) @@ -589,16 +591,30 @@ def _extract_tarball(repo: str, prefix="", dir__=None) -> str: sys.exit(1) -def _scan_paths(files_root: str, ignore_patterns: List[str], +def _scan_paths(files_root: str, ignore_patterns: List[str], ignore_dirs: List[str], root: str) -> Tuple[str, List[str], List[str], List[str]]: # 2. scan for paths and filter out the ignored paths, # and also collect poms for later metadata generation logger.info("Scan %s to collect files", files_root) top_level = root - valid_mvn_paths, non_mvn_paths, ignored_paths, valid_poms, valid_dirs = [], [], [], [], [] + valid_mvn_paths, non_mvn_paths, files_ignored, dirs_ignored, \ + valid_poms, valid_dirs = [], [], [], [], [], [] changed_dirs = set() top_found = False for root_dir, dirs, names in os.walk(files_root): + ignored_current_dir = False + if ignore_dirs: + for ignored in ignore_dirs: + ignored = ignored if ignored.endswith("/") else ignored + "/" + if ignored in root_dir: + logger.debug("Found ignored directory %s, " + "all files will be ignored in this dir", + root_dir) + ignored_current_dir = True + dirs_ignored.append(root_dir) + break + if ignored_current_dir: + continue for directory in dirs: changed_dirs.add(os.path.join(root_dir, directory)) if not top_found: @@ -615,7 +631,7 @@ def _scan_paths(files_root: str, ignore_patterns: List[str], # Let's wait to do the regex / pom examination until we # know we're inside a valid root directory. if _is_ignored(name, ignore_patterns): - ignored_paths.append(path) + files_ignored.append(path) continue valid_mvn_paths.append(path) @@ -644,10 +660,15 @@ def _scan_paths(files_root: str, ignore_patterns: List[str], valid_dirs.append(c) logger.info("Files scanning done.\n") + if ignore_dirs and len(ignore_dirs) > 0: + logger.info( + "Ignored all files in these directories: \n%s\n", + "\n".join(dirs_ignored) + ) if ignore_patterns and len(ignore_patterns) > 0: logger.info( "Ignored paths with ignore_patterns %s as below:\n%s\n", - ignore_patterns, "\n".join(ignored_paths) + ignore_patterns, "\n".join(files_ignored) ) return (top_level, valid_mvn_paths, valid_poms, valid_dirs) @@ -994,7 +1015,7 @@ def __hash_decorate_metadata(path: str, metadata: str) -> List[str]: def _is_ignored(filename: str, ignore_patterns: List[str]) -> bool: for ignored_name in STANDARD_GENERATED_IGNORES: if ignored_name in filename: - logger.info("Ignoring standard generated Maven path: %s", filename) + logger.debug("Ignoring standard generated Maven path: %s", filename) return True if ignore_patterns: diff --git a/tests/base.py b/tests/base.py index 39995023..00166fa7 100644 --- a/tests/base.py +++ b/tests/base.py @@ -43,6 +43,11 @@ def setUp(self): - ".*^(redhat).*" - ".*snapshot.*" +ignore_dirs: + - ".meta" + - ".index" + - ".nexus" + targets: ga: bucket: "charon-test" diff --git a/tests/input/commons-client-4.5.6.zip b/tests/input/commons-client-4.5.6.zip index 7ce27974ab2b78614e460bb6cdd691b40359bfc7..d083e95a27b3249d340032a703617a6ecb40c65e 100644 GIT binary patch delta 3141 zcmai0ZA?>V6u$SprG-MEg+hT|X+gnK+EP9W1vkusS!QI4K@$VA(J2^{Ar2uS3$XCR z`9~Mv7>ANAy7_BnAILtOY#GtWvTVjbi!*G?wrt5<%-j;=vOi=y=e?KPzPF$xZ$saE zp68tRea?MOZx1g_$yx0?6pLM=znwEvSrO&ivW2Ow9(e`^6*;<%=^7CYaXo$TPe`bP*R3M%u$hmRcW9jV^C zCqq)CjSF#H7_4A0oMVk{{qexyA#+FyUzGSfNr1kAffUf9sfZ>5c}<`f;)5D6h6OG1 zf}8|gihjB!4XdZMjeu=y05NL_3BpGQ`uU-VgVyj!rX*YLDw4!MhI)?;*%NmSqb-ml zSHL36&*Cua;k|Kw^~DTE;Vf&@Zk>O#C|&wDWtbg@SCrn>hu@Duw)I6X0?qj-l?&@m zmyoW{c0FCw`13KipEJ+yjKNv!H}dFwP5MW+3@vUyJ3jxFrB&uRESAsQ8e-0LgSw1I zdPjzjX}7ikPm7CW#9&QE?Xe+aBE)It(`8Hm3>lL^OQs@p8AXsGV{1Z20%M(xgKYEQs_{)hI}R|xO=ZO86av0g{9T+SZfBjq`X*S;sWMV3ba@(f{D#$d4E zOJ*O3yRK{a=u-X-_FxRQxG$q<_uU^mSm8LhUAGjX+BV4jEgsL2`<#R;p?gxx*&7~RCZb0nGA%{1;4-z zyUar-0~xRq@SEuZ0$pz+wnfecR<#jH{%BI6G+2aAx*6I-Rv)^kP*H_J;gJ4(s+3_m zfBN1%X_A5-3l{>aFU4d9!~{p==&r?q=(lSEYXsn34qP>Bts5}wdf=!4?og=CWGmNN zBZ(lzmIZ=qoWN}DTAv^Zx+_0Iibd-rGOBoSB5I*3Djw(92|n`Omx3z0!hzFi7FYDyP$$;7oNH_x+<`4~yq& zXJ~{KEb>8^SLHMNc1Vz{B9AaRVa-^1r=fjwnan`3(@t?1AYSZe>wr@NOq%b!W}LXN5vW(f;nNIX>GK?j83bAFaaz-!cV058Vw}u%WTsi--AcaM#P5G0u;xPl$b^7-LV;V4IDp?4XCI zC~EQ(>bI_6KiuGuBWs&b-z}x zb+CEh1$9PTsm`JKq$5H#ls{&+V*3)^2hD}qAt@x~;iR07pq|qIU!NyB delta 1207 zcmZ`&S!fes5Z?daCYzon&2E}BYqHo@lVWX}YLnQ|HnG;b+QbKmimg(QDqNoIH+Qt)9LD4pM zB;=;Z+(vP3BSD!^=&oCVO+TrV=}!t@17WE2Rj`@0`RsH%kH`|Z?kiVzMtY)W{fFz^ zSR^}Z)8!I;za0OeIR=BXV4Gjh<5;1(-cE1isPVyIF@KT5zO-bA!6Xw*=rRiY+V$XSE<@VI&8V0LK5lj?t^aJR zBOT2|OU946)NLIj`S8&zvlUSA=ZGG+EcYHj2ZBpdKOx;H3vfYIW3{N$GY$vd&x{;5 I%KGR20zZp1dH?_b From 64ce9b38be3c1ea086c9318b523f9f46cb2ab5db Mon Sep 17 00:00:00 2001 From: Gang Li Date: Tue, 4 Jan 2022 18:01:01 +0800 Subject: [PATCH 2/2] Change to use ignore_patterns to ignore dirs --- charon/config.py | 4 ---- charon/pkgs/maven.py | 27 +++++++++++++++------------ charon/storage.py | 9 ++------- charon/utils/strings.py | 7 +++++++ tests/base.py | 9 ++------- tests/commons.py | 5 +++++ tests/test_maven_index.py | 9 +++++++++ tests/test_maven_upload.py | 12 ++++++++---- tests/test_pkgs_dryrun.py | 2 +- 9 files changed, 49 insertions(+), 35 deletions(-) diff --git a/charon/config.py b/charon/config.py index c14c0d6b..c4324250 100644 --- a/charon/config.py +++ b/charon/config.py @@ -34,7 +34,6 @@ class CharonConfig(object): """ def __init__(self, data: Dict): self.__ignore_patterns: List[str] = data.get("ignore_patterns", None) - self.__ignore_dirs: List[str] = data.get("ignore_dirs", None) self.__aws_profile: str = data.get("aws_profile", None) self.__targets: Dict = data.get("targets", None) if not self.__targets or not isinstance(self.__targets, Dict): @@ -44,9 +43,6 @@ def __init__(self, data: Dict): def get_ignore_patterns(self) -> List[str]: return self.__ignore_patterns - def get_ignore_dirs(self) -> List[str]: - return self.__ignore_dirs - def get_aws_profile(self) -> str: return self.__aws_profile diff --git a/charon/pkgs/maven.py b/charon/pkgs/maven.py index 47d20144..5a138111 100644 --- a/charon/pkgs/maven.py +++ b/charon/pkgs/maven.py @@ -14,13 +14,14 @@ limitations under the License. """ from charon.utils.files import HashType +from charon.utils.strings import trail_path_with_root import charon.pkgs.indexing as indexing from charon.utils.files import overwrite_file, digest, write_manifest from charon.utils.archive import extract_zip_all from charon.utils.strings import remove_prefix from charon.storage import S3Client from charon.pkgs.pkg_utils import upload_post_process, rollback_post_process -from charon.config import get_template, get_config +from charon.config import get_template from charon.constants import (META_FILE_GEN_KEY, META_FILE_DEL_KEY, META_FILE_FAILED, MAVEN_METADATA_TEMPLATE, ARCHETYPE_CATALOG_TEMPLATE, ARCHETYPE_CATALOG_FILENAME, @@ -284,11 +285,11 @@ def handle_maven_uploading( # 2. scan for paths and filter out the ignored paths, # and also collect poms for later metadata generation - ignore_dirs = get_config().get_ignore_dirs() + # ignore_dirs = get_config().get_ignore_dirs() (top_level, valid_mvn_paths, valid_poms, - valid_dirs) = _scan_paths(tmp_root, ignore_patterns, ignore_dirs, root) + valid_dirs) = _scan_paths(tmp_root, ignore_patterns, root) # This prefix is a subdir under top-level directory in tarball # or root before real GAV dir structure @@ -448,11 +449,11 @@ def handle_maven_del( # 2. scan for paths and filter out the ignored paths, # and also collect poms for later metadata generation - ignore_dirs = get_config().get_ignore_dirs() + # ignore_dirs = get_config().get_ignore_dirs() (top_level, valid_mvn_paths, valid_poms, - valid_dirs) = _scan_paths(tmp_root, ignore_patterns, ignore_dirs, root) + valid_dirs) = _scan_paths(tmp_root, ignore_patterns, root) # 3. Delete all valid_paths from s3 logger.debug("Valid poms: %s", valid_poms) @@ -591,7 +592,7 @@ def _extract_tarball(repo: str, prefix="", dir__=None) -> str: sys.exit(1) -def _scan_paths(files_root: str, ignore_patterns: List[str], ignore_dirs: List[str], +def _scan_paths(files_root: str, ignore_patterns: List[str], root: str) -> Tuple[str, List[str], List[str], List[str]]: # 2. scan for paths and filter out the ignored paths, # and also collect poms for later metadata generation @@ -603,10 +604,12 @@ def _scan_paths(files_root: str, ignore_patterns: List[str], ignore_dirs: List[s top_found = False for root_dir, dirs, names in os.walk(files_root): ignored_current_dir = False - if ignore_dirs: - for ignored in ignore_dirs: - ignored = ignored if ignored.endswith("/") else ignored + "/" - if ignored in root_dir: + if ignore_patterns: + for ignored in ignore_patterns: + checking_dir = root_dir + if top_found and top_level in root_dir: + checking_dir = trail_path_with_root(root_dir, top_level) + if re.match(ignored, checking_dir): logger.debug("Found ignored directory %s, " "all files will be ignored in this dir", root_dir) @@ -660,12 +663,12 @@ def _scan_paths(files_root: str, ignore_patterns: List[str], ignore_dirs: List[s valid_dirs.append(c) logger.info("Files scanning done.\n") - if ignore_dirs and len(ignore_dirs) > 0: + if dirs_ignored and len(dirs_ignored) > 0: logger.info( "Ignored all files in these directories: \n%s\n", "\n".join(dirs_ignored) ) - if ignore_patterns and len(ignore_patterns) > 0: + if files_ignored and len(files_ignored) > 0: logger.info( "Ignored paths with ignore_patterns %s as below:\n%s\n", ignore_patterns, "\n".join(files_ignored) diff --git a/charon/storage.py b/charon/storage.py index 0cefed71..3cdbfd26 100644 --- a/charon/storage.py +++ b/charon/storage.py @@ -18,7 +18,7 @@ from boto3_type_annotations.s3.service_resource import Object from charon.utils.files import read_sha1 from charon.constants import PROD_INFO_SUFFIX, MANIFEST_SUFFIX - +from charon.utils.strings import trail_path_with_root from boto3 import session from botocore.errorfactory import ClientError from botocore.exceptions import HTTPClientError @@ -748,17 +748,12 @@ def __do_path_cut_and( path_handler: Callable[[str, str, int, int, List[str], asyncio.Semaphore], Awaitable[bool]], root="/" ) -> List[str]: - slash_root = root - if not root.endswith("/"): - slash_root = slash_root + "/" failed_paths = [] index = 1 file_paths_count = len(file_paths) tasks = [] for full_path in file_paths: - path = full_path - if path.startswith(slash_root): - path = path[len(slash_root):] + path = trail_path_with_root(full_path, root) tasks.append( asyncio.ensure_future( path_handler(full_path, path, index, file_paths_count, failed_paths) diff --git a/charon/utils/strings.py b/charon/utils/strings.py index a3f0ff59..f46ed6f8 100644 --- a/charon/utils/strings.py +++ b/charon/utils/strings.py @@ -2,3 +2,10 @@ def remove_prefix(string: str, prefix: str) -> str: if string and prefix and string.startswith(prefix): return string[len(prefix):] return string + + +def trail_path_with_root(path: str, root: str) -> str: + slash_root = root + if not slash_root.endswith("/"): + slash_root = slash_root + "/" + return remove_prefix(path, slash_root) diff --git a/tests/base.py b/tests/base.py index 00166fa7..98c8a630 100644 --- a/tests/base.py +++ b/tests/base.py @@ -40,13 +40,8 @@ def setUp(self): self.__prepare_template(config_base) default_config_content = """ ignore_patterns: - - ".*^(redhat).*" - - ".*snapshot.*" - -ignore_dirs: - - ".meta" - - ".index" - - ".nexus" + - .*^(redhat).* + - .*snapshot.* targets: ga: diff --git a/tests/commons.py b/tests/commons.py index 1fd2ead4..dca0e95b 100644 --- a/tests/commons.py +++ b/tests/commons.py @@ -85,6 +85,11 @@ "commons-logging/commons-logging/index.html", "commons-logging/commons-logging/1.2/index.html", ] +IGNORED_META_FOLDER_ITEMS = [ + ".index/org/foo/bar/foo-bar.pom", + ".nexus/org/foo/bar/foo-bar.pom", + ".meta/org/foo/bar/foo-bar.pom" +] COMMONS_CLIENT_INDEX = "org/apache/httpcomponents/httpclient/index.html" COMMONS_CLIENT_456_INDEX = "org/apache/httpcomponents/httpclient/4.5.6/index.html" COMMONS_LOGGING_INDEX = "commons-logging/commons-logging/index.html" diff --git a/tests/test_maven_index.py b/tests/test_maven_index.py index e55cfde6..4147ad32 100644 --- a/tests/test_maven_index.py +++ b/tests/test_maven_index.py @@ -26,6 +26,8 @@ from moto import mock_s3 import os +SHOULD_IGNORED = [r"^\.nexus/.*", r"^\.index/.*", r"^\.meta/.*"] + @mock_s3 class MavenFileIndexTest(PackageBaseTest): @@ -35,6 +37,7 @@ def test_uploading_index(self): product = "commons-client-4.5.6" handle_maven_uploading( test_zip, product, + SHOULD_IGNORED, targets=[(None, TEST_BUCKET, None)], dir_=self.tempdir ) @@ -79,6 +82,7 @@ def test_overlap_upload_index(self): product_456 = "commons-client-4.5.6" handle_maven_uploading( test_zip, product_456, + SHOULD_IGNORED, targets=[(None, TEST_BUCKET, None)], dir_=self.tempdir ) @@ -139,6 +143,7 @@ def __test_upload_index_with_prefix(self, prefix: str): product = "commons-client-4.5.6" handle_maven_uploading( test_zip, product, + SHOULD_IGNORED, targets=[(None, TEST_BUCKET, prefix)], dir_=self.tempdir ) @@ -191,6 +196,7 @@ def test_deletion_index(self): product_456 = "commons-client-4.5.6" handle_maven_del( test_zip, product_456, + SHOULD_IGNORED, targets=[(None, TEST_BUCKET, None)], dir_=self.tempdir ) @@ -262,6 +268,7 @@ def __test_deletion_index_with_prefix(self, prefix: str): product_456 = "commons-client-4.5.6" handle_maven_del( test_zip, product_456, + SHOULD_IGNORED, targets=[(None, TEST_BUCKET, prefix)], dir_=self.tempdir ) @@ -321,6 +328,7 @@ def __prepare_content(self, prefix=None): product_456 = "commons-client-4.5.6" handle_maven_uploading( test_zip, product_456, + SHOULD_IGNORED, targets=[(None, TEST_BUCKET, prefix)], dir_=self.tempdir ) @@ -329,6 +337,7 @@ def __prepare_content(self, prefix=None): product_459 = "commons-client-4.5.9" handle_maven_uploading( test_zip, product_459, + SHOULD_IGNORED, targets=[(None, TEST_BUCKET, prefix)], dir_=self.tempdir ) diff --git a/tests/test_maven_upload.py b/tests/test_maven_upload.py index c8424a10..830e7337 100644 --- a/tests/test_maven_upload.py +++ b/tests/test_maven_upload.py @@ -21,7 +21,7 @@ COMMONS_CLIENT_METAS, COMMONS_LOGGING_FILES, COMMONS_LOGGING_METAS, NON_MVN_FILES, ARCHETYPE_CATALOG, ARCHETYPE_CATALOG_FILES, COMMONS_CLIENT_456_MVN_NUM, COMMONS_CLIENT_MVN_NUM, - COMMONS_CLIENT_META_NUM + COMMONS_CLIENT_META_NUM, IGNORED_META_FOLDER_ITEMS ) from moto import mock_s3 import os @@ -62,7 +62,9 @@ def test_overlap_upload(self): actual_files = [obj.key for obj in objs] # need to double mvn num because of .prodinfo files self.assertEqual( - COMMONS_CLIENT_MVN_NUM * 2 + COMMONS_CLIENT_META_NUM, + COMMONS_CLIENT_MVN_NUM * 2 + + COMMONS_CLIENT_META_NUM + + len(IGNORED_META_FOLDER_ITEMS) * 2, len(actual_files) ) @@ -112,7 +114,7 @@ def test_ignore_upload(self): test_zip = os.path.join(os.getcwd(), "tests/input/commons-client-4.5.6.zip") product_456 = "commons-client-4.5.6" handle_maven_uploading( - test_zip, product_456, [".*.sha1"], + test_zip, product_456, [".*.sha1", r"^\.nexus/.*", r"^\.index/.*", r"^\.meta/.*"], targets=[(None, TEST_BUCKET, None)], dir_=self.tempdir, do_index=False ) @@ -151,7 +153,9 @@ def __test_prefix_upload(self, prefix: str): actual_files = [obj.key for obj in objs] # need to double mvn num because of .prodinfo files self.assertEqual( - COMMONS_CLIENT_456_MVN_NUM * 2 + COMMONS_CLIENT_META_NUM, + COMMONS_CLIENT_456_MVN_NUM * 2 + + COMMONS_CLIENT_META_NUM + + len(IGNORED_META_FOLDER_ITEMS) * 2, len(actual_files) ) diff --git a/tests/test_pkgs_dryrun.py b/tests/test_pkgs_dryrun.py index e75b0258..d8fe2a95 100644 --- a/tests/test_pkgs_dryrun.py +++ b/tests/test_pkgs_dryrun.py @@ -51,7 +51,7 @@ def test_maven_delete_dry_run(self): test_bucket = self.mock_s3.Bucket(TEST_BUCKET) objs = list(test_bucket.objects.all()) - self.assertEqual(50, len(objs)) + self.assertEqual(68, len(objs)) def test_npm_upload_dry_run(self): test_tgz = os.path.join(os.getcwd(), "tests/input/code-frame-7.14.5.tgz")