diff --git a/.2ms.yml b/.2ms.yml index 66352079..96927025 100644 --- a/.2ms.yml +++ b/.2ms.yml @@ -2871,3 +2871,119 @@ ignore-result: - f0544e7e9e25a6223cd10c37a445bfd4a4641337 # test data from defaultPlusAllCustomRules.json - f93dd5ab91efe7b70381afe3d4ebd1b26e628ac9 # test data from defaultPlusNonOverrideRules.json - ff81ccd6553feaf5fbbf573eac2a0042d05aaee4 # test data from defaultPlusNonOverrideRules.json +- 291db30c1fc925cf1db50da134b81bb53fcdfa9c # unit test from baseline_test.go +- 37053e96e52dcbe9d7a622cd3034cea95396a904 # unit test from detect_test.go +- 3d5c482ed8e7b2f642c4af0918dbb17bbad37fb5 # unit test from detect_test.go +- 54f91dd27b35bdce536ada4ac2d25e1830936eb3 # unit test from detect_test.go +- 58dad9441a96a0e2bde490df8012c310a754366d # unit test from detect_test.go +- 65e203cba5dc1dd46c0a05f7d7d29f13cd5a754b # unit test from detect_test.go +- 6deb6b4b6091d938e043751eff3c63f44f507047 # unit test from detect_test.go +- 742452318b80286d255e8f59b083130e0469361a # unit test from detect_test.go +- 763f9229cda90ec9d629db3d9510deec6106a50b # unit test from detect_test.go +- aa67b81207d129d2100805119835365106660dff # unit test from detect_test.go +- ad99aa7582689ab373979a458399b5ab74a48b28 # unit test from detect_test.go +- ae25f7e23754c15f9d7889ed0de737ec6691b6c1 # unit test from detect_test.go +- b8c48605e841ba5bc32988f59fb884475b8d5690 # unit test from detect_test.go +- c9266e05aba8dc5005cafc1d4a844ba52be7a33c # unit test from detect_test.go +- ca95e2536135c3d7ec3a7b1436519a008cea6ba5 # unit test from detect_test.go +- cf6b654c0b2f060d957409da90d86e6d349a50d5 # unit test from detect_test.go +- da1aaad0a6b7d539002003fd8f58c82cfaa72b15 # unit test from detect_test.go +- e2503154339a5267cbe90ed14a325b06c6ebd844 # unit test from detect_test.go +- e6db34f91797a0aa9cc7460c749577e7be7bc7ee # unit test from reader_test.go +- e8bb1f5f551f019754e029ad16dbd24ffad57655 # unit test from detect_test.go +- ebad3473b3803319f6b5587b77d0aaab458feb28 # unit test from detect_test.go +- f2d5884f42efeee708ece148627c2b95eefc316f # unit test from detect_test.go +- 0191ca76eb902ced14c15f96338aef02e0057c8b # unit test from detect_test.go +- 03ddf82cc545e43572460fa7327e5a29667f4d9f # unit test from detect_test.go +- 045a994e1030fff96dcb84b28e30d70b3ac95fd6 # unit test from detect_test.go +- 058d5ef7732f0bc4b72e9a9a6791bdd8b2cff6d9 # unit test from engine_test.go +- 064ea8914e82065aebbb6625d7eb8afab44474de # unit test from detect_test.go +- 08443422de367c9ac03d302b0f1105fd83237542 # unit test from e2e_test.go +- 084a5972aadeceed7774244c7cdf9dadab7cb65f # unit test from detect_test.go +- 08608fa8499ad6e42f04a4bfddd9f4f528a735f0 # unit test from detect_test.go +- 09c64bdb8a8293f680106f8573a6dc3b859e6107 # unit test from detect_test.go +- 0ba26d72ec2adcc03762aedf9adb41d72c843204 # unit test from e2e_test.go +- 0fc99fe8faa117e2d6c9dbbadd7edb6050866b61 # unit test from detect_test.go +- 120d2028133fe0a99e37e4406d6e39de19064acd # unit test from detect_test.go +- 13a25db3ecbb44fc4ebb8d7c743650b9f033b3cf # unit test from detect_test.go +- 1f491ddef575b4e99da0c0c30d1053292660e7f7 # unit test from detect_test.go +- 242966474821312cf4ba23cb189822bc9a197246 # unit test from detect_test.go +- 2b1cbfbf336cfbf5e461255bfe76d1e38a4a743b # unit test from e2e_test.go +- 2d62a78fe31d4c464b8c79fc585890f0483cfeec # unit test from baseline_test.go +- 2e47a67fd27d0fd4d75019e6e439c381cde961d0 # unit test from detect_test.go +- 2f16bf21f6d80c0d41e47cb0172468e43729dd66 # unit test from detect_test.go +- 31036836f52133fe54daa2e69ac12d9ba2b3b04a # unit test from e2e_test.go +- 316f88a3e44ddb85aabc0e948a3ea8a671b5edf5 # unit test from reader_test.go +- 3287a84c6e8fe79033feee27d9141e8a5124f825 # unit test from detect_test.go +- 34818a0709801a2869d996ede681f6c0fe7fea91 # unit test from e2e_test.go +- 350322ac0bdfb65e2a61786dbae5963f2bb9a527 # unit test from detect_test.go +- 3692e7882628467155bd939da7d56e5ee73d3e15 # unit test from detect_test.go +- 39957ecf370b6f1132121973e44b921200462c4e # unit test from reader_test.go +- 3a6b1739d3f3398ab84e710ba73b487c1df23376 # unit test from detect_test.go +- 3f71840afabd8ac630599d13fd1c8b398c1f5549 # unit test from engine_test.go +- 40936165ebf8cbb38f2a7546f57fa0cd87006dc0 # unit test from detect_test.go +- 40b32a8039a1a10a99a577d7ef6f583be0507227 # unit test from e2e_test.go +- 42a7aa1b64619c519f8b01586c548bf64a9e8c2d # unit test from engine_test.go +- 435ecfdc46e31ef78ac0409ae0ddcf0527dfc791 # unit test from detect_test.go +- 4523e7286b5b3ba00190a208da8ed687e22563ff # unit test from detect_test.go +- 4efd98988b2d95698bd233dc2e24c0c4774fd13c # unit test from detect_test.go +- 5047fbb5feea035a81a4f72f1c6358593afa0911 # unit test from detect_test.go +- 506fc171430a769bc7e5ef6ebfc6de1a2b42f9d0 # unit test from detect_test.go +- 52d8d8bc3b4025ebcc4cd9fa9cafb1d2448eff07 # unit test from detect_test.go +- 5a8b1c9b53fbd7a22adc42f108613c13f56dc8d0 # unit test from detect_test.go +- 5c0e99a770f0b06094247b2d93022c25f2fbb84b # unit test from detect_test.go +- 644d8f390bcad42029ddb02ef2860fe1c0e2c854 # unit test from detect_test.go +- 6a022529429d2852fcfe03d4cd7109bf17e383c5 # unit test from e2e_test.go +- 6bfadced69c08247f1861c5dd85a994d0c5e696d # unit test from detect_test.go +- 6dfe345e835c8fe9b67415512e7768665bee014d # unit test from detect_test.go +- 7232818438eb1f2a6fe608f34f1c31e7673fa2c1 # unit test from engine_test.go +- 738ac32fba8b18f3d77c73124828ac5e50ecc103 # unit test from detect_test.go +- 837c6b7f1e9b6fe22fc5337bb001b80b6d0158c8 # unit test from engine_test.go +- 8414e74dc0cbef4a2f75d3c74e9c4c5406f39a4d # unit test from baseline_test.go +- 85aea11d741219bd83c4ee1907e7c8a0d65e3e54 # unit test from detect_test.go +- 87d545317c6064d2a733a3431f339d43c27ade7a # unit test from detect_test.go +- 88c83cc8c1d4793d16b98482682fbd39ff9430dd # unit test from detect_test.go +- 8a79dfcf5127c3eda963791799dd9bf700024305 # unit test from baseline_test.go +- 8fe8eb66b9f59d95ea071664b43d17a7d86f4780 # unit test from detect_test.go +- 92564cf63061405552373a9017b49ae7de3d0d0e # unit test from detect_test.go +- 94d83bd98215b61a5477db138ac4605d8274075e # unit test from detect_test.go +- 94e3c8734f89bcef49a4fdbb9c7f4ce1eb6f0bab # unit test from engine_test.go +- 95602b44ee80fe9bbe9007e1a030c31cf4ac8f1a # unit test from detect_test.go +- 967d1117bf89447855642e1a13c50b161c8e746a # unit test from detect_test.go +- 970e4c538f13d14062157e54635c925d20a2b866 # unit test from e2e_test.go +- 973d837f7a0a53534ce79a098dfa29f7ebde938f # unit test from detect_test.go +- 97ce950e4f01b40ab80fc7fccc96c31688210edf # unit test from detect_test.go +- 99183dd451fe2b2c49cab3ae8de1ed06e6f0da2e # unit test from detect_test.go +- 9f8ef66d0619971d4124c9d0d87b680bec8eeb0d # unit test from detect_test.go +- a14018d7b39d3b4af8d1038d65deae1bcc196db3 # unit test from detect_test.go +- a6be8491e518a4551610780b8fe0bb662c7031d5 # unit test from detect_test.go +- ae423a7a77b65c64ee1f6cb684ae9b1e8045e55b # unit test from detect_test.go +- b072a72876b789b9750bdec283d03414eae1c16d # unit test from detect_test.go +- b45c94986a95512946a8f1260405afbb7a9bfb99 # unit test from engine_test.go +- bc1701a2a0fe0a077f64ecc0d76724efc4341ea8 # unit test from detect_test.go +- bcdf7068c28fab7bc4fd737c3197df180f2fb07b # unit test from detect_test.go +- c1e1deec29124eeab7be6a18a5babfbd0bb72059 # unit test from e2e_test.go +- c2350c25bccece41e4cf2295d107e6609f3e616a # unit test from detect_test.go +- cb0436f127d092d30aaab160d7fa8136462b959f # unit test from reader_test.go +- ce24444cc20bd1f602555d83bb1b237dc48892ac # unit test from detect_test.go +- ce35535feeceb177978f3f2292a9e45a4d59b94e # unit test from engine_test.go +- d586e56c65c55341bf52ec85a1c714e4447d29a1 # unit test from detect_test.go +- d6c1757c1515725af7a19f9c482a8c0611ae008f # unit test from detect_test.go +- dc42bae4cb2ef7cd40753c6468983ea9ac8c3b64 # unit test from engine_test.go +- dcdaac30b51248d43d84949e3df528d3fbf3f9cd # unit test from detect_test.go +- e89ddc535cf5d049e313fdcec347adbba3136e88 # unit test from detect_test.go +- e8a1871a16818bcfe448877955c3c631f0e1d033 # unit test from detect_test.go +- e9c06b68353c0301394dfa5afc432361b4dbd07a # unit test from detect_test.go +- ea0ef53c2528281124cdf3c54ad08c5d308829dc # unit test from engine_test.go +- f480fa35114d69065f5e8d921a99bf4b79aab633 # unit test from detect_test.go +- f8ad50fd98cae17bfe5ca72ba61ff3a5633af7c7 # unit test from detect_test.go +- f8df8140da50183a21805c2bde7d8628c9e67250 # unit test from detect_test.go +- fd8aec977582f220d0c416be5c3ad0a77a5e4e88 # unit test from detect_test.go +- 1fe065327471fe5245572e5406b3647d27887d58 # test/development data from generic_credential.go +- 41cca7cb54b14f9a6343f51a56dee951a3249a16 # test/development data from generic_credential.go +- 5d3c644b96a41c68a4bf3c6ecca6359557bb5c90 # test/development data from generic_credential.go +- 6c09a8e8d52d37152e38a4f64c96db16abd964ab # test/development data from generic_credential.go +- 7ccee5d40367960f69709da28dcf197c7c5c06f8 # test/development data from generic_credential.go +- ba40bd0cfa331c71899179dbb35b2a6eb452a482 # secret found in ruleids.go +- eeaf67842fc9be3123b8e2aea470608c2f362033 # test/development data from generic_credential.go +- 25960f13ce160dba4f08f210370fccdca6cb51eb # FP, id not credential diff --git a/.github/workflows/cx-one-scan.yaml b/.github/workflows/cx-one-scan.yaml index b270cc59..926337b8 100644 --- a/.github/workflows/cx-one-scan.yaml +++ b/.github/workflows/cx-one-scan.yaml @@ -6,7 +6,6 @@ on: push: branches: - master - - AST-75295-custom-rules schedule: - cron: '00 7 * * *' diff --git a/.github/workflows/pr-validation.yml b/.github/workflows/pr-validation.yml index bec1ac4c..34644dbf 100644 --- a/.github/workflows/pr-validation.yml +++ b/.github/workflows/pr-validation.yml @@ -2,9 +2,6 @@ name: PR Validation on: pull_request: - branches: - - master - - AST-75295-custom-rules merge_group: jobs: diff --git a/.github/workflows/security.yml b/.github/workflows/security.yml index f3be1054..f731c2c4 100644 --- a/.github/workflows/security.yml +++ b/.github/workflows/security.yml @@ -4,11 +4,7 @@ on: push: branches: - master - - AST-75295-custom-rules pull_request: - branches: - - master - - AST-75295-custom-rules merge_group: schedule: - cron: "0 0 * * *" diff --git a/.github/workflows/trivy-vulnerability-scan.yaml b/.github/workflows/trivy-vulnerability-scan.yaml index 403d7903..ed702674 100644 --- a/.github/workflows/trivy-vulnerability-scan.yaml +++ b/.github/workflows/trivy-vulnerability-scan.yaml @@ -3,9 +3,6 @@ on: push: workflow_dispatch: pull_request: - branches: - - master - - AST-75295-custom-rules schedule: - cron: '5 6 * * *' # Runs every day at 06:05 UTC diff --git a/.github/workflows/validate-readme.yml b/.github/workflows/validate-readme.yml index f7148db7..59aaa376 100644 --- a/.github/workflows/validate-readme.yml +++ b/.github/workflows/validate-readme.yml @@ -2,9 +2,6 @@ name: Validate README on: pull_request: - branches: - - master - - AST-75295-custom-rules merge_group: jobs: diff --git a/README.md b/README.md index 626236cb..75a78340 100644 --- a/README.md +++ b/README.md @@ -57,7 +57,7 @@ Scan recent Git history instead: - Unified scanning for local directories, Git history, Slack, Discord, Confluence Cloud, and Paligo — each exposed as a dedicated subcommand. - Hundreds of tuned detection rules curated by Checkmarx on top of gitleaks, enriched with CVSS-based scoring in every finding. - Optional live secret validation (`--validate`) to confirm whether discovered credentials are still active. -- Flexible filtering and noise reduction: `--rule`, `--ignore-rule`, `--add-special-rule`, `--ignore-result`, `--regex`, `--allowed-values`, and `--max-target-megabytes`. +- Flexible filtering and noise reduction: `--rule`, `--ignore-rule`, `--add-special-rule`, `--ignore-result`, `--regex`, `--allowed-values`, `--max-target-megabytes`, `--max-findings`, `--max-rule-matches-per-fragment`, and `--max-secret-size`. - Rich reporting for developers and pipelines with JSON, YAML, and SARIF outputs, multiple `--report-path` destinations, and CI-aware exit handling via `--ignore-on-exit`. - Automation ready: configuration files, `2MS_*` environment variables, Docker images, and GitHub Actions templates. - Extensible plugin architecture — contributions for new data sources are welcome. @@ -244,15 +244,18 @@ Global flags work with every subcommand. Combine them with configuration files a ### Global Flags -| Flag | Type | Default | Description | -|------|------|---------|-------------| -| `--config` | string | | Path to a YAML or JSON configuration file. | -| `--log-level` | string | `info` | Logging level: `trace`, `debug`, `info`, `warn`, `error`, `fatal`, or `none`. | -| `--stdout-format` | string | `yaml` | `yaml`, `json`, or `sarif` output on stdout. | -| `--report-path` | string slice | | Write findings to one or more files; format is inferred from the extension. | -| `--ignore-on-exit` | enum | `none` | Control exit codes: `all`, `results`, `errors`, or `none`. | -| `--max-target-megabytes` | int | `0` | Skip files larger than the threshold (0 disables the check). | -| `--validate` | bool | `false` | Enrich results by verifying secrets when supported. | +| Flag | Type | Default | Description | +|-----------------------------------|--------------|---------|-----------------------------------------------------------------------------------------------------------------| +| `--config` | string | | Path to a YAML or JSON configuration file. | +| `--log-level` | string | `info` | Logging level: `trace`, `debug`, `info`, `warn`, `error`, `fatal`, or `none`. | +| `--stdout-format` | string | `yaml` | `yaml`, `json`, or `sarif` output on stdout. | +| `--report-path` | string slice | | Write findings to one or more files; format is inferred from the extension. | +| `--ignore-on-exit` | enum | `none` | Control exit codes: `all`, `results`, `errors`, or `none`. | +| `--max-target-megabytes` | int | `0` | Skip files larger than the threshold (0 disables the check). | +| `--max-findings` | int | `0` | Caps the total number of results. Scan stops early if limit is reached. Omit or set to 0 to disable. | +| `--max-rule-matches-per-fragment` | int | `0` | Caps the number of results per rule per fragment (e.g., file, chunked file, page). Omit or set to 0 to disable. | +| `--max-secret-size` | int | `0` | Secrets larger than this size (in bytes) will be ignored. Omit or set to 0 to disable this check. | +| `--validate` | bool | `false` | Enrich results by verifying secrets when supported. | ### Configuration Files & Environment Variables diff --git a/cmd/config.go b/cmd/config.go index 6bcd82fc..2cef07f2 100644 --- a/cmd/config.go +++ b/cmd/config.go @@ -132,6 +132,18 @@ func setupFlags(rootCmd *cobra.Command) { IntVar(&engineConfigVar.MaxTargetMegabytes, maxTargetMegabytesFlagName, 0, "files larger than this will be skipped.\nOmit or set to 0 to disable this check.") + rootCmd.PersistentFlags(). + Uint64Var(&engineConfigVar.MaxFindings, maxFindingsFlagName, 0, + "caps the total number of results. Scan stops early if limit is reached.\nOmit or set to 0 to disable this check.") + + rootCmd.PersistentFlags(). + Uint64Var(&engineConfigVar.MaxRuleMatchesPerFragment, maxRuleMatchesPerFragmentFlagName, 0, + "caps the number of results per rule per fragment (e.g., file, chunked file, page).\nOmit or set to 0 to disable this check.") + + rootCmd.PersistentFlags(). + Uint64Var(&engineConfigVar.MaxSecretSize, maxSecretSizeFlagName, 0, + "secrets larger than this size (in bytes) will be ignored.\nOmit or set to 0 to disable this check.") + rootCmd.PersistentFlags(). BoolVar(&validateVar, validate, false, "trigger additional validation to check if discovered secrets are valid or invalid") diff --git a/cmd/main.go b/cmd/main.go index c764079b..fcf6e4ba 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -19,19 +19,22 @@ const ( outputFormatRegexpPattern = `^(ya?ml|json|sarif)$` configFileFlag = "config" - logLevelFlagName = "log-level" - reportPathFlagName = "report-path" - stdoutFormatFlagName = "stdout-format" - customRegexRuleFlagName = "regex" - ruleFlagName = "rule" - ignoreRuleFlagName = "ignore-rule" - ignoreFlagName = "ignore-result" - allowedValuesFlagName = "allowed-values" - specialRulesFlagName = "add-special-rule" - ignoreOnExitFlagName = "ignore-on-exit" - maxTargetMegabytesFlagName = "max-target-megabytes" - validate = "validate" - customRulesFileFlagName = "custom-rules-path" + logLevelFlagName = "log-level" + reportPathFlagName = "report-path" + stdoutFormatFlagName = "stdout-format" + customRegexRuleFlagName = "regex" + ruleFlagName = "rule" + ignoreRuleFlagName = "ignore-rule" + ignoreFlagName = "ignore-result" + allowedValuesFlagName = "allowed-values" + specialRulesFlagName = "add-special-rule" + ignoreOnExitFlagName = "ignore-on-exit" + maxTargetMegabytesFlagName = "max-target-megabytes" + maxFindingsFlagName = "max-findings" + maxRuleMatchesPerFragmentFlagName = "max-rule-matches-per-fragment" + maxSecretSizeFlagName = "max-secret-size" + validate = "validate" + customRulesFileFlagName = "custom-rules-path" ) var ( diff --git a/engine/constants/ruleids.go b/engine/constants/ruleids.go new file mode 100644 index 00000000..7d347b43 --- /dev/null +++ b/engine/constants/ruleids.go @@ -0,0 +1,4 @@ +package constants + +// GenericCredentialRuleID is the rule ID for generic credential detection. +const GenericCredentialRuleID = "01ab7659-d25a-4a1c-9f98-dee9d0cf2e70" //nolint:gosec // This is a rule ID, not a credential diff --git a/engine/detect/baseline.go b/engine/detect/baseline.go new file mode 100644 index 00000000..c6504aa5 --- /dev/null +++ b/engine/detect/baseline.go @@ -0,0 +1,82 @@ +package detect + +import ( + "encoding/json" + "fmt" + "os" + "path/filepath" + + "github.com/zricethezav/gitleaks/v8/report" +) + +//nolint:gocyclo // TODO: refactor this function to reduce cyclomatic complexity +func IsNew(finding *report.Finding, redact uint, baseline []report.Finding) bool { + // Explicitly testing each property as it gives significantly better performance in comparison to cmp.Equal(). Drawback is that + // the code requires maintenance if/when the Finding struct changes + for i := range baseline { + b := &baseline[i] + if finding.RuleID == b.RuleID && + finding.Description == b.Description && + finding.StartLine == b.StartLine && + finding.EndLine == b.EndLine && + finding.StartColumn == b.StartColumn && + finding.EndColumn == b.EndColumn && + (redact > 0 || (finding.Match == b.Match && finding.Secret == b.Secret)) && + finding.File == b.File && + finding.Commit == b.Commit && + finding.Author == b.Author && + finding.Email == b.Email && + finding.Date == b.Date && + finding.Message == b.Message && + // Omit checking finding.Fingerprint - if the format of the fingerprint changes, the users will see unexpected behavior + finding.Entropy == b.Entropy { + return false + } + } + return true +} + +func LoadBaseline(baselinePath string) ([]report.Finding, error) { + bytes, err := os.ReadFile(baselinePath) + if err != nil { + return nil, fmt.Errorf("could not open %s", baselinePath) + } + + var previousFindings []report.Finding + err = json.Unmarshal(bytes, &previousFindings) + if err != nil { + return nil, fmt.Errorf("the format of the file %s is not supported", baselinePath) + } + + return previousFindings, nil +} + +func (d *Detector) AddBaseline(baselinePath, source string) error { + if baselinePath != "" { + absoluteSource, err := filepath.Abs(source) + if err != nil { + return err + } + + absoluteBaseline, err := filepath.Abs(baselinePath) + if err != nil { + return err + } + + relativeBaseline, err := filepath.Rel(absoluteSource, absoluteBaseline) + if err != nil { + return err + } + + baseline, err := LoadBaseline(baselinePath) + if err != nil { + return err + } + + d.baseline = baseline + baselinePath = relativeBaseline + } + + d.baselinePath = baselinePath + return nil +} diff --git a/engine/detect/detect.go b/engine/detect/detect.go new file mode 100644 index 00000000..c17fd289 --- /dev/null +++ b/engine/detect/detect.go @@ -0,0 +1,900 @@ +package detect + +import ( + "bufio" + "context" + "fmt" + "math" + "os" + "strings" + "sync" + "sync/atomic" + "time" + + "github.com/zricethezav/gitleaks/v8/config" + "github.com/zricethezav/gitleaks/v8/detect/codec" + "github.com/zricethezav/gitleaks/v8/logging" + "github.com/zricethezav/gitleaks/v8/regexp" + "github.com/zricethezav/gitleaks/v8/report" + "github.com/zricethezav/gitleaks/v8/sources" + + ahocorasick "github.com/BobuSumisu/aho-corasick" + "github.com/fatih/semgroup" + "github.com/rs/zerolog" + "golang.org/x/exp/maps" +) + +const ( + gitleaksAllowSignature = "gitleaks:allow" + // SlowWarningThreshold is the amount of time to wait before logging that a file is slow. + // This is useful for identifying problematic files and tuning the allowlist. + SlowWarningThreshold = 5 * time.Second +) + +var ( + newLineRegexp = regexp.MustCompile("\n") +) + +// Detector is the main detector struct +type Detector struct { // TODO refactor package to remove fields and logic that only apply to gitleaks + // Config is the configuration for the detector + Config config.Config + + // Redact is a flag to redact findings. This is exported + // so users using gitleaks as a library can set this flag + // without calling `detector.Start(cmd *cobra.Command)` + Redact uint + + // verbose is a flag to print findings + Verbose bool + + // MaxDecodeDepths limits how many recursive decoding passes are allowed + MaxDecodeDepth int + + // MaxArchiveDepth limits how deep the sources will explore nested archives + MaxArchiveDepth int + + // files larger than this will be skipped + MaxTargetMegaBytes int + + // caps the number of regex matches per rule per fragment + MaxRuleMatchesPerFragment uint64 + + // MaxSecretSize is the maximum allowed secret size in bytes. + // Secrets larger than this will be ignored. 0 means no limit. + MaxSecretSize uint64 + + // followSymlinks is a flag to enable scanning symlink files + FollowSymlinks bool + + // NoColor is a flag to disable color output + NoColor bool + + // IgnoreGitleaksAllow is a flag to ignore gitleaks:allow comments. + IgnoreGitleaksAllow bool + + // commitMutex is to prevent concurrent access to the + // commit map when adding commits + commitMutex *sync.Mutex + + // commitMap is used to keep track of commits that have been scanned. + // This is only used for logging purposes and git scans. + commitMap map[string]bool + + // findingMutex is to prevent concurrent access to the + // findings slice when adding findings. + findingMutex *sync.Mutex + + // findings is a slice of report.Findings. This is the result + // of the detector's scan which can then be used to generate a + // report. + findings []report.Finding + + // prefilter is a ahocorasick struct used for doing efficient string + // matching given a set of words (keywords from the rules in the config) + prefilter ahocorasick.Trie + + // a list of known findings that should be ignored + baseline []report.Finding + + // path to baseline + baselinePath string + + // gitleaksIgnore + gitleaksIgnore map[string]struct{} + + // Sema (https://github.com/fatih/semgroup) controls the concurrency + Sema *semgroup.Group + + // report-related settings. + ReportPath string + Reporter report.Reporter + + TotalBytes atomic.Uint64 +} + +// Fragment is an alias for sources.Fragment for backwards compatibility +// +// Deprecated: This will be replaced with sources.Fragment in v9 +type Fragment sources.Fragment + +// NewDetector creates a new detector with the given config +func NewDetector(cfg *config.Config) *Detector { + return &Detector{ + commitMap: make(map[string]bool), + gitleaksIgnore: make(map[string]struct{}), + findingMutex: &sync.Mutex{}, + commitMutex: &sync.Mutex{}, + findings: make([]report.Finding, 0), + Config: *cfg, + prefilter: *ahocorasick.NewTrieBuilder().AddStrings(maps.Keys(cfg.Keywords)).Build(), + Sema: semgroup.NewGroup(context.Background(), 40), + } +} + +func (d *Detector) AddGitleaksIgnore(gitleaksIgnorePath string) error { + logging.Debug().Str("path", gitleaksIgnorePath).Msgf("found .gitleaksignore file") + file, err := os.Open(gitleaksIgnorePath) + if err != nil { + return err + } + defer func() { + // https://github.com/securego/gosec/issues/512 + if err := file.Close(); err != nil { + logging.Warn().Err(err).Msgf("Error closing .gitleaksignore file") + } + }() + + scanner := bufio.NewScanner(file) + replacer := strings.NewReplacer("\\", "/") + for scanner.Scan() { + line := strings.TrimSpace(scanner.Text()) + // Skip lines that start with a comment + if line == "" || strings.HasPrefix(line, "#") { + continue + } + + // Normalize the path. + // TODO: Make this a breaking change in v9. + s := strings.Split(line, ":") + switch len(s) { + case 3: + // Global fingerprint. + // `file:rule-id:start-line` + s[0] = replacer.Replace(s[0]) + case 4: + // Commit fingerprint. + // `commit:file:rule-id:start-line` + s[1] = replacer.Replace(s[1]) + default: + logging.Warn().Str("fingerprint", line).Msg("Invalid .gitleaksignore entry") + } + d.gitleaksIgnore[strings.Join(s, ":")] = struct{}{} + } + return nil +} + +// DetectBytes scans the given bytes and returns a list of findings +func (d *Detector) DetectBytes(content []byte) []report.Finding { + return d.DetectString(string(content)) +} + +// DetectString scans the given string and returns a list of findings +func (d *Detector) DetectString(content string) []report.Finding { + return d.Detect(&Fragment{ + Raw: content, + }) +} + +// DetectSource scans the given source and returns a list of findings +func (d *Detector) DetectSource(ctx context.Context, source sources.Source) ([]report.Finding, error) { + err := source.Fragments(ctx, func(fragment sources.Fragment, err error) error { + logContext := logging.With() + + if fragment.FilePath != "" { + logContext = logContext.Str("path", fragment.FilePath) + } + + if len(fragment.CommitSHA) > 6 { + logContext = logContext.Str("commit", fragment.CommitSHA[:7]) + d.addCommit(fragment.CommitSHA) + } else if fragment.CommitSHA != "" { + logContext = logContext.Str("commit", fragment.CommitSHA) + d.addCommit(fragment.CommitSHA) + logger := logContext.Logger() + logger.Warn().Msg("commit SHAs should be >= 7 characters long") + } + + logger := logContext.Logger() + + if err != nil { + // Log the error and move on to the next fragment + logger.Error().Err(err).Send() + return nil + } + + // both the fragment's content and path should be empty for it to be + // considered empty at this point because of path based matches + if fragment.Raw == "" && fragment.FilePath == "" { + logger.Trace().Msg("skipping empty fragment") + return nil + } + + var timer *time.Timer + // Only start the timer in debug mode + if logger.GetLevel() <= zerolog.DebugLevel { + timer = time.AfterFunc(SlowWarningThreshold, func() { + logger.Debug().Msgf("Taking longer than %s to inspect fragment", SlowWarningThreshold.String()) + }) + } + + f := Fragment(fragment) + findings := d.Detect(&f) + for i := range findings { + d.AddFinding(&findings[i]) + } + + // Stop the timer if it was created + if timer != nil { + timer.Stop() + } + + return nil + }) + + if _, isGit := source.(*sources.Git); isGit { + logging.Info().Msgf("%d commits scanned.", len(d.commitMap)) + logging.Debug().Msg("Note: this number might be smaller than expected due to commits with no additions") + } + + return d.Findings(), err +} + +// Detect scans the given fragment and returns a list of findings +// +//nolint:gocyclo // TODO: refactor this function to reduce cyclomatic complexity +func (d *Detector) Detect(fragment *Fragment) []report.Finding { + if fragment.Bytes == nil { + d.TotalBytes.Add(uint64(len(fragment.Raw))) + } + d.TotalBytes.Add(uint64(len(fragment.Bytes))) + + var ( + findings []report.Finding + logger = func() zerolog.Logger { + l := logging.With().Str("path", fragment.FilePath) + if fragment.CommitSHA != "" { + l = l.Str("commit", fragment.CommitSHA) + } + return l.Logger() + }() + ) + + // check if filepath is allowed + if fragment.FilePath != "" { + // is the path our config or baseline file? + if fragment.FilePath == d.Config.Path || (d.baselinePath != "" && fragment.FilePath == d.baselinePath) { + logging.Trace().Msg("skipping file: matches config or baseline path") + return findings + } + } + // check if commit or filepath is allowed. + if isAllowed, event := checkCommitOrPathAllowed(logger, fragment, d.Config.Allowlists); isAllowed { + event.Msg("skipping file: global allowlist") + return findings + } + + // setup variables to handle different decoding passes + currentRaw := fragment.Raw + encodedSegments := []*codec.EncodedSegment{} + currentDecodeDepth := 0 + decoder := codec.NewDecoder() + + for { + // build keyword map for prefiltering rules + keywords := make(map[string]bool) + normalizedRaw := strings.ToLower(currentRaw) + matches := d.prefilter.MatchString(normalizedRaw) + for _, m := range matches { + keywords[normalizedRaw[m.Pos():int(m.Pos())+len(m.Match())]] = true + } + + for ruleIdx := range d.Config.Rules { + rule := d.Config.Rules[ruleIdx] + if len(rule.Keywords) == 0 { + // if no keywords are associated with the rule always scan the + // fragment using the rule + findings = append(findings, d.detectRule(fragment, currentRaw, &rule, encodedSegments)...) + continue + } + + // check if keywords are in the fragment + for _, k := range rule.Keywords { + if _, ok := keywords[strings.ToLower(k)]; ok { + findings = append(findings, d.detectRule(fragment, currentRaw, &rule, encodedSegments)...) + break + } + } + } + + // increment the depth by 1 as we start our decoding pass + currentDecodeDepth++ + + // stop the loop if we've hit our max decoding depth + if currentDecodeDepth > d.MaxDecodeDepth { + break + } + + // decode the currentRaw for the next pass + currentRaw, encodedSegments = decoder.Decode(currentRaw, encodedSegments) + + // stop the loop when there's nothing else to decode + if len(encodedSegments) == 0 { + break + } + } + + return filter(findings, d.Redact) +} + +// detectRule scans the given fragment for the given rule and returns a list of findings +// +//nolint:gocyclo,funlen // TODO: refactor this function to reduce cyclomatic complexity and statements +func (d *Detector) detectRule( + fragment *Fragment, currentRaw string, r *config.Rule, encodedSegments []*codec.EncodedSegment, +) []report.Finding { + var ( + findings []report.Finding + logger = func() zerolog.Logger { + l := logging.With().Str("rule-id", r.RuleID).Str("path", fragment.FilePath) + if fragment.CommitSHA != "" { + l = l.Str("commit", fragment.CommitSHA) + } + return l.Logger() + }() + ) + + if r.SkipReport && !fragment.InheritedFromFinding { + return findings + } + + // check if commit or file is allowed for this rule. + if isAllowed, event := checkCommitOrPathAllowed(logger, fragment, r.Allowlists); isAllowed { + event.Msg("skipping file: rule allowlist") + return findings + } + + if r.Path != nil { + if r.Regex == nil && len(encodedSegments) == 0 { + // Path _only_ rule + if r.Path.MatchString(fragment.FilePath) || (fragment.WindowsFilePath != "" && r.Path.MatchString(fragment.WindowsFilePath)) { + finding := report.Finding{ + Commit: fragment.CommitSHA, + RuleID: r.RuleID, + Description: r.Description, + File: fragment.FilePath, + SymlinkFile: fragment.SymlinkFile, + Match: "file detected: " + fragment.FilePath, + Tags: r.Tags, + } + if fragment.CommitInfo != nil { + finding.Author = fragment.CommitInfo.AuthorName + finding.Date = fragment.CommitInfo.Date + finding.Email = fragment.CommitInfo.AuthorEmail + finding.Link = createScmLink(fragment.CommitInfo.Remote, &finding) + finding.Message = fragment.CommitInfo.Message + } + return append(findings, finding) + } + } else { + // if path is set _and_ a regex is set, then we need to check both + // so if the path does not match, then we should return early and not + // consider the regex + pathMatches := r.Path.MatchString(fragment.FilePath) + windowsPathMatches := fragment.WindowsFilePath != "" && r.Path.MatchString(fragment.WindowsFilePath) + if !pathMatches && !windowsPathMatches { + return findings + } + } + } + + // if path only rule, skip content checks + if r.Regex == nil { + return findings + } + + // if flag configure and raw data size bigger then the flag + if d.MaxTargetMegaBytes > 0 { + rawLength := len(currentRaw) / 1_000_000 + if rawLength > d.MaxTargetMegaBytes { + logger.Debug(). + Int("size", rawLength). + Int("max-size", d.MaxTargetMegaBytes). + Msg("skipping fragment: size") + return findings + } + } + + matchLimit := -1 + if d.MaxRuleMatchesPerFragment > 0 && d.MaxRuleMatchesPerFragment <= uint64(math.MaxInt) { + matchLimit = int(d.MaxRuleMatchesPerFragment) + } + matches := r.Regex.FindAllStringIndex(currentRaw, matchLimit) + if len(matches) == 0 { + return findings + } + + // TODO profile this, probably should replace with something more efficient + newlineIndices := newLineRegexp.FindAllStringIndex(fragment.Raw, -1) + + // use currentRaw instead of fragment.Raw since this represents the current + // decoding pass on the text + for _, matchIndex := range matches { + // Extract secret from match + secret := strings.Trim(currentRaw[matchIndex[0]:matchIndex[1]], "\n") + + // For any meta data from decoding + var metaTags []string + currentLine := "" + + // Check if the decoded portions of the segment overlap with the match + // to see if its potentially a new match + if len(encodedSegments) > 0 { + segments := codec.SegmentsWithDecodedOverlap(encodedSegments, matchIndex[0], matchIndex[1]) + if len(segments) == 0 { + // This item has already been added to a finding + continue + } + + matchIndex = codec.AdjustMatchIndex(segments, matchIndex) + metaTags = append(metaTags, codec.Tags(segments)...) + currentLine = codec.CurrentLine(segments, currentRaw) + } else { + // Fixes: https://github.com/gitleaks/gitleaks/issues/1352 + // removes the incorrectly following line that was detected by regex expression '\n' + matchIndex[1] = matchIndex[0] + len(secret) + } + + // determine location of match. Note that the location + // in the finding will be the line/column numbers of the _match_ + // not the _secret_, which will be different if the secretGroup + // value is set for this rule + loc := location(newlineIndices, fragment.Raw, matchIndex) + + if matchIndex[1] > loc.endLineIndex { + loc.endLineIndex = matchIndex[1] + } + + finding := report.Finding{ + Commit: fragment.CommitSHA, + RuleID: r.RuleID, + Description: r.Description, + StartLine: fragment.StartLine + loc.startLine, + EndLine: fragment.StartLine + loc.endLine, + StartColumn: loc.startColumn, + EndColumn: loc.endColumn, + Line: fragment.Raw[loc.startLineIndex:loc.endLineIndex], + Match: secret, + Secret: secret, + File: fragment.FilePath, + SymlinkFile: fragment.SymlinkFile, + Tags: append(r.Tags, metaTags...), + } + if fragment.CommitInfo != nil { + finding.Author = fragment.CommitInfo.AuthorName + finding.Date = fragment.CommitInfo.Date + finding.Email = fragment.CommitInfo.AuthorEmail + finding.Link = createScmLink(fragment.CommitInfo.Remote, &finding) + finding.Message = fragment.CommitInfo.Message + } + if !d.IgnoreGitleaksAllow && strings.Contains(finding.Line, gitleaksAllowSignature) { + logger.Trace(). + Str("finding", finding.Secret). + Msg("skipping finding: 'gitleaks:allow' signature") + continue + } + + if currentLine == "" { + currentLine = finding.Line + } + + // Set the value of |secret|, if the pattern contains at least one capture group. + // (The first element is the full match, hence we check >= 2.) + groups := r.Regex.FindStringSubmatch(finding.Secret) + if len(groups) >= 2 { + if r.SecretGroup > 0 { + if len(groups) <= r.SecretGroup { + // Config validation should prevent this + continue + } + finding.Secret = groups[r.SecretGroup] + } else { + // If |secretGroup| is not set, we will use the first suitable capture group. + for _, s := range groups[1:] { + if s != "" { + finding.Secret = s + break + } + } + } + } + + // check if secret size (in bytes) exceeds the maximum allowed size + if d.MaxSecretSize > 0 && uint64(len(finding.Secret)) > d.MaxSecretSize { + logger.Trace(). + Int("secret_size_bytes", len(finding.Secret)). + Uint64("max_secret_size_bytes", d.MaxSecretSize). + Str("rule_id", r.RuleID). + Msg("skipping finding: exceeds max secret size") + continue + } + + // check entropy + entropy := shannonEntropy(finding.Secret) + finding.Entropy = float32(entropy) + if r.Entropy != 0.0 { + // entropy is too low, skip this finding + if entropy <= r.Entropy { + logger.Trace(). + Str("finding", finding.Secret). + Float32("entropy", finding.Entropy). + Msg("skipping finding: low entropy") + continue + } + } + + // check if the result matches any of the global allowlists. + if isAllowed, event := checkFindingAllowed(logger, &finding, fragment, currentLine, d.Config.Allowlists); isAllowed { + event.Msg("skipping finding: global allowlist") + continue + } + + // check if the result matches any of the rule allowlists. + if isAllowed, event := checkFindingAllowed(logger, &finding, fragment, currentLine, r.Allowlists); isAllowed { + event.Msg("skipping finding: rule allowlist") + continue + } + findings = append(findings, finding) + } + + // Handle required rules (multi-part rules) + if fragment.InheritedFromFinding || len(r.RequiredRules) == 0 { + return findings + } + + // Process required rules and create findings with auxiliary findings + return d.processRequiredRules(fragment, currentRaw, r, encodedSegments, findings, logger) +} + +// processRequiredRules handles the logic for multi-part rules with auxiliary findings +func (d *Detector) processRequiredRules( + fragment *Fragment, + currentRaw string, + r *config.Rule, + encodedSegments []*codec.EncodedSegment, + primaryFindings []report.Finding, + logger zerolog.Logger, +) []report.Finding { + if len(primaryFindings) == 0 { + logger.Debug().Msg("no primary findings to process for required rules") + return primaryFindings + } + + // Pre-collect all required rule findings once + allRequiredFindings := make(map[string][]report.Finding) + + for _, requiredRule := range r.RequiredRules { + rule, ok := d.Config.Rules[requiredRule.RuleID] + if !ok { + logger.Error().Str("rule-id", requiredRule.RuleID).Msg("required rule not found in config") + continue + } + + // Mark fragment as inherited to prevent infinite recursion + inheritedFragment := *fragment + inheritedFragment.InheritedFromFinding = true + + // Call detectRule once for each required rule + requiredFindings := d.detectRule(&inheritedFragment, currentRaw, &rule, encodedSegments) + allRequiredFindings[requiredRule.RuleID] = requiredFindings + + logger.Debug(). + Str("rule-id", requiredRule.RuleID). + Int("findings", len(requiredFindings)). + Msg("collected required rule findings") + } + + var finalFindings []report.Finding + + // Now process each primary finding against the pre-collected required findings + for i := range primaryFindings { + primaryFinding := &primaryFindings[i] + var requiredFindings []*report.RequiredFinding + + for _, requiredRule := range r.RequiredRules { + foundRequiredFindings, exists := allRequiredFindings[requiredRule.RuleID] + if !exists { + continue // Rule wasn't found earlier, skip + } + + // Filter findings that are within proximity of the primary finding + for j := range foundRequiredFindings { + requiredFinding := &foundRequiredFindings[j] + if d.withinProximity(primaryFinding, requiredFinding, requiredRule) { + req := &report.RequiredFinding{ + RuleID: requiredFinding.RuleID, + StartLine: requiredFinding.StartLine, + EndLine: requiredFinding.EndLine, + StartColumn: requiredFinding.StartColumn, + EndColumn: requiredFinding.EndColumn, + Line: requiredFinding.Line, + Match: requiredFinding.Match, + Secret: requiredFinding.Secret, + } + requiredFindings = append(requiredFindings, req) + } + } + } + + // Check if we have at least one auxiliary finding for each required rule + if len(requiredFindings) > 0 && d.hasAllRequiredRules(requiredFindings, r.RequiredRules) { + // Create a finding with auxiliary findings + newFinding := *primaryFinding // Copy the primary finding + newFinding.AddRequiredFindings(requiredFindings) + finalFindings = append(finalFindings, newFinding) + + logger.Debug(). + Str("primary-rule", r.RuleID). + Int("primary-line", primaryFinding.StartLine). + Int("auxiliary-count", len(requiredFindings)). + Msg("multi-part rule satisfied") + } + } + + return finalFindings +} + +// hasAllRequiredRules checks if we have at least one auxiliary finding for each required rule +func (d *Detector) hasAllRequiredRules(auxiliaryFindings []*report.RequiredFinding, requiredRules []*config.Required) bool { + foundRules := make(map[string]bool) + // AuxiliaryFinding + for _, aux := range auxiliaryFindings { + foundRules[aux.RuleID] = true + } + + for _, required := range requiredRules { + if !foundRules[required.RuleID] { + return false + } + } + + return true +} + +func (d *Detector) withinProximity(primary, required *report.Finding, requiredRule *config.Required) bool { + // If neither within_lines nor within_columns is set, findings just need to be in the same fragment + if requiredRule.WithinLines == nil && requiredRule.WithinColumns == nil { + return true + } + + // Check line proximity (vertical distance) + if requiredRule.WithinLines != nil { + lineDiff := abs(primary.StartLine - required.StartLine) + if lineDiff > *requiredRule.WithinLines { + return false + } + } + + // Check column proximity (horizontal distance) + if requiredRule.WithinColumns != nil { + // Use the start column of each finding for proximity calculation + colDiff := abs(primary.StartColumn - required.StartColumn) + if colDiff > *requiredRule.WithinColumns { + return false + } + } + + return true +} + +// abs returns the absolute value of an integer +func abs(x int) int { + if x < 0 { + return -x + } + return x +} + +// AddFinding synchronously adds a finding to the findings slice +func (d *Detector) AddFinding(finding *report.Finding) { + globalFingerprint := fmt.Sprintf("%s:%s:%d", finding.File, finding.RuleID, finding.StartLine) + if finding.Commit != "" { + finding.Fingerprint = fmt.Sprintf("%s:%s:%s:%d", finding.Commit, finding.File, finding.RuleID, finding.StartLine) + } else { + finding.Fingerprint = globalFingerprint + } + + // check if we should ignore this finding + logger := logging.With().Str("finding", finding.Secret).Logger() + if _, ok := d.gitleaksIgnore[globalFingerprint]; ok { + logger.Debug(). + Str("fingerprint", globalFingerprint). + Msg("skipping finding: global fingerprint") + return + } else if finding.Commit != "" { + // Awkward nested if because I'm not sure how to chain these two conditions. + if _, ok := d.gitleaksIgnore[finding.Fingerprint]; ok { + logger.Debug(). + Str("fingerprint", finding.Fingerprint). + Msgf("skipping finding: fingerprint") + return + } + } + + if d.baseline != nil && !IsNew(finding, d.Redact, d.baseline) { + logger.Debug(). + Str("fingerprint", finding.Fingerprint). + Msgf("skipping finding: baseline") + return + } + + d.findingMutex.Lock() + d.findings = append(d.findings, *finding) + if d.Verbose { + printFinding(finding, d.NoColor) + } + d.findingMutex.Unlock() +} + +// Findings returns the findings added to the detector +func (d *Detector) Findings() []report.Finding { + return d.findings +} + +// AddCommit synchronously adds a commit to the commit slice +func (d *Detector) addCommit(commit string) { + d.commitMutex.Lock() + d.commitMap[commit] = true + d.commitMutex.Unlock() +} + +// checkCommitOrPathAllowed evaluates |fragment| against all provided |allowlists|. +// +// If the match condition is "OR", only commit and path are checked. +// Otherwise, if regexes or stopwords are defined this will fail. +func checkCommitOrPathAllowed( + logger zerolog.Logger, + fragment *Fragment, + allowlists []*config.Allowlist, +) (bool, *zerolog.Event) { + if fragment.FilePath == "" && fragment.CommitSHA == "" { + return false, nil + } + + for _, a := range allowlists { + var ( + isAllowed bool + allowlistChecks []bool + commitAllowed, _ = a.CommitAllowed(fragment.CommitSHA) + pathAllowed = a.PathAllowed(fragment.FilePath) || (fragment.WindowsFilePath != "" && a.PathAllowed(fragment.WindowsFilePath)) + ) + // If the condition is "AND" we need to check all conditions. + if a.MatchCondition == config.AllowlistMatchAnd { + if len(a.Commits) > 0 { + allowlistChecks = append(allowlistChecks, commitAllowed) + } + if len(a.Paths) > 0 { + allowlistChecks = append(allowlistChecks, pathAllowed) + } + // These will be checked later. + if len(a.Regexes) > 0 { + continue + } + if len(a.StopWords) > 0 { + continue + } + + isAllowed = allTrue(allowlistChecks) + } else { + isAllowed = commitAllowed || pathAllowed + } + if isAllowed { + event := logger.Trace().Str("condition", a.MatchCondition.String()) + if commitAllowed { + event.Bool("allowed-commit", commitAllowed) + } + if pathAllowed { + event.Bool("allowed-path", pathAllowed) + } + return true, event + } + } + return false, nil +} + +// checkFindingAllowed evaluates |finding| against all provided |allowlists|. +// +// If the match condition is "OR", only regex and stopwords are run. (Commit and path should be handled separately). +// Otherwise, all conditions are checked. +// +// TODO: The method signature is awkward. I can't think of a better way to log helpful info. +// +//nolint:gocyclo // TODO: refactor this function to reduce cyclomatic complexity +func checkFindingAllowed( + logger zerolog.Logger, + finding *report.Finding, + fragment *Fragment, + currentLine string, + allowlists []*config.Allowlist, +) (bool, *zerolog.Event) { + for _, a := range allowlists { + allowlistTarget := finding.Secret + switch a.RegexTarget { + case "match": + allowlistTarget = finding.Match + case "line": + allowlistTarget = currentLine + } + + var ( + checks []bool + isAllowed bool + commitAllowed bool + commit string + pathAllowed bool + regexAllowed = a.RegexAllowed(allowlistTarget) + containsStopword, word = a.ContainsStopWord(finding.Secret) + ) + // If the condition is "AND" we need to check all conditions. + if a.MatchCondition == config.AllowlistMatchAnd { + // Determine applicable checks. + if len(a.Commits) > 0 { + commitAllowed, commit = a.CommitAllowed(fragment.CommitSHA) + checks = append(checks, commitAllowed) + } + if len(a.Paths) > 0 { + pathAllowed = a.PathAllowed(fragment.FilePath) || (fragment.WindowsFilePath != "" && a.PathAllowed(fragment.WindowsFilePath)) + checks = append(checks, pathAllowed) + } + if len(a.Regexes) > 0 { + checks = append(checks, regexAllowed) + } + if len(a.StopWords) > 0 { + checks = append(checks, containsStopword) + } + + isAllowed = allTrue(checks) + } else { + isAllowed = regexAllowed || containsStopword + } + + if isAllowed { + event := logger.Trace(). + Str("finding", finding.Secret). + Str("condition", a.MatchCondition.String()) + if commitAllowed { + event.Str("allowed-commit", commit) + } + if pathAllowed { + event.Bool("allowed-path", pathAllowed) + } + if regexAllowed { + event.Bool("allowed-regex", regexAllowed) + } + if containsStopword { + event.Str("allowed-stopword", word) + } + return true, event + } + } + return false, nil +} + +func allTrue(bools []bool) bool { + for _, check := range bools { + if !check { + return false + } + } + return true +} diff --git a/engine/detect/location.go b/engine/detect/location.go new file mode 100644 index 00000000..94a782ec --- /dev/null +++ b/engine/detect/location.go @@ -0,0 +1,89 @@ +package detect + +// Location represents a location in a file +type Location struct { + startLine int + endLine int + startColumn int + endColumn int + startLineIndex int + endLineIndex int +} + +func location(newlineIndices [][]int, raw string, matchIndex []int) Location { + var ( + prevNewLine int + location Location + lineSet bool + _lineNum int + ) + + start := matchIndex[0] + end := matchIndex[1] + + // default startLineIndex to 0 + location.startLineIndex = 0 + + // Fixes: https://github.com/zricethezav/gitleaks/issues/1037 + // When a fragment does NOT have any newlines, a default "newline" + // will be counted to make the subsequent location calculation logic work + // for fragments will no newlines. + if len(newlineIndices) == 0 { + newlineIndices = [][]int{ + {len(raw), len(raw) + 1}, + } + } + + // If the file doesn't end with a newline, add a virtual newline at the end + // to ensure secrets on the last line are properly detected. + // This fixes the issue where secrets on the last line without a trailing + // newline would not have their location properly set. + lastNewlineIndex := newlineIndices[len(newlineIndices)-1][0] + if lastNewlineIndex < len(raw) { + newlineIndices = append(newlineIndices, []int{len(raw), len(raw) + 1}) + } + + for lineNum, pair := range newlineIndices { + _lineNum = lineNum + newLineByteIndex := pair[0] + if prevNewLine <= start && start < newLineByteIndex { + lineSet = true + location.startLine = lineNum + location.endLine = lineNum + location.startColumn = (start - prevNewLine) + 1 // +1 because counting starts at 1 + location.startLineIndex = prevNewLine + location.endLineIndex = newLineByteIndex + } + if prevNewLine < end && end <= newLineByteIndex { + location.endLine = lineNum + location.endColumn = (end - prevNewLine) + location.endLineIndex = newLineByteIndex + } + + prevNewLine = pair[0] + } + + if !lineSet { + // if lines never get set then that means the secret is most likely + // on the last line of the diff output and the diff output does not have + // a newline + location.startColumn = (start - prevNewLine) + 1 // +1 because counting starts at 1 + location.endColumn = (end - prevNewLine) + location.startLine = _lineNum + 1 + location.endLine = _lineNum + 1 + + // search for new line byte index + i := 0 + for end+i < len(raw) { + if raw[end+i] == '\n' { + break + } + if raw[end+i] == '\r' { + break + } + i++ + } + location.endLineIndex = end + i + } + return location +} diff --git a/engine/detect/utils.go b/engine/detect/utils.go new file mode 100644 index 00000000..9d6e73f8 --- /dev/null +++ b/engine/detect/utils.go @@ -0,0 +1,270 @@ +package detect + +import ( + "fmt" + "math" + "path/filepath" + "strings" + + "github.com/checkmarx/2ms/v4/engine/constants" + "github.com/zricethezav/gitleaks/v8/cmd/scm" + "github.com/zricethezav/gitleaks/v8/logging" + "github.com/zricethezav/gitleaks/v8/report" + "github.com/zricethezav/gitleaks/v8/sources" + + "github.com/charmbracelet/lipgloss" +) + +var linkCleaner = strings.NewReplacer( + " ", "%20", + "%", "%25", +) + +//nolint:gocyclo,funlen // TODO: refactor this function to reduce cyclomatic complexity and statements +func createScmLink(remote *sources.RemoteInfo, finding *report.Finding) string { + if remote.Platform == scm.UnknownPlatform || + remote.Platform == scm.NoPlatform || + finding.Commit == "" { + return "" + } + + // Clean the path. + filePath, _, hasInnerPath := strings.Cut(finding.File, sources.InnerPathSeparator) + filePath = linkCleaner.Replace(filePath) + + switch remote.Platform { + case scm.GitHubPlatform: + link := fmt.Sprintf("%s/blob/%s/%s", remote.Url, finding.Commit, filePath) + if hasInnerPath { + return link + } + ext := strings.ToLower(filepath.Ext(filePath)) + if ext == ".ipynb" || ext == ".md" { + link += "?plain=1" + } + if finding.StartLine != 0 { + link += fmt.Sprintf("#L%d", finding.StartLine) + } + if finding.EndLine != finding.StartLine { + link += fmt.Sprintf("-L%d", finding.EndLine) + } + return link + case scm.GitLabPlatform: + link := fmt.Sprintf("%s/blob/%s/%s", remote.Url, finding.Commit, filePath) + if hasInnerPath { + return link + } + if finding.StartLine != 0 { + link += fmt.Sprintf("#L%d", finding.StartLine) + } + if finding.EndLine != finding.StartLine { + link += fmt.Sprintf("-%d", finding.EndLine) + } + return link + case scm.AzureDevOpsPlatform: + link := fmt.Sprintf("%s/commit/%s?path=/%s", remote.Url, finding.Commit, filePath) + // Add line information if applicable + if hasInnerPath { + return link + } + if finding.StartLine != 0 { + link += fmt.Sprintf("&line=%d", finding.StartLine) + } + if finding.EndLine != finding.StartLine { + link += fmt.Sprintf("&lineEnd=%d", finding.EndLine) + } + // This is a bit dirty, but Azure DevOps does not highlight the line when the lineStartColumn and lineEndColumn are not provided + link += "&lineStartColumn=1&lineEndColumn=10000000&type=2&lineStyle=plain&_a=files" + return link + case scm.GiteaPlatform: + link := fmt.Sprintf("%s/src/commit/%s/%s", remote.Url, finding.Commit, filePath) + if hasInnerPath { + return link + } + ext := strings.ToLower(filepath.Ext(filePath)) + if ext == ".ipynb" || ext == ".md" { + link += "?display=source" + } + if finding.StartLine != 0 { + link += fmt.Sprintf("#L%d", finding.StartLine) + } + if finding.EndLine != finding.StartLine { + link += fmt.Sprintf("-L%d", finding.EndLine) + } + return link + case scm.BitbucketPlatform: + link := fmt.Sprintf("%s/src/%s/%s", remote.Url, finding.Commit, filePath) + if hasInnerPath { + return link + } + if finding.StartLine != 0 { + link += fmt.Sprintf("#lines-%d", finding.StartLine) + } + if finding.EndLine != finding.StartLine { + link += fmt.Sprintf(":%d", finding.EndLine) + } + return link + default: + // This should never happen. + return "" + } +} + +// shannonEntropy calculates the entropy of data using the formula defined here: +// https://en.wiktionary.org/wiki/Shannon_entropy +// Another way to think about what this is doing is calculating the number of bits +// needed to on average encode the data. So, the higher the entropy, the more random the data, the +// more bits needed to encode that data. +func shannonEntropy(data string) (entropy float64) { + if data == "" { + return 0 + } + + charCounts := make(map[rune]int) + for _, char := range data { + charCounts[char]++ + } + + invLength := 1.0 / float64(len(data)) + for _, count := range charCounts { + freq := float64(count) * invLength + entropy -= freq * math.Log2(freq) + } + + return entropy +} + +// filter will dedupe, redact, and remove empty secret findings +func filter(findings []report.Finding, redact uint) []report.Finding { + var retFindings []report.Finding + for i := range findings { + f := &findings[i] + // Skip findings with empty secrets + if f.Secret == "" { + continue + } + include := true + if strings.Contains(strings.ToLower(f.RuleID), constants.GenericCredentialRuleID) { // generic rule ID + for j := range findings { + fPrime := &findings[j] + if f.StartLine == fPrime.StartLine && + f.Commit == fPrime.Commit && + f.RuleID != fPrime.RuleID && + strings.Contains(fPrime.Secret, f.Secret) && + !strings.Contains(strings.ToLower(fPrime.RuleID), constants.GenericCredentialRuleID) { + genericMatch := strings.ReplaceAll(f.Match, f.Secret, "REDACTED") + betterMatch := strings.ReplaceAll(fPrime.Match, fPrime.Secret, "REDACTED") + logging.Trace().Msgf("skipping %s finding (%s), %s rule takes precedence (%s)", f.RuleID, genericMatch, fPrime.RuleID, betterMatch) + include = false + break + } + } + } + + if redact > 0 { + f.Redact(redact) + } + if include { + retFindings = append(retFindings, *f) + } + } + return retFindings +} + +//nolint:funlen // TODO: refactor this function to reduce statements +func printFinding(f *report.Finding, noColor bool) { + // trim all whitespace and tabs + f.Line = strings.TrimSpace(f.Line) + f.Secret = strings.TrimSpace(f.Secret) + f.Match = strings.TrimSpace(f.Match) + + isFileMatch := strings.HasPrefix(f.Match, "file detected:") + skipColor := noColor + finding := "" + var secret lipgloss.Style + + // Matches from filenames do not have a |line| or |secret| + if !isFileMatch { + matchInLineIDX := strings.Index(f.Line, f.Match) + secretInMatchIdx := strings.Index(f.Match, f.Secret) + + skipColor = false + + if matchInLineIDX == -1 || noColor { + skipColor = true + matchInLineIDX = 0 + } + + start := f.Line[0:matchInLineIDX] + startMatchIdx := 0 + if matchInLineIDX > 20 { + startMatchIdx = matchInLineIDX - 20 + start = "..." + f.Line[startMatchIdx:matchInLineIDX] + } + + matchBeginning := lipgloss.NewStyle().SetString(f.Match[0:secretInMatchIdx]).Foreground(lipgloss.Color("#f5d445")) + secret = lipgloss.NewStyle().SetString(f.Secret). + Bold(true). + Italic(true). + Foreground(lipgloss.Color("#f05c07")) + matchEnd := lipgloss.NewStyle().SetString(f.Match[secretInMatchIdx+len(f.Secret):]).Foreground(lipgloss.Color("#f5d445")) + + lineEndIdx := matchInLineIDX + len(f.Match) + if len(f.Line)-1 <= lineEndIdx { + lineEndIdx = len(f.Line) + } + + lineEnd := f.Line[lineEndIdx:] + + if len(f.Secret) > 100 { + secret = lipgloss.NewStyle().SetString(f.Secret[0:100] + "..."). + Bold(true). + Italic(true). + Foreground(lipgloss.Color("#f05c07")) + } + if len(lineEnd) > 20 { + lineEnd = lineEnd[0:20] + "..." + } + + finding = fmt.Sprintf("%s%s%s%s%s\n", strings.TrimPrefix(strings.TrimLeft(start, " "), "\n"), matchBeginning, secret, matchEnd, lineEnd) + } + + if skipColor || isFileMatch { + fmt.Printf("%-12s %s\n", "Finding:", f.Match) + fmt.Printf("%-12s %s\n", "Secret:", f.Secret) + } else { + fmt.Printf("%-12s %s", "Finding:", finding) + fmt.Printf("%-12s %s\n", "Secret:", secret) + } + + fmt.Printf("%-12s %s\n", "RuleID:", f.RuleID) + fmt.Printf("%-12s %f\n", "Entropy:", f.Entropy) + + if f.File == "" { + f.PrintRequiredFindings() + fmt.Println("") + return + } + if len(f.Tags) > 0 { + fmt.Printf("%-12s %s\n", "Tags:", f.Tags) + } + fmt.Printf("%-12s %s\n", "File:", f.File) + fmt.Printf("%-12s %d\n", "Line:", f.StartLine) + if f.Commit == "" { + fmt.Printf("%-12s %s\n", "Fingerprint:", f.Fingerprint) + f.PrintRequiredFindings() + fmt.Println("") + return + } + fmt.Printf("%-12s %s\n", "Commit:", f.Commit) + fmt.Printf("%-12s %s\n", "Author:", f.Author) + fmt.Printf("%-12s %s\n", "Email:", f.Email) + fmt.Printf("%-12s %s\n", "Date:", f.Date) + fmt.Printf("%-12s %s\n", "Fingerprint:", f.Fingerprint) + if f.Link != "" { + fmt.Printf("%-12s %s\n", "Link:", f.Link) + } + + f.PrintRequiredFindings() + fmt.Println("") +} diff --git a/engine/engine.go b/engine/engine.go index 5b5edd3f..36c23173 100644 --- a/engine/engine.go +++ b/engine/engine.go @@ -15,9 +15,11 @@ import ( "slices" "strings" "sync" + "sync/atomic" "text/tabwriter" "github.com/checkmarx/2ms/v4/engine/chunk" + "github.com/checkmarx/2ms/v4/engine/detect" "github.com/checkmarx/2ms/v4/engine/extra" "github.com/checkmarx/2ms/v4/engine/linecontent" "github.com/checkmarx/2ms/v4/engine/rules" @@ -34,8 +36,6 @@ import ( "github.com/sourcegraph/conc" "github.com/spf13/cobra" "github.com/zricethezav/gitleaks/v8/config" - "github.com/zricethezav/gitleaks/v8/detect" - "github.com/zricethezav/gitleaks/v8/logging" "github.com/zricethezav/gitleaks/v8/report" ) @@ -55,10 +55,13 @@ var ( ) type DetectorConfig struct { - SelectedRules []*ruledefine.Rule - CustomRegexPatterns []string - AdditionalIgnoreRules []string - MaxTargetMegabytes int + SelectedRules []*ruledefine.Rule + CustomRegexPatterns []string + AdditionalIgnoreRules []string + MaxTargetMegabytes int + MaxFindings uint64 // Total findings limit across entire scan + MaxRuleMatchesPerFragment uint64 // Regex matches limit per rule per fragment + MaxSecretSize uint64 // Maximum secret size in bytes (0 = no limit) } type Engine struct { @@ -88,6 +91,12 @@ type Engine struct { ScanConfig resources.ScanConfig wg conc.WaitGroup + + // Atomic counter to track findings across concurrent workers immediately + findingsCounter atomic.Uint64 + + // Ensures max findings warning is only logged once + maxFindingsWarnOnce sync.Once } type IEngine interface { @@ -118,7 +127,6 @@ type ctxKey string const ( customRegexRuleIdFormat = "custom-regex-%d" - CxFileEndMarker = ";cx-file-end" totalLinesKey ctxKey = "totalLines" linesInChunkKey ctxKey = "linesInChunk" ) @@ -128,7 +136,10 @@ type EngineConfig struct { IgnoreList []string SpecialList []string - MaxTargetMegabytes int + MaxTargetMegabytes int + MaxFindings uint64 // Total findings limit across entire scan + MaxRuleMatchesPerFragment uint64 // Regex matches limit per rule per fragment + MaxSecretSize uint64 // Maximum secret size in bytes (0 = no limit) IgnoredIds []string AllowedValues []string @@ -188,10 +199,13 @@ func initEngine(engineConfig *EngineConfig, opts ...EngineOption) (*Engine, erro engine := &Engine{ detectorConfig: DetectorConfig{ - SelectedRules: finalRules, - CustomRegexPatterns: engineConfig.CustomRegexPatterns, - AdditionalIgnoreRules: engineConfig.AdditionalIgnoreRules, - MaxTargetMegabytes: engineConfig.MaxTargetMegabytes, + SelectedRules: finalRules, + CustomRegexPatterns: engineConfig.CustomRegexPatterns, + AdditionalIgnoreRules: engineConfig.AdditionalIgnoreRules, + MaxTargetMegabytes: engineConfig.MaxTargetMegabytes, + MaxFindings: engineConfig.MaxFindings, + MaxRuleMatchesPerFragment: engineConfig.MaxRuleMatchesPerFragment, + MaxSecretSize: engineConfig.MaxSecretSize, }, validator: *validation.NewValidator(), @@ -240,8 +254,10 @@ func initEngine(engineConfig *EngineConfig, opts ...EngineOption) (*Engine, erro } // Create detector with final config - detector := detect.NewDetector(*cfg) + detector := detect.NewDetector(cfg) detector.MaxTargetMegaBytes = engineConfig.MaxTargetMegabytes + detector.MaxRuleMatchesPerFragment = engineConfig.MaxRuleMatchesPerFragment + detector.MaxSecretSize = engineConfig.MaxSecretSize engine.detector = detector return engine, nil @@ -356,19 +372,29 @@ func (e *Engine) detectSecrets( secrets chan *secrets.Secret, pluginName string, ) error { - fragment.Raw += CxFileEndMarker + "\n" - - values := e.detector.Detect(*fragment) + maxFindings := e.detectorConfig.MaxFindings + if maxFindings > 0 && e.findingsCounter.Load() >= maxFindings { + return nil + } - // Filter generic secrets if a better finding exists - filteredValues := filterGenericDuplicateFindings(values) + values := e.detector.Detect(fragment) - for _, value := range filteredValues { //nolint:gocritic // rangeValCopy: value is used immediately + for _, value := range values { //nolint:gocritic // rangeValCopy: value is used immediately secret, buildErr := buildSecret(ctx, item, value, pluginName) if buildErr != nil { return fmt.Errorf("failed to build secret: %w", buildErr) } if !isSecretIgnored(secret, e.ignoredIds, e.allowedValues, value.Line, value.Match, pluginName) { + // Atomically increment and check to avoid race condition + newCount := e.findingsCounter.Add(1) + if maxFindings > 0 && newCount > maxFindings { + e.maxFindingsWarnOnce.Do(func() { + log.Warn(). + Uint64("max_findings", maxFindings). + Msg("Maximum findings limit reached. Scan will stop early and report results up to this limit.") + }) + break + } secrets <- secret } else { log.Debug().Msgf("Secret %s was ignored", secret.ID) @@ -518,7 +544,6 @@ func buildSecret( return nil, fmt.Errorf("failed to get start and end lines for source %s: %w", item.GetSource(), err) } - value.Line = strings.TrimSuffix(value.Line, CxFileEndMarker) hasNewline := strings.HasPrefix(value.Line, "\n") if hasNewline { @@ -799,35 +824,6 @@ func (e *Engine) Wait() { e.wg.Wait() } -func filterGenericDuplicateFindings(findings []report.Finding) []report.Finding { - var retFindings []report.Finding - for i := range findings { - f := &findings[i] - include := true - if strings.Contains(strings.ToLower(f.RuleID), "01ab7659-d25a-4a1c-9f98-dee9d0cf2e70") { // generic rule ID - for j := range findings { - fPrime := &findings[j] - if f.StartLine == fPrime.StartLine && - f.Commit == fPrime.Commit && - f.RuleID != fPrime.RuleID && - strings.Contains(fPrime.Secret, f.Secret) && - !strings.Contains(strings.ToLower(fPrime.RuleID), "01ab7659-d25a-4a1c-9f98-dee9d0cf2e70") { - genericMatch := strings.ReplaceAll(f.Match, f.Secret, "REDACTED") - betterMatch := strings.ReplaceAll(fPrime.Match, fPrime.Secret, "REDACTED") - logging.Trace().Msgf("skipping %s finding (%s), %s rule takes precedence (%s)", f.RuleID, genericMatch, fPrime.RuleID, betterMatch) - include = false - break - } - } - } - - if include { - retFindings = append(retFindings, *f) - } - } - return retFindings -} - // isSecretFromConfluenceResourceIdentifier reports whether a regex match found in a line // actually belongs to Confluence Storage Format metadata (the `ri:` namespace) rather than // real user content. This lets us ignore false-positives that cannot be suppressed via the diff --git a/engine/engine_test.go b/engine/engine_test.go index 2f949aa7..d9744168 100644 --- a/engine/engine_test.go +++ b/engine/engine_test.go @@ -29,8 +29,9 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "github.com/zricethezav/gitleaks/v8/config" - "github.com/zricethezav/gitleaks/v8/detect" "github.com/zricethezav/gitleaks/v8/report" + + "github.com/checkmarx/2ms/v4/engine/detect" ) // Removed global fsPlugin to avoid test interference @@ -349,7 +350,7 @@ func TestDetectFile(t *testing.T) { cfg := newConfig() cfg.Rules = make(map[string]config.Rule) cfg.Keywords = make(map[string]struct{}) - detector := detect.NewDetector(*cfg) + detector := detect.NewDetector(cfg) detector.MaxTargetMegaBytes = tc.maxMegabytes engine := &Engine{ rules: nil, @@ -450,7 +451,7 @@ func TestDetectChunks(t *testing.T) { cfg := newConfig() cfg.Rules = make(map[string]config.Rule) cfg.Keywords = make(map[string]struct{}) - detector := detect.NewDetector(*cfg) + detector := detect.NewDetector(cfg) engine := &Engine{ rules: nil, @@ -1319,3 +1320,198 @@ func TestBuildSecret(t *testing.T) { assert.Equal(t, pageID, value) }) } + +func TestMaxRuleMatchesPerFragmentFlag(t *testing.T) { + // Content with multiple secrets that would match the same rule + multipleSecrets := ` +token1: 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 +token2: ghp_1234567890abcdefghijklmnopqrstuvwxyz +token3: ghp_abcdefghijklmnopqrstuvwxyz1234567890 +token4: ghp_9876543210zyxwvutsrqponmlkjihgfedcba +token5: ghp_aB3cD4eF5gH6iJ7kL8mN9oP0qR1sT2uV3wX4 +` + + testCases := []struct { + name string + limit uint64 + expectedCount int + }{ + { + name: "no limit - finds all matches", + limit: 0, + expectedCount: 5, + }, + { + name: "limit of 2 - finds only 2 matches", + limit: 2, + expectedCount: 2, + }, + { + name: "limit of 1 - finds only 1 match", + limit: 1, + expectedCount: 1, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + eng, err := initEngine(&EngineConfig{ + DetectorWorkerPoolSize: 1, + MaxRuleMatchesPerFragment: tc.limit, + }) + require.NoError(t, err) + defer eng.Shutdown() + + secretsChan := make(chan *secrets.Secret, 10) + fsPlugin := &plugins.FileSystemPlugin{} + err = eng.DetectFragment(item{content: &multipleSecrets}, secretsChan, fsPlugin.GetName()) + require.NoError(t, err) + close(secretsChan) + + count := 0 + for range secretsChan { + count++ + } + assert.Equal(t, tc.expectedCount, count) + }) + } +} + +func TestMaxSecretSizeFlag(t *testing.T) { + // Valid GitHub PAT format - 40 chars + secret := "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" + + testCases := []struct { + name string + limit uint64 + shouldFind bool + }{ + { + name: "no limit - finds secret", + limit: 0, + shouldFind: true, + }, + { + name: "limit larger than secret - finds secret", + limit: 200, + shouldFind: true, + }, + { + name: "limit smaller than secret - ignores secret", + limit: 10, + shouldFind: false, + }, + { + name: "limit exactly at secret size boundary - finds secret", + limit: 40, + shouldFind: true, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + eng, err := initEngine(&EngineConfig{ + DetectorWorkerPoolSize: 1, + MaxSecretSize: tc.limit, + }) + require.NoError(t, err) + defer eng.Shutdown() + + secretsChan := make(chan *secrets.Secret, 1) + fsPlugin := &plugins.FileSystemPlugin{} + err = eng.DetectFragment(item{content: &secret}, secretsChan, fsPlugin.GetName()) + require.NoError(t, err) + close(secretsChan) + + s := <-secretsChan + if tc.shouldFind { + assert.NotNil(t, s) + } else { + assert.Nil(t, s) + } + }) + } +} + +func TestMaxFindingsFlag(t *testing.T) { + // Content with multiple secrets in single fragment + multipleSecrets := ` +github_token: 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 +another_token: ghp_1234567890abcdefghijklmnopqrstuvwxyz +third_token: ghp_abcdefghijklmnopqrstuvwxyz1234567890 +` + + testCases := []struct { + name string + limit uint64 + fragments []string + expectedCount int + }{ + { + name: "no limit - finds all secrets", + limit: 0, + fragments: []string{multipleSecrets}, + expectedCount: 3, + }, + { + name: "limit of 2 - stops after 2 findings", + limit: 2, + fragments: []string{multipleSecrets}, + expectedCount: 2, + }, + { + name: "limit of 1 - stops after 1 finding", + limit: 1, + fragments: []string{multipleSecrets}, + expectedCount: 1, + }, + { + name: "limit of 2 across 3 fragments", + limit: 2, + fragments: []string{ + "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", + "ghp_1234567890abcdefghijklmnopqrstuvwxyz", + "ghp_abcdefghijklmnopqrstuvwxyz1234567890", + }, + expectedCount: 2, + }, + { + name: "limit of 1 across 3 fragments", + limit: 1, + fragments: []string{ + "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", + "ghp_1234567890abcdefghijklmnopqrstuvwxyz", + "ghp_abcdefghijklmnopqrstuvwxyz1234567890", + }, + expectedCount: 1, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + eng, err := initEngine(&EngineConfig{ + DetectorWorkerPoolSize: 1, + MaxFindings: tc.limit, + }) + require.NoError(t, err) + defer eng.Shutdown() + + secretsChan := make(chan *secrets.Secret, 10) + fsPlugin := &plugins.FileSystemPlugin{} + + for _, fragment := range tc.fragments { + f := fragment // capture for closure + err = eng.DetectFragment(item{content: &f}, secretsChan, fsPlugin.GetName()) + require.NoError(t, err) + } + + close(secretsChan) + + count := 0 + for range secretsChan { + count++ + } + assert.Equal(t, tc.expectedCount, count) + }) + } +} diff --git a/engine/rules/ruledefine/freemius_test.go b/engine/rules/ruledefine/freemius_test.go index 8559eca4..c8ea73b7 100644 --- a/engine/rules/ruledefine/freemius_test.go +++ b/engine/rules/ruledefine/freemius_test.go @@ -4,8 +4,8 @@ import ( "fmt" "testing" + "github.com/checkmarx/2ms/v4/engine/detect" "github.com/stretchr/testify/assert" - "github.com/zricethezav/gitleaks/v8/detect" ) func TestFreemiusSecretKey(t *testing.T) { @@ -50,14 +50,14 @@ func TestFreemiusSecretKey(t *testing.T) { // validate true positives if any specified for path, truePositive := range tt.truePositivesWPaths { fragment := detect.Fragment{Raw: truePositive, FilePath: path} - findings := d.Detect(fragment) + findings := d.Detect(&fragment) assert.Equal(t, len(findings), 1, fmt.Sprintf("failed to detect true positive: %s", truePositive)) } // validate false positives if any specified for path, falsePositive := range tt.falsePositivesWPaths { fragment := detect.Fragment{Raw: falsePositive, FilePath: path} - findings := d.Detect(fragment) + findings := d.Detect(&fragment) assert.Equal(t, 0, len(findings), fmt.Sprintf("unexpectedly found false positive: %s", falsePositive)) } }) diff --git a/engine/rules/ruledefine/generic_credential.go b/engine/rules/ruledefine/generic_credential.go index cde74336..a038a53c 100644 --- a/engine/rules/ruledefine/generic_credential.go +++ b/engine/rules/ruledefine/generic_credential.go @@ -2,6 +2,8 @@ package ruledefine import ( "regexp" + + "github.com/checkmarx/2ms/v4/engine/constants" ) var genericCredentialRegex = generateSemiGenericRegexIncludingXml([]string{ @@ -18,7 +20,7 @@ var genericCredentialRegex = generateSemiGenericRegexIncludingXml([]string{ func GenericCredential() *Rule { return &Rule{ - RuleID: "01ab7659-d25a-4a1c-9f98-dee9d0cf2e70", + RuleID: constants.GenericCredentialRuleID, RuleName: "Generic-Api-Key", Description: "Detected a Generic API Key, potentially exposing access to various services and sensitive operations.", Regex: genericCredentialRegex, diff --git a/engine/rules/ruledefine/hashicorp_tf_password_test.go b/engine/rules/ruledefine/hashicorp_tf_password_test.go index db918fb2..e8db9269 100644 --- a/engine/rules/ruledefine/hashicorp_tf_password_test.go +++ b/engine/rules/ruledefine/hashicorp_tf_password_test.go @@ -4,8 +4,8 @@ import ( "fmt" "testing" + "github.com/checkmarx/2ms/v4/engine/detect" "github.com/stretchr/testify/assert" - "github.com/zricethezav/gitleaks/v8/detect" ) func TestHashicorpField(t *testing.T) { @@ -38,14 +38,14 @@ func TestHashicorpField(t *testing.T) { // validate true positives if any specified for path, truePositive := range tt.truePositivesWPaths { fragment := detect.Fragment{Raw: truePositive, FilePath: path} - findings := d.Detect(fragment) + findings := d.Detect(&fragment) assert.Equal(t, len(findings), 1, fmt.Sprintf("failed to detect true positive: %s", truePositive)) } // validate false positives if any specified for path, falsePositive := range tt.falsePositivesWPaths { fragment := detect.Fragment{Raw: falsePositive, FilePath: path} - findings := d.Detect(fragment) + findings := d.Detect(&fragment) assert.Equal(t, 0, len(findings), fmt.Sprintf("unexpectedly found false positive: %s", falsePositive)) } }) diff --git a/engine/rules/ruledefine/kubernetes_secret_test.go b/engine/rules/ruledefine/kubernetes_secret_test.go index 6657f7ac..7b813e40 100644 --- a/engine/rules/ruledefine/kubernetes_secret_test.go +++ b/engine/rules/ruledefine/kubernetes_secret_test.go @@ -4,8 +4,8 @@ import ( "fmt" "testing" + "github.com/checkmarx/2ms/v4/engine/detect" "github.com/stretchr/testify/assert" - "github.com/zricethezav/gitleaks/v8/detect" ) func TestKubernetesSecret(t *testing.T) { @@ -434,14 +434,14 @@ data: // validate true positives if any specified for path, truePositive := range tt.truePositivesWPaths { fragment := detect.Fragment{Raw: truePositive, FilePath: path} - findings := d.Detect(fragment) + findings := d.Detect(&fragment) assert.Equal(t, len(findings), 1, fmt.Sprintf("failed to detect true positive: %s", truePositive)) } // validate false positives if any specified for path, falsePositive := range tt.falsePositivesWPaths { fragment := detect.Fragment{Raw: falsePositive, FilePath: path} - findings := d.Detect(fragment) + findings := d.Detect(&fragment) assert.Equal(t, 0, len(findings), fmt.Sprintf("unexpectedly found false positive: %s", falsePositive)) } }) diff --git a/engine/rules/ruledefine/nuget_test.go b/engine/rules/ruledefine/nuget_test.go index f1284469..67604e2f 100644 --- a/engine/rules/ruledefine/nuget_test.go +++ b/engine/rules/ruledefine/nuget_test.go @@ -4,8 +4,8 @@ import ( "fmt" "testing" + "github.com/checkmarx/2ms/v4/engine/detect" "github.com/stretchr/testify/assert" - "github.com/zricethezav/gitleaks/v8/detect" ) func TestNugetConfigPassword(t *testing.T) { @@ -43,14 +43,14 @@ func TestNugetConfigPassword(t *testing.T) { // validate true positives if any specified for path, truePositive := range tt.truePositivesWPaths { fragment := detect.Fragment{Raw: truePositive, FilePath: path} - findings := d.Detect(fragment) + findings := d.Detect(&fragment) assert.Equal(t, len(findings), 1, fmt.Sprintf("failed to detect true positive: %s", truePositive)) } // validate false positives if any specified for path, falsePositive := range tt.falsePositivesWPaths { fragment := detect.Fragment{Raw: falsePositive, FilePath: path} - findings := d.Detect(fragment) + findings := d.Detect(&fragment) assert.Equal(t, 0, len(findings), fmt.Sprintf("unexpectedly found false positive: %s", falsePositive)) } }) diff --git a/engine/rules/ruledefine/utils_test.go b/engine/rules/ruledefine/utils_test.go index 32ccfed4..4e951b7b 100644 --- a/engine/rules/ruledefine/utils_test.go +++ b/engine/rules/ruledefine/utils_test.go @@ -3,9 +3,9 @@ package ruledefine import ( "strings" + "github.com/checkmarx/2ms/v4/engine/detect" "github.com/zricethezav/gitleaks/v8/cmd/generate/config/base" gitleaksrule "github.com/zricethezav/gitleaks/v8/config" - "github.com/zricethezav/gitleaks/v8/detect" "github.com/zricethezav/gitleaks/v8/logging" ) @@ -36,5 +36,5 @@ func createSingleRuleDetector(r *gitleaksrule.Rule) *detect.Detector { logging.Fatal().Err(err).Msg("invalid global allowlist") } } - return detect.NewDetector(cfg) + return detect.NewDetector(&cfg) } diff --git a/go.mod b/go.mod index f50228c0..1adc470f 100644 --- a/go.mod +++ b/go.mod @@ -8,8 +8,11 @@ replace ( ) require ( + github.com/BobuSumisu/aho-corasick v1.0.3 github.com/alitto/pond/v2 v2.5.0 github.com/bwmarrin/discordgo v0.27.1 + github.com/charmbracelet/lipgloss v0.7.1 + github.com/fatih/semgroup v1.2.0 github.com/gitleaks/go-gitdiff v0.9.1 github.com/google/uuid v1.6.0 github.com/h2non/filetype v1.1.3 @@ -25,6 +28,7 @@ require ( github.com/stretchr/testify v1.10.0 github.com/zricethezav/gitleaks/v8 v8.28.0 go.uber.org/mock v0.5.2 + golang.org/x/exp v0.0.0-20250218142911-aa4b98e5adaa golang.org/x/net v0.47.0 golang.org/x/sync v0.18.0 golang.org/x/text v0.31.0 @@ -34,7 +38,6 @@ require ( require ( dario.cat/mergo v1.0.1 // indirect - github.com/BobuSumisu/aho-corasick v1.0.3 // indirect github.com/Masterminds/goutils v1.1.1 // indirect github.com/Masterminds/semver/v3 v3.3.0 // indirect github.com/Masterminds/sprig/v3 v3.3.0 // indirect @@ -44,10 +47,8 @@ require ( github.com/bodgit/plumbing v1.3.0 // indirect github.com/bodgit/sevenzip v1.6.1 // indirect github.com/bodgit/windows v1.0.1 // indirect - github.com/charmbracelet/lipgloss v0.7.1 // indirect github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect github.com/dsnet/compress v0.0.2-0.20230904184137-39efe44ab707 // indirect - github.com/fatih/semgroup v1.2.0 // indirect github.com/fsnotify/fsnotify v1.8.0 // indirect github.com/go-ole/go-ole v1.2.6 // indirect github.com/go-viper/mapstructure/v2 v2.4.0 // indirect @@ -88,6 +89,5 @@ require ( go.uber.org/multierr v1.11.0 // indirect go4.org v0.0.0-20230225012048-214862532bf5 // indirect golang.org/x/crypto v0.45.0 // indirect - golang.org/x/exp v0.0.0-20250218142911-aa4b98e5adaa // indirect golang.org/x/sys v0.38.0 // indirect ) diff --git a/pkg/testData/expectedReports/customRules/defaultPlusAllCustomRules.json b/pkg/testData/expectedReports/customRules/defaultPlusAllCustomRules.json index fa78e9c3..45c15d51 100644 --- a/pkg/testData/expectedReports/customRules/defaultPlusAllCustomRules.json +++ b/pkg/testData/expectedReports/customRules/defaultPlusAllCustomRules.json @@ -175,7 +175,7 @@ "endLine": 1, "lineContent": " Text_Example = eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiJtb2NrU3ViMiIsIm5hbWUiOiJtb2NrTmFtZTIifQ.dummysignature2", "startColumn": 63, - "endColumn": 166, + "endColumn": 165, "value": "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiJtb2NrU3ViMiIsIm5hbWUiOiJtb2NrTmFtZTIifQ.dummysignature2", "validationStatus": "Unknown", "ruleDescription": "Uncovered a JSON Web Token, which may lead to unauthorized access to web applications and sensitive user data.", diff --git a/pkg/testData/expectedReports/customRules/defaultPlusNonOverrideRules.json b/pkg/testData/expectedReports/customRules/defaultPlusNonOverrideRules.json index 955379f7..6d2cc2ef 100644 --- a/pkg/testData/expectedReports/customRules/defaultPlusNonOverrideRules.json +++ b/pkg/testData/expectedReports/customRules/defaultPlusNonOverrideRules.json @@ -175,7 +175,7 @@ "endLine": 1, "lineContent": " Text_Example = eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiJtb2NrU3ViMiIsIm5hbWUiOiJtb2NrTmFtZTIifQ.dummysignature2", "startColumn": 63, - "endColumn": 166, + "endColumn": 165, "value": "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiJtb2NrU3ViMiIsIm5hbWUiOiJtb2NrTmFtZTIifQ.dummysignature2", "validationStatus": "Unknown", "ruleDescription": "Uncovered a JSON Web Token, which may lead to unauthorized access to web applications and sensitive user data.", diff --git a/pkg/testData/expectedReports/customRules/onlyDefaultIgnoreCustomRules.json b/pkg/testData/expectedReports/customRules/onlyDefaultIgnoreCustomRules.json index 82d1c1db..f068b9e4 100644 --- a/pkg/testData/expectedReports/customRules/onlyDefaultIgnoreCustomRules.json +++ b/pkg/testData/expectedReports/customRules/onlyDefaultIgnoreCustomRules.json @@ -137,7 +137,7 @@ "endLine": 1, "lineContent": " Text_Example = eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiJtb2NrU3ViMiIsIm5hbWUiOiJtb2NrTmFtZTIifQ.dummysignature2", "startColumn": 63, - "endColumn": 166, + "endColumn": 165, "value": "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiJtb2NrU3ViMiIsIm5hbWUiOiJtb2NrTmFtZTIifQ.dummysignature2", "validationStatus": "Unknown", "ruleDescription": "Uncovered a JSON Web Token, which may lead to unauthorized access to web applications and sensitive user data.", diff --git a/pkg/testData/expectedReports/expectedReport.json b/pkg/testData/expectedReports/expectedReport.json index dcf00d5c..1f1a7d59 100644 --- a/pkg/testData/expectedReports/expectedReport.json +++ b/pkg/testData/expectedReports/expectedReport.json @@ -113,7 +113,7 @@ "endLine": 1, "lineContent": " Text_Example = eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiJtb2NrU3ViMiIsIm5hbWUiOiJtb2NrTmFtZTIifQ.dummysignature2", "startColumn": 63, - "endColumn": 166, + "endColumn": 165, "value": "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiJtb2NrU3ViMiIsIm5hbWUiOiJtb2NrTmFtZTIifQ.dummysignature2", "ruleDescription": "Uncovered a JSON Web Token, which may lead to unauthorized access to web applications and sensitive user data.", "extraDetails": { diff --git a/pkg/testData/expectedReports/expectedReportWithIgnoredRule.json b/pkg/testData/expectedReports/expectedReportWithIgnoredRule.json index cfb41643..3db4d9f3 100644 --- a/pkg/testData/expectedReports/expectedReportWithIgnoredRule.json +++ b/pkg/testData/expectedReports/expectedReportWithIgnoredRule.json @@ -59,7 +59,7 @@ "endLine": 1, "lineContent": " Text_Example = eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiJtb2NrU3ViMiIsIm5hbWUiOiJtb2NrTmFtZTIifQ.dummysignature2", "startColumn": 63, - "endColumn": 166, + "endColumn": 165, "value": "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiJtb2NrU3ViMiIsIm5hbWUiOiJtb2NrTmFtZTIifQ.dummysignature2", "ruleDescription": "Uncovered a JSON Web Token, which may lead to unauthorized access to web applications and sensitive user data.", "extraDetails": { diff --git a/pkg/testData/expectedReports/expectedReportWithValidation.json b/pkg/testData/expectedReports/expectedReportWithValidation.json index 3b20fa59..1899af24 100644 --- a/pkg/testData/expectedReports/expectedReportWithValidation.json +++ b/pkg/testData/expectedReports/expectedReportWithValidation.json @@ -118,7 +118,7 @@ "endLine": 1, "lineContent": " Text_Example = eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiJtb2NrU3ViMiIsIm5hbWUiOiJtb2NrTmFtZTIifQ.dummysignature2", "startColumn": 63, - "endColumn": 166, + "endColumn": 165, "value": "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiJtb2NrU3ViMiIsIm5hbWUiOiJtb2NrTmFtZTIifQ.dummysignature2", "validationStatus": "Unknown", "ruleDescription": "Uncovered a JSON Web Token, which may lead to unauthorized access to web applications and sensitive user data.", diff --git a/tests/e2e_test.go b/tests/e2e_test.go index 0bdc4ece..fa6fbc59 100644 --- a/tests/e2e_test.go +++ b/tests/e2e_test.go @@ -766,4 +766,114 @@ api_key: test-key-456` } } }) + + t.Run("--max-findings flag: caps total number of findings", func(t *testing.T) { + projectDir := t.TempDir() + + // Create multiple files with secrets to ensure we have more than the limit + for i := 1; i <= 5; i++ { + content := fmt.Sprintf("secret%d: ghp_%dabcdefghijklmnopqrstuvwxyz12345678", i, i) + err := os.WriteFile(path.Join(projectDir, fmt.Sprintf("secret%d.txt", i)), []byte(content), 0644) + require.NoError(t, err, "failed to create test file") + } + + // Run scan with --max-findings set to 2 + err = executable.run("filesystem", "--path", projectDir, "--max-findings", "2", "--ignore-on-exit", "results") + assert.NoError(t, err, "scan should succeed with max-findings flag") + + report, err := executable.getReport() + require.NoError(t, err, "failed to get report") + + totalSecrets := report.GetTotalSecretsFound() + t.Logf("Total secrets found with --max-findings=2: %d", totalSecrets) + assert.LessOrEqual(t, totalSecrets, 2, "should find at most 2 secrets when --max-findings=2") + }) + + t.Run("--max-rule-matches-per-fragment flag: limits matches per rule per fragment", func(t *testing.T) { + projectDir := t.TempDir() + + // Create a single file with multiple secrets that match the same rule + content := `Multiple GitHub PATs in one file: +token1: ghp_1234567890abcdefghijklmnopqrstuvwxyz +token2: ghp_abcdefghijklmnopqrstuvwxyz1234567890 +token3: ghp_9876543210zyxwvutsrqponmlkjihgfedcba +token4: ghp_aB3cD4eF5gH6iJ7kL8mN9oP0qR1sT2uV3wX4 +token5: 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` + + err := os.WriteFile(path.Join(projectDir, "multi_secrets.txt"), []byte(content), 0644) + require.NoError(t, err, "failed to create test file") + + // Run scan with --max-rule-matches-per-fragment set to 2 + err = executable.run("filesystem", "--path", projectDir, "--max-rule-matches-per-fragment", "2", "--ignore-on-exit", "results") + assert.NoError(t, err, "scan should succeed with max-rule-matches-per-fragment flag") + + report, err := executable.getReport() + require.NoError(t, err, "failed to get report") + + totalSecrets := report.GetTotalSecretsFound() + t.Logf("Total secrets found with --max-rule-matches-per-fragment=2: %d", totalSecrets) + assert.LessOrEqual(t, totalSecrets, 2, "should find at most 2 secrets per rule per fragment") + }) + + t.Run("--max-secret-size flag: ignores secrets larger than specified size", func(t *testing.T) { + projectDir := t.TempDir() + + // Create a file with a normal-sized secret + normalSecret := "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" // 40 chars + err := os.WriteFile(path.Join(projectDir, "normal.txt"), []byte(normalSecret), 0644) + require.NoError(t, err, "failed to create test file") + + // Run scan with --max-secret-size set to a value smaller than the secret + err = executable.run("filesystem", "--path", projectDir, "--max-secret-size", "10", "--ignore-on-exit", "results") + assert.NoError(t, err, "scan should succeed with max-secret-size flag") + + report, err := executable.getReport() + require.NoError(t, err, "failed to get report") + + totalSecrets := report.GetTotalSecretsFound() + t.Logf("Total secrets found with --max-secret-size=10: %d", totalSecrets) + assert.Equal(t, 0, totalSecrets, "should find no secrets when max-secret-size is smaller than secret") + + // Run scan with --max-secret-size set to a value larger than the secret + err = executable.run("filesystem", "--path", projectDir, "--max-secret-size", "100", "--ignore-on-exit", "results") + assert.NoError(t, err, "scan should succeed with max-secret-size flag") + + report, err = executable.getReport() + require.NoError(t, err, "failed to get report") + + totalSecrets = report.GetTotalSecretsFound() + t.Logf("Total secrets found with --max-secret-size=100: %d", totalSecrets) + assert.GreaterOrEqual(t, totalSecrets, 1, "should find secrets when max-secret-size is larger than secret") + }) + + t.Run("Combined limit flags: multiple limit flags together", func(t *testing.T) { + projectDir := t.TempDir() + + // Create multiple files with multiple secrets each + for i := 1; i <= 5; i++ { + content := fmt.Sprintf(`File %d secrets: +token1: ghp_%d234567890abcdefghijklmnopqrstuvwxy +token2: ghp_%dabcdefghijklmnopqrstuvwxyz123456789`, i, i, i) + err := os.WriteFile(path.Join(projectDir, fmt.Sprintf("file%d.txt", i)), []byte(content), 0644) + require.NoError(t, err, "failed to create test file") + } + + // Run scan with multiple limit flags + err = executable.run("filesystem", + "--path", projectDir, + "--max-findings", "3", + "--max-rule-matches-per-fragment", "1", + "--max-secret-size", "100", + "--ignore-on-exit", "results") + assert.NoError(t, err, "scan should succeed with combined limit flags") + + report, err := executable.getReport() + require.NoError(t, err, "failed to get report") + + totalSecrets := report.GetTotalSecretsFound() + t.Logf("Total secrets found with combined limit flags: %d", totalSecrets) + // With max-rule-matches-per-fragment=1, we get at most 1 per file (3 files) + // With max-findings=3, we get at most 3 total + assert.LessOrEqual(t, totalSecrets, 3, "should respect combined limit flags") + }) }