From 082402225deeb6c1080ec0284e8262703e07a739 Mon Sep 17 00:00:00 2001 From: Hunter Gregory Date: Fri, 11 Feb 2022 16:40:50 -0800 Subject: [PATCH] wip --- npm/http/server/server.go | 6 ++- .../policies/chain-management_linux.go | 5 +- .../policies/chain-management_linux_test.go | 17 +++++- npm/pkg/dataplane/policies/policy.go | 1 + npm/pkg/dataplane/policies/policy_linux.go | 14 +++++ .../dataplane/policies/policymanager_linux.go | 54 +++++++++++-------- .../policies/policymanager_linux_test.go | 28 ++++++++-- npm/pkg/dataplane/policies/testutils_linux.go | 8 +-- 8 files changed, 98 insertions(+), 35 deletions(-) diff --git a/npm/http/server/server.go b/npm/http/server/server.go index 6af0944477..6e6278914d 100644 --- a/npm/http/server/server.go +++ b/npm/http/server/server.go @@ -32,8 +32,10 @@ func NPMRestServerListenAndServe(config npmconfig.Config, npmEncoder json.Marsha rs.router.Handle(api.ClusterMetricsPath, metrics.GetHandler(metrics.ClusterMetrics)) } - if config.Toggles.EnableHTTPDebugAPI && npmEncoder != nil { - // ACN CLI debug handlerss + // TODO support the debug CLI for v2 + // the nil check is for fan-out npm + if config.Toggles.EnableHTTPDebugAPI && npmEncoder != nil && !config.Toggles.EnableV2NPM { + // ACN CLI debug handlers rs.router.Handle(api.NPMMgrPath, rs.npmCacheHandler(npmEncoder)).Methods(http.MethodGet) } diff --git a/npm/pkg/dataplane/policies/chain-management_linux.go b/npm/pkg/dataplane/policies/chain-management_linux.go index 07daea03a6..c70efc773b 100644 --- a/npm/pkg/dataplane/policies/chain-management_linux.go +++ b/npm/pkg/dataplane/policies/chain-management_linux.go @@ -139,10 +139,9 @@ func isBaseChain(chain string) bool { - delete all deprecated chains - delete old v2 policy chains 3. Add/reposition the jump from FORWARD chain to AZURE-NPM chain. - TODO: add this jump (if necessary) in the iptables-restore call - TODO: could use one grep call instead of one for getting the jump line num and one for getting deprecated chains and old v2 policy chains - - should use a grep pattern like so: | + TODO: could use one grep call instead of separate calls for getting jump line nums and for getting deprecated chains and old v2 policy chains + - would use a grep pattern like so: | */ func (pMgr *PolicyManager) bootup(_ []string) error { klog.Infof("booting up iptables Azure chains") diff --git a/npm/pkg/dataplane/policies/chain-management_linux_test.go b/npm/pkg/dataplane/policies/chain-management_linux_test.go index 796e5f754c..56d4430b04 100644 --- a/npm/pkg/dataplane/policies/chain-management_linux_test.go +++ b/npm/pkg/dataplane/policies/chain-management_linux_test.go @@ -371,6 +371,20 @@ func TestBootupLinux(t *testing.T) { calls: GetBootupTestCalls(), wantErr: false, }, + { + name: "success after restore failure (no NPM prior)", + calls: []testutils.TestCmd{ + {Cmd: []string{"iptables", "-w", "60", "-D", "FORWARD", "-j", "AZURE-NPM"}, ExitCode: 2}, // AZURE-NPM chain didn't exist + {Cmd: listAllCommandStrings, PipedToCommand: true}, + {Cmd: []string{"grep", "Chain AZURE-NPM"}, ExitCode: 1}, + fakeIPTablesRestoreFailureCommand, // e.g. xtables lock held by another app. Currently the stdout doesn't matter for retrying + fakeIPTablesRestoreCommand, + {Cmd: listLineNumbersCommandStrings, PipedToCommand: true}, + {Cmd: []string{"grep", "AZURE-NPM"}, ExitCode: 1}, + {Cmd: []string{"iptables", "-w", "60", "-I", "FORWARD", "-j", "AZURE-NPM", "-m", "conntrack", "--ctstate", "NEW"}}, + }, + wantErr: false, + }, { name: "success: v2 existed prior", calls: []testutils.TestCmd{ @@ -429,12 +443,13 @@ func TestBootupLinux(t *testing.T) { wantErr: true, }, { - name: "failure on restore (no NPM prior)", + name: "failure twice on restore (no NPM prior)", calls: []testutils.TestCmd{ {Cmd: []string{"iptables", "-w", "60", "-D", "FORWARD", "-j", "AZURE-NPM"}, ExitCode: 2}, // AZURE-NPM chain didn't exist {Cmd: listAllCommandStrings, PipedToCommand: true}, {Cmd: []string{"grep", "Chain AZURE-NPM"}, ExitCode: 1}, fakeIPTablesRestoreFailureCommand, + fakeIPTablesRestoreFailureCommand, }, wantErr: true, }, diff --git a/npm/pkg/dataplane/policies/policy.go b/npm/pkg/dataplane/policies/policy.go index 4650125e6e..9938f13f59 100644 --- a/npm/pkg/dataplane/policies/policy.go +++ b/npm/pkg/dataplane/policies/policy.go @@ -18,6 +18,7 @@ type NPMNetworkPolicy struct { PolicyKey string // PodSelectorIPSets holds all the IPSets generated from Pod Selector PodSelectorIPSets []*ipsets.TranslatedIPSet + // TODO change to slice of pointers // PodSelectorList holds target pod information to avoid duplicatoin in SrcList and DstList fields in ACLs PodSelectorList []SetInfo // RuleIPSets holds all IPSets generated from policy's rules diff --git a/npm/pkg/dataplane/policies/policy_linux.go b/npm/pkg/dataplane/policies/policy_linux.go index 094c465561..8f40f77fc2 100644 --- a/npm/pkg/dataplane/policies/policy_linux.go +++ b/npm/pkg/dataplane/policies/policy_linux.go @@ -15,6 +15,9 @@ type UniqueDirection bool const ( forIngress UniqueDirection = true forEgress UniqueDirection = false + + // 5-6 elements depending on Included boolean + maxLengthForMatchSetSpecs = 6 ) // returns two booleans indicating whether the network policy has ingress and egress respectively @@ -83,6 +86,17 @@ func (info SetInfo) comment() string { return "!" + name } +func (info SetInfo) matchSetSpecs(matchString string) []string { + specs := make([]string, 0, maxLengthForMatchSetSpecs) + specs = append(specs, util.IptablesModuleFlag, util.IptablesSetModuleFlag) + if !info.Included { + specs = append(specs, util.IptablesNotFlag) + } + hashedSetName := info.IPSet.GetHashedName() + specs = append(specs, util.IptablesMatchSetFlag, hashedSetName, matchString) + return specs +} + func (aclPolicy *ACLPolicy) comment() string { // cleanPeerList contains peers that aren't namedports var cleanPeerList []SetInfo diff --git a/npm/pkg/dataplane/policies/policymanager_linux.go b/npm/pkg/dataplane/policies/policymanager_linux.go index 2ebe978f55..0940178b64 100644 --- a/npm/pkg/dataplane/policies/policymanager_linux.go +++ b/npm/pkg/dataplane/policies/policymanager_linux.go @@ -12,14 +12,34 @@ import ( ) const ( - maxTryCount = 1 - unknownLineErrorPattern = "line (\\d+) failed" // TODO this could happen if syntax is off or AZURE-NPM-INGRESS doesn't exist for -A AZURE-NPM-INGRESS -j hash(NP1) ... - knownLineErrorPattern = "Error occurred at line: (\\d+)" - - chainSectionPrefix = "chain" // TODO are sections necessary for error handling? - maxLengthForMatchSetSpecs = 6 // 5-6 elements depending on Included boolean + maxTryCount = 2 + // the error message doesn't describe the error if this pattern is within the message + // this could happen if syntax is off or AZURE-NPM-INGRESS doesn't exist for -A AZURE-NPM-INGRESS -j hash(NP1) ... + unknownLineErrorPattern = "line (\\d+) failed" + // the error message describes the error if this pattern is within the message + knownLineErrorPattern = "Error occurred at line: (\\d+)" + + chainSectionPrefix = "chain" ) +/* +Error handling for iptables-restore: +Currently we retry on any error and will make two tries max. +The section IDs and line error patterns are pointless currently. Although we can eventually use them to skip a section with an error and salvage the rest of the file. + +Known errors that we should retry on: +- exit status 4 + - iptables: Resource temporarily unavailable. + - fork/exec /usr/sbin/iptables: resource temporarily unavailable + - Another app is currently holding the xtables lock; still 51s 0us time ahead to have a chance to grab the lock... + Another app is currently holding the xtables lock; still 41s 0us time ahead to have a chance to grab the lock... + Another app is currently holding the xtables lock; still 31s 0us time ahead to have a chance to grab the lock... + Another app is currently holding the xtables lock; still 21s 0us time ahead to have a chance to grab the lock... + Another app is currently holding the xtables lock; still 11s 0us time ahead to have a chance to grab the lock... + Another app is currently holding the xtables lock; still 1s 0us time ahead to have a chance to grab the lock... + Another app is currently holding the xtables lock. Stopped waiting after 60s. +*/ + func (pMgr *PolicyManager) addPolicy(networkPolicy *NPMNetworkPolicy, _ map[string]string) error { // 1. Add rules for the network policies and activate NPM (if necessary). chainsToCreate := chainNames([]*NPMNetworkPolicy{networkPolicy}) @@ -117,7 +137,7 @@ func (pMgr *PolicyManager) newCreatorWithChains(chainNames []string) *ioutil.Fil sectionID := joinWithDash(chainSectionPrefix, chainName) counters := "-" creator.AddLine(sectionID, nil, ":"+chainName, "-", counters) - // TODO remove sections?? + // TODO remove sections if we never use section-based error handling (e.g. remove the whole section) } return creator } @@ -154,10 +174,9 @@ func (pMgr *PolicyManager) deleteJumpRule(policy *NPMNetworkPolicy, direction Un specs = append([]string{baseChainName}, specs...) errCode, err := pMgr.runIPTablesCommand(util.IptablesDeletionFlag, specs...) - if err != nil && errCode != couldntLoadTargetErrorCode { - // TODO check rule doesn't exist error code instead because the chain should exist + if err != nil && errCode != doesNotExistErrorCode { errorString := fmt.Sprintf("failed to delete jump from %s chain to %s chain for policy %s with exit code %d", baseChainName, chainName, policy.PolicyKey, errCode) - log.Errorf(errorString+": %w", err) + log.Errorf("%s: %w", errorString, err) return npmerrors.SimpleErrorWrapper(errorString, err) } return nil @@ -264,13 +283,7 @@ func matchSetSpecsForNetworkPolicy(networkPolicy *NPMNetworkPolicy, matchType Ma specs := make([]string, 0, maxLengthForMatchSetSpecs*len(networkPolicy.PodSelectorList)) matchString := matchType.toIPTablesString() for _, setInfo := range networkPolicy.PodSelectorList { - // TODO consolidate this code with that in matchSetSpecsFromSetInfo - specs = append(specs, util.IptablesModuleFlag, util.IptablesSetModuleFlag) - if !setInfo.Included { - specs = append(specs, util.IptablesNotFlag) - } - hashedSetName := setInfo.IPSet.GetHashedName() - specs = append(specs, util.IptablesMatchSetFlag, hashedSetName, matchString) + specs = append(specs, setInfo.matchSetSpecs(matchString)...) } return specs } @@ -279,12 +292,7 @@ func matchSetSpecsFromSetInfo(setInfoList []SetInfo) []string { specs := make([]string, 0, maxLengthForMatchSetSpecs*len(setInfoList)) for _, setInfo := range setInfoList { matchString := setInfo.MatchType.toIPTablesString() - specs = append(specs, util.IptablesModuleFlag, util.IptablesSetModuleFlag) - if !setInfo.Included { - specs = append(specs, util.IptablesNotFlag) - } - hashedSetName := setInfo.IPSet.GetHashedName() - specs = append(specs, util.IptablesMatchSetFlag, hashedSetName, matchString) + specs = append(specs, setInfo.matchSetSpecs(matchString)...) } return specs } diff --git a/npm/pkg/dataplane/policies/policymanager_linux_test.go b/npm/pkg/dataplane/policies/policymanager_linux_test.go index 18db6159c4..12aa2eb791 100644 --- a/npm/pkg/dataplane/policies/policymanager_linux_test.go +++ b/npm/pkg/dataplane/policies/policymanager_linux_test.go @@ -317,7 +317,28 @@ func TestCreatorForRemovePolicies(t *testing.T) { dptestutils.AssertEqualLines(t, expectedLines, actualLines) } -// similar to TestRemovePolicy in policymanager.go except an error occurs +// similar to TestRemovePolicy in policymanager_test.go except an acceptable error occurs +func TestRemovePoliciesAcceptableError(t *testing.T) { + metrics.ReinitializeAll() + calls := []testutils.TestCmd{ + fakeIPTablesRestoreCommand, + // ignore exit code 1 + getFakeDeleteJumpCommandWithCode("AZURE-NPM-INGRESS", ingressEgressNetPolIngressJump, 1), + // ignore exit code 1 + getFakeDeleteJumpCommandWithCode("AZURE-NPM-EGRESS", ingressEgressNetPolEgressJump, 1), + fakeIPTablesRestoreCommand, + } + ioshim := common.NewMockIOShim(calls) + defer ioshim.VerifyCalls(t, calls) + pMgr := NewPolicyManager(ioshim, ipsetConfig) + require.NoError(t, pMgr.AddPolicy(bothDirectionsNetPol, epList)) + require.NoError(t, pMgr.RemovePolicy(bothDirectionsNetPol.PolicyKey, nil)) + _, ok := pMgr.GetPolicy(bothDirectionsNetPol.PolicyKey) + require.False(t, ok) + promVals{0, 1}.testPrometheusMetrics(t) +} + +// similar to TestRemovePolicy in policymanager_test.go except an error occurs func TestRemovePoliciesError(t *testing.T) { tests := []struct { name string @@ -330,13 +351,14 @@ func TestRemovePoliciesError(t *testing.T) { getFakeDeleteJumpCommand("AZURE-NPM-INGRESS", ingressEgressNetPolIngressJump), getFakeDeleteJumpCommand("AZURE-NPM-EGRESS", ingressEgressNetPolEgressJump), fakeIPTablesRestoreFailureCommand, + fakeIPTablesRestoreFailureCommand, }, }, { name: "error on delete for ingress", calls: []testutils.TestCmd{ fakeIPTablesRestoreCommand, - getFakeDeleteJumpCommandWithCode("AZURE-NPM-INGRESS", ingressEgressNetPolIngressJump, 1), // anything but 0 or 2 + getFakeDeleteJumpCommandWithCode("AZURE-NPM-INGRESS", ingressEgressNetPolIngressJump, 2), // anything but 0 or 1 }, }, { @@ -344,7 +366,7 @@ func TestRemovePoliciesError(t *testing.T) { calls: []testutils.TestCmd{ fakeIPTablesRestoreCommand, getFakeDeleteJumpCommand("AZURE-NPM-INGRESS", ingressEgressNetPolIngressJump), - getFakeDeleteJumpCommandWithCode("AZURE-NPM-EGRESS", ingressEgressNetPolEgressJump, 1), // anything but 0 or 2 + getFakeDeleteJumpCommandWithCode("AZURE-NPM-EGRESS", ingressEgressNetPolEgressJump, 2), // anything but 0 or 1 }, }, } diff --git a/npm/pkg/dataplane/policies/testutils_linux.go b/npm/pkg/dataplane/policies/testutils_linux.go index 74dfc9f474..59b167e0fd 100644 --- a/npm/pkg/dataplane/policies/testutils_linux.go +++ b/npm/pkg/dataplane/policies/testutils_linux.go @@ -20,7 +20,7 @@ func GetAddPolicyTestCalls(_ *NPMNetworkPolicy) []testutils.TestCmd { } func GetAddPolicyFailureTestCalls(_ *NPMNetworkPolicy) []testutils.TestCmd { - return []testutils.TestCmd{fakeIPTablesRestoreFailureCommand} + return []testutils.TestCmd{fakeIPTablesRestoreFailureCommand, fakeIPTablesRestoreFailureCommand} } func GetRemovePolicyTestCalls(policy *NPMNetworkPolicy) []testutils.TestCmd { @@ -44,8 +44,10 @@ func GetRemovePolicyTestCalls(policy *NPMNetworkPolicy) []testutils.TestCmd { // GetRemovePolicyFailureTestCalls fails on the restore func GetRemovePolicyFailureTestCalls(policy *NPMNetworkPolicy) []testutils.TestCmd { calls := GetRemovePolicyTestCalls(policy) - calls[len(calls)-1] = fakeIPTablesRestoreFailureCommand // replace the restore success with a failure - return calls + // replace the restore success with a failure + calls[len(calls)-1] = fakeIPTablesRestoreFailureCommand + // add another failure + return append(calls, fakeIPTablesRestoreFailureCommand) } func GetBootupTestCalls() []testutils.TestCmd {