diff --git a/npm/ipsm/ipsm.go b/npm/ipsm/ipsm.go index 112054037d..cf3446b62c 100644 --- a/npm/ipsm/ipsm.go +++ b/npm/ipsm/ipsm.go @@ -211,14 +211,11 @@ func (ipsMgr *IpsetManager) run(entry *ipsEntry) (int, error) { } func (ipsMgr *IpsetManager) createList(listName string) error { - prometheusTimer := metrics.StartNewTimer() if _, exists := ipsMgr.listMap[listName]; exists { return nil } - defer metrics.RecordIPSetExecTime(prometheusTimer) // record execution time regardless of failure - entry := &ipsEntry{ name: listName, operationFlag: util.IpsetCreationFlag, @@ -226,7 +223,9 @@ func (ipsMgr *IpsetManager) createList(listName string) error { spec: []string{util.IpsetSetListFlag}, } log.Logf("Creating List: %+v", entry) + timer := metrics.StartNewTimer() errCode, err := ipsMgr.run(entry) + metrics.RecordIPSetExecTime(timer) // record execution time regardless of failure if err != nil && errCode != 1 { metrics.SendErrorLogAndMetric(util.IpsmID, "Error: failed to create ipset list %s.", listName) return err @@ -619,9 +618,10 @@ func (ipsMgr *IpsetManager) DestroyNpmIpsets() error { _, err := ipsMgr.run(flushEntry) if err != nil { metrics.SendErrorLogAndMetric(util.IpsmID, "{DestroyNpmIpsets} Error: failed to flush ipset %s", ipsetName) + } else { + metrics.RemoveAllEntriesFromIPSet(ipsetName) } } - for _, ipsetName := range ipsetLists { deleteEntry := &ipsEntry{ operationFlag: util.IpsetDestroyFlag, @@ -631,8 +631,6 @@ func (ipsMgr *IpsetManager) DestroyNpmIpsets() error { if err != nil { destroyFailureCount++ metrics.SendErrorLogAndMetric(util.IpsmID, "{DestroyNpmIpsets} Error: failed to destroy ipset %s", ipsetName) - } else { - metrics.RemoveAllEntriesFromIPSet(ipsetName) } } @@ -644,9 +642,10 @@ func (ipsMgr *IpsetManager) DestroyNpmIpsets() error { } else { metrics.ResetNumIPSets() } - // NOTE: in v2, we reset ipset entries, but in v1 we only remove entries for ipsets we delete. - // So v2 may underestimate the number of entries if there are destroy failures, - // but v1 may miss removing entries if some sets are in the prometheus metric but not in the kernel. + // NOTE: in v2, we reset metrics blindly, regardless of errors + // So v2 would underestimate the number of ipsets/entries if there are destroy failures. + // In v1 we remove entries for ipsets we flush. + // We may miss removing entries if some sets are in the prometheus metric but not in the kernel. return nil } diff --git a/npm/iptm/iptm.go b/npm/iptm/iptm.go index 91e8e933d6..a487146a84 100644 --- a/npm/iptm/iptm.go +++ b/npm/iptm/iptm.go @@ -174,9 +174,6 @@ func (iptMgr *IptablesManager) UninitNpmChains() error { // Add adds a rule in iptables. func (iptMgr *IptablesManager) Add(entry *IptEntry) error { - prometheusTimer := metrics.StartNewTimer() - defer metrics.RecordACLRuleExecTime(prometheusTimer) // record execution time regardless of failure - log.Logf("Adding iptables entry: %+v.", entry) // Since there is a RETURN statement added to each DROP chain, we need to make sure @@ -186,7 +183,10 @@ func (iptMgr *IptablesManager) Add(entry *IptEntry) error { } else { iptMgr.OperationFlag = util.IptablesInsertionFlag } - if _, err := iptMgr.run(entry); err != nil { + timer := metrics.StartNewTimer() + _, err := iptMgr.run(entry) + metrics.RecordACLRuleExecTime(timer) // record execution time regardless of failure + if err != nil { metrics.SendErrorLogAndMetric(util.IptmID, "Error: failed to create iptables rules.") return err } diff --git a/npm/metrics/acl_rules.go b/npm/metrics/acl_rules.go index 318f96003e..4f9282e911 100644 --- a/npm/metrics/acl_rules.go +++ b/npm/metrics/acl_rules.go @@ -5,11 +5,21 @@ func IncNumACLRules() { numACLRules.Inc() } +// IncNumACLRulesBy increments the number of ACL rules by the amount. +func IncNumACLRulesBy(amount int) { + numACLRules.Add(float64(amount)) +} + // DecNumACLRules decrements the number of ACL rules. func DecNumACLRules() { numACLRules.Dec() } +// DecNumACLRulesBy decrements the number of ACL rules by the amount. +func DecNumACLRulesBy(amount int) { + numACLRules.Add(float64(-amount)) +} + // ResetNumACLRules sets the number of ACL rules to 0. func ResetNumACLRules() { numACLRules.Set(0) diff --git a/npm/metrics/acl_rules_test.go b/npm/metrics/acl_rules_test.go index 2995e6abca..f9e6edcdfb 100644 --- a/npm/metrics/acl_rules_test.go +++ b/npm/metrics/acl_rules_test.go @@ -3,8 +3,9 @@ package metrics import "testing" var ( - numRulesMetric = &basicMetric{ResetNumACLRules, IncNumACLRules, DecNumACLRules, GetNumACLRules} - ruleExecMetric = &recordingMetric{RecordACLRuleExecTime, GetACLRuleExecCount} + numRulesMetric = &basicMetric{ResetNumACLRules, IncNumACLRules, DecNumACLRules, GetNumACLRules} + numRulesAmountMetric = &amountMetric{basicMetric: numRulesMetric, incBy: IncNumACLRulesBy, decBy: DecNumACLRulesBy} + ruleExecMetric = &recordingMetric{RecordACLRuleExecTime, GetACLRuleExecCount} ) func TestRecordACLRuleExecTime(t *testing.T) { @@ -22,3 +23,11 @@ func TestDecNumACLRules(t *testing.T) { func TestResetNumACLRules(t *testing.T) { testResetMetric(t, numRulesMetric) } + +func TestIncNumACLRulesBy(t *testing.T) { + numRulesAmountMetric.testIncByMetric(t) +} + +func TestDecNumACLRulesBy(t *testing.T) { + numRulesAmountMetric.testDecByMetric(t) +} diff --git a/npm/metrics/prometheus_test.go b/npm/metrics/prometheus_test.go index d6dcdb0994..008e585580 100644 --- a/npm/metrics/prometheus_test.go +++ b/npm/metrics/prometheus_test.go @@ -23,6 +23,25 @@ type basicMetric struct { get func() (int, error) } +type amountMetric struct { + *basicMetric + incBy func(int) + decBy func(int) +} + +func (metric *amountMetric) testIncByMetric(t *testing.T) { + metric.reset() + metric.incBy(2) + assertMetricVal(t, metric.basicMetric, 2) +} + +func (metric *amountMetric) testDecByMetric(t *testing.T) { + metric.reset() + metric.incBy(5) + metric.decBy(2) + assertMetricVal(t, metric.basicMetric, 3) +} + type recordingMetric struct { record func(timer *Timer) getCount func() (int, error) diff --git a/npm/pkg/dataplane/ipsets/ipsetmanager.go b/npm/pkg/dataplane/ipsets/ipsetmanager.go index aef755da8d..847999c7a5 100644 --- a/npm/pkg/dataplane/ipsets/ipsetmanager.go +++ b/npm/pkg/dataplane/ipsets/ipsetmanager.go @@ -51,13 +51,15 @@ func NewIPSetManager(iMgrCfg *IPSetManagerCfg, ioShim *common.IOShim) *IPSetMana func (iMgr *IPSetManager) ResetIPSets() error { iMgr.Lock() defer iMgr.Unlock() + metrics.ResetNumIPSets() + metrics.ResetIPSetEntries() err := iMgr.resetIPSets() + iMgr.setMap = make(map[string]*IPSet) + iMgr.clearDirtyCache() if err != nil { metrics.SendErrorLogAndMetric(util.IpsmID, "error: failed to reset ipsetmanager: %s", err.Error()) return fmt.Errorf("error while resetting ipsetmanager: %w", err) } - // TODO update prometheus metrics here instead of in OS-specific functions (done in Linux right now) - // metrics.ResetNumIPSets() and metrics.ResetIPSetEntries() return nil } @@ -384,8 +386,6 @@ func (iMgr *IPSetManager) RemoveFromList(listMetadata *IPSetMetadata, setMetadat } func (iMgr *IPSetManager) ApplyIPSets() error { - prometheusTimer := metrics.StartNewTimer() - iMgr.Lock() defer iMgr.Unlock() @@ -393,13 +393,14 @@ func (iMgr *IPSetManager) ApplyIPSets() error { klog.Info("[IPSetManager] No IPSets to apply") return nil } - defer metrics.RecordIPSetExecTime(prometheusTimer) // record execution time regardless of failure klog.Infof("[IPSetManager] toAddUpdateCache %+v \n ", iMgr.toAddOrUpdateCache) klog.Infof("[IPSetManager] toDeleteCache %+v \n ", iMgr.toDeleteCache) iMgr.sanitizeDirtyCache() // Call the appropriate apply ipsets + prometheusTimer := metrics.StartNewTimer() + defer metrics.RecordIPSetExecTime(prometheusTimer) // record execution time regardless of failure err := iMgr.applyIPSets() if err != nil { metrics.SendErrorLogAndMetric(util.IpsmID, "error: failed to apply ipsets: %s", err.Error()) diff --git a/npm/pkg/dataplane/ipsets/ipsetmanager_linux.go b/npm/pkg/dataplane/ipsets/ipsetmanager_linux.go index c0f20de6e0..7e5aec6f41 100644 --- a/npm/pkg/dataplane/ipsets/ipsetmanager_linux.go +++ b/npm/pkg/dataplane/ipsets/ipsetmanager_linux.go @@ -4,7 +4,6 @@ import ( "fmt" "strings" - "github.com/Azure/azure-container-networking/npm/metrics" "github.com/Azure/azure-container-networking/npm/pkg/dataplane/parse" "github.com/Azure/azure-container-networking/npm/util" npmerrors "github.com/Azure/azure-container-networking/npm/util/errors" @@ -90,26 +89,20 @@ func (iMgr *IPSetManager) resetIPSets() error { grepCommand := iMgr.ioShim.Exec.Command(ioutil.Grep, azureNPMPrefix) azureIPSets, haveAzureIPSets, commandError := ioutil.PipeCommandToGrep(listCommand, grepCommand) if commandError != nil { - return npmerrors.SimpleErrorWrapper("failed to run ipset list for resetting IPSets", commandError) + return npmerrors.SimpleErrorWrapper("failed to run ipset list for resetting IPSets (prometheus metrics may be off now)", commandError) } if !haveAzureIPSets { - metrics.ResetNumIPSets() - metrics.ResetIPSetEntries() return nil } creator, originalNumAzureSets, destroyFailureCount := iMgr.fileCreatorForReset(azureIPSets) restoreError := creator.RunCommandWithFile(ipsetCommand, ipsetRestoreFlag) if restoreError != nil { - metrics.SetNumIPSets(originalNumAzureSets) - // NOTE: the num entries for sets may be incorrect if the restore fails + klog.Errorf( + "failed to restore ipsets (prometheus metrics may be off now). Had originalNumAzureSets %d and destroyFailureCount %d with err: %v", + originalNumAzureSets, destroyFailureCount, restoreError, + ) return npmerrors.SimpleErrorWrapper("failed to run ipset restore for resetting IPSets", restoreError) } - if metrics.NumIPSetsIsPositive() { - metrics.SetNumIPSets(*destroyFailureCount) - } else { - metrics.ResetNumIPSets() - } - metrics.ResetIPSetEntries() // NOTE: the num entries for sets that fail to flush may be incorrect after this return nil } diff --git a/npm/pkg/dataplane/ipsets/ipsetmanager_linux_test.go b/npm/pkg/dataplane/ipsets/ipsetmanager_linux_test.go index e13f43a712..c6e893bd81 100644 --- a/npm/pkg/dataplane/ipsets/ipsetmanager_linux_test.go +++ b/npm/pkg/dataplane/ipsets/ipsetmanager_linux_test.go @@ -9,7 +9,6 @@ import ( "github.com/Azure/azure-container-networking/common" "github.com/Azure/azure-container-networking/npm/metrics" - "github.com/Azure/azure-container-networking/npm/metrics/promutil" dptestutils "github.com/Azure/azure-container-networking/npm/pkg/dataplane/testutils" testutils "github.com/Azure/azure-container-networking/test/utils" "github.com/stretchr/testify/require" @@ -190,15 +189,10 @@ func TestDestroyNPMIPSetsCreatorErrorHandling(t *testing.T) { } func TestDestroyNPMIPSets(t *testing.T) { - numSetsToStart := 2 - numEntriesToStart := 5 - tests := []struct { - name string - calls []testutils.TestCmd - wantErr bool - expectedNumSets int - expectedNumEntries int + name string + calls []testutils.TestCmd + wantErr bool }{ { name: "success with no results from grep", @@ -206,9 +200,7 @@ func TestDestroyNPMIPSets(t *testing.T) { {Cmd: []string{"ipset", "list", "--name"}, PipedToCommand: true}, {Cmd: []string{"grep", "azure-npm-"}, ExitCode: 1}, }, - wantErr: false, - expectedNumSets: 0, - expectedNumEntries: 0, + wantErr: false, }, { name: "successfully delete sets", @@ -217,9 +209,7 @@ func TestDestroyNPMIPSets(t *testing.T) { {Cmd: []string{"grep", "azure-npm-"}, Stdout: resetIPSetsListOutputString}, fakeRestoreSuccessCommand, }, - wantErr: false, - expectedNumSets: 0, - expectedNumEntries: 0, + wantErr: false, }, { name: "grep error", @@ -227,9 +217,7 @@ func TestDestroyNPMIPSets(t *testing.T) { {Cmd: []string{"ipset", "list", "--name"}, HasStartError: true, PipedToCommand: true, ExitCode: 1}, {Cmd: []string{"grep", "azure-npm-"}}, }, - wantErr: true, - expectedNumSets: numSetsToStart, - expectedNumEntries: numEntriesToStart, + wantErr: true, }, { name: "restore error from max tries", @@ -240,9 +228,7 @@ func TestDestroyNPMIPSets(t *testing.T) { {Cmd: ipsetRestoreStringSlice, ExitCode: 1}, {Cmd: ipsetRestoreStringSlice, ExitCode: 1}, }, - wantErr: true, - expectedNumSets: resetIPSetsNumGreppedSets, - expectedNumEntries: numEntriesToStart, + wantErr: true, }, { name: "successfully restore, but fail to flush/destroy 1 set since the set doesn't exist when flushing", @@ -256,9 +242,7 @@ func TestDestroyNPMIPSets(t *testing.T) { }, fakeRestoreSuccessCommand, }, - wantErr: false, - expectedNumSets: 0, - expectedNumEntries: 0, + wantErr: false, }, { name: "successfully restore, but fail to flush/destroy 1 set due to other flush error", @@ -272,9 +256,7 @@ func TestDestroyNPMIPSets(t *testing.T) { }, fakeRestoreSuccessCommand, }, - wantErr: false, - expectedNumSets: 1, - expectedNumEntries: 0, + wantErr: false, }, { name: "successfully restore, but fail to destroy 1 set since the set doesn't exist when destroying", @@ -288,9 +270,7 @@ func TestDestroyNPMIPSets(t *testing.T) { }, fakeRestoreSuccessCommand, }, - wantErr: false, - expectedNumSets: 0, - expectedNumEntries: 0, + wantErr: false, }, { name: "successfully restore, but fail to destroy 1 set due to other destroy error", @@ -304,46 +284,56 @@ func TestDestroyNPMIPSets(t *testing.T) { }, fakeRestoreSuccessCommand, }, - wantErr: false, - expectedNumSets: 1, - expectedNumEntries: 0, + wantErr: false, }, } - testSet := "set1" for _, tt := range tests { tt := tt t.Run(tt.name, func(t *testing.T) { ioshim := common.NewMockIOShim(tt.calls) defer ioshim.VerifyCalls(t, tt.calls) iMgr := NewIPSetManager(applyAlwaysCfg, ioshim) - metrics.SetNumIPSets(numSetsToStart) - metrics.ResetIPSetEntries() - for i := 0; i < numEntriesToStart; i++ { - metrics.AddEntryToIPSet(testSet) - } - err := iMgr.resetIPSets() if tt.wantErr { require.Error(t, err) } else { require.NoError(t, err) } - numSets, err := metrics.GetNumIPSets() - promutil.NotifyIfErrors(t, err) - require.Equal(t, tt.expectedNumSets, numSets, "got unexpected prometheus metric for num ipsets") - - numEntries, err := metrics.GetNumIPSetEntries() - promutil.NotifyIfErrors(t, err) - require.Equal(t, tt.expectedNumEntries, numEntries, "got unexpected prometheus metric for num ipset entries") - - numEntriesForSet, err := metrics.GetNumEntriesForIPSet(testSet) - promutil.NotifyIfErrors(t, err) - require.Equal(t, tt.expectedNumEntries, numEntriesForSet, "got unexpected prometheus metric for num entries for the test set") }) } } +// identical to TestResetIPSets in ipsetmanager_test.go except an error occurs +// makes sure that the cache and metrics are reset despite error +func TestResetIPSetsOnFailure(t *testing.T) { + metrics.ReinitializeAll() + calls := []testutils.TestCmd{ + {Cmd: []string{"ipset", "list", "--name"}, PipedToCommand: true, HasStartError: true}, + {Cmd: []string{"grep", "azure-npm-"}}, + } + ioShim := common.NewMockIOShim(calls) + defer ioShim.VerifyCalls(t, calls) + iMgr := NewIPSetManager(applyAlwaysCfg, ioShim) + + iMgr.CreateIPSets([]*IPSetMetadata{namespaceSet, keyLabelOfPodSet}) + + metrics.IncNumIPSets() + metrics.IncNumIPSets() + metrics.AddEntryToIPSet("test1") + metrics.AddEntryToIPSet("test1") + metrics.AddEntryToIPSet("test2") + + require.NoError(t, iMgr.ResetIPSets()) + + assertExpectedInfo(t, iMgr, &expectedInfo{ + mainCache: nil, + toAddUpdateCache: nil, + toDeleteCache: nil, + setsForKernel: nil, + }) +} + func TestApplyIPSetsSuccessWithoutSave(t *testing.T) { // no sets to add/update, so don't call ipset save calls := []testutils.TestCmd{{Cmd: ipsetRestoreStringSlice}} diff --git a/npm/pkg/dataplane/ipsets/ipsetmanager_test.go b/npm/pkg/dataplane/ipsets/ipsetmanager_test.go index 7e8a8afdc9..29d7470f1b 100644 --- a/npm/pkg/dataplane/ipsets/ipsetmanager_test.go +++ b/npm/pkg/dataplane/ipsets/ipsetmanager_test.go @@ -68,6 +68,32 @@ var ( list = NewIPSetMetadata("test-list1", KeyLabelOfNamespace) ) +// see ipsetmanager_linux_test.go for testing when an error occurs +func TestResetIPSets(t *testing.T) { + metrics.ReinitializeAll() + calls := GetResetTestCalls() + ioShim := common.NewMockIOShim(calls) + defer ioShim.VerifyCalls(t, calls) + iMgr := NewIPSetManager(applyAlwaysCfg, ioShim) + + iMgr.CreateIPSets([]*IPSetMetadata{namespaceSet, keyLabelOfPodSet}) + + metrics.IncNumIPSets() + metrics.IncNumIPSets() + metrics.AddEntryToIPSet("test1") + metrics.AddEntryToIPSet("test1") + metrics.AddEntryToIPSet("test2") + + require.NoError(t, iMgr.ResetIPSets()) + + assertExpectedInfo(t, iMgr, &expectedInfo{ + mainCache: nil, + toAddUpdateCache: nil, + toDeleteCache: nil, + setsForKernel: nil, + }) +} + func TestApplyIPSets(t *testing.T) { type args struct { toAddUpdateSets []*IPSetMetadata diff --git a/npm/pkg/dataplane/policies/chain-management_linux_test.go b/npm/pkg/dataplane/policies/chain-management_linux_test.go index 8ef090db4e..796e5f754c 100644 --- a/npm/pkg/dataplane/policies/chain-management_linux_test.go +++ b/npm/pkg/dataplane/policies/chain-management_linux_test.go @@ -7,6 +7,7 @@ import ( "testing" "github.com/Azure/azure-container-networking/common" + "github.com/Azure/azure-container-networking/npm/metrics" dptestutils "github.com/Azure/azure-container-networking/npm/pkg/dataplane/testutils" testutils "github.com/Azure/azure-container-networking/test/utils" "github.com/stretchr/testify/require" @@ -38,6 +39,27 @@ Chain AZURE-NPM-ACCEPT (1 references) ` ) +// similar to TestBootup in policymanager.go except an error occurs +func TestBootupFailure(t *testing.T) { + metrics.ReinitializeAll() + calls := []testutils.TestCmd{ + {Cmd: []string{"iptables", "-w", "60", "-D", "FORWARD", "-j", "AZURE-NPM"}, ExitCode: 2}, //nolint // AZURE-NPM chain didn't exist + {Cmd: listAllCommandStrings, PipedToCommand: true, HasStartError: true}, + {Cmd: []string{"grep", "Chain AZURE-NPM"}}, + } + ioshim := common.NewMockIOShim(calls) + defer ioshim.VerifyCalls(t, calls) + pMgr := NewPolicyManager(ioshim, ipsetConfig) + + metrics.IncNumACLRules() + metrics.IncNumACLRules() + + require.Error(t, pMgr.Bootup(nil)) + + // make sure that the metrics were reset + promVals{0, 0}.testPrometheusMetrics(t) +} + func TestStaleChainsForceLock(t *testing.T) { testChains := []string{} for i := 0; i < 100000; i++ { diff --git a/npm/pkg/dataplane/policies/policy.go b/npm/pkg/dataplane/policies/policy.go index e306f99314..8a1eff3956 100644 --- a/npm/pkg/dataplane/policies/policy.go +++ b/npm/pkg/dataplane/policies/policy.go @@ -36,6 +36,52 @@ func NewNPMNetworkPolicy(netPolName, netPolNamespace string) *NPMNetworkPolicy { } } +func (netPol *NPMNetworkPolicy) numACLRulesProducedInKernel() int { + numRules := 0 + hasIngress := false + hasEgress := false + for _, aclPolicy := range netPol.ACLs { + if aclPolicy.hasIngress() { + hasIngress = true + numRules++ + } + if aclPolicy.hasEgress() { + hasEgress = true + numRules++ + } + } + + // both Windows and Linux have an extra ACL rule for ingress and an extra rule for egress + if hasIngress { + numRules++ + } + if hasEgress { + numRules++ + } + return numRules +} + +func (netPol *NPMNetworkPolicy) String() string { + if netPol == nil { + klog.Infof("NPMNetworkPolicy is nil when trying to print string") + return "nil NPMNetworkPolicy" + } + itemStrings := make([]string, 0, len(netPol.ACLs)) + for _, item := range netPol.ACLs { + itemStrings = append(itemStrings, item.String()) + } + aclArrayString := strings.Join(itemStrings, "\n--\n") + + podSelectorIPSetString := translatedIPSetsToString(netPol.PodSelectorIPSets) + podSelectorListString := infoArrayToString(netPol.PodSelectorList) + format := `Name:%s Namespace:%s +PodSelectorIPSets: %s +PodSelectorList: %s +ACLs: +%s` + return fmt.Sprintf(format, netPol.Name, netPol.NameSpace, podSelectorIPSetString, podSelectorListString, aclArrayString) +} + // ACLPolicy equivalent to a single iptable rule in linux // or a single HNS rule in windows type ACLPolicy struct { @@ -148,27 +194,6 @@ func (aclPolicy *ACLPolicy) hasNamedPort() bool { return false } -func (netPol *NPMNetworkPolicy) String() string { - if netPol == nil { - klog.Infof("NPMNetworkPolicy is nil when trying to print string") - return "nil NPMNetworkPolicy" - } - itemStrings := make([]string, 0, len(netPol.ACLs)) - for _, item := range netPol.ACLs { - itemStrings = append(itemStrings, item.String()) - } - aclArrayString := strings.Join(itemStrings, "\n--\n") - - podSelectorIPSetString := translatedIPSetsToString(netPol.PodSelectorIPSets) - podSelectorListString := infoArrayToString(netPol.PodSelectorList) - format := `Name:%s Namespace:%s -PodSelectorIPSets: %s -PodSelectorList: %s -ACLs: -%s` - return fmt.Sprintf(format, netPol.Name, netPol.NameSpace, podSelectorIPSetString, podSelectorListString, aclArrayString) -} - func (aclPolicy *ACLPolicy) String() string { format := `Target:%s Direction:%s Protocol:%s Ports:%+v SrcList: %s diff --git a/npm/pkg/dataplane/policies/policy_linux.go b/npm/pkg/dataplane/policies/policy_linux.go index 4e8d271b85..094c465561 100644 --- a/npm/pkg/dataplane/policies/policy_linux.go +++ b/npm/pkg/dataplane/policies/policy_linux.go @@ -10,6 +10,13 @@ import ( "k8s.io/klog" ) +type UniqueDirection bool + +const ( + forIngress UniqueDirection = true + forEgress UniqueDirection = false +) + // returns two booleans indicating whether the network policy has ingress and egress respectively func (networkPolicy *NPMNetworkPolicy) hasIngressAndEgress() (hasIngress, hasEgress bool) { hasIngress = false @@ -34,13 +41,6 @@ func (networkPolicy *NPMNetworkPolicy) chainName(prefix string) string { return joinWithDash(prefix, policyHash) } -type UniqueDirection bool - -const ( - forIngress UniqueDirection = true - forEgress UniqueDirection = false -) - func (networkPolicy *NPMNetworkPolicy) commentForJumpToIngress() string { return networkPolicy.commentForJump(forIngress) } diff --git a/npm/pkg/dataplane/policies/policymanager.go b/npm/pkg/dataplane/policies/policymanager.go index 5fe6311cbe..2cf541dcdd 100644 --- a/npm/pkg/dataplane/policies/policymanager.go +++ b/npm/pkg/dataplane/policies/policymanager.go @@ -23,6 +23,11 @@ const ( IPPolicyMode PolicyManagerMode = "IP" reconcileTimeInMinutes = 5 + + // this number is based on the implementation in chain-management_linux.go + // it represents the number of rules unrelated to policies + // it's technically 3 off when there are no policies since we flush the AZURE-NPM chain then + numLinuxBaseACLRules = 11 ) type PolicyManagerCfg struct { @@ -62,10 +67,17 @@ func NewPolicyManager(ioShim *common.IOShim, cfg *PolicyManagerCfg) *PolicyManag } func (pMgr *PolicyManager) Bootup(epIDs []string) error { + metrics.ResetNumACLRules() if err := pMgr.bootup(epIDs); err != nil { + // NOTE: in Linux, Prometheus metrics may be off at this point since some ACL rules may have been applied successfully metrics.SendErrorLogAndMetric(util.IptmID, "error: failed to bootup policy manager: %s", err.Error()) return npmerrors.ErrorWrapper(npmerrors.BootupPolicyMgr, false, "failed to bootup policy manager", err) } + + if !util.IsWindowsDP() { + // update Prometheus metrics on success + metrics.IncNumACLRulesBy(numLinuxBaseACLRules) + } return nil } @@ -97,12 +109,12 @@ func (pMgr *PolicyManager) GetPolicy(policyKey string) (*NPMNetworkPolicy, bool) } func (pMgr *PolicyManager) AddPolicy(policy *NPMNetworkPolicy, endpointList map[string]string) error { - prometheusTimer := metrics.StartNewTimer() if len(policy.ACLs) == 0 { klog.Infof("[DataPlane] No ACLs in policy %s to apply", policy.PolicyKey) return nil } - defer metrics.RecordACLRuleExecTime(prometheusTimer) // record execution time regardless of failure + + // TODO move this validation and normalization to controller normalizePolicy(policy) if err := validatePolicy(policy); err != nil { msg := fmt.Sprintf("failed to validate policy: %s", err.Error()) @@ -111,13 +123,19 @@ func (pMgr *PolicyManager) AddPolicy(policy *NPMNetworkPolicy, endpointList map[ } // Call actual dataplane function to apply changes + timer := metrics.StartNewTimer() err := pMgr.addPolicy(policy, endpointList) + metrics.RecordACLRuleExecTime(timer) // record execution time regardless of failure if err != nil { + // NOTE: in Linux, Prometheus metrics may be off at this point since some ACL rules may have been applied successfully msg := fmt.Sprintf("failed to add policy: %s", err.Error()) metrics.SendErrorLogAndMetric(util.IptmID, "error: %s", msg) return npmerrors.Errorf(npmerrors.AddPolicy, false, msg) } + // update Prometheus metrics on success + metrics.IncNumACLRulesBy(policy.numACLRulesProducedInKernel()) + pMgr.policyMap.cache[policy.PolicyKey] = policy return nil } @@ -139,12 +157,17 @@ func (pMgr *PolicyManager) RemovePolicy(policyKey string, endpointList map[strin } // Call actual dataplane function to apply changes err := pMgr.removePolicy(policy, endpointList) + // currently we only have acl rule exec time for "adding" rules, so we skip recording here if err != nil { + // NOTE: in Linux, Prometheus metrics may be off at this point since some ACL rules may have been applied successfully msg := fmt.Sprintf("failed to remove policy: %s", err.Error()) metrics.SendErrorLogAndMetric(util.IptmID, "error: %s", msg) return npmerrors.Errorf(npmerrors.RemovePolicy, false, msg) } + // update Prometheus metrics on success + metrics.DecNumACLRulesBy(policy.numACLRulesProducedInKernel()) + delete(pMgr.policyMap.cache, policyKey) return nil } diff --git a/npm/pkg/dataplane/policies/policymanager_linux_test.go b/npm/pkg/dataplane/policies/policymanager_linux_test.go index 4c9ddf43d5..18db6159c4 100644 --- a/npm/pkg/dataplane/policies/policymanager_linux_test.go +++ b/npm/pkg/dataplane/policies/policymanager_linux_test.go @@ -6,6 +6,7 @@ import ( "testing" "github.com/Azure/azure-container-networking/common" + "github.com/Azure/azure-container-networking/npm/metrics" "github.com/Azure/azure-container-networking/npm/pkg/dataplane/ipsets" dptestutils "github.com/Azure/azure-container-networking/npm/pkg/dataplane/testutils" "github.com/Azure/azure-container-networking/npm/util" @@ -201,6 +202,20 @@ func TestChainNames(t *testing.T) { require.Equal(t, expectedName, bothDirectionsNetPol.egressChainName()) } +// similar to TestAddPolicy in policymanager.go except an error occurs +func TestAddPolicyFailure(t *testing.T) { + metrics.ReinitializeAll() + calls := GetAddPolicyFailureTestCalls(testNetPol) + ioshim := common.NewMockIOShim(calls) + defer ioshim.VerifyCalls(t, calls) + pMgr := NewPolicyManager(ioshim, ipsetConfig) + + require.Error(t, pMgr.AddPolicy(testNetPol, nil)) + _, ok := pMgr.GetPolicy(testNetPol.PolicyKey) + require.False(t, ok) + promVals{0, 1}.testPrometheusMetrics(t) +} + func TestCreatorForAddPolicies(t *testing.T) { calls := []testutils.TestCmd{fakeIPTablesRestoreCommand} ioshim := common.NewMockIOShim(calls) @@ -302,6 +317,7 @@ func TestCreatorForRemovePolicies(t *testing.T) { dptestutils.AssertEqualLines(t, expectedLines, actualLines) } +// similar to TestRemovePolicy in policymanager.go except an error occurs func TestRemovePoliciesError(t *testing.T) { tests := []struct { name string @@ -335,6 +351,7 @@ func TestRemovePoliciesError(t *testing.T) { for _, tt := range tests { tt := tt t.Run(tt.name, func(t *testing.T) { + metrics.ReinitializeAll() ioshim := common.NewMockIOShim(tt.calls) defer ioshim.VerifyCalls(t, tt.calls) pMgr := NewPolicyManager(ioshim, ipsetConfig) @@ -342,6 +359,8 @@ func TestRemovePoliciesError(t *testing.T) { require.NoError(t, err) err = pMgr.RemovePolicy(bothDirectionsNetPol.PolicyKey, nil) require.Error(t, err) + + promVals{6, 1}.testPrometheusMetrics(t) }) } } diff --git a/npm/pkg/dataplane/policies/policymanager_test.go b/npm/pkg/dataplane/policies/policymanager_test.go index 8f8ea31978..4d1460be9c 100644 --- a/npm/pkg/dataplane/policies/policymanager_test.go +++ b/npm/pkg/dataplane/policies/policymanager_test.go @@ -6,7 +6,9 @@ import ( "github.com/Azure/azure-container-networking/common" "github.com/Azure/azure-container-networking/npm/metrics" + "github.com/Azure/azure-container-networking/npm/metrics/promutil" "github.com/Azure/azure-container-networking/npm/pkg/dataplane/ipsets" + "github.com/Azure/azure-container-networking/npm/util" "github.com/stretchr/testify/require" ) @@ -17,6 +19,7 @@ var ( // below epList is no-op for linux epList = map[string]string{"10.0.0.1": "test123", "10.0.0.2": "test456"} + epIDs = []string{"test123", "test456"} testNSSet = ipsets.NewIPSetMetadata("test-ns-set", ipsets.Namespace) testKeyPodSet = ipsets.NewIPSetMetadata("test-keyPod-set", ipsets.KeyLabelOfPod) testNetPol = &NPMNetworkPolicy{ @@ -69,17 +72,67 @@ var ( } ) -func TestAddPolicy(t *testing.T) { - netpol := &NPMNetworkPolicy{} +type promVals struct { + numACLs int + execCount int +} - calls := GetAddPolicyTestCalls(netpol) +func (p promVals) testPrometheusMetrics(t *testing.T) { + numACLs, err := metrics.GetNumACLRules() + promutil.NotifyIfErrors(t, err) + require.Equal(t, p.numACLs, numACLs, "Prometheus didn't register correctly for num acls") + + execCount, err := metrics.GetACLRuleExecCount() + promutil.NotifyIfErrors(t, err) + require.Equal(t, p.execCount, execCount, "Prometheus didn't register correctly for acl rule exec count") +} + +// see chain-management_linux_test.go for testing when an error occurs +func TestBootup(t *testing.T) { + metrics.ReinitializeAll() + calls := GetBootupTestCalls() ioshim := common.NewMockIOShim(calls) defer ioshim.VerifyCalls(t, calls) pMgr := NewPolicyManager(ioshim, ipsetConfig) - require.NoError(t, pMgr.AddPolicy(netpol, epList)) + metrics.IncNumACLRules() + metrics.IncNumACLRules() + + require.NoError(t, pMgr.Bootup(epIDs)) + + expectedNumACLs := 11 + if util.IsWindowsDP() { + expectedNumACLs = 0 + } + promVals{expectedNumACLs, 0}.testPrometheusMetrics(t) +} + +// see policymanager_linux.go for testing when an error occurs +func TestAddPolicy(t *testing.T) { + metrics.ReinitializeAll() + calls := GetAddPolicyTestCalls(testNetPol) + ioshim := common.NewMockIOShim(calls) + defer ioshim.VerifyCalls(t, calls) + pMgr := NewPolicyManager(ioshim, ipsetConfig) require.NoError(t, pMgr.AddPolicy(testNetPol, epList)) + _, ok := pMgr.GetPolicy(testNetPol.PolicyKey) + require.True(t, ok) + numTestNetPolACLRulesProducedInKernel := 3 + if util.IsWindowsDP() { + numTestNetPolACLRulesProducedInKernel = 2 + } + promVals{numTestNetPolACLRulesProducedInKernel, 1}.testPrometheusMetrics(t) +} + +func TestAddEmptyPolicy(t *testing.T) { + metrics.ReinitializeAll() + ioshim := common.NewMockIOShim(nil) + pMgr := NewPolicyManager(ioshim, ipsetConfig) + require.NoError(t, pMgr.AddPolicy(&NPMNetworkPolicy{PolicyKey: "test"}, nil)) + _, ok := pMgr.GetPolicy(testNetPol.PolicyKey) + require.False(t, ok) + promVals{0, 0}.testPrometheusMetrics(t) } func TestGetPolicy(t *testing.T) { @@ -111,16 +164,25 @@ func TestGetPolicy(t *testing.T) { } func TestRemovePolicy(t *testing.T) { + metrics.ReinitializeAll() calls := append(GetAddPolicyTestCalls(testNetPol), GetRemovePolicyTestCalls(testNetPol)...) ioshim := common.NewMockIOShim(calls) defer ioshim.VerifyCalls(t, calls) pMgr := NewPolicyManager(ioshim, ipsetConfig) - require.NoError(t, pMgr.AddPolicy(testNetPol, epList)) + require.NoError(t, pMgr.RemovePolicy(testNetPol.PolicyKey, nil)) + _, ok := pMgr.GetPolicy(testNetPol.PolicyKey) + require.False(t, ok) + promVals{0, 1}.testPrometheusMetrics(t) +} +// see policymanager_linux.go for testing when an error occurs +func TestRemoveNonexistentPolicy(t *testing.T) { + metrics.ReinitializeAll() + ioshim := common.NewMockIOShim(nil) + pMgr := NewPolicyManager(ioshim, ipsetConfig) require.NoError(t, pMgr.RemovePolicy("wrong-policy-key", epList)) - - require.NoError(t, pMgr.RemovePolicy("x/test-netpol", nil)) + promVals{0, 0}.testPrometheusMetrics(t) } func TestNormalizeAndValidatePolicy(t *testing.T) {