From 78462b8b3ca3631f0facf25ec14d01d69a78333d Mon Sep 17 00:00:00 2001 From: Hunter Gregory Date: Wed, 31 Aug 2022 13:15:52 -0700 Subject: [PATCH 01/11] adaptively modify linux max restore try count to prevent perpetual errors --- npm/pkg/dataplane/ipsets/ipsetmanager.go | 31 ++++++--- .../dataplane/ipsets/ipsetmanager_linux.go | 9 ++- .../ipsets/ipsetmanager_linux_test.go | 64 +++++++++++++++++-- 3 files changed, 84 insertions(+), 20 deletions(-) diff --git a/npm/pkg/dataplane/ipsets/ipsetmanager.go b/npm/pkg/dataplane/ipsets/ipsetmanager.go index 7815965394..736246f4f1 100644 --- a/npm/pkg/dataplane/ipsets/ipsetmanager.go +++ b/npm/pkg/dataplane/ipsets/ipsetmanager.go @@ -31,6 +31,9 @@ type IPSetMode string const ( ApplyAllIPSets IPSetMode = "all" ApplyOnNeed IPSetMode = "on-need" + + initialMaxRestoreTryCount int = 2 + largestMaxRestoreTryCount int = 1024 // 2^10 ) var ( @@ -50,7 +53,9 @@ type IPSetManager struct { emptySet *IPSet setMap map[string]*IPSet dirtyCache dirtyCacheInterface - ioShim *common.IOShim + // maxRestoreTryCount is only used in Linux + maxRestoreTryCount int + ioShim *common.IOShim sync.RWMutex } @@ -66,11 +71,12 @@ type IPSetManagerCfg struct { func NewIPSetManager(iMgrCfg *IPSetManagerCfg, ioShim *common.IOShim) *IPSetManager { return &IPSetManager{ - iMgrCfg: iMgrCfg, - emptySet: nil, // will be set if needed in calls to AddToLists - setMap: make(map[string]*IPSet), - dirtyCache: newDirtyCache(), - ioShim: ioShim, + iMgrCfg: iMgrCfg, + emptySet: nil, // will be set if needed in calls to AddToLists + setMap: make(map[string]*IPSet), + dirtyCache: newDirtyCache(), + maxRestoreTryCount: initialMaxRestoreTryCount, + ioShim: ioShim, } } @@ -463,12 +469,21 @@ func (iMgr *IPSetManager) ApplyIPSets() error { defer metrics.RecordIPSetExecTime(prometheusTimer) // record execution time regardless of failure err := iMgr.applyIPSets() if err != nil { - metrics.SendErrorLogAndMetric(util.IpsmID, "error: failed to apply ipsets: %s", err.Error()) + // exponentially increase maxRestoreTryCount + iMgr.maxRestoreTryCount *= 2 + if iMgr.maxRestoreTryCount > largestMaxRestoreTryCount { + iMgr.maxRestoreTryCount = largestMaxRestoreTryCount + } + + metrics.SendErrorLogAndMetric(util.IpsmID, "error: failed to apply ipsets. new restore try count (linux only): %d. err: %s", iMgr.maxRestoreTryCount, err.Error()) return err } - iMgr.clearDirtyCache() // TODO could also set the number of ipsets in NPM (not necessarily in kernel) here using len(iMgr.setMap) + iMgr.clearDirtyCache() + // reset the maxRestoreTryCount + iMgr.maxRestoreTryCount = initialMaxRestoreTryCount + fmt.Println("DEBUGME applied ipsets", iMgr.maxRestoreTryCount) return nil } diff --git a/npm/pkg/dataplane/ipsets/ipsetmanager_linux.go b/npm/pkg/dataplane/ipsets/ipsetmanager_linux.go index 286c1aaca5..77cbe375c7 100644 --- a/npm/pkg/dataplane/ipsets/ipsetmanager_linux.go +++ b/npm/pkg/dataplane/ipsets/ipsetmanager_linux.go @@ -50,7 +50,6 @@ const ( ipsetIPPortHashString = ipsetIPPortHashFlag // creator constants - maxTryCount = 3 destroySectionPrefix = "delete" addOrUpdateSectionPrefix = "add/update" ipsetRestoreLineFailurePattern = "Error in line (\\d+):" @@ -173,7 +172,7 @@ func (iMgr *IPSetManager) resetWithoutRestore() bool { // named returns to appease lint func (iMgr *IPSetManager) fileCreatorForFlushAll(ipsetListOutput []byte) (creator *ioutil.FileCreator, names []string, failedNames map[string]struct{}) { destroyFailureCount := 0 - creator = ioutil.NewFileCreator(iMgr.ioShim, maxTryCount, ipsetRestoreLineFailurePattern) + creator = ioutil.NewFileCreator(iMgr.ioShim, iMgr.maxRestoreTryCount, ipsetRestoreLineFailurePattern) names = make([]string, 0) failedNames = make(map[string]struct{}, 0) readIndex := 0 @@ -238,7 +237,7 @@ func (iMgr *IPSetManager) setsWithReferences() map[string]struct{} { func (iMgr *IPSetManager) fileCreatorForDestroyAll(names []string, failedNames, setsWithReferences map[string]struct{}) (creator *ioutil.FileCreator, failureCount *int) { failureCountVal := 0 failureCount = &failureCountVal - creator = ioutil.NewFileCreator(iMgr.ioShim, maxTryCount, ipsetRestoreLineFailurePattern) + creator = ioutil.NewFileCreator(iMgr.ioShim, iMgr.maxRestoreTryCount, ipsetRestoreLineFailurePattern) // destroy all the sets for _, hashedSetName := range names { @@ -364,7 +363,7 @@ func (iMgr *IPSetManager) applyIPSetsWithSaveFile() error { return npmerrors.SimpleErrorWrapper("ipset save failed when applying ipsets", saveError) } } - creator := iMgr.fileCreatorForApplyWithSaveFile(maxTryCount, saveFile) + creator := iMgr.fileCreatorForApplyWithSaveFile(iMgr.maxRestoreTryCount, saveFile) restoreError := creator.RunCommandWithFile(ipsetCommand, ipsetRestoreFlag) if restoreError != nil { return npmerrors.SimpleErrorWrapper("ipset restore failed when applying ipsets with save file", restoreError) @@ -405,7 +404,7 @@ example where: -X set4 */ func (iMgr *IPSetManager) applyIPSets() error { - creator := iMgr.fileCreatorForApply(maxTryCount) + creator := iMgr.fileCreatorForApply(iMgr.maxRestoreTryCount) restoreError := creator.RunCommandWithFile(ipsetCommand, ipsetRestoreFlag) if restoreError != nil { return npmerrors.SimpleErrorWrapper("ipset restore failed when applying ipsets", restoreError) diff --git a/npm/pkg/dataplane/ipsets/ipsetmanager_linux_test.go b/npm/pkg/dataplane/ipsets/ipsetmanager_linux_test.go index c9db0194b7..62dec490d2 100644 --- a/npm/pkg/dataplane/ipsets/ipsetmanager_linux_test.go +++ b/npm/pkg/dataplane/ipsets/ipsetmanager_linux_test.go @@ -36,6 +36,60 @@ var ( // TODO test that a reconcile list is updated for all the TestFailure UTs // TODO same exact TestFailure UTs for unknown errors +func TestMaxRestoreTryCount(t *testing.T) { + metrics.ReinitializeAll() + + tryCounts := []int{2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 1024} + numSets := 1025 + toAddIPSets := make([]*IPSetMetadata, numSets) + for k := range toAddIPSets { + toAddIPSets[k] = NewIPSetMetadata(fmt.Sprintf("bad-set-%d", k), Namespace) + } + numSetsWhenSuccessful := 1022 + toAddIPSetsForSuccess := make([]*IPSetMetadata, numSetsWhenSuccessful) + for k := range toAddIPSetsForSuccess { + toAddIPSetsForSuccess[k] = NewIPSetMetadata(fmt.Sprintf("good-set-%d", k), Namespace) + } + + errorInLine1 := testutils.TestCmd{ + Cmd: ipsetRestoreStringSlice, + Stdout: "Error in line 1: for some other error", + ExitCode: 1, + } + sumCounts := 0 + for _, tryCount := range tryCounts { + sumCounts += tryCount + } + numCalls := sumCounts + numSetsWhenSuccessful + calls := make([]testutils.TestCmd, numCalls) + for k := range calls { + calls[k] = errorInLine1 + } + calls[numCalls-1].ExitCode = 0 + calls[numCalls-1].Stdout = "" + + ioShim := common.NewMockIOShim(calls) + defer ioShim.VerifyCalls(t, calls) + iMgr := NewIPSetManager(applyAlwaysCfg, ioShim) + iMgr.CreateIPSets(toAddIPSets) + + for _, tryCount := range tryCounts { + require.Equal(t, tryCount, iMgr.maxRestoreTryCount) + require.Error(t, iMgr.ApplyIPSets()) + // cache behavior is currently undefined if there's an apply error + } + + // set the dirty cache to less sets and succeed by skipping all sets + iMgr.clearDirtyCache() + iMgr.CreateIPSets(toAddIPSetsForSuccess) + require.NoError(t, iMgr.ApplyIPSets()) + require.Equal(t, 2, iMgr.maxRestoreTryCount) + + execCount, err := metrics.GetIPSetExecCount() + promutil.NotifyIfErrors(t, err) + require.Equal(t, len(tryCounts)+1, execCount) +} + func TestApplyIPSets(t *testing.T) { type args struct { toAddUpdateSets []*IPSetMetadata @@ -110,7 +164,7 @@ func TestApplyIPSets(t *testing.T) { require.Greater(t, len(calls), 0) calls[len(calls)-1].ExitCode = 1 // then add errors as many times as we retry - for i := 1; i < maxTryCount; i++ { + for i := 1; i < initialMaxRestoreTryCount; i++ { calls = append(calls, testutils.TestCmd{Cmd: ipsetRestoreStringSlice, ExitCode: 1}) } } @@ -488,7 +542,6 @@ func TestDestroyNPMIPSets(t *testing.T) { {Cmd: []string{"grep", "azure-npm-"}, Stdout: resetIPSetsListOutputString}, {Cmd: ipsetRestoreStringSlice, ExitCode: 1}, {Cmd: ipsetRestoreStringSlice, ExitCode: 1}, - {Cmd: ipsetRestoreStringSlice, ExitCode: 1}, }, wantErr: true, }, @@ -505,7 +558,6 @@ func TestDestroyNPMIPSets(t *testing.T) { {Cmd: []string{"grep", "-o", "-P", "azure-npm-\\d+"}, ExitCode: 1}, {Cmd: ipsetRestoreStringSlice, ExitCode: 1}, {Cmd: ipsetRestoreStringSlice, ExitCode: 1}, - {Cmd: ipsetRestoreStringSlice, ExitCode: 1}, }, wantErr: true, }, @@ -726,8 +778,7 @@ func TestApplyIPSetsFailureOnSave(t *testing.T) { func TestApplyIPSetsFailureOnRestore(t *testing.T) { calls := []testutils.TestCmd{ - // fail 3 times because this is our max try count - {Cmd: ipsetRestoreStringSlice, ExitCode: 1}, + // fail 2 times because this is our max try count {Cmd: ipsetRestoreStringSlice, ExitCode: 1}, {Cmd: ipsetRestoreStringSlice, ExitCode: 1}, } @@ -743,8 +794,7 @@ func TestApplyIPSetsFailureOnRestore(t *testing.T) { calls = []testutils.TestCmd{ {Cmd: ipsetSaveStringSlice, PipedToCommand: true}, {Cmd: []string{"grep", "azure-npm-"}}, - // fail 3 times because this is our max try count - {Cmd: ipsetRestoreStringSlice, ExitCode: 1}, + // fail 2 times because this is our max try count {Cmd: ipsetRestoreStringSlice, ExitCode: 1}, {Cmd: ipsetRestoreStringSlice, ExitCode: 1}, } From 8b28fa9063673f4bb1f167ee752a2a9f6d87ece6 Mon Sep 17 00:00:00 2001 From: Hunter Gregory Date: Thu, 27 Oct 2022 21:18:37 -0700 Subject: [PATCH 02/11] remove debug print --- npm/pkg/dataplane/ipsets/ipsetmanager.go | 1 - 1 file changed, 1 deletion(-) diff --git a/npm/pkg/dataplane/ipsets/ipsetmanager.go b/npm/pkg/dataplane/ipsets/ipsetmanager.go index 736246f4f1..9db4239a09 100644 --- a/npm/pkg/dataplane/ipsets/ipsetmanager.go +++ b/npm/pkg/dataplane/ipsets/ipsetmanager.go @@ -483,7 +483,6 @@ func (iMgr *IPSetManager) ApplyIPSets() error { iMgr.clearDirtyCache() // reset the maxRestoreTryCount iMgr.maxRestoreTryCount = initialMaxRestoreTryCount - fmt.Println("DEBUGME applied ipsets", iMgr.maxRestoreTryCount) return nil } From e5c8f389744c967f376948aa82e953c73a89f089 Mon Sep 17 00:00:00 2001 From: Hunter Gregory Date: Thu, 27 Oct 2022 21:19:18 -0700 Subject: [PATCH 03/11] log restore file and send ipsetmanager_linux errors --- .../dataplane/ipsets/ipsetmanager_linux.go | 38 +++++++++---------- npm/util/ioutil/restore.go | 11 ++++++ 2 files changed, 30 insertions(+), 19 deletions(-) diff --git a/npm/pkg/dataplane/ipsets/ipsetmanager_linux.go b/npm/pkg/dataplane/ipsets/ipsetmanager_linux.go index 77cbe375c7..16523ba8c0 100644 --- a/npm/pkg/dataplane/ipsets/ipsetmanager_linux.go +++ b/npm/pkg/dataplane/ipsets/ipsetmanager_linux.go @@ -200,7 +200,7 @@ func (iMgr *IPSetManager) fileCreatorForFlushAll(ipsetListOutput []byte) (creato Definition: ioutil.AlwaysMatchDefinition, Method: ioutil.Continue, Callback: func() { - klog.Errorf("[RESET-IPSETS] marking flush and upcoming destroy for set %s as a failure due to unknown error", hashedSetName) + metrics.SendErrorLogAndMetric(util.IpsmID, "[RESET-IPSETS] marking flush and upcoming destroy for set %s as a failure due to unknown error", hashedSetName) destroyFailureCount++ failedNames[hashedSetName] = struct{}{} // TODO mark as a failure @@ -258,7 +258,7 @@ func (iMgr *IPSetManager) fileCreatorForDestroyAll(names []string, failedNames, Definition: setInUseByKernelDefinition, Method: ioutil.Continue, Callback: func() { - klog.Errorf("[RESET-IPSETS] marking destroy for set %s as a failure since the set is in use by a kernel component", hashedSetName) + metrics.SendErrorLogAndMetric(util.IpsmID, "[RESET-IPSETS] marking destroy for set %s as a failure since the set is in use by a kernel component", hashedSetName) failureCountVal++ // TODO mark the set as a failure and reconcile what iptables rule or ipset is referring to it }, @@ -274,7 +274,7 @@ func (iMgr *IPSetManager) fileCreatorForDestroyAll(names []string, failedNames, Definition: ioutil.AlwaysMatchDefinition, Method: ioutil.Continue, Callback: func() { - klog.Errorf("[RESET-IPSETS] marking destroy for set %s as a failure due to unknown error", hashedSetName) + metrics.SendErrorLogAndMetric(util.IpsmID, "[RESET-IPSETS] marking destroy for set %s as a failure due to unknown error", hashedSetName) failureCountVal++ // TODO mark the set as a failure and reconcile what iptables rule or ipset is referring to it }, @@ -538,7 +538,7 @@ func (iMgr *IPSetManager) updateDirtyKernelSets(setsToAddOrUpdate map[string]str if readIndex < len(saveFile) { line, readIndex = parse.Line(readIndex, saveFile) if !hasPrefix(line, createStringWithSpace) { - klog.Errorf("expected a create line in ipset save file, but got the following line: %s", string(line)) + metrics.SendErrorLogAndMetric(util.IpsmID, "expected a create line in ipset save file, but got the following line: %s", string(line)) // TODO send error snapshot line, readIndex = nextCreateLine(readIndex, saveFile) } @@ -594,7 +594,7 @@ func (iMgr *IPSetManager) updateDirtyKernelSets(setsToAddOrUpdate map[string]str break } if !hasPrefix(line, addStringWithSpace) { - klog.Errorf("expected an add line, but got the following line: %s", string(line)) + metrics.SendErrorLogAndMetric(util.IpsmID, "expected an add line, but got the following line: %s", string(line)) // TODO send error snapshot line, readIndex = nextCreateLine(readIndex, saveFile) break @@ -603,7 +603,7 @@ func (iMgr *IPSetManager) updateDirtyKernelSets(setsToAddOrUpdate map[string]str spaceSplitLineAfterAdd := strings.Split(lineAfterAdd, space) parent := spaceSplitLineAfterAdd[0] if len(spaceSplitLineAfterAdd) != 2 || parent != hashedName { - klog.Errorf("expected an add line for set %s in ipset save file, but got the following line: %s", hashedName, string(line)) + metrics.SendErrorLogAndMetric(util.IpsmID, "expected an add line for set %s in ipset save file, but got the following line: %s", hashedName, string(line)) // TODO send error snapshot line, readIndex = nextCreateLine(readIndex, saveFile) break @@ -654,23 +654,23 @@ func haveTypeProblem(set *IPSet, restOfSpaceSplitCreateLine []string) bool { case ipsetSetListString: if set.Kind != ListSet { lineString := fmt.Sprintf("create %s %s", set.HashedName, strings.Join(restOfSpaceSplitCreateLine, " ")) - klog.Errorf("expected to find a ListSet but have the line: %s", lineString) + metrics.SendErrorLogAndMetric(util.IpsmID, "expected to find a ListSet but have the line: %s", lineString) return true } case ipsetNetHashString: if set.Kind != HashSet || set.Type == NamedPorts { lineString := fmt.Sprintf("create %s %s", set.HashedName, strings.Join(restOfSpaceSplitCreateLine, " ")) - klog.Errorf("expected to find a non-NamedPorts HashSet but have the following line: %s", lineString) + metrics.SendErrorLogAndMetric(util.IpsmID, "expected to find a non-NamedPorts HashSet but have the following line: %s", lineString) return true } case ipsetIPPortHashString: if set.Type != NamedPorts { lineString := fmt.Sprintf("create %s %s", set.HashedName, strings.Join(restOfSpaceSplitCreateLine, " ")) - klog.Errorf("expected to find a NamedPorts set but have the following line: %s", lineString) + metrics.SendErrorLogAndMetric(util.IpsmID, "expected to find a NamedPorts set but have the following line: %s", lineString) return true } default: - klog.Errorf("unknown type string [%s] in line: %s", typeString, strings.Join(restOfSpaceSplitCreateLine, " ")) + metrics.SendErrorLogAndMetric(util.IpsmID, "unknown type string [%s] in line: %s", typeString, strings.Join(restOfSpaceSplitCreateLine, " ")) return true } return false @@ -693,7 +693,7 @@ func (iMgr *IPSetManager) flushSetForApply(creator *ioutil.FileCreator, prefixed Definition: ioutil.AlwaysMatchDefinition, Method: ioutil.ContinueAndAbortSection, Callback: func() { - klog.Errorf("skipping flush and upcoming destroy for set %s due to unknown error", prefixedName) + metrics.SendErrorLogAndMetric(util.IpsmID, "skipping flush and upcoming destroy for set %s due to unknown error", prefixedName) // TODO mark as a failure // would this ever happen? }, @@ -710,7 +710,7 @@ func (iMgr *IPSetManager) destroySetForApply(creator *ioutil.FileCreator, prefix Definition: setInUseByKernelDefinition, Method: ioutil.Continue, Callback: func() { - klog.Errorf("skipping destroy line for set %s since the set is in use by a kernel component", prefixedName) + metrics.SendErrorLogAndMetric(util.IpsmID, "skipping destroy line for set %s since the set is in use by a kernel component", prefixedName) // TODO mark the set as a failure and reconcile what iptables rule or ipset is referring to it }, }, @@ -718,7 +718,7 @@ func (iMgr *IPSetManager) destroySetForApply(creator *ioutil.FileCreator, prefix Definition: ioutil.AlwaysMatchDefinition, Method: ioutil.Continue, Callback: func() { - klog.Errorf("skipping destroy line for set %s due to unknown error", prefixedName) + metrics.SendErrorLogAndMetric(util.IpsmID, "skipping destroy line for set %s due to unknown error", prefixedName) }, }, } @@ -746,7 +746,7 @@ func (iMgr *IPSetManager) createSetForApply(creator *ioutil.FileCreator, set *IP Definition: setAlreadyExistsDefinition, Method: ioutil.ContinueAndAbortSection, Callback: func() { - klog.Errorf("skipping create and any following adds/deletes for set %s since the set already exists with different specs", prefixedName) + metrics.SendErrorLogAndMetric(util.IpsmID, "skipping create and any following adds/deletes for set %s since the set already exists with different specs", prefixedName) // TODO mark the set as a failure and handle this }, }, @@ -754,7 +754,7 @@ func (iMgr *IPSetManager) createSetForApply(creator *ioutil.FileCreator, set *IP Definition: ioutil.AlwaysMatchDefinition, Method: ioutil.ContinueAndAbortSection, Callback: func() { - klog.Errorf("skipping create and any following adds/deletes for set %s due to unknown error", prefixedName) + metrics.SendErrorLogAndMetric(util.IpsmID, "skipping create and any following adds/deletes for set %s due to unknown error", prefixedName) // TODO same as above error handler }, }, @@ -769,7 +769,7 @@ func (iMgr *IPSetManager) deleteMemberForApply(creator *ioutil.FileCreator, set Definition: ioutil.AlwaysMatchDefinition, Method: ioutil.Continue, Callback: func() { - klog.Errorf("skipping delete line for set %s due to unknown error", set.Name) + metrics.SendErrorLogAndMetric(util.IpsmID, "skipping delete line for set %s due to unknown error", set.Name) }, }, } @@ -784,7 +784,7 @@ func (iMgr *IPSetManager) addMemberForApply(creator *ioutil.FileCreator, set *IP Definition: memberSetDoesntExistDefinition, Method: ioutil.Continue, Callback: func() { - klog.Errorf("skipping add of %s to list %s since the member doesn't exist", member, set.Name) + metrics.SendErrorLogAndMetric(util.IpsmID, "skipping add of %s to list %s since the member doesn't exist", member, set.Name) // TODO reconcile }, }, @@ -792,7 +792,7 @@ func (iMgr *IPSetManager) addMemberForApply(creator *ioutil.FileCreator, set *IP Definition: ioutil.AlwaysMatchDefinition, Method: ioutil.Continue, Callback: func() { - klog.Errorf("skipping add of %s to list %s due to unknown error", member, set.Name) + metrics.SendErrorLogAndMetric(util.IpsmID, "skipping add of %s to list %s due to unknown error", member, set.Name) }, }, } @@ -802,7 +802,7 @@ func (iMgr *IPSetManager) addMemberForApply(creator *ioutil.FileCreator, set *IP Definition: ioutil.AlwaysMatchDefinition, Method: ioutil.Continue, Callback: func() { - klog.Errorf("skipping add line for hash set %s due to unknown error", set.Name) + metrics.SendErrorLogAndMetric(util.IpsmID, "skipping add line for hash set %s due to unknown error", set.Name) }, }, } diff --git a/npm/util/ioutil/restore.go b/npm/util/ioutil/restore.go index 9bd462b01d..966c41d34f 100644 --- a/npm/util/ioutil/restore.go +++ b/npm/util/ioutil/restore.go @@ -182,6 +182,7 @@ func (creator *FileCreator) runCommandOnceWithFile(fileString, cmd string, args } klog.Infof("running this restore command: [%s]", commandString) + creator.logLines(commandString) command := creator.ioShim.Exec.Command(cmd, args...) command.SetStdin(bytes.NewBufferString(fileString)) @@ -301,3 +302,13 @@ func (creator *FileCreator) handleLineError(stdErr, commandString string, lineNu } return false, creator.lines[lineIndex] } + +func (creator *FileCreator) logLines(commandString string) { + lineNum := 0 + for i, line := range creator.lines { + if _, ok := creator.lineNumbersToOmit[i]; !ok { + klog.Infof("line %d of restore command [%s] with section ID [%s]: [%s]", lineNum, commandString, line.sectionID, line.content) + lineNum++ + } + } +} From cb73cfeb65ccca2c21c4c05bc1dda896a38e4af8 Mon Sep 17 00:00:00 2001 From: Hunter Gregory Date: Thu, 27 Oct 2022 21:33:57 -0700 Subject: [PATCH 04/11] send other appropriate errors --- npm/metrics/ai-utils.go | 1 + npm/metrics/timer.go | 4 ++-- npm/pkg/dataplane/ipsets/ipset.go | 6 +++--- npm/pkg/dataplane/policies/chain-management_linux.go | 2 +- npm/pkg/dataplane/policies/policy_linux.go | 4 ++-- npm/util/ioutil/current-azure-chains.go | 6 +++--- npm/util/ioutil/restore.go | 7 +++---- 7 files changed, 15 insertions(+), 15 deletions(-) diff --git a/npm/metrics/ai-utils.go b/npm/metrics/ai-utils.go index 769042dde1..20de3009ff 100644 --- a/npm/metrics/ai-utils.go +++ b/npm/metrics/ai-utils.go @@ -65,6 +65,7 @@ func SendErrorLogAndMetric(operationID int, format string, args ...interface{}) Value: util.ErrorValue, CustomDimensions: customDimensions, } + // SendMetric prints out a log visible in kubectl logs SendMetric(metric) // Send error logs diff --git a/npm/metrics/timer.go b/npm/metrics/timer.go index 17c37d8ebd..7060c4687b 100644 --- a/npm/metrics/timer.go +++ b/npm/metrics/timer.go @@ -3,8 +3,8 @@ package metrics import ( "time" + "github.com/Azure/azure-container-networking/npm/util" "github.com/prometheus/client_golang/prometheus" - "k8s.io/klog" ) // Timer is a one-time-use tool for recording time between a start and end point @@ -27,7 +27,7 @@ func (timer *Timer) stopAndRecord(observer prometheus.Summary) { func (timer *Timer) stopAndRecordCRUDExecTime(observer *prometheus.SummaryVec, op OperationKind, hadError bool) { timer.stop() if !op.isValid() { - klog.Errorf("Unknown operation [%v] when recording exec time", op) + SendErrorLogAndMetric(util.UtilID, "Unknown operation [%v] when recording exec time", op) return } if op != NoOp { diff --git a/npm/pkg/dataplane/ipsets/ipset.go b/npm/pkg/dataplane/ipsets/ipset.go index 5275f7eb3f..62e168db24 100644 --- a/npm/pkg/dataplane/ipsets/ipset.go +++ b/npm/pkg/dataplane/ipsets/ipset.go @@ -5,8 +5,8 @@ import ( "fmt" "github.com/Azure/azure-container-networking/log" + "github.com/Azure/azure-container-networking/npm/metrics" "github.com/Azure/azure-container-networking/npm/util" - "k8s.io/klog" ) type IPSetMetadata struct { @@ -64,10 +64,10 @@ func (setMetadata *IPSetMetadata) GetPrefixName() string { case EmptyHashSet: return fmt.Sprintf("%s%s", util.EmptySetPrefix, setMetadata.Name) case UnknownType: // adding this to appease golint - klog.Errorf("experienced unknown type in set metadata: %+v", setMetadata) + metrics.SendErrorLogAndMetric(util.UtilID, "experienced unknown type in set metadata: %+v", setMetadata) return Unknown default: - klog.Errorf("experienced unexpected type %d in set metadata: %+v", setMetadata.Type, setMetadata) + metrics.SendErrorLogAndMetric(util.UtilID, "experienced unexpected type %d in set metadata: %+v", setMetadata.Type, setMetadata) return Unknown } } diff --git a/npm/pkg/dataplane/policies/chain-management_linux.go b/npm/pkg/dataplane/policies/chain-management_linux.go index c2b65d9e1f..f324f8bd1a 100644 --- a/npm/pkg/dataplane/policies/chain-management_linux.go +++ b/npm/pkg/dataplane/policies/chain-management_linux.go @@ -178,7 +178,7 @@ func (pMgr *PolicyManager) bootup(_ []string) error { if deprecatedErrCode == 0 { klog.Infof("deleted deprecated jump rule from FORWARD chain to AZURE-NPM chain") } else if deprecatedErr != nil { - klog.Errorf("failed to delete deprecated jump rule from FORWARD chain to AZURE-NPM chain for unexpected reason with exit code %d and error: %s", deprecatedErrCode, deprecatedErr.Error()) + metrics.SendErrorLogAndMetric(util.IptmID, "failed to delete deprecated jump rule from FORWARD chain to AZURE-NPM chain for unexpected reason with exit code %d and error: %s", deprecatedErrCode, deprecatedErr.Error()) } currentChains, err := ioutil.AllCurrentAzureChains(pMgr.ioShim.Exec, util.IptablesDefaultWaitTime) diff --git a/npm/pkg/dataplane/policies/policy_linux.go b/npm/pkg/dataplane/policies/policy_linux.go index e40cfb5735..038f00a2f9 100644 --- a/npm/pkg/dataplane/policies/policy_linux.go +++ b/npm/pkg/dataplane/policies/policy_linux.go @@ -5,9 +5,9 @@ import ( "strconv" "strings" + "github.com/Azure/azure-container-networking/npm/metrics" "github.com/Azure/azure-container-networking/npm/pkg/dataplane/ipsets" "github.com/Azure/azure-container-networking/npm/util" - "k8s.io/klog" ) type UniqueDirection bool @@ -117,7 +117,7 @@ func (aclPolicy *ACLPolicy) comment() string { for _, info := range aclPolicy.DstList { if info.IPSet.Type == ipsets.NamedPorts { if foundNamedPortPeer { - klog.Errorf("while creating ACL comment, unexpectedly found more than one namedPort peer for ACL:\n%s", aclPolicy.PrettyString()) + metrics.SendErrorLogAndMetric(util.IptmID, "while creating ACL comment, unexpectedly found more than one namedPort peer for ACL:\n%s", aclPolicy.PrettyString()) } namedPortPeer = info foundNamedPortPeer = true diff --git a/npm/util/ioutil/current-azure-chains.go b/npm/util/ioutil/current-azure-chains.go index 70b95ba3ba..3a7629e334 100644 --- a/npm/util/ioutil/current-azure-chains.go +++ b/npm/util/ioutil/current-azure-chains.go @@ -7,9 +7,9 @@ import ( "fmt" "strings" + "github.com/Azure/azure-container-networking/npm/metrics" "github.com/Azure/azure-container-networking/npm/util" npmerrors "github.com/Azure/azure-container-networking/npm/util/errors" - "k8s.io/klog" utilexec "k8s.io/utils/exec" ) @@ -46,14 +46,14 @@ func AllCurrentAzureChains(exec utilexec.Interface, lockWaitTimeSeconds string) // remove the last empty line (since each line ends with a newline) lines = lines[:lastIndex] // this line doesn't impact the array that the slice references } else { - klog.Errorf(`while grepping for current Azure chains, expected last line to end in "" but got [%s]. full grep output: [%s]`, lastLine, string(searchResults)) + metrics.SendErrorLogAndMetric(util.IptmID, `while grepping for current Azure chains, expected last line to end in "" but got [%s]. full grep output: [%s]`, lastLine, string(searchResults)) } chainNames := make(map[string]struct{}, len(lines)) for _, line := range lines { // line of the form "Chain NAME (1 references)" spaceSeparatedLine := strings.Split(line, " ") if len(spaceSeparatedLine) < minSpacedSectionsForChainLine || len(spaceSeparatedLine[1]) < minAzureChainNameLength { - klog.Errorf("while grepping for current Azure chains, got unexpected line [%s] for all current azure chains. full grep output: [%s]", line, string(searchResults)) + metrics.SendErrorLogAndMetric(util.IptmID, "while grepping for current Azure chains, got unexpected line [%s] for all current azure chains. full grep output: [%s]", line, string(searchResults)) } else { chainNames[spaceSeparatedLine[1]] = struct{}{} } diff --git a/npm/util/ioutil/restore.go b/npm/util/ioutil/restore.go index 966c41d34f..a5e5bd8f4f 100644 --- a/npm/util/ioutil/restore.go +++ b/npm/util/ioutil/restore.go @@ -142,7 +142,6 @@ func (creator *FileCreator) RunCommandWithFile(cmd string, args ...string) error sameNew = "updated" } msg := fmt.Sprintf("on try number %d, failed to run command [%s]. Rerunning with %s file. err: [%s]", creator.tryCount, commandString, sameNew, err.Error()) - klog.Error(msg) metrics.SendErrorLogAndMetric(util.UtilID, "error: %s", msg) if wasFileAltered { @@ -243,17 +242,17 @@ func (creator *FileCreator) numLines() int { func (definition *ErrorDefinition) getErrorLineNumber(stdErr, commandString string, numLines int) int { result := definition.re.FindStringSubmatch(stdErr) if result == nil || len(result) < 2 { - klog.Errorf("expected error with line number, but couldn't detect one with error regex pattern [%s] for command [%s] with stdErr [%s]", definition.matchPattern, commandString, stdErr) + metrics.SendErrorLogAndMetric(util.UtilID, "expected error with line number, but couldn't detect one with error regex pattern [%s] for command [%s] with stdErr [%s]", definition.matchPattern, commandString, stdErr) return -1 } lineNumString := result[1] lineNum, err := strconv.Atoi(lineNumString) if err != nil { - klog.Errorf("expected error with line number, but error regex pattern %s didn't produce a number for command [%s] with stdErr [%s]", definition.matchPattern, commandString, stdErr) + metrics.SendErrorLogAndMetric(util.UtilID, "expected error with line number, but error regex pattern %s didn't produce a number for command [%s] with stdErr [%s]", definition.matchPattern, commandString, stdErr) return -1 } if lineNum < 1 || lineNum > numLines { - klog.Errorf( + metrics.SendErrorLogAndMetric(util.UtilID, "expected error with line number, but error regex pattern %s produced an invalid line number %d for command [%s] with stdErr [%s]", definition.matchPattern, lineNum, commandString, stdErr, ) From f4eb47ed274c2b58c167eaa1f760451936bfc280 Mon Sep 17 00:00:00 2001 From: Hunter Gregory Date: Fri, 28 Oct 2022 13:46:15 -0700 Subject: [PATCH 05/11] fix handleLineError function --- npm/util/ioutil/restore.go | 9 +++++---- npm/util/ioutil/restore_test.go | 22 +++++++++++++++++++++- 2 files changed, 26 insertions(+), 5 deletions(-) diff --git a/npm/util/ioutil/restore.go b/npm/util/ioutil/restore.go index a5e5bd8f4f..2780ae0c61 100644 --- a/npm/util/ioutil/restore.go +++ b/npm/util/ioutil/restore.go @@ -264,15 +264,16 @@ func (definition *ErrorDefinition) getErrorLineNumber(stdErr, commandString stri // return whether the file was altered func (creator *FileCreator) handleLineError(stdErr, commandString string, lineNum int) (bool, *Line) { lineIndex := 0 - currentLineIndex := 0 + currentLineNum := 1 for i := range creator.lines { if _, isOmitted := creator.lineNumbersToOmit[i]; isOmitted { continue } - if currentLineIndex == lineNum-1 { - lineIndex = currentLineIndex + if currentLineNum == lineNum { + lineIndex = i + break } - currentLineIndex++ + currentLineNum++ } line := creator.lines[lineIndex] diff --git a/npm/util/ioutil/restore_test.go b/npm/util/ioutil/restore_test.go index e49800c3ff..dc54f197e6 100644 --- a/npm/util/ioutil/restore_test.go +++ b/npm/util/ioutil/restore_test.go @@ -17,6 +17,7 @@ const ( testCommandString = "test command" section1ID = "section1" section2ID = "section2" + section3ID = "section3" ) var ( @@ -182,11 +183,20 @@ func TestHandleLineErrorForContinueAndAbortSection(t *testing.T) { creator.AddLine(section2ID, errorHandlers, "line2-item1", "line2-item2", "line2-item3") creator.AddLine(section1ID, nil, "line3-item1", "line3-item2", "line3-item3") creator.AddLine(section2ID, nil, "line4-item1", "line4-item2", "line4-item3") + creator.AddLine(section3ID, nil, "line5-item1", "line5-item2", "line5-item3") wasFileAltered, err := creator.RunCommandOnceWithFile(testCommandString) require.Error(t, err) require.True(t, wasFileAltered) fileString := creator.ToString() - assert.Equal(t, "line3-item1 line3-item2 line3-item3\n", fileString) + assert.Equal(t, "line3-item1 line3-item2 line3-item3\nline5-item1 line5-item2 line5-item3\n", fileString) + + require.Equal(t, map[int]struct{}{0: {}, 1: {}, 3: {}}, creator.lineNumbersToOmit, "expected line 1, 2, and 4 to be marked omitted") + + _, line := creator.handleLineError("some error", testCommandString, 1) + require.Equal(t, creator.lines[2], line, "expected a failure in line 1 to map to original line 3") + + _, line = creator.handleLineError("some error", testCommandString, 2) + require.Equal(t, creator.lines[4], line, "expected a failure in line 2 to map to original line 5") } func TestHandleLineErrorForContinue(t *testing.T) { @@ -213,6 +223,14 @@ func TestHandleLineErrorForContinue(t *testing.T) { require.True(t, wasFileAltered) fileString := creator.ToString() assert.Equal(t, "line3-item1 line3-item2 line3-item3\nline4-item1 line4-item2 line4-item3\n", fileString) + + require.Equal(t, map[int]struct{}{0: {}, 1: {}}, creator.lineNumbersToOmit, "expected line 1 and 2 to be marked omitted") + + _, line := creator.handleLineError("some error", testCommandString, 1) + require.Equal(t, creator.lines[2], line, "expected a failure in line 1 to map to original line 3") + + _, line = creator.handleLineError("some error", testCommandString, 2) + require.Equal(t, creator.lines[3], line, "expected a failure in line 2 to map to original line 4") } func TestHandleLineErrorNoMatch(t *testing.T) { @@ -239,6 +257,8 @@ func TestHandleLineErrorNoMatch(t *testing.T) { require.False(t, wasFileAltered) fileStringAfter := creator.ToString() require.Equal(t, fileStringBefore, fileStringAfter) + + require.Equal(t, map[int]struct{}{}, creator.lineNumbersToOmit, "expected no lines to be marked omitted") } func TestAlwaysMatchDefinition(t *testing.T) { From e4d873d8924402f489a9b697953de2758c4c2afc Mon Sep 17 00:00:00 2001 From: Hunter Gregory Date: Fri, 28 Oct 2022 13:47:48 -0700 Subject: [PATCH 06/11] fix printing restore lines and enhance a log --- npm/util/ioutil/restore.go | 34 +++++++++++++++++++++++++++++---- npm/util/ioutil/restore_test.go | 3 +++ 2 files changed, 33 insertions(+), 4 deletions(-) diff --git a/npm/util/ioutil/restore.go b/npm/util/ioutil/restore.go index 2780ae0c61..0db944187c 100644 --- a/npm/util/ioutil/restore.go +++ b/npm/util/ioutil/restore.go @@ -288,7 +288,7 @@ func (creator *FileCreator) handleLineError(stdErr, commandString string, lineNu creator.lineNumbersToOmit[i] = struct{}{} } case ContinueAndAbortSection: - klog.Infof("continuing after line %d and aborting section associated with the line for command [%s]", lineNum, commandString) + klog.Infof("continuing after line %d and aborting section [%s] for command [%s]", lineNum, line.sectionID, commandString) for i := 0; i <= lineIndex; i++ { creator.lineNumbersToOmit[i] = struct{}{} } @@ -304,11 +304,37 @@ func (creator *FileCreator) handleLineError(stdErr, commandString string, lineNu } func (creator *FileCreator) logLines(commandString string) { - lineNum := 0 - for i, line := range creator.lines { - if _, ok := creator.lineNumbersToOmit[i]; !ok { + if creator.tryCount == 0 { + // print every line + lineNum := 1 + for i, line := range creator.lines { + if _, ok := creator.lineNumbersToOmit[i]; ok { + metrics.SendErrorLogAndMetric(util.UtilID, "unexpectedly seeing an omitted line for tryCount=0. line num: %d", i) + continue + } klog.Infof("line %d of restore command [%s] with section ID [%s]: [%s]", lineNum, commandString, line.sectionID, line.content) lineNum++ } + + return + } + + // don't print every line because printing all lines can pollute the logs and we already know the lines + if len(creator.lineNumbersToOmit) == 0 { + klog.Infof("on try %d of restore command [%s]. repeating with same lines", creator.tryCount, commandString) + return + } + + lineNumMappings := make([]string, 0, creator.numLines()) + lineNum := 1 + for i := range creator.lines { + if _, ok := creator.lineNumbersToOmit[i]; ok { + continue + } + // this mapping could be off if we unexpectedly saw an omitted line for the first try (see error log in branch above) + lineNumMappings = append(lineNumMappings, fmt.Sprintf("%d->%d", lineNum, i+1)) + lineNum++ } + + klog.Infof("on try %d of restore command [%s]. mapping of current line numbers to original line numbers: %+v", creator.tryCount, commandString, lineNumMappings) } diff --git a/npm/util/ioutil/restore_test.go b/npm/util/ioutil/restore_test.go index dc54f197e6..f355100d07 100644 --- a/npm/util/ioutil/restore_test.go +++ b/npm/util/ioutil/restore_test.go @@ -190,6 +190,7 @@ func TestHandleLineErrorForContinueAndAbortSection(t *testing.T) { fileString := creator.ToString() assert.Equal(t, "line3-item1 line3-item2 line3-item3\nline5-item1 line5-item2 line5-item3\n", fileString) + creator.logLines(testCommandString) require.Equal(t, map[int]struct{}{0: {}, 1: {}, 3: {}}, creator.lineNumbersToOmit, "expected line 1, 2, and 4 to be marked omitted") _, line := creator.handleLineError("some error", testCommandString, 1) @@ -224,6 +225,7 @@ func TestHandleLineErrorForContinue(t *testing.T) { fileString := creator.ToString() assert.Equal(t, "line3-item1 line3-item2 line3-item3\nline4-item1 line4-item2 line4-item3\n", fileString) + creator.logLines(testCommandString) require.Equal(t, map[int]struct{}{0: {}, 1: {}}, creator.lineNumbersToOmit, "expected line 1 and 2 to be marked omitted") _, line := creator.handleLineError("some error", testCommandString, 1) @@ -258,6 +260,7 @@ func TestHandleLineErrorNoMatch(t *testing.T) { fileStringAfter := creator.ToString() require.Equal(t, fileStringBefore, fileStringAfter) + creator.logLines(testCommandString) require.Equal(t, map[int]struct{}{}, creator.lineNumbersToOmit, "expected no lines to be marked omitted") } From 435b2386144ad74a9487ef4a24cc335c7209688e Mon Sep 17 00:00:00 2001 From: Hunter Gregory Date: Tue, 1 Nov 2022 11:15:51 -0700 Subject: [PATCH 07/11] fix lints and wrap chainLineNumber errors --- npm/pkg/dataplane/policies/chain-management_linux.go | 8 +++++--- npm/util/ioutil/restore.go | 8 ++++++-- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/npm/pkg/dataplane/policies/chain-management_linux.go b/npm/pkg/dataplane/policies/chain-management_linux.go index f324f8bd1a..a529e1b51c 100644 --- a/npm/pkg/dataplane/policies/chain-management_linux.go +++ b/npm/pkg/dataplane/policies/chain-management_linux.go @@ -178,7 +178,9 @@ func (pMgr *PolicyManager) bootup(_ []string) error { if deprecatedErrCode == 0 { klog.Infof("deleted deprecated jump rule from FORWARD chain to AZURE-NPM chain") } else if deprecatedErr != nil { - metrics.SendErrorLogAndMetric(util.IptmID, "failed to delete deprecated jump rule from FORWARD chain to AZURE-NPM chain for unexpected reason with exit code %d and error: %s", deprecatedErrCode, deprecatedErr.Error()) + metrics.SendErrorLogAndMetric(util.IptmID, + "failed to delete deprecated jump rule from FORWARD chain to AZURE-NPM chain for unexpected reason with exit code %d and error: %s", + deprecatedErrCode, deprecatedErr.Error()) } currentChains, err := ioutil.AllCurrentAzureChains(pMgr.ioShim.Exec, util.IptablesDefaultWaitTime) @@ -437,12 +439,12 @@ func (pMgr *PolicyManager) chainLineNumber(chain string) (int, error) { lineNumberString := string(searchResults[0:firstSpaceIndex]) lineNum, err := strconv.Atoi(lineNumberString) if err != nil { - return 0, errNoLineNumber + return 0, npmerrors.SimpleErrorWrapper(fmt.Sprintf("unable to parse line number. lineNumberString: [%s]. searchResults: [%s]", lineNumberString, string(searchResults)), errNoLineNumber) } return lineNum, nil } } - return 0, errUnexpectedLineNumberString + return 0, npmerrors.SimpleErrorWrapper(fmt.Sprintf("unable to parse line number. searchResults: [%s]", string(searchResults)), errUnexpectedLineNumberString) } func onMarkSpecs(mark string) []string { diff --git a/npm/util/ioutil/restore.go b/npm/util/ioutil/restore.go index 0db944187c..781479482b 100644 --- a/npm/util/ioutil/restore.go +++ b/npm/util/ioutil/restore.go @@ -242,13 +242,17 @@ func (creator *FileCreator) numLines() int { func (definition *ErrorDefinition) getErrorLineNumber(stdErr, commandString string, numLines int) int { result := definition.re.FindStringSubmatch(stdErr) if result == nil || len(result) < 2 { - metrics.SendErrorLogAndMetric(util.UtilID, "expected error with line number, but couldn't detect one with error regex pattern [%s] for command [%s] with stdErr [%s]", definition.matchPattern, commandString, stdErr) + metrics.SendErrorLogAndMetric(util.UtilID, + "expected error with line number, but couldn't detect one with error regex pattern [%s] for command [%s] with stdErr [%s]", + definition.matchPattern, commandString, stdErr) return -1 } lineNumString := result[1] lineNum, err := strconv.Atoi(lineNumString) if err != nil { - metrics.SendErrorLogAndMetric(util.UtilID, "expected error with line number, but error regex pattern %s didn't produce a number for command [%s] with stdErr [%s]", definition.matchPattern, commandString, stdErr) + metrics.SendErrorLogAndMetric(util.UtilID, + "expected error with line number, but error regex pattern %s didn't produce a number for command [%s] with stdErr [%s]", + definition.matchPattern, commandString, stdErr) return -1 } if lineNum < 1 || lineNum > numLines { From 44c7bd10ea9f75531b5d2cc807f44934d6c39b75 Mon Sep 17 00:00:00 2001 From: Hunter Gregory Date: Tue, 1 Nov 2022 18:03:16 -0700 Subject: [PATCH 08/11] fix one off error for logging the try count --- npm/util/ioutil/restore.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/npm/util/ioutil/restore.go b/npm/util/ioutil/restore.go index 781479482b..a1b6731d5b 100644 --- a/npm/util/ioutil/restore.go +++ b/npm/util/ioutil/restore.go @@ -181,6 +181,7 @@ func (creator *FileCreator) runCommandOnceWithFile(fileString, cmd string, args } klog.Infof("running this restore command: [%s]", commandString) + creator.tryCount++ creator.logLines(commandString) command := creator.ioShim.Exec.Command(cmd, args...) @@ -188,7 +189,6 @@ func (creator *FileCreator) runCommandOnceWithFile(fileString, cmd string, args // run the command stdErrBytes, err := command.CombinedOutput() - creator.tryCount++ if err == nil { // success return false, nil From 8757b3b5456633e7913d8ef31440ad98df115f3f Mon Sep 17 00:00:00 2001 From: Hunter Gregory Date: Thu, 3 Nov 2022 14:27:08 -0700 Subject: [PATCH 09/11] revert exponential increase to try limit --- npm/pkg/dataplane/ipsets/ipsetmanager.go | 30 +++------ .../dataplane/ipsets/ipsetmanager_linux.go | 9 +-- .../ipsets/ipsetmanager_linux_test.go | 64 ++----------------- 3 files changed, 20 insertions(+), 83 deletions(-) diff --git a/npm/pkg/dataplane/ipsets/ipsetmanager.go b/npm/pkg/dataplane/ipsets/ipsetmanager.go index 9db4239a09..7815965394 100644 --- a/npm/pkg/dataplane/ipsets/ipsetmanager.go +++ b/npm/pkg/dataplane/ipsets/ipsetmanager.go @@ -31,9 +31,6 @@ type IPSetMode string const ( ApplyAllIPSets IPSetMode = "all" ApplyOnNeed IPSetMode = "on-need" - - initialMaxRestoreTryCount int = 2 - largestMaxRestoreTryCount int = 1024 // 2^10 ) var ( @@ -53,9 +50,7 @@ type IPSetManager struct { emptySet *IPSet setMap map[string]*IPSet dirtyCache dirtyCacheInterface - // maxRestoreTryCount is only used in Linux - maxRestoreTryCount int - ioShim *common.IOShim + ioShim *common.IOShim sync.RWMutex } @@ -71,12 +66,11 @@ type IPSetManagerCfg struct { func NewIPSetManager(iMgrCfg *IPSetManagerCfg, ioShim *common.IOShim) *IPSetManager { return &IPSetManager{ - iMgrCfg: iMgrCfg, - emptySet: nil, // will be set if needed in calls to AddToLists - setMap: make(map[string]*IPSet), - dirtyCache: newDirtyCache(), - maxRestoreTryCount: initialMaxRestoreTryCount, - ioShim: ioShim, + iMgrCfg: iMgrCfg, + emptySet: nil, // will be set if needed in calls to AddToLists + setMap: make(map[string]*IPSet), + dirtyCache: newDirtyCache(), + ioShim: ioShim, } } @@ -469,20 +463,12 @@ func (iMgr *IPSetManager) ApplyIPSets() error { defer metrics.RecordIPSetExecTime(prometheusTimer) // record execution time regardless of failure err := iMgr.applyIPSets() if err != nil { - // exponentially increase maxRestoreTryCount - iMgr.maxRestoreTryCount *= 2 - if iMgr.maxRestoreTryCount > largestMaxRestoreTryCount { - iMgr.maxRestoreTryCount = largestMaxRestoreTryCount - } - - metrics.SendErrorLogAndMetric(util.IpsmID, "error: failed to apply ipsets. new restore try count (linux only): %d. err: %s", iMgr.maxRestoreTryCount, err.Error()) + metrics.SendErrorLogAndMetric(util.IpsmID, "error: failed to apply ipsets: %s", err.Error()) return err } - // TODO could also set the number of ipsets in NPM (not necessarily in kernel) here using len(iMgr.setMap) iMgr.clearDirtyCache() - // reset the maxRestoreTryCount - iMgr.maxRestoreTryCount = initialMaxRestoreTryCount + // TODO could also set the number of ipsets in NPM (not necessarily in kernel) here using len(iMgr.setMap) return nil } diff --git a/npm/pkg/dataplane/ipsets/ipsetmanager_linux.go b/npm/pkg/dataplane/ipsets/ipsetmanager_linux.go index 16523ba8c0..f5fdb91cea 100644 --- a/npm/pkg/dataplane/ipsets/ipsetmanager_linux.go +++ b/npm/pkg/dataplane/ipsets/ipsetmanager_linux.go @@ -50,6 +50,7 @@ const ( ipsetIPPortHashString = ipsetIPPortHashFlag // creator constants + maxTryCount = 3 destroySectionPrefix = "delete" addOrUpdateSectionPrefix = "add/update" ipsetRestoreLineFailurePattern = "Error in line (\\d+):" @@ -172,7 +173,7 @@ func (iMgr *IPSetManager) resetWithoutRestore() bool { // named returns to appease lint func (iMgr *IPSetManager) fileCreatorForFlushAll(ipsetListOutput []byte) (creator *ioutil.FileCreator, names []string, failedNames map[string]struct{}) { destroyFailureCount := 0 - creator = ioutil.NewFileCreator(iMgr.ioShim, iMgr.maxRestoreTryCount, ipsetRestoreLineFailurePattern) + creator = ioutil.NewFileCreator(iMgr.ioShim, maxTryCount, ipsetRestoreLineFailurePattern) names = make([]string, 0) failedNames = make(map[string]struct{}, 0) readIndex := 0 @@ -237,7 +238,7 @@ func (iMgr *IPSetManager) setsWithReferences() map[string]struct{} { func (iMgr *IPSetManager) fileCreatorForDestroyAll(names []string, failedNames, setsWithReferences map[string]struct{}) (creator *ioutil.FileCreator, failureCount *int) { failureCountVal := 0 failureCount = &failureCountVal - creator = ioutil.NewFileCreator(iMgr.ioShim, iMgr.maxRestoreTryCount, ipsetRestoreLineFailurePattern) + creator = ioutil.NewFileCreator(iMgr.ioShim, maxTryCount, ipsetRestoreLineFailurePattern) // destroy all the sets for _, hashedSetName := range names { @@ -363,7 +364,7 @@ func (iMgr *IPSetManager) applyIPSetsWithSaveFile() error { return npmerrors.SimpleErrorWrapper("ipset save failed when applying ipsets", saveError) } } - creator := iMgr.fileCreatorForApplyWithSaveFile(iMgr.maxRestoreTryCount, saveFile) + creator := iMgr.fileCreatorForApplyWithSaveFile(maxTryCount, saveFile) restoreError := creator.RunCommandWithFile(ipsetCommand, ipsetRestoreFlag) if restoreError != nil { return npmerrors.SimpleErrorWrapper("ipset restore failed when applying ipsets with save file", restoreError) @@ -404,7 +405,7 @@ example where: -X set4 */ func (iMgr *IPSetManager) applyIPSets() error { - creator := iMgr.fileCreatorForApply(iMgr.maxRestoreTryCount) + creator := iMgr.fileCreatorForApply(maxTryCount) restoreError := creator.RunCommandWithFile(ipsetCommand, ipsetRestoreFlag) if restoreError != nil { return npmerrors.SimpleErrorWrapper("ipset restore failed when applying ipsets", restoreError) diff --git a/npm/pkg/dataplane/ipsets/ipsetmanager_linux_test.go b/npm/pkg/dataplane/ipsets/ipsetmanager_linux_test.go index 62dec490d2..c9db0194b7 100644 --- a/npm/pkg/dataplane/ipsets/ipsetmanager_linux_test.go +++ b/npm/pkg/dataplane/ipsets/ipsetmanager_linux_test.go @@ -36,60 +36,6 @@ var ( // TODO test that a reconcile list is updated for all the TestFailure UTs // TODO same exact TestFailure UTs for unknown errors -func TestMaxRestoreTryCount(t *testing.T) { - metrics.ReinitializeAll() - - tryCounts := []int{2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 1024} - numSets := 1025 - toAddIPSets := make([]*IPSetMetadata, numSets) - for k := range toAddIPSets { - toAddIPSets[k] = NewIPSetMetadata(fmt.Sprintf("bad-set-%d", k), Namespace) - } - numSetsWhenSuccessful := 1022 - toAddIPSetsForSuccess := make([]*IPSetMetadata, numSetsWhenSuccessful) - for k := range toAddIPSetsForSuccess { - toAddIPSetsForSuccess[k] = NewIPSetMetadata(fmt.Sprintf("good-set-%d", k), Namespace) - } - - errorInLine1 := testutils.TestCmd{ - Cmd: ipsetRestoreStringSlice, - Stdout: "Error in line 1: for some other error", - ExitCode: 1, - } - sumCounts := 0 - for _, tryCount := range tryCounts { - sumCounts += tryCount - } - numCalls := sumCounts + numSetsWhenSuccessful - calls := make([]testutils.TestCmd, numCalls) - for k := range calls { - calls[k] = errorInLine1 - } - calls[numCalls-1].ExitCode = 0 - calls[numCalls-1].Stdout = "" - - ioShim := common.NewMockIOShim(calls) - defer ioShim.VerifyCalls(t, calls) - iMgr := NewIPSetManager(applyAlwaysCfg, ioShim) - iMgr.CreateIPSets(toAddIPSets) - - for _, tryCount := range tryCounts { - require.Equal(t, tryCount, iMgr.maxRestoreTryCount) - require.Error(t, iMgr.ApplyIPSets()) - // cache behavior is currently undefined if there's an apply error - } - - // set the dirty cache to less sets and succeed by skipping all sets - iMgr.clearDirtyCache() - iMgr.CreateIPSets(toAddIPSetsForSuccess) - require.NoError(t, iMgr.ApplyIPSets()) - require.Equal(t, 2, iMgr.maxRestoreTryCount) - - execCount, err := metrics.GetIPSetExecCount() - promutil.NotifyIfErrors(t, err) - require.Equal(t, len(tryCounts)+1, execCount) -} - func TestApplyIPSets(t *testing.T) { type args struct { toAddUpdateSets []*IPSetMetadata @@ -164,7 +110,7 @@ func TestApplyIPSets(t *testing.T) { require.Greater(t, len(calls), 0) calls[len(calls)-1].ExitCode = 1 // then add errors as many times as we retry - for i := 1; i < initialMaxRestoreTryCount; i++ { + for i := 1; i < maxTryCount; i++ { calls = append(calls, testutils.TestCmd{Cmd: ipsetRestoreStringSlice, ExitCode: 1}) } } @@ -542,6 +488,7 @@ func TestDestroyNPMIPSets(t *testing.T) { {Cmd: []string{"grep", "azure-npm-"}, Stdout: resetIPSetsListOutputString}, {Cmd: ipsetRestoreStringSlice, ExitCode: 1}, {Cmd: ipsetRestoreStringSlice, ExitCode: 1}, + {Cmd: ipsetRestoreStringSlice, ExitCode: 1}, }, wantErr: true, }, @@ -558,6 +505,7 @@ func TestDestroyNPMIPSets(t *testing.T) { {Cmd: []string{"grep", "-o", "-P", "azure-npm-\\d+"}, ExitCode: 1}, {Cmd: ipsetRestoreStringSlice, ExitCode: 1}, {Cmd: ipsetRestoreStringSlice, ExitCode: 1}, + {Cmd: ipsetRestoreStringSlice, ExitCode: 1}, }, wantErr: true, }, @@ -778,7 +726,8 @@ func TestApplyIPSetsFailureOnSave(t *testing.T) { func TestApplyIPSetsFailureOnRestore(t *testing.T) { calls := []testutils.TestCmd{ - // fail 2 times because this is our max try count + // fail 3 times because this is our max try count + {Cmd: ipsetRestoreStringSlice, ExitCode: 1}, {Cmd: ipsetRestoreStringSlice, ExitCode: 1}, {Cmd: ipsetRestoreStringSlice, ExitCode: 1}, } @@ -794,7 +743,8 @@ func TestApplyIPSetsFailureOnRestore(t *testing.T) { calls = []testutils.TestCmd{ {Cmd: ipsetSaveStringSlice, PipedToCommand: true}, {Cmd: []string{"grep", "azure-npm-"}}, - // fail 2 times because this is our max try count + // fail 3 times because this is our max try count + {Cmd: ipsetRestoreStringSlice, ExitCode: 1}, {Cmd: ipsetRestoreStringSlice, ExitCode: 1}, {Cmd: ipsetRestoreStringSlice, ExitCode: 1}, } From 35d2d51fefeb38d4a305d831de49630b96d41085 Mon Sep 17 00:00:00 2001 From: Hunter Gregory Date: Thu, 3 Nov 2022 15:35:10 -0700 Subject: [PATCH 10/11] update try count to 5 and update UTs --- npm/pkg/dataplane/ipsets/ipsetmanager_linux.go | 2 +- npm/pkg/dataplane/ipsets/ipsetmanager_linux_test.go | 12 ++++++++++-- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/npm/pkg/dataplane/ipsets/ipsetmanager_linux.go b/npm/pkg/dataplane/ipsets/ipsetmanager_linux.go index f5fdb91cea..6d83ef4790 100644 --- a/npm/pkg/dataplane/ipsets/ipsetmanager_linux.go +++ b/npm/pkg/dataplane/ipsets/ipsetmanager_linux.go @@ -50,7 +50,7 @@ const ( ipsetIPPortHashString = ipsetIPPortHashFlag // creator constants - maxTryCount = 3 + maxTryCount = 5 destroySectionPrefix = "delete" addOrUpdateSectionPrefix = "add/update" ipsetRestoreLineFailurePattern = "Error in line (\\d+):" diff --git a/npm/pkg/dataplane/ipsets/ipsetmanager_linux_test.go b/npm/pkg/dataplane/ipsets/ipsetmanager_linux_test.go index c9db0194b7..b05db39a0a 100644 --- a/npm/pkg/dataplane/ipsets/ipsetmanager_linux_test.go +++ b/npm/pkg/dataplane/ipsets/ipsetmanager_linux_test.go @@ -489,6 +489,8 @@ func TestDestroyNPMIPSets(t *testing.T) { {Cmd: ipsetRestoreStringSlice, ExitCode: 1}, {Cmd: ipsetRestoreStringSlice, ExitCode: 1}, {Cmd: ipsetRestoreStringSlice, ExitCode: 1}, + {Cmd: ipsetRestoreStringSlice, ExitCode: 1}, + {Cmd: ipsetRestoreStringSlice, ExitCode: 1}, }, wantErr: true, }, @@ -506,6 +508,8 @@ func TestDestroyNPMIPSets(t *testing.T) { {Cmd: ipsetRestoreStringSlice, ExitCode: 1}, {Cmd: ipsetRestoreStringSlice, ExitCode: 1}, {Cmd: ipsetRestoreStringSlice, ExitCode: 1}, + {Cmd: ipsetRestoreStringSlice, ExitCode: 1}, + {Cmd: ipsetRestoreStringSlice, ExitCode: 1}, }, wantErr: true, }, @@ -726,7 +730,9 @@ func TestApplyIPSetsFailureOnSave(t *testing.T) { func TestApplyIPSetsFailureOnRestore(t *testing.T) { calls := []testutils.TestCmd{ - // fail 3 times because this is our max try count + // fail 5 times because this is our max try count + {Cmd: ipsetRestoreStringSlice, ExitCode: 1}, + {Cmd: ipsetRestoreStringSlice, ExitCode: 1}, {Cmd: ipsetRestoreStringSlice, ExitCode: 1}, {Cmd: ipsetRestoreStringSlice, ExitCode: 1}, {Cmd: ipsetRestoreStringSlice, ExitCode: 1}, @@ -743,7 +749,9 @@ func TestApplyIPSetsFailureOnRestore(t *testing.T) { calls = []testutils.TestCmd{ {Cmd: ipsetSaveStringSlice, PipedToCommand: true}, {Cmd: []string{"grep", "azure-npm-"}}, - // fail 3 times because this is our max try count + // fail 5 times because this is our max try count + {Cmd: ipsetRestoreStringSlice, ExitCode: 1}, + {Cmd: ipsetRestoreStringSlice, ExitCode: 1}, {Cmd: ipsetRestoreStringSlice, ExitCode: 1}, {Cmd: ipsetRestoreStringSlice, ExitCode: 1}, {Cmd: ipsetRestoreStringSlice, ExitCode: 1}, From 477e53a8a7bb839e58eb896a916a96c0d75ba4b8 Mon Sep 17 00:00:00 2001 From: Hunter Gregory Date: Mon, 21 Nov 2022 17:30:57 -0800 Subject: [PATCH 11/11] do not log lines for every restore call until perf is understood --- npm/util/ioutil/restore.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/npm/util/ioutil/restore.go b/npm/util/ioutil/restore.go index a1b6731d5b..8e544d251e 100644 --- a/npm/util/ioutil/restore.go +++ b/npm/util/ioutil/restore.go @@ -182,7 +182,8 @@ func (creator *FileCreator) runCommandOnceWithFile(fileString, cmd string, args klog.Infof("running this restore command: [%s]", commandString) creator.tryCount++ - creator.logLines(commandString) + // TODO uncomment for debugging or after ensuring no performance decrease + // creator.logLines(commandString) command := creator.ioShim.Exec.Command(cmd, args...) command.SetStdin(bytes.NewBufferString(fileString))