Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions npm/ipsm/ipsm.go
Original file line number Diff line number Diff line change
Expand Up @@ -644,6 +644,9 @@ func (ipsMgr *IpsetManager) DestroyNpmIpsets() error {
} else {
metrics.ResetNumIPSets()
}
// NOTE: in v2, we reset ipset entries, but in v1 we only remove entries for ipsets we delete.
// So v2 may underestimate the number of entries if there are destroy failures,
// but v1 may miss removing entries if some sets are in the prometheus metric but not in the kernel.

return nil
}
8 changes: 8 additions & 0 deletions npm/ipsm/ipsm_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -528,6 +528,7 @@ func TestDestroyNpmIpsets(t *testing.T) {
calls := []testutils.TestCmd{
{Cmd: []string{"ipset", "-N", "-exist", util.GetHashedName(testSet1Name), "nethash"}},
{Cmd: []string{"ipset", "-N", "-exist", util.GetHashedName(testSet2Name), "nethash"}},
{Cmd: []string{"ipset", "-A", "-exist", util.GetHashedName(testSet1Name), "1.2.3.4"}},
{Cmd: []string{"ipset", "list"}, Stdout: ipsetListStdout},
{Cmd: []string{"ipset", "-F", "-exist", testSet1Name}},
{Cmd: []string{"ipset", "-F", "-exist", testSet2Name}},
Expand Down Expand Up @@ -555,6 +556,13 @@ func TestDestroyNpmIpsets(t *testing.T) {
t.Errorf(err.Error())
}

// expect prometheus to add this entry, but remove it when destroying npm sets
err = ipsMgr.AddToSet(testSet1Name, "1.2.3.4", util.IpsetNetHashFlag, "")
if err != nil {
t.Errorf("TestDestroyNpmIpsets failed @ ipsMgr.addToSet")
t.Errorf(err.Error())
}

err = ipsMgr.DestroyNpmIpsets()
if err != nil {
t.Errorf("TestDestroyNpmIpsets failed @ ipsMgr.DestroyNpmIpsets")
Expand Down
3 changes: 3 additions & 0 deletions npm/metrics/ipsets.go
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,9 @@ func DeleteIPSet(setName string) {
// It doesn't ever update the number of IPSets.
func ResetIPSetEntries() {
numIPSetEntries.Set(0)
for setName := range ipsetInventoryMap {
removeFromIPSetInventory(setName)
}
ipsetInventoryMap = make(map[string]int)
}

Expand Down
29 changes: 13 additions & 16 deletions npm/metrics/ipsets_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -94,11 +94,19 @@ func assertNumEntriesAndCounts(t *testing.T, sets ...*testSet) {
expectedTotal := 0
for _, set := range sets {
val, exists := ipsetInventoryMap[set.name]
if !exists {
if exists && set.entryCount == 0 {
require.FailNowf(t, "", "expected set %s to not exist since we expect an entry count of 0", set.name)
}
if !exists && set.entryCount > 0 {
require.FailNowf(t, "", "expected set %s to exist in map for ipset entries", set.name)
}
if set.entryCount != val {
require.FailNowf(t, "", "set %s has incorrect number of entries. Expected %d, got %d", set.name, set.entryCount, val)
require.FailNowf(t, "", "set %s has incorrect number of entries in the map. Expected %d, got %d", set.name, set.entryCount, val)
}
prometheusVal, err := GetNumEntriesForIPSet(set.name)
require.NoError(t, err, "unexpectedly got an error when getting prometheus num entries for set %s", set.name)
if set.entryCount != prometheusVal {
require.FailNow(t, "", "set %s has incorrect number of entries in the Prometheus metric. Expected %d, got %d", set.name, set.entryCount, prometheusVal)
}
expectedTotal += val
}
Expand All @@ -110,15 +118,6 @@ func assertNumEntriesAndCounts(t *testing.T, sets ...*testSet) {
}
}

func assertNotInMap(t *testing.T, setNames ...string) {
for _, setName := range setNames {
_, exists := ipsetInventoryMap[setName]
if exists {
require.FailNowf(t, "", "expected set %s to not exist in map for ipset entries", setName)
}
}
}

func assertMapIsGood(t *testing.T) {
assertEqualMapAndMetricElements(t)
assertNoZeroEntriesInMap(t)
Expand Down Expand Up @@ -160,8 +159,7 @@ func TestRemoveAllEntriesFromIPSet(t *testing.T) {
AddEntryToIPSet(testName1)
AddEntryToIPSet(testName2)
RemoveAllEntriesFromIPSet(testName1)
assertNotInMap(t, testName1)
assertNumEntriesAndCounts(t, &testSet{testName2, 1})
assertNumEntriesAndCounts(t, &testSet{testName1, 0}, &testSet{testName2, 1})
assertMapIsGood(t)
}

Expand All @@ -175,8 +173,7 @@ func TestDeleteIPSet(t *testing.T) {
AddEntryToIPSet(testName2)
DeleteIPSet(testName1)
assertNumSets(t, 1)
assertNotInMap(t, testName1)
assertNumEntriesAndCounts(t, &testSet{testName2, 1})
assertNumEntriesAndCounts(t, &testSet{testName1, 0}, &testSet{testName2, 1})
assertMapIsGood(t)
}

Expand All @@ -193,6 +190,6 @@ func TestResetIPSetEntries(t *testing.T) {
AddEntryToIPSet(testName1)
AddEntryToIPSet(testName2)
ResetIPSetEntries()
assertNotInMap(t, testName1, testName2)
assertNumEntriesAndCounts(t, &testSet{testName1, 0}, &testSet{testName2, 0})
assertMapIsGood(t)
}
2 changes: 2 additions & 0 deletions npm/pkg/dataplane/ipsets/ipsetmanager.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@ type IPSetManagerCfg struct {
NetworkName string
}

// TODO delegate prometheus metrics logic to OS-specific ones?

func NewIPSetManager(iMgrCfg *IPSetManagerCfg, ioShim *common.IOShim) *IPSetManager {
return &IPSetManager{
iMgrCfg: iMgrCfg,
Expand Down
155 changes: 138 additions & 17 deletions npm/pkg/dataplane/ipsets/ipsetmanager_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"fmt"
"regexp"

"github.com/Azure/azure-container-networking/npm/metrics"
"github.com/Azure/azure-container-networking/npm/pkg/dataplane/ioutil"
"github.com/Azure/azure-container-networking/npm/pkg/dataplane/parse"
"github.com/Azure/azure-container-networking/npm/util"
Expand All @@ -15,6 +16,8 @@ const (
azureNPMPrefix = "azure-npm-"

ipsetCommand = "ipset"
ipsetListFlag = "list"
ipsetNameFlag = "--name"
ipsetSaveFlag = "save"
ipsetRestoreFlag = "restore"
ipsetCreateFlag = "-N"
Expand Down Expand Up @@ -57,14 +60,134 @@ var (
nameForAddRegex = regexp.MustCompile(fmt.Sprintf("%s (%s) ", ipsetAddString, hashedNamePattern))
)

/*
based on ipset list output with azure-npm- prefix, create an ipset restore file where we flush all sets first, then destroy all sets

overall error handling:
- if flush fails because the set doesn't exist (should never happen because we're listing sets right before), then ignore it and the destroy
- if flush fails otherwise, then add to destroyFailureCount and continue (aborting the destroy too)
- if destroy fails because the set doesn't exist (should never happen since the flush operation would have worked), then ignore it
- if destroy fails for another reason, then ignore it and add to destroyFailureCount and mark for reconcile (TODO)

example:
grep output:
azure-npm-123456
azure-npm-987654
azure-npm-777777

example restore file [flag meanings: -F (flush), -X (destroy)]:
-F azure-npm-123456
-F azure-npm-987654
-F azure-npm-777777
-X azure-npm-123456
-X azure-npm-987654
-X azure-npm-777777

prometheus metrics:
After this function, NumIPSets should be 0 or the number of NPM IPSets that existed and failed to be destroyed.
When NPM restarts, Prometheus metrics will initialize at 0, but NPM IPSets may exist.
We will reset ipset entry metrics if the restore succeeds whether or not some flushes/destroys failed (NOTE: this is different behavior than v1).
If a flush fails, we could update the num entries for that set, but that would be a lot of overhead.
*/
func (iMgr *IPSetManager) resetIPSets() error {
// called on failure or when NPM is created
// so no ipset cache. need to use ipset list like in ipsm.go
listCommand := iMgr.ioShim.Exec.Command(ipsetCommand, ipsetListFlag, ipsetNameFlag)
grepCommand := iMgr.ioShim.Exec.Command(ioutil.Grep, azureNPMPrefix)
azureIPSets, haveAzureIPSets, commandError := ioutil.PipeCommandToGrep(listCommand, grepCommand)
if commandError != nil {
return npmerrors.SimpleErrorWrapper("failed to run ipset list for resetting IPSets", commandError)
}
if !haveAzureIPSets {
metrics.ResetNumIPSets()
metrics.ResetIPSetEntries()
return nil
}
creator, originalNumAzureSets, destroyFailureCount := iMgr.fileCreatorForReset(azureIPSets)
restoreError := creator.RunCommandWithFile(ipsetCommand, ipsetRestoreFlag)
if restoreError != nil {
metrics.SetNumIPSets(originalNumAzureSets)
// NOTE: the num entries for sets may be incorrect if the restore fails
return npmerrors.SimpleErrorWrapper("failed to run ipset restore for resetting IPSets", restoreError)
}
if metrics.NumIPSetsIsPositive() {
metrics.SetNumIPSets(*destroyFailureCount)
} else {
metrics.ResetNumIPSets()
}
metrics.ResetIPSetEntries() // NOTE: the num entries for sets that fail to flush may be incorrect after this
return nil
}

// create restore file that flushes all sets, then deletes all sets
// technically don't need to flush a hashset
// this needs to be a separate function because we need to check creator contents in UTs
func (iMgr *IPSetManager) fileCreatorForReset(ipsetListOutput []byte) (*ioutil.FileCreator, int, *int) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems we do not return pointer.

(*ioutil.FileCreator, int, *int) -> (*ioutil.FileCreator, int, int)

and can use non-pointer operation in caller as well.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we might need the pointer because the destroyFailureCount is updated after we get the creator and run creator.Run(). I'll add a clarifying comment next PR

destroyFailureCount := 0
creator := ioutil.NewFileCreator(iMgr.ioShim, maxTryCount, ipsetRestoreLineFailurePattern)
names := make([]string, 0)
readIndex := 0
var line []byte
// flush all the sets and create a list of the sets so we can destroy them
for readIndex < len(ipsetListOutput) {
line, readIndex = parse.Line(readIndex, ipsetListOutput)
hashedSetName := string(line)
names = append(names, hashedSetName)
// error handlers specific to resetting ipsets
errorHandlers := []*ioutil.LineErrorHandler{
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Great work to classify all errors and corresponding actions.
I understood the high-level ideas about LineErrorHandler which holds necessary error handling information to the corresponding executed line when there is error.

While I am fine as is, but just curious.
Do we consolidate all error handlings code without this errorHandlers here - https://github.com/Azure/azure-container-networking/blob/master/npm/pkg/dataplane/ioutil/restore_linux.go#L199?
For maintenance, it would be nice to consolidate error handling in one places if possible.
I thought Callback has only error message, but it changed destroyFailureCount in some places.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Some callbacks modify destroyFailureCount because the error is a kind of failure to delete (as opposed to the set not existing, which is technically an error but not a failure in our eyes). The line you reference is code which recognizes which line an error happened on and is somewhat unrelated to the error handlers. Did this answer your question?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep. I understood the intention of codes.

My fundamental comments are whether we can consolidate all error definitions and handlers in one place per ipset and iptables instead of sparsely declaring them in each function and using Callback.
It will be easier to understand and maintain codes, but I may miss somethings and it will need big changes.
wdyt?

BTW, it is fine as is.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Still might not fully understand your question. To try to help consolidate, for creatorForApply, I group all of the error handling code at the bottom of the file. I can't move the error handling code out of creatorForReset because of the dependency on destroyFailureCount.

We could consolidate the definition and method (e.g. ContinueAndAbortSection) but the callback will always be unique and will eventually do more than logging for most (e.g. mark a set as needing reconcile).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

for instance though, we could make this error handler a function of the set name and destroyFailureCount pointer though

{
Definition: setDoesntExistDefinition,
Method: ioutil.ContinueAndAbortSection,
Callback: func() {
klog.Infof("[RESET-IPSETS] skipping flush and upcoming destroy for set %s since the set doesn't exist", hashedSetName)
},
},
{
Definition: ioutil.AlwaysMatchDefinition,
Method: ioutil.ContinueAndAbortSection,
Callback: func() {
klog.Errorf("[RESET-IPSETS] marking flush and upcoming destroy for set %s as a failure due to unknown error", hashedSetName)
destroyFailureCount++
// TODO mark as a failure
},
},
}
sectionID := sectionID(destroySectionPrefix, hashedSetName)
creator.AddLine(sectionID, errorHandlers, ipsetFlushFlag, hashedSetName) // flush set
}

return nil
// destroy all the sets
for _, hashedSetName := range names {
hashedSetName := hashedSetName // to appease go lint
errorHandlers := []*ioutil.LineErrorHandler{
// error handlers specific to resetting ipsets
{
Definition: setInUseByKernelDefinition,
Method: ioutil.Continue,
Callback: func() {
klog.Errorf("[RESET-IPSETS] marking destroy for set %s as a failure since the set is in use by a kernel component", hashedSetName)
destroyFailureCount++
// TODO mark the set as a failure and reconcile what iptables rule or ipset is referring to it
},
},
{
Definition: setDoesntExistDefinition,
Method: ioutil.Continue,
Callback: func() {
klog.Infof("[RESET-IPSETS] skipping destroy for set %s since the set does not exist", hashedSetName)
},
},
{
Definition: ioutil.AlwaysMatchDefinition,
Method: ioutil.Continue,
Callback: func() {
klog.Errorf("[RESET-IPSETS] marking destroy for set %s as a failure due to unknown error", hashedSetName)
destroyFailureCount++
// TODO mark the set as a failure and reconcile what iptables rule or ipset is referring to it
},
},
}
sectionID := sectionID(destroySectionPrefix, hashedSetName)
creator.AddLine(sectionID, errorHandlers, ipsetDestroyFlag, hashedSetName) // destroy set
}
originalNumAzureSets := len(names)
return creator, originalNumAzureSets, &destroyFailureCount
}

/*
Expand Down Expand Up @@ -134,7 +257,6 @@ example where every set in add/update cache should have ip 1.2.3.4 and 2.3.4.5:
-A new-set-3 2.3.4.5

*/

func (iMgr *IPSetManager) applyIPSets() error {
var saveFile []byte
var saveError error
Expand All @@ -144,7 +266,7 @@ func (iMgr *IPSetManager) applyIPSets() error {
return npmerrors.SimpleErrorWrapper("ipset save failed when applying ipsets", saveError)
}
}
creator := iMgr.fileCreator(maxTryCount, saveFile)
creator := iMgr.fileCreatorForApply(maxTryCount, saveFile)
restoreError := creator.RunCommandWithFile(ipsetCommand, ipsetRestoreFlag)
if restoreError != nil {
return npmerrors.SimpleErrorWrapper("ipset restore failed when applying ipsets", restoreError)
Expand All @@ -155,17 +277,17 @@ func (iMgr *IPSetManager) applyIPSets() error {
func (iMgr *IPSetManager) ipsetSave() ([]byte, error) {
command := iMgr.ioShim.Exec.Command(ipsetCommand, ipsetSaveFlag)
grepCommand := iMgr.ioShim.Exec.Command(ioutil.Grep, azureNPMPrefix)
searchResults, gotMatches, err := ioutil.PipeCommandToGrep(command, grepCommand)
saveFile, haveAzureSets, err := ioutil.PipeCommandToGrep(command, grepCommand)
if err != nil {
return nil, npmerrors.SimpleErrorWrapper("failed to run ipset save", err)
}
if !gotMatches {
if !haveAzureSets {
return nil, nil
}
return searchResults, nil
return saveFile, nil
}

func (iMgr *IPSetManager) fileCreator(maxTryCount int, saveFile []byte) *ioutil.FileCreator {
func (iMgr *IPSetManager) fileCreatorForApply(maxTryCount int, saveFile []byte) *ioutil.FileCreator {
creator := ioutil.NewFileCreator(iMgr.ioShim, maxTryCount, ipsetRestoreLineFailurePattern) // TODO make the line failure pattern into a definition constant eventually

// flush all sets first so we don't try to delete an ipset referenced by a list we're deleting too
Expand Down Expand Up @@ -350,23 +472,22 @@ func (iMgr *IPSetManager) flushSetInFile(creator *ioutil.FileCreator, prefixedNa
Definition: setDoesntExistDefinition,
Method: ioutil.ContinueAndAbortSection,
Callback: func() {
// no action needed
klog.Infof("skipping flush and upcoming delete for set %s since the set doesn't exist", prefixedName)
klog.Infof("skipping flush and upcoming destroy for set %s since the set doesn't exist", prefixedName)
},
},
{
Definition: ioutil.AlwaysMatchDefinition,
Method: ioutil.ContinueAndAbortSection,
Callback: func() {
klog.Errorf("skipping flush and upcoming delete for set %s due to unknown error", prefixedName)
klog.Errorf("skipping flush and upcoming destroy for set %s due to unknown error", prefixedName)
// TODO mark as a failure
// would this ever happen?
},
},
}
sectionID := sectionID(destroySectionPrefix, prefixedName)
hashedName := util.GetHashedName(prefixedName)
creator.AddLine(sectionID, errorHandlers, ipsetFlushFlag, hashedName)
creator.AddLine(sectionID, errorHandlers, ipsetFlushFlag, hashedName) // flush set
}

func (iMgr *IPSetManager) destroySetInFile(creator *ioutil.FileCreator, prefixedName string) {
Expand Down Expand Up @@ -438,7 +559,7 @@ func (iMgr *IPSetManager) deleteMemberInFile(creator *ioutil.FileCreator, set *I
},
},
}
creator.AddLine(sectionID, errorHandlers, ipsetDeleteFlag, set.HashedName, member)
creator.AddLine(sectionID, errorHandlers, ipsetDeleteFlag, set.HashedName, member) // delete member
}

func (iMgr *IPSetManager) addMemberInFile(creator *ioutil.FileCreator, set *IPSet, sectionID, member string) {
Expand Down Expand Up @@ -472,7 +593,7 @@ func (iMgr *IPSetManager) addMemberInFile(creator *ioutil.FileCreator, set *IPSe
},
}
}
creator.AddLine(sectionID, errorHandlers, ipsetAddFlag, set.HashedName, member)
creator.AddLine(sectionID, errorHandlers, ipsetAddFlag, set.HashedName, member) // add member
}

func sectionID(prefix, prefixedName string) string {
Expand Down
Loading