From 393f7415169c32f66dac3c9b3f182206f4397616 Mon Sep 17 00:00:00 2001 From: Hunter Gregory Date: Tue, 3 Jan 2023 15:41:14 -0800 Subject: [PATCH 1/2] ignore certain errors --- .../dataplane-test-cases_windows_test.go | 29 +++++++++++++++ npm/pkg/dataplane/dataplane.go | 35 +++++++++---------- npm/pkg/dataplane/dataplane_windows.go | 13 ++++--- 3 files changed, 51 insertions(+), 26 deletions(-) diff --git a/npm/pkg/dataplane/dataplane-test-cases_windows_test.go b/npm/pkg/dataplane/dataplane-test-cases_windows_test.go index 6aae19e33f..e403eb8dd1 100644 --- a/npm/pkg/dataplane/dataplane-test-cases_windows_test.go +++ b/npm/pkg/dataplane/dataplane-test-cases_windows_test.go @@ -15,6 +15,7 @@ const ( podCrudTag Tag = "pod-crud" nsCrudTag Tag = "namespace-crud" netpolCrudTag Tag = "netpol-crud" + reconcileTag Tag = "reconcile" ) const ( @@ -155,6 +156,7 @@ func getAllSerialTests() []*SerialTestCase { TestCaseMetadata: &TestCaseMetadata{ Tags: []Tag{ podCrudTag, + reconcileTag, }, DpCfg: defaultWindowsDPCfg, InitialEndpoints: nil, @@ -482,6 +484,33 @@ func getAllSerialTests() []*SerialTestCase { }, }, }, + { + Description: "issue 1613: remove last instance of label, then reconcile IPSets, then apply DP", + Actions: []*Action{ + CreateEndpoint(endpoint1, ip1), + CreatePod("x", "a", ip1, thisNode, map[string]string{"k1": "v1"}), + ApplyDP(), + UpdatePodLabels("x", "a", ip1, thisNode, map[string]string{"k1": "v1"}, nil), + ReconcileDP(), + ApplyDP(), + }, + TestCaseMetadata: &TestCaseMetadata{ + Tags: []Tag{ + podCrudTag, + reconcileTag, + }, + DpCfg: defaultWindowsDPCfg, + InitialEndpoints: nil, + ExpectedSetPolicies: []*hcn.SetPolicySetting{ + dptestutils.SetPolicy(emptySet), + dptestutils.SetPolicy(allNamespaces, emptySet.GetHashedName(), nsXSet.GetHashedName()), + dptestutils.SetPolicy(nsXSet, ip1), + }, + ExpectedEnpdointACLs: map[string][]*hnswrapper.FakeEndpointPolicy{ + endpoint1: {}, + }, + }, + }, } } diff --git a/npm/pkg/dataplane/dataplane.go b/npm/pkg/dataplane/dataplane.go index 8227da2abf..5cfaa24f19 100644 --- a/npm/pkg/dataplane/dataplane.go +++ b/npm/pkg/dataplane/dataplane.go @@ -134,7 +134,8 @@ func (dp *DataPlane) AddToSets(setNames []*ipsets.IPSetMetadata, podMetadata *Po if err != nil { return fmt.Errorf("[DataPlane] error while adding to set: %w", err) } - if dp.shouldUpdatePod() { + + if dp.shouldUpdatePod() && podMetadata.NodeName == dp.nodeName { klog.Infof("[DataPlane] Updating Sets to Add for pod key %s", podMetadata.PodKey) // lock updatePodCache while reading/modifying or setting the updatePod in the cache @@ -162,7 +163,7 @@ func (dp *DataPlane) RemoveFromSets(setNames []*ipsets.IPSetMetadata, podMetadat return fmt.Errorf("[DataPlane] error while removing from set: %w", err) } - if dp.shouldUpdatePod() { + if dp.shouldUpdatePod() && podMetadata.NodeName == dp.nodeName { klog.Infof("[DataPlane] Updating Sets to Remove for pod key %s", podMetadata.PodKey) // lock updatePodCache while reading/modifying or setting the updatePod in the cache @@ -214,10 +215,12 @@ func (dp *DataPlane) ApplyDataPlane() error { } if dp.shouldUpdatePod() { + // NOTE: ideally we won't refresh Pod Endpoints if the updatePodCache is empty err := dp.refreshPodEndpoints() if err != nil { metrics.SendErrorLogAndMetric(util.DaemonDataplaneID, "[DataPlane] failed to refresh endpoints while updating pods. err: [%s]", err.Error()) - return fmt.Errorf("[DataPlane] failed to refresh endpoints while updating pods. err: [%w]", err) + // return as success since this can be retried irrespective of other operations + return nil } // lock updatePodCache while driving goal state to kernel @@ -225,23 +228,16 @@ func (dp *DataPlane) ApplyDataPlane() error { dp.updatePodCache.Lock() defer dp.updatePodCache.Unlock() - var aggregateErr error for podKey, pod := range dp.updatePodCache.cache { err := dp.updatePod(pod) if err != nil { - if aggregateErr == nil { - aggregateErr = fmt.Errorf("failed to update pod while applying the dataplane. key: [%s], err: [%w]", podKey, err) - } else { - aggregateErr = fmt.Errorf("failed to update pod while applying the dataplane. key: [%s], err: [%s]. previous err: [%w]", podKey, err.Error(), aggregateErr) - } + // move on to the next and later return as success since this can be retried irrespective of other operations metrics.SendErrorLogAndMetric(util.DaemonDataplaneID, "failed to update pod while applying the dataplane. key: [%s], err: [%s]", podKey, err.Error()) continue } + delete(dp.updatePodCache.cache, podKey) } - if aggregateErr != nil { - return fmt.Errorf("[DataPlane] error while updating pods: %w", aggregateErr) - } } return nil } @@ -393,17 +389,18 @@ func (dp *DataPlane) createIPSetsAndReferences(sets []*ipsets.TranslatedIPSet, n } func (dp *DataPlane) deleteIPSetsAndReferences(sets []*ipsets.TranslatedIPSet, netpolName string, referenceType ipsets.ReferenceType) error { + for _, set := range sets { + prefixName := set.Metadata.GetPrefixName() + if err := dp.ipsetMgr.DeleteReference(prefixName, netpolName, referenceType); err != nil { + // with current implementation of DeleteReference(), err will be ipsets.ErrSetDoesNotExist + klog.Infof("[DataPlane] ignoring delete reference on non-existent set. ipset: %s. netpol: %s. referenceType: %s", prefixName, netpolName, referenceType) + } + } + npmErrorString := npmerrors.DeleteSelectorReference if referenceType == ipsets.NetPolType { npmErrorString = npmerrors.DeleteNetPolReference } - for _, set := range sets { - // TODO ignore set does not exist error - err := dp.ipsetMgr.DeleteReference(set.Metadata.GetPrefixName(), netpolName, referenceType) - if err != nil { - return npmerrors.Errorf(npmErrorString, false, fmt.Sprintf("[DataPlane] failed to deleteIPSetReferences with err: %s", err.Error())) - } - } // Check if any list sets are provided with members to delete // NOTE: every translated member will be deleted, even if the member is part of the same set in another policy diff --git a/npm/pkg/dataplane/dataplane_windows.go b/npm/pkg/dataplane/dataplane_windows.go index 856f96065e..f4b59b2338 100644 --- a/npm/pkg/dataplane/dataplane_windows.go +++ b/npm/pkg/dataplane/dataplane_windows.go @@ -115,11 +115,6 @@ func (dp *DataPlane) shouldUpdatePod() bool { // 2. Will check for existing applicable network policies and applies it on endpoint func (dp *DataPlane) updatePod(pod *updateNPMPod) error { klog.Infof("[DataPlane] updatePod called for Pod Key %s", pod.PodKey) - // Check if pod is part of this node - if pod.NodeName != dp.nodeName { - klog.Infof("[DataPlane] ignoring update pod as expected Node: [%s] got: [%s]", dp.nodeName, pod.NodeName) - return nil - } // lock the endpoint cache while we read/modify the endpoint with the pod's IP dp.endpointCache.Lock() @@ -167,7 +162,9 @@ func (dp *DataPlane) updatePod(pod *updateNPMPod) error { */ selectorReference, err := dp.ipsetMgr.GetSelectorReferencesBySet(setName) if err != nil { - return err + // ignore this set since it may have been deleted in the background reconcile thread + klog.Infof("[DataPlane] ignoring pod update for ipset to remove since the set does not exist. pod: %+v. set: %s", pod, setName) + continue } for policyKey := range selectorReference { @@ -204,7 +201,9 @@ func (dp *DataPlane) updatePod(pod *updateNPMPod) error { */ selectorReference, err := dp.ipsetMgr.GetSelectorReferencesBySet(setName) if err != nil { - return err + // ignore this set since it may have been deleted in the background reconcile thread + klog.Infof("[DataPlane] ignoring pod update for ipset to remove since the set does not exist. pod: %+v. set: %s", pod, setName) + continue } for policyKey := range selectorReference { From 45be65666b72022723929decff0023ec1e90fb74 Mon Sep 17 00:00:00 2001 From: Hunter Gregory Date: Thu, 5 Jan 2023 13:22:59 -0800 Subject: [PATCH 2/2] remove changes to updatePod tracking (on/off-node) --- npm/pkg/dataplane/dataplane.go | 6 ++---- npm/pkg/dataplane/dataplane_windows.go | 5 +++++ 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/npm/pkg/dataplane/dataplane.go b/npm/pkg/dataplane/dataplane.go index 5cfaa24f19..1d5a4ccdae 100644 --- a/npm/pkg/dataplane/dataplane.go +++ b/npm/pkg/dataplane/dataplane.go @@ -134,8 +134,7 @@ func (dp *DataPlane) AddToSets(setNames []*ipsets.IPSetMetadata, podMetadata *Po if err != nil { return fmt.Errorf("[DataPlane] error while adding to set: %w", err) } - - if dp.shouldUpdatePod() && podMetadata.NodeName == dp.nodeName { + if dp.shouldUpdatePod() { klog.Infof("[DataPlane] Updating Sets to Add for pod key %s", podMetadata.PodKey) // lock updatePodCache while reading/modifying or setting the updatePod in the cache @@ -163,7 +162,7 @@ func (dp *DataPlane) RemoveFromSets(setNames []*ipsets.IPSetMetadata, podMetadat return fmt.Errorf("[DataPlane] error while removing from set: %w", err) } - if dp.shouldUpdatePod() && podMetadata.NodeName == dp.nodeName { + if dp.shouldUpdatePod() { klog.Infof("[DataPlane] Updating Sets to Remove for pod key %s", podMetadata.PodKey) // lock updatePodCache while reading/modifying or setting the updatePod in the cache @@ -215,7 +214,6 @@ func (dp *DataPlane) ApplyDataPlane() error { } if dp.shouldUpdatePod() { - // NOTE: ideally we won't refresh Pod Endpoints if the updatePodCache is empty err := dp.refreshPodEndpoints() if err != nil { metrics.SendErrorLogAndMetric(util.DaemonDataplaneID, "[DataPlane] failed to refresh endpoints while updating pods. err: [%s]", err.Error()) diff --git a/npm/pkg/dataplane/dataplane_windows.go b/npm/pkg/dataplane/dataplane_windows.go index f4b59b2338..60552a3876 100644 --- a/npm/pkg/dataplane/dataplane_windows.go +++ b/npm/pkg/dataplane/dataplane_windows.go @@ -115,6 +115,11 @@ func (dp *DataPlane) shouldUpdatePod() bool { // 2. Will check for existing applicable network policies and applies it on endpoint func (dp *DataPlane) updatePod(pod *updateNPMPod) error { klog.Infof("[DataPlane] updatePod called for Pod Key %s", pod.PodKey) + // Check if pod is part of this node + if pod.NodeName != dp.nodeName { + klog.Infof("[DataPlane] ignoring update pod as expected Node: [%s] got: [%s]", dp.nodeName, pod.NodeName) + return nil + } // lock the endpoint cache while we read/modify the endpoint with the pod's IP dp.endpointCache.Lock()