From 1412e40e17175784c0c728701a8c8d770ffe2845 Mon Sep 17 00:00:00 2001 From: Hunter Gregory Date: Thu, 1 Sep 2022 00:48:50 -0700 Subject: [PATCH 1/2] do not remove policy from cache when updating pod --- npm/pkg/dataplane/dataplane.go | 2 +- npm/pkg/dataplane/policies/policymanager.go | 50 +++++++++++++++++-- .../policies/policymanager_linux_test.go | 10 ++-- .../dataplane/policies/policymanager_test.go | 4 +- .../policies/policymanager_windows.go | 3 ++ 5 files changed, 56 insertions(+), 13 deletions(-) diff --git a/npm/pkg/dataplane/dataplane.go b/npm/pkg/dataplane/dataplane.go index cf577b176a..a2514c2847 100644 --- a/npm/pkg/dataplane/dataplane.go +++ b/npm/pkg/dataplane/dataplane.go @@ -293,7 +293,7 @@ func (dp *DataPlane) RemovePolicy(policyKey string) error { return nil } // Use the endpoint list saved in cache for this network policy to remove - err := dp.policyMgr.RemovePolicy(policy.PolicyKey, nil) + err := dp.policyMgr.RemovePolicy(policy.PolicyKey) if err != nil { return fmt.Errorf("[DataPlane] error while removing policy: %w", err) } diff --git a/npm/pkg/dataplane/policies/policymanager.go b/npm/pkg/dataplane/policies/policymanager.go index 356e5d104c..b355fc5eaa 100644 --- a/npm/pkg/dataplane/policies/policymanager.go +++ b/npm/pkg/dataplane/policies/policymanager.go @@ -130,13 +130,18 @@ func (pMgr *PolicyManager) AddPolicy(policy *NPMNetworkPolicy, endpointList map[ metrics.RecordACLRuleExecTime(timer) // record execution time regardless of failure if err != nil { // NOTE: in Linux, Prometheus metrics may be off at this point since some ACL rules may have been applied successfully + // In Windows, Prometheus metrics may be off at this point since we don't know how many endpoints had rules applied successfully. msg := fmt.Sprintf("failed to add policy: %s", err.Error()) metrics.SendErrorLogAndMetric(util.IptmID, "error: %s", msg) return npmerrors.Errorf(npmerrors.AddPolicy, false, msg) } // update Prometheus metrics on success - metrics.IncNumACLRulesBy(policy.numACLRulesProducedInKernel()) + numEndpoints := 1 + if util.IsWindowsDP() { + numEndpoints = len(endpointList) + } + metrics.IncNumACLRulesBy(policy.numACLRulesProducedInKernel() * numEndpoints) pMgr.policyMap.cache[policy.PolicyKey] = policy return nil @@ -146,7 +151,7 @@ func (pMgr *PolicyManager) isFirstPolicy() bool { return len(pMgr.policyMap.cache) == 0 } -func (pMgr *PolicyManager) RemovePolicy(policyKey string, endpointList map[string]string) error { +func (pMgr *PolicyManager) RemovePolicy(policyKey string) error { policy, ok := pMgr.GetPolicy(policyKey) if !ok { @@ -162,22 +167,57 @@ func (pMgr *PolicyManager) RemovePolicy(policyKey string, endpointList map[strin defer pMgr.policyMap.Unlock() // Call actual dataplane function to apply changes - err := pMgr.removePolicy(policy, endpointList) + err := pMgr.removePolicy(policy, nil) // currently we only have acl rule exec time for "adding" rules, so we skip recording here if err != nil { - // NOTE: in Linux, Prometheus metrics may be off at this point since some ACL rules may have been applied successfully + // NOTE: in Linux, Prometheus metrics may be off at this point since some ACL rules may have been applied successfully. + // In Windows, Prometheus metrics may be off at this point since we don't know how many endpoints had rules applied successfully. msg := fmt.Sprintf("failed to remove policy: %s", err.Error()) metrics.SendErrorLogAndMetric(util.IptmID, "error: %s", msg) return npmerrors.Errorf(npmerrors.RemovePolicy, false, msg) } // update Prometheus metrics on success - metrics.DecNumACLRulesBy(policy.numACLRulesProducedInKernel()) + numEndpoints := 1 + if util.IsWindowsDP() { + numEndpoints = len(policy.PodEndpoints) + } + metrics.DecNumACLRulesBy(policy.numACLRulesProducedInKernel() * numEndpoints) + // remove policy from cache delete(pMgr.policyMap.cache, policyKey) return nil } +// RemovePolicyForEndpoints is identical to RemovePolicy except it will not remove the policy from the cache. +// This function is intended for Windows only. +func (pMgr *PolicyManager) RemovePolicyForEndpoints(policyKey string, endpointList map[string]string) error { + policy, ok := pMgr.GetPolicy(policyKey) + + if !ok { + return nil + } + + if len(policy.ACLs) == 0 { + klog.Infof("[DataPlane] No ACLs in policy %s to remove for endpoints", policyKey) + return nil + } + // Call actual dataplane function to apply changes + err := pMgr.removePolicy(policy, endpointList) + // currently we only have acl rule exec time for "adding" rules, so we skip recording here + if err != nil { + // NOTE: Prometheus metrics may be off at this point since we don't know how many endpoints had rules applied successfully. + msg := fmt.Sprintf("failed to remove policy. endpoints: [%+v]. err: [%s]", endpointList, err.Error()) + metrics.SendErrorLogAndMetric(util.IptmID, "error: %s", msg) + return npmerrors.Errorf(npmerrors.RemovePolicy, false, msg) + } + + // update Prometheus metrics on success + metrics.DecNumACLRulesBy(policy.numACLRulesProducedInKernel() * len(endpointList)) + + return nil +} + func (pMgr *PolicyManager) isLastPolicy() bool { // if we change our code to delete more than one policy at once, we can specify numPoliciesToDelete as an argument numPoliciesToDelete := 1 diff --git a/npm/pkg/dataplane/policies/policymanager_linux_test.go b/npm/pkg/dataplane/policies/policymanager_linux_test.go index 6d944890fa..9320074bc9 100644 --- a/npm/pkg/dataplane/policies/policymanager_linux_test.go +++ b/npm/pkg/dataplane/policies/policymanager_linux_test.go @@ -334,7 +334,7 @@ func TestRemovePoliciesAcceptableError(t *testing.T) { defer ioshim.VerifyCalls(t, calls) pMgr := NewPolicyManager(ioshim, ipsetConfig) require.NoError(t, pMgr.AddPolicy(bothDirectionsNetPol, epList)) - require.NoError(t, pMgr.RemovePolicy(bothDirectionsNetPol.PolicyKey, nil)) + require.NoError(t, pMgr.RemovePolicy(bothDirectionsNetPol.PolicyKey)) _, ok := pMgr.GetPolicy(bothDirectionsNetPol.PolicyKey) require.False(t, ok) promVals{0, 1}.testPrometheusMetrics(t) @@ -381,7 +381,7 @@ func TestRemovePoliciesError(t *testing.T) { pMgr := NewPolicyManager(ioshim, ipsetConfig) err := pMgr.AddPolicy(bothDirectionsNetPol, nil) require.NoError(t, err) - err = pMgr.RemovePolicy(bothDirectionsNetPol.PolicyKey, nil) + err = pMgr.RemovePolicy(bothDirectionsNetPol.PolicyKey) require.Error(t, err) promVals{6, 1}.testPrometheusMetrics(t) @@ -407,7 +407,7 @@ func TestUpdatingStaleChains(t *testing.T) { assertStaleChainsContain(t, pMgr.staleChains) // successful removal, so mark the policy's chains as stale - require.NoError(t, pMgr.RemovePolicy(bothDirectionsNetPol.PolicyKey, nil)) + require.NoError(t, pMgr.RemovePolicy(bothDirectionsNetPol.PolicyKey)) assertStaleChainsContain(t, pMgr.staleChains, bothDirectionsNetPolIngressChain, bothDirectionsNetPolEgressChain) // successful add, so keep the same stale chains @@ -415,7 +415,7 @@ func TestUpdatingStaleChains(t *testing.T) { assertStaleChainsContain(t, pMgr.staleChains, bothDirectionsNetPolIngressChain, bothDirectionsNetPolEgressChain) // failure to remove, so keep the same stale chains - require.Error(t, pMgr.RemovePolicy(ingressNetPol.PolicyKey, nil)) + require.Error(t, pMgr.RemovePolicy(ingressNetPol.PolicyKey)) assertStaleChainsContain(t, pMgr.staleChains, bothDirectionsNetPolIngressChain, bothDirectionsNetPolEgressChain) // successfully add a new policy. keep the same stale chains @@ -423,7 +423,7 @@ func TestUpdatingStaleChains(t *testing.T) { assertStaleChainsContain(t, pMgr.staleChains, bothDirectionsNetPolIngressChain, bothDirectionsNetPolEgressChain) // successful removal, so mark the policy's chains as stale - require.NoError(t, pMgr.RemovePolicy(egressNetPol.PolicyKey, nil)) + require.NoError(t, pMgr.RemovePolicy(egressNetPol.PolicyKey)) assertStaleChainsContain(t, pMgr.staleChains, bothDirectionsNetPolIngressChain, bothDirectionsNetPolEgressChain, egressNetPolChain) // failure to add, so keep the same stale chains the same diff --git a/npm/pkg/dataplane/policies/policymanager_test.go b/npm/pkg/dataplane/policies/policymanager_test.go index 7c59bd2e5c..31fbb9a242 100644 --- a/npm/pkg/dataplane/policies/policymanager_test.go +++ b/npm/pkg/dataplane/policies/policymanager_test.go @@ -172,7 +172,7 @@ func TestRemovePolicy(t *testing.T) { defer ioshim.VerifyCalls(t, calls) pMgr := NewPolicyManager(ioshim, ipsetConfig) require.NoError(t, pMgr.AddPolicy(testNetPol, epList)) - require.NoError(t, pMgr.RemovePolicy(testNetPol.PolicyKey, nil)) + require.NoError(t, pMgr.RemovePolicy(testNetPol.PolicyKey)) _, ok := pMgr.GetPolicy(testNetPol.PolicyKey) require.False(t, ok) promVals{0, 1}.testPrometheusMetrics(t) @@ -183,7 +183,7 @@ func TestRemoveNonexistentPolicy(t *testing.T) { metrics.ReinitializeAll() ioshim := common.NewMockIOShim(nil) pMgr := NewPolicyManager(ioshim, ipsetConfig) - require.NoError(t, pMgr.RemovePolicy("wrong-policy-key", epList)) + require.NoError(t, pMgr.RemovePolicy("wrong-policy-key")) promVals{0, 0}.testPrometheusMetrics(t) } diff --git a/npm/pkg/dataplane/policies/policymanager_windows.go b/npm/pkg/dataplane/policies/policymanager_windows.go index 1b68604d82..14f26a805b 100644 --- a/npm/pkg/dataplane/policies/policymanager_windows.go +++ b/npm/pkg/dataplane/policies/policymanager_windows.go @@ -144,6 +144,7 @@ func (pMgr *PolicyManager) removePolicy(policy *NPMNetworkPolicy, endpointList m if err != nil { return err } + // FIXME rulesToRemove is a list of pointers klog.Infof("[PolicyManagerWindows] To Remove Policy: %s \n To Delete ACLs: %+v \n To Remove From %+v endpoints", policy.PolicyKey, rulesToRemove, endpointList) // If remove bug is solved we can directly remove the exact policy from the endpoint // but if the bug is not solved then get all existing policies and remove relevant policies from list @@ -204,6 +205,7 @@ func (pMgr *PolicyManager) removePolicyByEndpointID(ruleID, epID string, noOfRul return nil } } + // FIXME epBuilder.aclPolicies is a list of pointers klog.Infof("[DataPlanewindows] Epbuilder ACL policies before removing %+v", epBuilder.aclPolicies) klog.Infof("[DataPlanewindows] Epbuilder Other policies before removing %+v", epBuilder.otherPolicies) epPolicies, err := epBuilder.getHCNPolicyRequest() @@ -246,6 +248,7 @@ func getEPPolicyReqFromACLSettings(settings []*NPMACLPolSettings) (hcn.PolicyEnd } for i, acl := range settings { + // FIXME a lot of prints klog.Infof("Acl settings: %+v", acl) byteACL, err := json.Marshal(acl) if err != nil { From 4c821b104412402dbbab00fc7b1490998489a658 Mon Sep 17 00:00:00 2001 From: Hunter Gregory Date: Thu, 1 Sep 2022 10:53:53 -0700 Subject: [PATCH 2/2] dp windows changes --- npm/pkg/dataplane/dataplane_windows.go | 2 +- npm/pkg/dataplane/policies/policymanager_windows_test.go | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/npm/pkg/dataplane/dataplane_windows.go b/npm/pkg/dataplane/dataplane_windows.go index 9cd38e6edb..f689491d5b 100644 --- a/npm/pkg/dataplane/dataplane_windows.go +++ b/npm/pkg/dataplane/dataplane_windows.go @@ -169,7 +169,7 @@ func (dp *DataPlane) updatePod(pod *updateNPMPod) error { endpointList := map[string]string{ endpoint.ip: endpoint.id, } - err := dp.policyMgr.RemovePolicy(policyKey, endpointList) + err := dp.policyMgr.RemovePolicyForEndpoints(policyKey, endpointList) if err != nil { return err } diff --git a/npm/pkg/dataplane/policies/policymanager_windows_test.go b/npm/pkg/dataplane/policies/policymanager_windows_test.go index 553c2d4317..f20ea4f08b 100644 --- a/npm/pkg/dataplane/policies/policymanager_windows_test.go +++ b/npm/pkg/dataplane/policies/policymanager_windows_test.go @@ -124,7 +124,7 @@ func TestRemovePolicies(t *testing.T) { verifyFakeHNSCacheACLs(t, expectedACLs, acls) } - err = pMgr.RemovePolicy(TestNetworkPolicies[0].PolicyKey, nil) + err = pMgr.RemovePolicy(TestNetworkPolicies[0].PolicyKey) require.NoError(t, err) verifyACLCacheIsCleaned(t, hns, len(endPointIDList)) } @@ -150,7 +150,7 @@ func TestRemovePoliciesEndpointNotFound(t *testing.T) { testendPointIDList := map[string]string{ "10.0.0.5": "test10", } - err = pMgr.RemovePolicy(TestNetworkPolicies[0].PolicyKey, testendPointIDList) + err = pMgr.RemovePolicyForEndpoints(TestNetworkPolicies[0].PolicyKey, testendPointIDList) require.NoError(t, err, err) verifyACLCacheIsCleaned(t, hns, len(endPointIDList)) }