From f3b66f86acb3563d5e4e546a0ed29ac933a5cd8a Mon Sep 17 00:00:00 2001 From: Vamsi Kalapala Date: Mon, 22 Nov 2021 10:41:33 -0800 Subject: [PATCH 1/2] feat: [NPM] adding windows ACL Policy reset support --- npm/pkg/dataplane/dataplane.go | 7 - npm/pkg/dataplane/dataplane_linux.go | 8 ++ npm/pkg/dataplane/dataplane_windows.go | 31 ++++- .../policies/chain-management_linux.go | 2 +- npm/pkg/dataplane/policies/policymanager.go | 4 +- .../policies/policymanager_windows.go | 123 +++++++++++++----- test/integration/npm/main.go | 2 +- 7 files changed, 127 insertions(+), 50 deletions(-) diff --git a/npm/pkg/dataplane/dataplane.go b/npm/pkg/dataplane/dataplane.go index 5baf87eab7..d3e690aaf5 100644 --- a/npm/pkg/dataplane/dataplane.go +++ b/npm/pkg/dataplane/dataplane.go @@ -101,13 +101,6 @@ func (dp *DataPlane) InitializeDataPlane() error { // ResetDataPlane helps in cleaning up dataplane sets and policies programmed // by NPM, returning a clean slate func (dp *DataPlane) ResetDataPlane() error { - // It is important to keep order to clean-up ACLs before ipsets. Otherwise we won't be able to delete ipsets referenced by ACLs - if err := dp.policyMgr.Reset(); err != nil { - return npmerrors.ErrorWrapper(npmerrors.ResetDataPlane, false, "failed to reset policy dataplane", err) - } - if err := dp.ipsetMgr.ResetIPSets(); err != nil { - return npmerrors.ErrorWrapper(npmerrors.ResetDataPlane, false, "failed to reset ipsets dataplane", err) - } return dp.resetDataPlane() } diff --git a/npm/pkg/dataplane/dataplane_linux.go b/npm/pkg/dataplane/dataplane_linux.go index bd4ebad9cf..319b68f56f 100644 --- a/npm/pkg/dataplane/dataplane_linux.go +++ b/npm/pkg/dataplane/dataplane_linux.go @@ -2,6 +2,7 @@ package dataplane import ( "github.com/Azure/azure-container-networking/npm/pkg/dataplane/policies" + npmerrors "github.com/Azure/azure-container-networking/npm/util/errors" "k8s.io/klog" ) @@ -26,5 +27,12 @@ func (dp *DataPlane) updatePod(pod *updateNPMPod) error { } func (dp *DataPlane) resetDataPlane() error { + // It is important to keep order to clean-up ACLs before ipsets. Otherwise we won't be able to delete ipsets referenced by ACLs + if err := dp.policyMgr.Reset(nil); err != nil { + return npmerrors.ErrorWrapper(npmerrors.ResetDataPlane, false, "failed to reset policy dataplane", err) + } + if err := dp.ipsetMgr.ResetIPSets(); err != nil { + return npmerrors.ErrorWrapper(npmerrors.ResetDataPlane, false, "failed to reset ipsets dataplane", err) + } return nil } diff --git a/npm/pkg/dataplane/dataplane_windows.go b/npm/pkg/dataplane/dataplane_windows.go index 91c5274e05..491558876d 100644 --- a/npm/pkg/dataplane/dataplane_windows.go +++ b/npm/pkg/dataplane/dataplane_windows.go @@ -4,6 +4,7 @@ import ( "fmt" "github.com/Azure/azure-container-networking/npm/pkg/dataplane/policies" + npmerrors "github.com/Azure/azure-container-networking/npm/util/errors" "github.com/Microsoft/hcsshim/hcn" "k8s.io/klog" ) @@ -42,6 +43,24 @@ func (dp *DataPlane) initializeDataPlane() error { return nil } +func (dp *DataPlane) resetDataPlane() error { + // initialize the DP so the podendpoints will get updated. + if err := dp.initializeDataPlane(); err != nil { + return err + } + + epIDs := dp.getAllEndpointIDs() + + // It is important to keep order to clean-up ACLs before ipsets. Otherwise we won't be able to delete ipsets referenced by ACLs + if err := dp.policyMgr.Reset(epIDs); err != nil { + return npmerrors.ErrorWrapper(npmerrors.ResetDataPlane, false, "failed to reset policy dataplane", err) + } + if err := dp.ipsetMgr.ResetIPSets(); err != nil { + return npmerrors.ErrorWrapper(npmerrors.ResetDataPlane, false, "failed to reset ipsets dataplane", err) + } + return nil +} + func (dp *DataPlane) shouldUpdatePod() bool { return true } @@ -195,10 +214,6 @@ func (dp *DataPlane) getEndpointsToApplyPolicy(policy *policies.NPMNetworkPolicy return endpointList, nil } -func (dp *DataPlane) resetDataPlane() error { - return nil -} - func (dp *DataPlane) getAllPodEndpoints() ([]hcn.HostComputeEndpoint, error) { klog.Infof("Getting all endpoints for Network ID %s", dp.networkID) endpoints, err := dp.ioShim.Hns.ListEndpointsOfNetwork(dp.networkID) @@ -269,3 +284,11 @@ func (dp *DataPlane) getEndpointByIP(podIP string) (*NPMEndpoint, error) { return nil, nil } + +func (dp *DataPlane) getAllEndpointIDs() []string { + endpointIDs := make([]string, 0, len(dp.endpointCache)) + for _, endpoint := range dp.endpointCache { + endpointIDs = append(endpointIDs, endpoint.ID) + } + return endpointIDs +} diff --git a/npm/pkg/dataplane/policies/chain-management_linux.go b/npm/pkg/dataplane/policies/chain-management_linux.go index d25d8407d7..ece94acfa8 100644 --- a/npm/pkg/dataplane/policies/chain-management_linux.go +++ b/npm/pkg/dataplane/policies/chain-management_linux.go @@ -100,7 +100,7 @@ func (pMgr *PolicyManager) initialize() error { return nil } -func (pMgr *PolicyManager) reset() error { +func (pMgr *PolicyManager) reset(_ []string) error { if err := pMgr.removeNPMChains(); err != nil { return npmerrors.SimpleErrorWrapper("failed to remove NPM chains", err) } diff --git a/npm/pkg/dataplane/policies/policymanager.go b/npm/pkg/dataplane/policies/policymanager.go index 86c5d0eb4b..9ebc4d118d 100644 --- a/npm/pkg/dataplane/policies/policymanager.go +++ b/npm/pkg/dataplane/policies/policymanager.go @@ -56,8 +56,8 @@ type PolicyManagerCfg struct { Mode PolicyManagerMode } -func (pMgr *PolicyManager) Reset() error { - if err := pMgr.reset(); err != nil { +func (pMgr *PolicyManager) Reset(epIDs []string) error { + if err := pMgr.reset(epIDs); err != nil { return npmerrors.ErrorWrapper(npmerrors.ResetPolicyMgr, false, "failed to reset policy manager", err) } return nil diff --git a/npm/pkg/dataplane/policies/policymanager_windows.go b/npm/pkg/dataplane/policies/policymanager_windows.go index fb206a438d..84f773efaa 100644 --- a/npm/pkg/dataplane/policies/policymanager_windows.go +++ b/npm/pkg/dataplane/policies/policymanager_windows.go @@ -4,18 +4,23 @@ import ( "encoding/json" "errors" "fmt" + "strings" "github.com/Microsoft/hcsshim/hcn" "k8s.io/klog" ) var ( - ErrFailedMarshalACLSettings = errors.New("Failed to marshal ACL settings") - ErrFailedUnMarshalACLSettings = errors.New("Failed to unmarshal ACL settings") + ErrFailedMarshalACLSettings = errors.New("Failed to marshal ACL settings") + ErrFailedUnMarshalACLSettings = errors.New("Failed to unmarshal ACL settings") + resetAllACLs shouldResetAllACLs = true + removeOnlyGivenPolicy shouldResetAllACLs = false ) type staleChains struct{} // unused in Windows +type shouldResetAllACLs bool + type endpointPolicyBuilder struct { aclPolicies []*NPMACLPolSettings otherPolicies []hcn.EndpointPolicy @@ -35,9 +40,16 @@ func (pMgr *PolicyManager) initialize() error { return nil } -func (pMgr *PolicyManager) reset() error { - // TODO - return nil +func (pMgr *PolicyManager) reset(epIDs []string) error { + var aggregateErr error + for _, epID := range epIDs { + err := pMgr.removePolicyByEndpointID("", epID, 0, resetAllACLs) + if err != nil { + aggregateErr = fmt.Errorf("[DataPlane Windows] Skipping removing policies on %s ID Endpoint with %s err\n Previous %w", epID, err.Error(), aggregateErr) + continue + } + } + return aggregateErr } func (pMgr *PolicyManager) reconcile() { @@ -120,44 +132,62 @@ func (pMgr *PolicyManager) removePolicy(policy *NPMNetworkPolicy, endpointList m // but if the bug is not solved then get all existing policies and remove relevant policies from list // then apply remaining policies onto the endpoint var aggregateErr error + noOfRulesToRemove := len(rulesToRemove) for epIPAddr, epID := range endpointList { - epObj, err := pMgr.getEndpointByID(epID) + err := pMgr.removePolicyByEndpointID(rulesToRemove[0].Id, epID, noOfRulesToRemove, removeOnlyGivenPolicy) if err != nil { - // Do not return if one endpoint fails, try all endpoints. - // aggregate the error message and return it at the end aggregateErr = fmt.Errorf("[DataPlane Windows] Skipping removing policies on %s ID Endpoint with %s err\n Previous %w", epID, err.Error(), aggregateErr) continue } - if len(epObj.Policies) == 0 { - klog.Infof("[DataPlanewindows] No Policies to remove on %s ID Endpoint", epID) - continue - } - epBuilder, err := splitEndpointPolicies(epObj.Policies) - if err != nil { - aggregateErr = fmt.Errorf("[DataPlane Windows] Skipping removing policies on %s ID Endpoint with %s err\n Previous %w", epID, err.Error(), aggregateErr) - continue - } + // Delete podendpoint from policy cache + delete(policy.PodEndpoints, epIPAddr) + } - epBuilder.compareAndRemovePolicies(rulesToRemove[0].Id, len(rulesToRemove)) - klog.Infof("[DataPlanewindows] Epbuilder ACL policies before removing %+v", epBuilder.aclPolicies) - klog.Infof("[DataPlanewindows] Epbuilder Other policies before removing %+v", epBuilder.otherPolicies) - epPolicies, err := epBuilder.getHCNPolicyRequest() - if err != nil { - aggregateErr = fmt.Errorf("[DataPlanewindows] Skipping removing policies on %s ID Endpoint with %s err\n Previous %w", epID, err.Error(), aggregateErr) - continue - } + return aggregateErr +} - err = pMgr.updatePoliciesOnEndpoint(epObj, epPolicies) - if err != nil { - aggregateErr = fmt.Errorf("[DataPlanewindows] Skipping removing policies on %s ID Endpoint with %s err\n Previous %w", epID, err.Error(), aggregateErr) - continue - } +func (pMgr *PolicyManager) removePolicyByEndpointID(ruleID, epID string, noOfRulesToRemove int, resetAllACL shouldResetAllACLs) error { + epObj, err := pMgr.getEndpointByID(epID) + var aggregateErr error + if err != nil { + // Do not return if one endpoint fails, try all endpoints. + // aggregate the error message and return it at the end + aggregateErr = fmt.Errorf("[DataPlane Windows] Skipping removing policies on %s ID Endpoint with %s err\n Previous %w", epID, err.Error(), aggregateErr) + } + if len(epObj.Policies) == 0 { + klog.Infof("[DataPlanewindows] No Policies to remove on %s ID Endpoint", epID) + } - // Delete podendpoint from policy cache - delete(policy.PodEndpoints, epIPAddr) + epBuilder, err := splitEndpointPolicies(epObj.Policies) + if err != nil { + aggregateErr = fmt.Errorf("[DataPlane Windows] Skipping removing policies on %s ID Endpoint with %s err\n Previous %w", epID, err.Error(), aggregateErr) + } + + if resetAllACL { + klog.Infof("[DataPlane Windows] Resetting all ACL Policies on %s ID Endpoint", epID) + if !epBuilder.resetAllNPMAclPolicies() { + klog.Infof("[DataPlane Windows] No Azure-NPM ACL Policies on %s ID Endpoint to reset", epID) + return nil + } + } else { + klog.Infof("[DataPlane Windows] Resetting only ACL Policies with %s ID on %s ID Endpoint", ruleID, epID) + if !epBuilder.compareAndRemovePolicies(ruleID, noOfRulesToRemove) { + klog.Infof("[DataPlane Windows] No Policies with ID %s on %s ID Endpoint", ruleID, epID) + return nil + } + } + klog.Infof("[DataPlanewindows] Epbuilder ACL policies before removing %+v", epBuilder.aclPolicies) + klog.Infof("[DataPlanewindows] Epbuilder Other policies before removing %+v", epBuilder.otherPolicies) + epPolicies, err := epBuilder.getHCNPolicyRequest() + if err != nil { + aggregateErr = fmt.Errorf("[DataPlanewindows] Skipping removing policies on %s ID Endpoint with %s err\n Previous %w", epID, err.Error(), aggregateErr) } + err = pMgr.updatePoliciesOnEndpoint(epObj, epPolicies) + if err != nil { + aggregateErr = fmt.Errorf("[DataPlanewindows] Skipping removing policies on %s ID Endpoint with %s err\n Previous %w", epID, err.Error(), aggregateErr) + } return aggregateErr } @@ -283,12 +313,13 @@ func (epBuilder *endpointPolicyBuilder) compareAndRemovePolicies(ruleIDToRemove aclFound = true } } - epBuilder.removeACLPolicyAtIndex(toDeleteIndexes) // If ACl Policies are not found, it means that we might have removed them earlier // or never applied them if !aclFound { klog.Infof("[DataPlane Windows] ACL with ID %s is not Found in Dataplane", ruleIDToRemove) + return aclFound } + epBuilder.removeACLPolicyAtIndex(toDeleteIndexes) // if there are still rules to remove, it means that we might have not added all the policies in the add // case and were only able to find a portion of the rules to remove if lenOfRulesToRemove > 0 { @@ -297,11 +328,33 @@ func (epBuilder *endpointPolicyBuilder) compareAndRemovePolicies(ruleIDToRemove return aclFound } -func (epBuilder *endpointPolicyBuilder) resetAllNPMAclPolicies() { - epBuilder.aclPolicies = []*NPMACLPolSettings{} +func (epBuilder *endpointPolicyBuilder) resetAllNPMAclPolicies() bool { + if len(epBuilder.aclPolicies) == 0 { + return false + } + aclFound := false + toDeleteIndexes := map[int]struct{}{} + for i, acl := range epBuilder.aclPolicies { + // First check if ID is present and equal, this saves compute cycles to compare both objects + if strings.HasPrefix(acl.Id, policyIDPrefix) { + // Remove the ACL policy from the list + klog.Infof("[DataPlane Windows] Found ACL with ID %s and removing it", acl.Id) + toDeleteIndexes[i] = struct{}{} + aclFound = true + } + } + if len(toDeleteIndexes) == len(epBuilder.aclPolicies) { + epBuilder.aclPolicies = []*NPMACLPolSettings{} + return aclFound + } + epBuilder.removeACLPolicyAtIndex(toDeleteIndexes) + return aclFound } func (epBuilder *endpointPolicyBuilder) removeACLPolicyAtIndex(indexes map[int]struct{}) { + if len(indexes) == 0 { + return + } tempAclPolicies := []*NPMACLPolSettings{} for i, acl := range epBuilder.aclPolicies { if _, ok := indexes[i]; !ok { diff --git a/test/integration/npm/main.go b/test/integration/npm/main.go index 98021f2996..c0243bf3bb 100644 --- a/test/integration/npm/main.go +++ b/test/integration/npm/main.go @@ -149,7 +149,7 @@ func main() { func testPolicyManager() { pMgr := policies.NewPolicyManager(common.NewIOShim()) - panicOnError(pMgr.Reset()) + panicOnError(pMgr.Reset(nil)) printAndWait(false) panicOnError(pMgr.Initialize()) From 9b1e85f5bd8eee37eae5d57813a9a1cb5cf0f1b7 Mon Sep 17 00:00:00 2001 From: Vamsi Kalapala Date: Wed, 1 Dec 2021 13:16:33 -0800 Subject: [PATCH 2/2] Applying some comments --- .../dataplane/policies/policymanager_windows.go | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/npm/pkg/dataplane/policies/policymanager_windows.go b/npm/pkg/dataplane/policies/policymanager_windows.go index 84f773efaa..c7c492bda1 100644 --- a/npm/pkg/dataplane/policies/policymanager_windows.go +++ b/npm/pkg/dataplane/policies/policymanager_windows.go @@ -132,9 +132,9 @@ func (pMgr *PolicyManager) removePolicy(policy *NPMNetworkPolicy, endpointList m // but if the bug is not solved then get all existing policies and remove relevant policies from list // then apply remaining policies onto the endpoint var aggregateErr error - noOfRulesToRemove := len(rulesToRemove) + numOfRulesToRemove := len(rulesToRemove) for epIPAddr, epID := range endpointList { - err := pMgr.removePolicyByEndpointID(rulesToRemove[0].Id, epID, noOfRulesToRemove, removeOnlyGivenPolicy) + err := pMgr.removePolicyByEndpointID(rulesToRemove[0].Id, epID, numOfRulesToRemove, removeOnlyGivenPolicy) if err != nil { aggregateErr = fmt.Errorf("[DataPlane Windows] Skipping removing policies on %s ID Endpoint with %s err\n Previous %w", epID, err.Error(), aggregateErr) continue @@ -149,11 +149,8 @@ func (pMgr *PolicyManager) removePolicy(policy *NPMNetworkPolicy, endpointList m func (pMgr *PolicyManager) removePolicyByEndpointID(ruleID, epID string, noOfRulesToRemove int, resetAllACL shouldResetAllACLs) error { epObj, err := pMgr.getEndpointByID(epID) - var aggregateErr error if err != nil { - // Do not return if one endpoint fails, try all endpoints. - // aggregate the error message and return it at the end - aggregateErr = fmt.Errorf("[DataPlane Windows] Skipping removing policies on %s ID Endpoint with %s err\n Previous %w", epID, err.Error(), aggregateErr) + return fmt.Errorf("[DataPlane Windows] Skipping removing policies on %s ID Endpoint with %s err", epID, err.Error()) } if len(epObj.Policies) == 0 { klog.Infof("[DataPlanewindows] No Policies to remove on %s ID Endpoint", epID) @@ -161,7 +158,7 @@ func (pMgr *PolicyManager) removePolicyByEndpointID(ruleID, epID string, noOfRul epBuilder, err := splitEndpointPolicies(epObj.Policies) if err != nil { - aggregateErr = fmt.Errorf("[DataPlane Windows] Skipping removing policies on %s ID Endpoint with %s err\n Previous %w", epID, err.Error(), aggregateErr) + return fmt.Errorf("[DataPlane Windows] Skipping removing policies on %s ID Endpoint with %s err", epID, err.Error()) } if resetAllACL { @@ -181,14 +178,14 @@ func (pMgr *PolicyManager) removePolicyByEndpointID(ruleID, epID string, noOfRul klog.Infof("[DataPlanewindows] Epbuilder Other policies before removing %+v", epBuilder.otherPolicies) epPolicies, err := epBuilder.getHCNPolicyRequest() if err != nil { - aggregateErr = fmt.Errorf("[DataPlanewindows] Skipping removing policies on %s ID Endpoint with %s err\n Previous %w", epID, err.Error(), aggregateErr) + return fmt.Errorf("[DataPlanewindows] Skipping removing policies on %s ID Endpoint with %s err", epID, err.Error()) } err = pMgr.updatePoliciesOnEndpoint(epObj, epPolicies) if err != nil { - aggregateErr = fmt.Errorf("[DataPlanewindows] Skipping removing policies on %s ID Endpoint with %s err\n Previous %w", epID, err.Error(), aggregateErr) + return fmt.Errorf("[DataPlanewindows] Skipping removing policies on %s ID Endpoint with %s err", epID, err.Error()) } - return aggregateErr + return nil } // addEPPolicyWithEpID given an EP ID and a list of policies, add the policies to the endpoint